bm-preprocessing 1.0.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/PKG-INFO +1 -1
  2. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/USAGE.md +10 -2
  3. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/pyproject.toml +1 -1
  4. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/__init__.py +9 -0
  5. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/all_vis.py +30 -0
  6. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/eval_metrics.py +26 -0
  7. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/ndd.py +26 -0
  8. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/rel.py +26 -0
  9. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/all_vis.py +294 -0
  10. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/eval_metrics.py +224 -0
  11. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/ndd.py +105 -0
  12. bm_preprocessing-1.2.0/src/bm_preprocessing/IR/sources/rel.py +116 -0
  13. bm_preprocessing-1.0.0/src/bm_preprocessing/IR/__init__.py +0 -5
  14. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/.gitignore +0 -0
  15. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/README.md +0 -0
  16. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/__init__.py +0 -0
  17. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/adaboost.py +0 -0
  18. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/all.py +0 -0
  19. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/all_vis.py +0 -0
  20. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/apriori.py +0 -0
  21. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/bagging.py +0 -0
  22. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/hash.py +0 -0
  23. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/hunts.py +0 -0
  24. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/hunts_test.py +0 -0
  25. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/id3.py +0 -0
  26. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/id3_test.py +0 -0
  27. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/lib_doc.py +0 -0
  28. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/metrics.py +0 -0
  29. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/preprocessing.py +0 -0
  30. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/adaboost.py +0 -0
  31. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/all.py +0 -0
  32. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/all_hunts_tree +0 -0
  33. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/all_vis.py +0 -0
  34. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/apriori.py +0 -0
  35. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/bagging.py +0 -0
  36. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/data.csv +0 -0
  37. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/hash.py +0 -0
  38. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/hunts.py +0 -0
  39. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/hunts_test.py +0 -0
  40. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/id3.py +0 -0
  41. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/id3_test.py +0 -0
  42. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/lib_doc.py +0 -0
  43. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/metrics.py +0 -0
  44. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/preprocessing.py +0 -0
  45. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/DM/sources/tennis.csv +0 -0
  46. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/IR/all.py +0 -0
  47. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/IR/sources/all.py +0 -0
  48. {bm_preprocessing-1.0.0 → bm_preprocessing-1.2.0}/src/bm_preprocessing/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: bm-preprocessing
3
- Version: 1.0.0
3
+ Version: 1.2.0
4
4
  Summary: A package to preprocess text data
5
5
  Requires-Python: >=3.8
6
6
  Requires-Dist: build>=1.2.2.post1
@@ -14,7 +14,7 @@ Create a file `example.py`:
14
14
 
15
15
  ```python
16
16
  # Import modules
17
- from bm_preprocessing.IR import all
17
+ from bm_preprocessing.IR import all, all_vis, eval_metrics, ndd, rel
18
18
  from bm_preprocessing.DM import adaboost, apriori, bagging, hash, hunts, hunts_test, id3, id3_test, lib_doc, metrics, preprocessing
19
19
  from bm_preprocessing.DM import all, all_vis
20
20
 
@@ -108,6 +108,10 @@ Then in the Python REPL:
108
108
 
109
109
  ```bash
110
110
  python -c "from bm_preprocessing.IR import all; print(all)"
111
+ python -c "from bm_preprocessing.IR import all_vis; print(all_vis)"
112
+ python -c "from bm_preprocessing.IR import eval_metrics; print(eval_metrics)"
113
+ python -c "from bm_preprocessing.IR import ndd; print(ndd)"
114
+ python -c "from bm_preprocessing.IR import rel; print(rel)"
111
115
  python -c "from bm_preprocessing.DM import all; print(all)"
112
116
  python -c "from bm_preprocessing.DM import all_vis; print(all_vis)"
113
117
  python -c "from bm_preprocessing.DM import apriori; print(apriori)"
@@ -129,7 +133,11 @@ python -c "from bm_preprocessing.DM import preprocessing; print(preprocessing)"
129
133
 
130
134
  | Import | Description |
131
135
  |--------|-------------|
132
- | `from bm_preprocessing.IR import all` | Information Retrieval (BM25, TF-IDF, Boolean) |
136
+ | `from bm_preprocessing.IR import all` | Information Retrieval (MinHash, LSH, Rocchio, Jaccard, VS) |
137
+ | `from bm_preprocessing.IR import all_vis` | IR algorithms with Matplotlib visualizations |
138
+ | `from bm_preprocessing.IR import eval_metrics` | Jaccard, PRF, Compression Ratio, MAP metrics & plots |
139
+ | `from bm_preprocessing.IR import ndd` | Near Duplicate Documents (MinHash & LSH) |
140
+ | `from bm_preprocessing.IR import rel` | Relevance feedback & query expansion (Rocchio & LCA) |
133
141
  | `from bm_preprocessing.DM import all` | All DM algorithms (Hunt's, ID3, Bagging, AdaBoost, metrics) |
134
142
  | `from bm_preprocessing.DM import all_vis` | All DM algorithms + graphviz & full visualization |
135
143
  | `from bm_preprocessing.DM import apriori` | Apriori algorithm |
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "bm-preprocessing"
7
- version = "1.0.0"
7
+ version = "1.2.0"
8
8
  description = "A package to preprocess text data"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -0,0 +1,9 @@
1
+ """IR subpackage - Information Retrieval source code."""
2
+
3
+ from .all import all
4
+ from .all_vis import all_vis
5
+ from .eval_metrics import eval_metrics
6
+ from .ndd import ndd
7
+ from .rel import rel
8
+
9
+ __all__ = ["all", "all_vis", "eval_metrics", "ndd", "rel"]
@@ -0,0 +1,30 @@
1
+ """Source code loader for IR/all_vis.py"""
2
+
3
+ from pathlib import Path
4
+
5
+
6
+ class SourceCodeModule:
7
+ """A class that displays source code when printed."""
8
+
9
+ def __init__(self, name: str, source_path: Path):
10
+ self.name = name
11
+ self._source_path = source_path
12
+ self._source_code = None
13
+
14
+ @property
15
+ def source_code(self) -> str:
16
+ """Lazily load source code."""
17
+ if self._source_code is None:
18
+ self._source_code = self._source_path.read_text(encoding="utf-8")
19
+ return self._source_code
20
+
21
+ def __repr__(self) -> str:
22
+ return self.source_code
23
+
24
+ def __str__(self) -> str:
25
+ return self.source_code
26
+
27
+
28
+ # Get the path to the source file
29
+ _source_file = Path(__file__).parent / "sources" / "all_vis.py"
30
+ all_vis = SourceCodeModule("IR.all_vis", _source_file)
@@ -0,0 +1,26 @@
1
+ """Source code loader for IR/eval_metrics.py"""
2
+ from pathlib import Path
3
+
4
+ class SourceCodeModule:
5
+ """A class that displays source code when printed."""
6
+ def __init__(self, name: str, source_path: Path):
7
+ self.name = name
8
+ self._source_path = source_path
9
+ self._source_code = None
10
+
11
+ @property
12
+ def source_code(self) -> str:
13
+ """Lazily load source code."""
14
+ if self._source_code is None:
15
+ self._source_code = self._source_path.read_text(encoding="utf-8")
16
+ return self._source_code
17
+
18
+ def __repr__(self) -> str:
19
+ return self.source_code
20
+ def __str__(self) -> str:
21
+ return self.source_code
22
+
23
+
24
+ # Get the path to the source file
25
+ _source_file = Path(__file__).parent / "sources" / "eval_metrics.py"
26
+ eval_metrics = SourceCodeModule("IR.eval_metrics", _source_file)
@@ -0,0 +1,26 @@
1
+ """Source code loader for IR/ndd.py"""
2
+ from pathlib import Path
3
+
4
+ class SourceCodeModule:
5
+ """A class that displays source code when printed."""
6
+ def __init__(self, name: str, source_path: Path):
7
+ self.name = name
8
+ self._source_path = source_path
9
+ self._source_code = None
10
+
11
+ @property
12
+ def source_code(self) -> str:
13
+ """Lazily load source code."""
14
+ if self._source_code is None:
15
+ self._source_code = self._source_path.read_text(encoding="utf-8")
16
+ return self._source_code
17
+
18
+ def __repr__(self) -> str:
19
+ return self.source_code
20
+ def __str__(self) -> str:
21
+ return self.source_code
22
+
23
+
24
+ # Get the path to the source file
25
+ _source_file = Path(__file__).parent / "sources" / "ndd.py"
26
+ ndd = SourceCodeModule("IR.ndd", _source_file)
@@ -0,0 +1,26 @@
1
+ """Source code loader for IR/rel.py"""
2
+ from pathlib import Path
3
+
4
+ class SourceCodeModule:
5
+ """A class that displays source code when printed."""
6
+ def __init__(self, name: str, source_path: Path):
7
+ self.name = name
8
+ self._source_path = source_path
9
+ self._source_code = None
10
+
11
+ @property
12
+ def source_code(self) -> str:
13
+ """Lazily load source code."""
14
+ if self._source_code is None:
15
+ self._source_code = self._source_path.read_text(encoding="utf-8")
16
+ return self._source_code
17
+
18
+ def __repr__(self) -> str:
19
+ return self.source_code
20
+ def __str__(self) -> str:
21
+ return self.source_code
22
+
23
+
24
+ # Get the path to the source file
25
+ _source_file = Path(__file__).parent / "sources" / "rel.py"
26
+ rel = SourceCodeModule("IR.rel", _source_file)
@@ -0,0 +1,294 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import random, hashlib, nltk
5
+ from itertools import combinations
6
+ from nltk.corpus import stopwords
7
+ from nltk.tokenize import word_tokenize
8
+ from nltk.stem import PorterStemmer
9
+ from sklearn.feature_extraction.text import TfidfVectorizer
10
+ from sklearn.metrics.pairwise import cosine_similarity
11
+
12
+ nltk.download('punkt_tab')
13
+ nltk.download('stopwords')
14
+
15
+ random.seed(42)
16
+ np.random.seed(42)
17
+
18
+ docs = [
19
+ "information retrieval is the process of obtaining relevant documents",
20
+ "search engines use ranking algorithms for information retrieval",
21
+ "information retrieval systems index and rank documents",
22
+ "retrieval models help search engines find relevant documents",
23
+ "inverted index is widely used in information retrieval",
24
+ "query expansion improves retrieval effectiveness",
25
+ "query expansion adds related terms to the query",
26
+ "expansion techniques improve search results",
27
+ "duplicate documents appear frequently in search engines",
28
+ "near duplicate detection improves indexing"
29
+ ]
30
+
31
+ stop_words = set(stopwords.words('english'))
32
+ stemmer = PorterStemmer()
33
+
34
+ def preprocess(text):
35
+ return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
36
+
37
+ def sim_df(mat, title):
38
+ df = pd.DataFrame(np.round(np.asarray(mat), 3),
39
+ index=[f"Doc{i}" for i in range(len(docs))],
40
+ columns=[f"Doc{i}" for i in range(len(docs))])
41
+ print(f"\n{title}")
42
+ print(df)
43
+ return df
44
+
45
+ def prf(tp, fp, fn):
46
+ p = tp / (tp + fp) if tp + fp else 0
47
+ r = tp / (tp + fn) if tp + fn else 0
48
+ f = 2 * p * r / (p + r) if p + r else 0
49
+ return round(p, 3), round(r, 3), round(f, 3)
50
+
51
+ processed_docs = [" ".join(preprocess(doc)) for doc in docs]
52
+ shingles = [set(preprocess(doc)) for doc in docs]
53
+
54
+ # MinHash
55
+ num_hash, max_shingle = 50, 1000
56
+ hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
57
+ vocab = list(set(word for doc in shingles for word in doc))
58
+ shingle_index = {w: i for i, w in enumerate(vocab)}
59
+
60
+ def h(x, a, b): return (a * x + b) % max_shingle
61
+
62
+ signature = np.full((num_hash, len(docs)), np.inf)
63
+ for d, doc in enumerate(shingles):
64
+ for word in doc:
65
+ idx = shingle_index[word]
66
+ for i, (a, b) in enumerate(hash_funcs):
67
+ signature[i, d] = min(signature[i, d], h(idx, a, b))
68
+ signature = signature.astype(int)
69
+
70
+ minhash_sim = np.matrix([[np.mean(signature[:, i] == signature[:, j]) for j in range(len(docs))] for i in range(len(docs))])
71
+ sim_df(minhash_sim, "MinHash Similarity Table")
72
+
73
+ # LSH
74
+ def get_lsh_candidates(sig, bands):
75
+ rows = sig.shape[0] // bands
76
+ buckets, candidates = {}, set()
77
+ for b in range(bands):
78
+ for d in range(sig.shape[1]):
79
+ band = tuple(sig[b * rows:(b + 1) * rows, d])
80
+ key = hashlib.md5(str(band).encode()).hexdigest()
81
+ buckets.setdefault((b, key), []).append(d)
82
+ for group in buckets.values():
83
+ if len(group) > 1:
84
+ for pair in combinations(group, 2):
85
+ candidates.add(tuple(sorted(pair)))
86
+ return candidates
87
+
88
+ candidates = get_lsh_candidates(signature, 10)
89
+ lsh_df = pd.DataFrame([(f"Doc{i}", f"Doc{j}") for i, j in sorted(candidates)], columns=["Document 1", "Document 2"])
90
+ print("\nLSH Candidate Pairs Table")
91
+ print(lsh_df)
92
+
93
+ # Rocchio
94
+ vectorizer = TfidfVectorizer()
95
+ tfidf = vectorizer.fit_transform(processed_docs)
96
+ query = "information retrieval"
97
+ q_vec = vectorizer.transform([" ".join(preprocess(query))])
98
+ scores = cosine_similarity(q_vec, tfidf)[0]
99
+ top_docs = scores.argsort()[::-1][:3]
100
+ relevant = tfidf[top_docs]
101
+ non_relevant = tfidf[[i for i in range(len(docs)) if i not in top_docs]]
102
+
103
+ alpha, beta, gamma = 1, 0.75, 0.15
104
+ new_query = alpha * q_vec + beta * np.asarray(relevant.mean(axis=0)) - gamma * np.asarray(non_relevant.mean(axis=0))
105
+ new_scores = cosine_similarity(np.asarray(new_query), tfidf)[0]
106
+
107
+ rocchio_df = pd.DataFrame({
108
+ "Document": [f"Doc{i}" for i in range(len(docs))],
109
+ "Original Score": np.round(scores, 3),
110
+ "Updated Score": np.round(new_scores, 3)
111
+ })
112
+ print("\nRocchio Score Table")
113
+ print(rocchio_df)
114
+
115
+ # LCA
116
+ top_k = scores.argsort()[::-1][:5]
117
+ term_freq = {}
118
+ for doc in [processed_docs[i] for i in top_k]:
119
+ for word in doc.split():
120
+ term_freq[word] = term_freq.get(word, 0) + 1
121
+ expanded_terms = sorted(term_freq, key=term_freq.get, reverse=True)[:5]
122
+ expanded_query = " ".join(preprocess(query)) + " " + " ".join(expanded_terms)
123
+ expanded_scores = cosine_similarity(vectorizer.transform([expanded_query]), tfidf)[0]
124
+
125
+ print("\nLCA Expanded Query")
126
+ print(expanded_query)
127
+
128
+ lca_df = pd.DataFrame({
129
+ "Document": [f"Doc{i}" for i in range(len(docs))],
130
+ "LCA Score": np.round(expanded_scores, 3)
131
+ })
132
+ print("\nLCA Score Table")
133
+ print(lca_df)
134
+
135
+ # Jaccard
136
+ jaccard = lambda a, b: len(a & b) / len(a | b)
137
+ jaccard_matrix = np.matrix([[jaccard(shingles[i], shingles[j]) for j in range(len(docs))] for i in range(len(docs))])
138
+ sim_df(jaccard_matrix, "Jaccard Similarity Table")
139
+
140
+ # Precision Recall Fscore with different bucket sizes
141
+ threshold = 0.30
142
+ ground_truth = {(i, j) for i in range(len(docs)) for j in range(i + 1, len(docs)) if float(jaccard_matrix[i, j]) >= threshold}
143
+
144
+ bucket_rows = []
145
+ for b in [5, 10, 25]:
146
+ if num_hash % b == 0:
147
+ cand = get_lsh_candidates(signature, b)
148
+ tp = len(cand & ground_truth)
149
+ fp = len(cand - ground_truth)
150
+ fn = len(ground_truth - cand)
151
+ p, r, f = prf(tp, fp, fn)
152
+ bucket_rows.append([b, len(cand), tp, fp, fn, p, r, f])
153
+
154
+ bucket_df = pd.DataFrame(bucket_rows, columns=["Bucket Size", "Candidate Pairs", "TP", "FP", "FN", "Precision", "Recall", "Fscore"])
155
+ print("\nPrecision Recall Fscore with Different Bucket Size")
156
+ print(bucket_df)
157
+
158
+ # Signature Size Compression Ratio Accuracy
159
+ original_size = len(vocab) * len(docs)
160
+ comp_rows = []
161
+ for rows_used in [10, 20, 30, 40, 50]:
162
+ sub_sig = signature[:rows_used, :]
163
+ correct, total = 0, 0
164
+ for i in range(len(docs)):
165
+ for j in range(i + 1, len(docs)):
166
+ approx = np.mean(sub_sig[:, i] == sub_sig[:, j]) >= threshold
167
+ actual = float(jaccard_matrix[i, j]) >= threshold
168
+ correct += int(approx == actual)
169
+ total += 1
170
+ comp_rows.append([
171
+ rows_used,
172
+ sub_sig.size,
173
+ round(sub_sig.size / original_size, 3),
174
+ round(correct / total, 3)
175
+ ])
176
+
177
+ compression_df = pd.DataFrame(comp_rows, columns=["Signature Rows Used", "Signature Size", "Compression Ratio", "Accuracy"])
178
+ print("\nSignature Size Compression Ratio Accuracy Table")
179
+ print(compression_df)
180
+
181
+ # MAP change for different term reweighting
182
+ training_queries = ["information retrieval", "query expansion", "search engines", "duplicate detection"]
183
+ query_relevance = {
184
+ "information retrieval": {0, 1, 2, 3, 4},
185
+ "query expansion": {5, 6, 7},
186
+ "search engines": {1, 3, 8},
187
+ "duplicate detection": {8, 9}
188
+ }
189
+ settings = [(1.0, 0.75, 0.15), (1.0, 0.50, 0.25), (1.0, 1.00, 0.50)]
190
+
191
+ def avg_precision(score_vector, relevant_ids):
192
+ ranked = np.argsort(score_vector)[::-1]
193
+ hits, s = 0, 0
194
+ for rank, d in enumerate(ranked, 1):
195
+ if d in relevant_ids:
196
+ hits += 1
197
+ s += hits / rank
198
+ return s / len(relevant_ids)
199
+
200
+ map_rows = []
201
+ for a, b, g in settings:
202
+ before_list, after_list = [], []
203
+ for tq in training_queries:
204
+ tq_vec = vectorizer.transform([" ".join(preprocess(tq))])
205
+ base = cosine_similarity(tq_vec, tfidf)[0]
206
+ top = base.argsort()[::-1][:3]
207
+ rel = tfidf[top]
208
+ nonrel = tfidf[[i for i in range(len(docs)) if i not in top]]
209
+ rq = a * tq_vec + b * np.asarray(rel.mean(axis=0)) - g * np.asarray(nonrel.mean(axis=0))
210
+ updated = cosine_similarity(np.asarray(rq), tfidf)[0]
211
+ before_list.append(avg_precision(base, query_relevance[tq]))
212
+ after_list.append(avg_precision(updated, query_relevance[tq]))
213
+ mb, ma = np.mean(before_list), np.mean(after_list)
214
+ change = ((ma - mb) / mb) * 100 if mb else 0
215
+ map_rows.append([a, b, g, round(mb, 3), round(ma, 3), round(change, 3)])
216
+
217
+ map_df = pd.DataFrame(map_rows, columns=["Alpha", "Beta", "Gamma", "MAP Before", "MAP After", "Percent Change in MAP"])
218
+ print("\nPercent Change in Mean Average Precision on Training Queries for Different Term Reweighting")
219
+ print(map_df)
220
+
221
+ # Graphs
222
+
223
+ # Additional Graphs
224
+
225
+ plt.figure()
226
+ plt.imshow(np.asarray(minhash_sim), cmap='viridis')
227
+ plt.colorbar()
228
+ plt.title("MinHash Similarity Heatmap")
229
+ plt.xlabel("Documents")
230
+ plt.ylabel("Documents")
231
+ plt.show()
232
+
233
+ plt.figure()
234
+ plt.imshow(np.asarray(jaccard_matrix), cmap='plasma')
235
+ plt.colorbar()
236
+ plt.title("Jaccard Similarity Heatmap")
237
+ plt.xlabel("Documents")
238
+ plt.ylabel("Documents")
239
+ plt.show()
240
+
241
+ plt.figure()
242
+ plt.bar(["Before Rocchio", "After Rocchio"], [np.mean(scores), np.mean(new_scores)])
243
+ plt.title("MAP Change After Rocchio")
244
+ plt.ylabel("MAP")
245
+ plt.show()
246
+
247
+ precision_val = bucket_df["Precision"].mean()
248
+ recall_val = bucket_df["Recall"].mean()
249
+ fscore_val = bucket_df["Fscore"].mean()
250
+
251
+ plt.figure()
252
+ plt.bar(["Precision", "Recall", "Fscore"], [precision_val, recall_val, fscore_val])
253
+ plt.title("Average Evaluation Metrics")
254
+ plt.ylabel("Value")
255
+ plt.show()
256
+
257
+ plt.figure()
258
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Precision"], marker='o', label="Precision")
259
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Recall"], marker='s', label="Recall")
260
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Fscore"], marker='^', label="Fscore")
261
+ plt.title("PRF vs Bucket Size")
262
+ plt.xlabel("Bucket Size")
263
+ plt.ylabel("Value")
264
+ plt.legend()
265
+ plt.show()
266
+
267
+ plt.figure()
268
+ plt.plot(compression_df["Signature Rows Used"], compression_df["Compression Ratio"], marker='o', label="Compression Ratio")
269
+ plt.plot(compression_df["Signature Rows Used"], compression_df["Accuracy"], marker='s', label="Accuracy")
270
+ plt.title("Compression Ratio and Accuracy")
271
+ plt.xlabel("Signature Rows Used")
272
+ plt.ylabel("Value")
273
+ plt.legend()
274
+ plt.show()
275
+
276
+ labels = [f"a={r['Alpha']}, b={r['Beta']}, g={r['Gamma']}" for _, r in map_df.iterrows()]
277
+
278
+ plt.figure()
279
+ plt.plot(labels, map_df["MAP Before"], marker='o', label="MAP Before")
280
+ plt.plot(labels, map_df["MAP After"], marker='s', label="MAP After")
281
+ plt.title("MAP for Different Reweighting")
282
+ plt.xlabel("Term Reweighting")
283
+ plt.ylabel("MAP")
284
+ plt.xticks(rotation=20)
285
+ plt.legend()
286
+ plt.show()
287
+
288
+ plt.figure()
289
+ plt.bar(labels, map_df["Percent Change in MAP"])
290
+ plt.title("Percent Change in MAP")
291
+ plt.xlabel("Term Reweighting")
292
+ plt.ylabel("Percent Change")
293
+ plt.xticks(rotation=20)
294
+ plt.show()
@@ -0,0 +1,224 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import random
4
+ import hashlib
5
+ import nltk
6
+ from itertools import combinations
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.stem import PorterStemmer
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.metrics.pairwise import cosine_similarity
12
+ import matplotlib.pyplot as plt
13
+
14
+ nltk.download('punkt_tab', quiet=True)
15
+ nltk.download('stopwords', quiet=True)
16
+
17
+ random.seed(42)
18
+ np.random.seed(42)
19
+
20
+ docs = [
21
+ "information retrieval is the process of obtaining relevant documents",
22
+ "search engines use ranking algorithms for information retrieval",
23
+ "information retrieval systems index and rank documents",
24
+ "retrieval models help search engines find relevant documents",
25
+ "inverted index is widely used in information retrieval",
26
+ "query expansion improves retrieval effectiveness",
27
+ "query expansion adds related terms to the query",
28
+ "expansion techniques improve search results",
29
+ "duplicate documents appear frequently in search engines",
30
+ "near duplicate detection improves indexing"
31
+ ]
32
+
33
+ stop_words = set(stopwords.words('english'))
34
+ stemmer = PorterStemmer()
35
+
36
+ def preprocess(text):
37
+ return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
38
+
39
+ def sim_df(mat, title):
40
+ df = pd.DataFrame(np.round(np.asarray(mat), 3),
41
+ index=[f"Doc{i}" for i in range(len(docs))],
42
+ columns=[f"Doc{i}" for i in range(len(docs))])
43
+ print(f"\n{title}")
44
+ # print(df)
45
+ return df
46
+
47
+ def prf(tp, fp, fn):
48
+ p = tp / (tp + fp) if tp + fp else 0
49
+ r = tp / (tp + fn) if tp + fn else 0
50
+ f = 2 * p * r / (p + r) if p + r else 0
51
+ return round(p, 3), round(r, 3), round(f, 3)
52
+
53
+ processed_docs = [" ".join(preprocess(doc)) for doc in docs]
54
+ shingles = [set(preprocess(doc)) for doc in docs]
55
+
56
+ num_hash, max_shingle = 50, 1000
57
+ hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
58
+ vocab = list(set(word for doc in shingles for word in doc))
59
+ shingle_index = {w: i for i, w in enumerate(vocab)}
60
+
61
+ def h(x, a, b): return (a * x + b) % max_shingle
62
+
63
+ signature = np.full((num_hash, len(docs)), np.inf)
64
+ for d, doc in enumerate(shingles):
65
+ for word in doc:
66
+ idx = shingle_index[word]
67
+ for i, (a, b) in enumerate(hash_funcs):
68
+ signature[i, d] = min(signature[i, d], h(idx, a, b))
69
+ signature = signature.astype(int)
70
+
71
+ # =====================================================================
72
+ # 1. JACCARD SIMILARITY
73
+ # =====================================================================
74
+ jaccard = lambda a, b: len(a & b) / len(a | b)
75
+ jaccard_matrix = np.matrix([[jaccard(shingles[i], shingles[j]) for j in range(len(docs))] for i in range(len(docs))])
76
+
77
+ sim_df(jaccard_matrix, "JACCARD SIMILARITY TABLE")
78
+ df_jaccard = pd.DataFrame(np.round(np.asarray(jaccard_matrix), 3))
79
+ print("=" * 60)
80
+ print(df_jaccard)
81
+
82
+ # =====================================================================
83
+ # 2. PRECISION, RECALL, FSCORE WITH DIFFERENT BUCKET SIZES (LSH)
84
+ # =====================================================================
85
+ def get_lsh_candidates(sig, bands):
86
+ rows = sig.shape[0] // bands
87
+ buckets, candidates = {}, set()
88
+ for b in range(bands):
89
+ for d in range(sig.shape[1]):
90
+ band = tuple(sig[b * rows:(b + 1) * rows, d])
91
+ key = hashlib.md5(str(band).encode()).hexdigest()
92
+ buckets.setdefault((b, key), []).append(d)
93
+ for group in buckets.values():
94
+ if len(group) > 1:
95
+ for pair in combinations(group, 2):
96
+ candidates.add(tuple(sorted(pair)))
97
+ return candidates
98
+
99
+ threshold = 0.30
100
+ ground_truth = {(i, j) for i in range(len(docs)) for j in range(i + 1, len(docs)) if float(jaccard_matrix[i, j]) >= threshold}
101
+
102
+ bucket_rows = []
103
+ for b in [5, 10, 25]:
104
+ if num_hash % b == 0:
105
+ cand = get_lsh_candidates(signature, b)
106
+ tp = len(cand & ground_truth)
107
+ fp = len(cand - ground_truth)
108
+ fn = len(ground_truth - cand)
109
+ p, r, f = prf(tp, fp, fn)
110
+ bucket_rows.append([b, len(cand), tp, fp, fn, p, r, f])
111
+
112
+ bucket_df = pd.DataFrame(bucket_rows, columns=["Bucket Size", "Candidate Pairs", "TP", "FP", "FN", "Precision", "Recall", "Fscore"])
113
+ print("\nPRECISION, RECALL, FSCORE WITH DIFFERENT BUCKET SIZES")
114
+ print("=" * 60)
115
+ print(bucket_df)
116
+
117
+ # =====================================================================
118
+ # 3. SIGNATURE SIZE / COMPRESSION RATIO AND ACCURACY
119
+ # =====================================================================
120
+ original_size = len(vocab) * len(docs)
121
+ comp_rows = []
122
+
123
+ for rows_used in [10, 20, 30, 40, 50]:
124
+ sub_sig = signature[:rows_used, :]
125
+ correct, total = 0, 0
126
+ for i in range(len(docs)):
127
+ for j in range(i + 1, len(docs)):
128
+ approx = np.mean(sub_sig[:, i] == sub_sig[:, j]) >= threshold
129
+ actual = float(jaccard_matrix[i, j]) >= threshold
130
+ correct += int(approx == actual)
131
+ total += 1
132
+ comp_rows.append([
133
+ rows_used,
134
+ sub_sig.size,
135
+ round(sub_sig.size / original_size, 3),
136
+ round(correct / total, 3)
137
+ ])
138
+
139
+ compression_df = pd.DataFrame(comp_rows, columns=["Signature Rows Used", "Signature Size", "Compression Ratio", "Accuracy"])
140
+ print("\nSIGNATURE SIZE, COMPRESSION RATIO & ACCURACY TABLE")
141
+ print("=" * 60)
142
+ print(compression_df)
143
+
144
+ # =====================================================================
145
+ # 4. PERCENT CHANGE IN MEAN AVERAGE PRECISION ON TRAINING QUERIES
146
+ # =====================================================================
147
+ vectorizer = TfidfVectorizer()
148
+ tfidf = vectorizer.fit_transform(processed_docs)
149
+
150
+ training_queries = ["information retrieval", "query expansion", "search engines", "duplicate detection"]
151
+
152
+ # Ground truth relevant documents mapping for training queries
153
+ query_relevance = {
154
+ "information retrieval": {0, 1, 2, 3, 4},
155
+ "query expansion": {5, 6, 7},
156
+ "search engines": {1, 3, 8},
157
+ "duplicate detection": {8, 9}
158
+ }
159
+
160
+ # (Alpha, Beta, Gamma) settings for Term Reweighting
161
+ settings = [(1.0, 0.75, 0.15), (1.0, 0.50, 0.25), (1.0, 1.00, 0.50)]
162
+
163
+ def avg_precision(score_vector, relevant_ids):
164
+ ranked = np.argsort(score_vector)[::-1]
165
+ hits, s = 0, 0
166
+ for rank, d in enumerate(ranked, 1):
167
+ if d in relevant_ids:
168
+ hits += 1
169
+ s += hits / rank
170
+ return s / len(relevant_ids) if len(relevant_ids) > 0 else 0
171
+
172
+ map_rows = []
173
+ for a, b, g in settings:
174
+ before_list, after_list = [], []
175
+ for tq in training_queries:
176
+ tq_vec = vectorizer.transform([" ".join(preprocess(tq))])
177
+ base = cosine_similarity(tq_vec, tfidf)[0]
178
+
179
+ # Pseudo-relevance for Rocchio on this query
180
+ top = base.argsort()[::-1][:3]
181
+ rel = tfidf[top]
182
+ nonrel = tfidf[[i for i in range(len(docs)) if i not in top]]
183
+
184
+ rq = a * tq_vec + b * np.asarray(rel.mean(axis=0)) - g * np.asarray(nonrel.mean(axis=0))
185
+ updated = cosine_similarity(np.asarray(rq), tfidf)[0]
186
+
187
+ # Calculate Average Precision
188
+ before_list.append(avg_precision(base, query_relevance[tq]))
189
+ after_list.append(avg_precision(updated, query_relevance[tq]))
190
+
191
+ mb, ma = np.mean(before_list), np.mean(after_list)
192
+ change = ((ma - mb) / mb) * 100 if mb else 0
193
+ map_rows.append([a, b, g, round(mb, 3), round(ma, 3), round(change, 3)])
194
+
195
+ map_df = pd.DataFrame(map_rows, columns=["Alpha", "Beta", "Gamma", "MAP Before", "MAP After", "Percent Change in MAP"])
196
+ print("\nPERCENT CHANGE IN MEAN AVERAGE PRECISION ON TRAINING QUERIES")
197
+ print("=" * 60)
198
+ print(map_df)
199
+
200
+ # =====================================================================
201
+ # 5. VISUALIZATIONS
202
+ # =====================================================================
203
+ plt.figure(figsize=(12, 5))
204
+ plt.subplot(1, 2, 1)
205
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Precision"], marker='o', label="Precision")
206
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Recall"], marker='s', label="Recall")
207
+ plt.plot(bucket_df["Bucket Size"], bucket_df["Fscore"], marker='^', label="Fscore")
208
+ plt.title("PRF vs Bucket Size")
209
+ plt.xlabel("Bucket Size")
210
+ plt.ylabel("Value")
211
+ plt.legend()
212
+
213
+ plt.subplot(1, 2, 2)
214
+ plt.plot(compression_df["Signature Rows Used"], compression_df["Compression Ratio"], marker='o', label="Compression")
215
+ plt.plot(compression_df["Signature Rows Used"], compression_df["Accuracy"], marker='s', label="Accuracy")
216
+ plt.title("Compression Ratio and Accuracy")
217
+ plt.xlabel("Signature Rows Used")
218
+ plt.ylabel("Value")
219
+ plt.legend()
220
+
221
+ plt.tight_layout()
222
+ plt.savefig("eval_metrics_plots.png", dpi=150)
223
+ print("\nMetrics plots saved to 'eval_metrics_plots.png'.")
224
+ plt.show()
@@ -0,0 +1,105 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import random
4
+ import hashlib
5
+ import nltk
6
+ from itertools import combinations
7
+ from nltk.corpus import stopwords
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.stem import PorterStemmer
10
+
11
+ nltk.download('punkt_tab', quiet=True)
12
+ nltk.download('stopwords', quiet=True)
13
+
14
+ random.seed(42)
15
+ np.random.seed(42)
16
+
17
+ # =====================================================================
18
+ # READING FROM CORPUS (Example Code)
19
+ # =====================================================================
20
+ """
21
+ To read documents from a local corpus directory instead of the hardcoded list below,
22
+ you can use the following snippet:
23
+
24
+ import os
25
+ corpus_dir = "path/to/your/corpus/folder"
26
+ docs = []
27
+ for filename in os.listdir(corpus_dir):
28
+ if filename.endswith(".txt"):
29
+ with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as f:
30
+ docs.append(f.read())
31
+ """
32
+
33
+ docs = [
34
+ "information retrieval is the process of obtaining relevant documents",
35
+ "search engines use ranking algorithms for information retrieval",
36
+ "information retrieval systems index and rank documents",
37
+ "retrieval models help search engines find relevant documents",
38
+ "inverted index is widely used in information retrieval",
39
+ "query expansion improves retrieval effectiveness",
40
+ "query expansion adds related terms to the query",
41
+ "expansion techniques improve search results",
42
+ "duplicate documents appear frequently in search engines",
43
+ "near duplicate detection improves indexing"
44
+ ]
45
+
46
+ stop_words = set(stopwords.words('english'))
47
+ stemmer = PorterStemmer()
48
+
49
+ def preprocess(text):
50
+ return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
51
+
52
+ def sim_df(mat, title):
53
+ df = pd.DataFrame(np.round(np.asarray(mat), 3),
54
+ index=[f"Doc{i}" for i in range(len(docs))],
55
+ columns=[f"Doc{i}" for i in range(len(docs))])
56
+ print(f"\n{title}")
57
+ print(df)
58
+ return df
59
+
60
+ processed_docs = [" ".join(preprocess(doc)) for doc in docs]
61
+ shingles = [set(preprocess(doc)) for doc in docs]
62
+
63
+ # =====================================================================
64
+ # 1. MINHASH
65
+ # =====================================================================
66
+ num_hash, max_shingle = 50, 1000
67
+ hash_funcs = [(random.randint(1, max_shingle), random.randint(0, max_shingle)) for _ in range(num_hash)]
68
+ vocab = list(set(word for doc in shingles for word in doc))
69
+ shingle_index = {w: i for i, w in enumerate(vocab)}
70
+
71
+ def h(x, a, b): return (a * x + b) % max_shingle
72
+
73
+ signature = np.full((num_hash, len(docs)), np.inf)
74
+ for d, doc in enumerate(shingles):
75
+ for word in doc:
76
+ idx = shingle_index[word]
77
+ for i, (a, b) in enumerate(hash_funcs):
78
+ signature[i, d] = min(signature[i, d], h(idx, a, b))
79
+ signature = signature.astype(int)
80
+
81
+ minhash_sim = np.matrix([[np.mean(signature[:, i] == signature[:, j]) for j in range(len(docs))] for i in range(len(docs))])
82
+ sim_df(minhash_sim, "MinHash Similarity Table")
83
+
84
+ # =====================================================================
85
+ # 2. LOCALITY SENSITIVE HASHING (LSH)
86
+ # =====================================================================
87
+ def get_lsh_candidates(sig, bands):
88
+ rows = sig.shape[0] // bands
89
+ buckets, candidates = {}, set()
90
+ for b in range(bands):
91
+ for d in range(sig.shape[1]):
92
+ band = tuple(sig[b * rows:(b + 1) * rows, d])
93
+ key = hashlib.md5(str(band).encode()).hexdigest()
94
+ buckets.setdefault((b, key), []).append(d)
95
+ for group in buckets.values():
96
+ if len(group) > 1:
97
+ for pair in combinations(group, 2):
98
+ candidates.add(tuple(sorted(pair)))
99
+ return candidates
100
+
101
+ bands = 10
102
+ candidates = get_lsh_candidates(signature, bands)
103
+ lsh_df = pd.DataFrame([(f"Doc{i}", f"Doc{j}") for i, j in sorted(candidates)], columns=["Document 1", "Document 2"])
104
+ print(f"\nLSH Candidate Pairs Table (Bands={bands})")
105
+ print(lsh_df)
@@ -0,0 +1,116 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.tokenize import word_tokenize
6
+ from nltk.stem import PorterStemmer
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ nltk.download('punkt_tab', quiet=True)
11
+ nltk.download('stopwords', quiet=True)
12
+
13
+ # =====================================================================
14
+ # READING FROM CORPUS (Example Code)
15
+ # =====================================================================
16
+ """
17
+ To read documents from a local corpus directory:
18
+
19
+ import os
20
+ corpus_dir = "path/to/your/corpus/folder"
21
+ docs = []
22
+ for filename in os.listdir(corpus_dir):
23
+ if filename.endswith(".txt"):
24
+ with open(os.path.join(corpus_dir, filename), "r", encoding="utf-8") as f:
25
+ docs.append(f.read())
26
+ """
27
+
28
+ docs = [
29
+ "information retrieval is the process of obtaining relevant documents",
30
+ "search engines use ranking algorithms for information retrieval",
31
+ "information retrieval systems index and rank documents",
32
+ "retrieval models help search engines find relevant documents",
33
+ "inverted index is widely used in information retrieval",
34
+ "query expansion improves retrieval effectiveness",
35
+ "query expansion adds related terms to the query",
36
+ "expansion techniques improve search results",
37
+ "duplicate documents appear frequently in search engines",
38
+ "near duplicate detection improves indexing"
39
+ ]
40
+
41
+ stop_words = set(stopwords.words('english'))
42
+ stemmer = PorterStemmer()
43
+
44
+ def preprocess(text):
45
+ return [stemmer.stem(w) for w in word_tokenize(text.lower()) if w.isalnum() and w not in stop_words]
46
+
47
+ processed_docs = [" ".join(preprocess(doc)) for doc in docs]
48
+
49
+ vectorizer = TfidfVectorizer()
50
+ tfidf = vectorizer.fit_transform(processed_docs)
51
+ query = "information retrieval"
52
+ processed_query = " ".join(preprocess(query))
53
+ q_vec = vectorizer.transform([processed_query])
54
+
55
+ # =====================================================================
56
+ # 1. ROCCHIO'S FEEDBACK ALGORITHM
57
+ # =====================================================================
58
+ scores = cosine_similarity(q_vec, tfidf)[0]
59
+
60
+ # Pseudo-relevance assumption: top 3 docs are relevant (in a real system, top 10-20)
61
+ num_pseudo_relevant = 3
62
+ top_docs = scores.argsort()[::-1][:num_pseudo_relevant]
63
+
64
+ alpha, beta, gamma = 1.0, 0.75, 0.15
65
+
66
+ relevant = tfidf[top_docs]
67
+ non_relevant = tfidf[[i for i in range(len(docs)) if i not in top_docs]]
68
+
69
+ new_query = alpha * q_vec + beta * np.asarray(relevant.mean(axis=0)) - gamma * np.asarray(non_relevant.mean(axis=0))
70
+ new_scores = cosine_similarity(np.asarray(new_query), tfidf)[0]
71
+
72
+ rocchio_df = pd.DataFrame({
73
+ "Document": [f"Doc{i}" for i in range(len(docs))],
74
+ "Original Score": np.round(scores, 3),
75
+ "Updated Score (Rocchio)": np.round(new_scores, 3)
76
+ })
77
+ print("\nROCCHIO ALGORITHM SCORE TABLE")
78
+ print("=" * 60)
79
+ print(rocchio_df)
80
+
81
+ # =====================================================================
82
+ # 2. LOCAL CONTEXT ANALYSIS (LCA)
83
+ # =====================================================================
84
+ # Measures the co-occurrence of a term with all query terms based on information
85
+ # from pseudo-relevant documents (top 10-20 documents returned by initial search).
86
+ # Since our corpus is small, we'll use top 5 pseudo-relevant documents.
87
+
88
+ num_lca_pseudo_relevant = 5
89
+ top_k_lca = scores.argsort()[::-1][:num_lca_pseudo_relevant]
90
+ top_docs_lca = [processed_docs[i] for i in top_k_lca]
91
+
92
+ term_freq = {}
93
+ for doc in top_docs_lca:
94
+ for word in doc.split():
95
+ term_freq[word] = term_freq.get(word, 0) + 1
96
+
97
+ # Extract top expanded terms from pseudo-relevant docs
98
+ num_expansion_terms = 5
99
+ expanded_terms = sorted(term_freq, key=term_freq.get, reverse=True)[:num_expansion_terms]
100
+ expanded_query = processed_query + " " + " ".join(expanded_terms)
101
+
102
+ expanded_vec = vectorizer.transform([expanded_query])
103
+ expanded_scores = cosine_similarity(expanded_vec, tfidf)[0]
104
+
105
+ print("\nLOCAL CONTEXT ANALYSIS (LCA)")
106
+ print("=" * 60)
107
+ print(f"Original Query: {processed_query}")
108
+ print(f"Expanded Query: {expanded_query}")
109
+
110
+ lca_df = pd.DataFrame({
111
+ "Document": [f"Doc{i}" for i in range(len(docs))],
112
+ "Original Score": np.round(scores, 3),
113
+ "LCA Expanded Score": np.round(expanded_scores, 3)
114
+ })
115
+ print("\nLCA SCORE TABLE")
116
+ print(lca_df)
@@ -1,5 +0,0 @@
1
- """IR subpackage - Information Retrieval source code."""
2
-
3
- from .all import all
4
-
5
- __all__ = ["all"]