pymrsf 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ # Environment variables
2
+ .env
3
+
4
+ # Python cache
5
+ __pycache__/
6
+ *.pyc
7
+ *.pyo
8
+ *.pyd
9
+ .Python
10
+
11
+ # Virtual environment
12
+ venv/
13
+ env/
14
+ ENV/
15
+
16
+ # MRSF database and indexes
17
+ mrsf.db
18
+ mrsf.faiss
19
+ mrsf.faiss.meta
20
+
21
+ # Results and outputs
22
+ *.csv
23
+ mrsf_results_full.csv
24
+
25
+ # Large model files
26
+ models/
27
+ *.gguf
28
+ *.tar.gz
29
+ *.bin
30
+ *.safetensors
31
+
32
+ # Distribution files (can be rebuilt)
33
+ dist/
34
+ build/
35
+ *.egg-info/
36
+
37
+ # IDE
38
+ .vscode/
39
+ .idea/
40
+ *.swp
41
+ *.swo
42
+ *~
43
+
44
+ # OS
45
+ .DS_Store
46
+ Thumbs.db
pymrsf-0.4.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eric Monthe
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pymrsf-0.4.0/PKG-INFO ADDED
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.4
2
+ Name: pymrsf
3
+ Version: 0.4.0
4
+ Summary: Novelty-Aware RAG scoring — Filter chunks by information gain, not just relevance
5
+ Project-URL: Homepage, https://github.com/riiseup08/mrsf
6
+ Author: Eric Monthe
7
+ License: MIT
8
+ License-File: LICENSE
9
+ Keywords: embeddings,llm,novelty,rag,retrieval,semantic
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Requires-Python: >=3.9
16
+ Requires-Dist: faiss-cpu>=1.7.4
17
+ Requires-Dist: httpx>=0.25.0
18
+ Requires-Dist: llama-cpp-python>=0.2.0
19
+ Requires-Dist: msgpack>=1.0.0
20
+ Requires-Dist: numpy>=1.24.0
21
+ Requires-Dist: requests>=2.31.0
22
+ Requires-Dist: tiktoken>=0.5.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest; extra == 'dev'
25
+ Requires-Dist: pytest-cov; extra == 'dev'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # pymrsf — Novelty-Aware RAG Chunk Scoring
29
+
30
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
32
+ [![Tests](https://img.shields.io/badge/tests-33%20passing-brightgreen)]()
33
+
34
+ **Stop wasting context window on information your LLM already knows.**
35
+
36
+ `pymrsf` scores RAG chunks by measuring **information gain** — not just relevance. It uses the model's own predictive surprise to detect which chunks contain genuinely new information.
37
+
38
+ ## The Problem
39
+
40
+ Standard RAG retrieves chunks by *relevance* (cosine similarity). But a chunk can be highly relevant while containing *only facts the model already memorized during training*. You waste precious context window on redundant information.
41
+
42
+ Also, if the LLM already *knows the answer* to the query, even novel chunks are less useful. And if two chunks say the same thing, you don't need both.
43
+
44
+ ## The Solution
45
+
46
+ `pymrsf` introduces **multi-factor novelty-aware scoring**:
47
+
48
+ | Factor | What It Measures | Weight |
49
+ |--------|-----------------|--------|
50
+ | **Novelty** | How much *new* information does this chunk contain? | 40% |
51
+ | **Relevance** | How related is this chunk to the query? | 40% |
52
+ | **Query Ignorance** | Does the model *not* know the answer to your question? | 20% |
53
+ | **Diversity** | Does a better chunk already cover this content? | Dedup |
54
+
55
+ ## Quick Start
56
+
57
+ ```python
58
+ from pymrsf.rag import filter_chunks
59
+
60
+ # Your retrieved chunks
61
+ chunks = [
62
+ "Backpropagation computes gradients using the chain rule.",
63
+ "Neural networks are inspired by the human brain.",
64
+ "The sky is blue because of Rayleigh scattering.",
65
+ ]
66
+
67
+ # Filter to only useful chunks
68
+ query = "How does backpropagation work?"
69
+ useful = filter_chunks(chunks, query, min_rag_score=50, verbose=True)
70
+
71
+ # → Pass only useful chunks to your LLM
72
+ answer = llm.complete(query, context=useful)
73
+ ```
74
+
75
+ ## Installation
76
+
77
+ ```bash
78
+ pip install llama-cpp-python faiss-cpu msgpack tiktoken
79
+ git clone https://github.com/riiseup08/mrsf.git
80
+ cd mrsf
81
+ pip install -e .
82
+ ```
83
+
84
+ ## Features
85
+
86
+ ### 🎯 RAG Chunk Scoring (Core Feature)
87
+
88
+ ```python
89
+ from pymrsf.rag import score_chunk, score_chunks, score_chunks_batch
90
+
91
+ # Single chunk scoring
92
+ result = score_chunk(
93
+ "Backpropagation computes gradients using the chain rule.",
94
+ query="How does backpropagation work?",
95
+ verbose=True
96
+ )
97
+ print(result["rag_score"]) # 72/100
98
+ print(result["verdict"]) # "good"
99
+ print(result["query_knowledge"]) # how much model knows the query
100
+
101
+ # Batch scoring (3-5x faster for many chunks)
102
+ results = score_chunks_batch(chunks, query)
103
+
104
+ # Custom weights (adjust the formula)
105
+ weights = {"novelty": 0.5, "relevance": 0.3, "query_ignorance": 0.2}
106
+ result = score_chunk(chunk, query, weights=weights)
107
+ ```
108
+
109
+ ### 🔍 Knowledge Probing
110
+
111
+ ```python
112
+ from pymrsf import probe
113
+
114
+ result = probe("To be or not to be, that is the question.")
115
+ print(f"Knowledge: {result['knowledge_score']}/100 ({result['label']})")
116
+ # → Knowledge: 92/100 (memorized) — Shakespeare is well-known
117
+
118
+ result = probe("My proprietary algorithm uses a novel attention mechanism.")
119
+ print(f"Knowledge: {result['knowledge_score']}/100 ({result['label']})")
120
+ # → Knowledge: 15/100 (unknown) — novel content!
121
+ ```
122
+
123
+ ### 🔧 RAG Pipeline Filter
124
+
125
+ ```python
126
+ from pymrsf.rag import filter_chunks
127
+
128
+ chunks = retriever.get(query, top_k=20) # your retriever
129
+
130
+ # Only keep chunks worth sending to the LLM
131
+ good = filter_chunks(
132
+ chunks,
133
+ query,
134
+ min_rag_score=50, # skip low-value chunks
135
+ top_k=5, # limit context window usage
136
+ diversity_threshold=0.85, # dedup similar chunks
137
+ verbose=True,
138
+ )
139
+
140
+ answer = llm.complete(query, context=good)
141
+ ```
142
+
143
+ ### 📦 Delta Compression (Experimental)
144
+
145
+ Store text efficiently using LLM surprises:
146
+
147
+ ```python
148
+ from pymrsf import mrsf_write, mrsf_read, save_index
149
+
150
+ # Write (stores only surprise tokens = ~40% compression)
151
+ mrsf_write("The Eiffel Tower is in Paris.")
152
+ save_index()
153
+
154
+ # Read (reconstructs from delta + model)
155
+ results = mrsf_read("famous landmark in France")
156
+ ```
157
+
158
+ ## Configuration
159
+
160
+ Create a `.env` file:
161
+
162
+ ```bash
163
+ PYMRSF_PROVIDER=local
164
+ PYMRSF_MODEL_PATH=./models/mistral-7b-v0.1.Q4_K_M.gguf
165
+ ```
166
+
167
+ ## Scoring Concepts
168
+
169
+ ### RAG Score Formula
170
+ ```
171
+ rag_score = novelty × 0.40 + relevance × 0.40 + query_ignorance × 0.20
172
+ ```
173
+
174
+ ### What the Scores Mean
175
+
176
+ | Score | Verdict | Action |
177
+ |-------|---------|--------|
178
+ | 80-100 | Excellent | Prioritize this chunk |
179
+ | 60-79 | Good | Include in context |
180
+ | 40-59 | Moderate | Include if space allows |
181
+ | 20-39 | Weak | Skip if better chunks exist |
182
+ | 0-19 | Skip | Model already knows this |
183
+
184
+ ## Project Structure
185
+
186
+ ```
187
+ pymrsf/
188
+ ├── __init__.py # Public API exports
189
+ ├── core.py # Provider routing (local + openai), lazy model loading
190
+ ├── embeddings.py # Ollama embedding API client
191
+ ├── probe.py # Knowledge probing (how well does model know a text?)
192
+ ├── rag.py # RAG chunk scoring with novelty + relevance + diversity
193
+ ├── storage.py # Delta compression storage (experimental)
194
+ ├── inspect.py # Token-level visualization tools
195
+ └── benchmark.py # Compression/latency benchmarks
196
+ ```
197
+
198
+ ## Project Status
199
+
200
+ **Alpha** — The RAG novelty scoring works and solves a real problem. The delta compression/storage system is experimental.
201
+
202
+ ## License
203
+
204
+ MIT
pymrsf-0.4.0/README.md ADDED
@@ -0,0 +1,177 @@
1
+ # pymrsf — Novelty-Aware RAG Chunk Scoring
2
+
3
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
5
+ [![Tests](https://img.shields.io/badge/tests-33%20passing-brightgreen)]()
6
+
7
+ **Stop wasting context window on information your LLM already knows.**
8
+
9
+ `pymrsf` scores RAG chunks by measuring **information gain** — not just relevance. It uses the model's own predictive surprise to detect which chunks contain genuinely new information.
10
+
11
+ ## The Problem
12
+
13
+ Standard RAG retrieves chunks by *relevance* (cosine similarity). But a chunk can be highly relevant while containing *only facts the model already memorized during training*. You waste precious context window on redundant information.
14
+
15
+ Also, if the LLM already *knows the answer* to the query, even novel chunks are less useful. And if two chunks say the same thing, you don't need both.
16
+
17
+ ## The Solution
18
+
19
+ `pymrsf` introduces **multi-factor novelty-aware scoring**:
20
+
21
+ | Factor | What It Measures | Weight |
22
+ |--------|-----------------|--------|
23
+ | **Novelty** | How much *new* information does this chunk contain? | 40% |
24
+ | **Relevance** | How related is this chunk to the query? | 40% |
25
+ | **Query Ignorance** | Does the model *not* know the answer to your question? | 20% |
26
+ | **Diversity** | Does a better chunk already cover this content? | Dedup |
27
+
28
+ ## Quick Start
29
+
30
+ ```python
31
+ from pymrsf.rag import filter_chunks
32
+
33
+ # Your retrieved chunks
34
+ chunks = [
35
+ "Backpropagation computes gradients using the chain rule.",
36
+ "Neural networks are inspired by the human brain.",
37
+ "The sky is blue because of Rayleigh scattering.",
38
+ ]
39
+
40
+ # Filter to only useful chunks
41
+ query = "How does backpropagation work?"
42
+ useful = filter_chunks(chunks, query, min_rag_score=50, verbose=True)
43
+
44
+ # → Pass only useful chunks to your LLM
45
+ answer = llm.complete(query, context=useful)
46
+ ```
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ pip install llama-cpp-python faiss-cpu msgpack tiktoken
52
+ git clone https://github.com/riiseup08/mrsf.git
53
+ cd mrsf
54
+ pip install -e .
55
+ ```
56
+
57
+ ## Features
58
+
59
+ ### 🎯 RAG Chunk Scoring (Core Feature)
60
+
61
+ ```python
62
+ from pymrsf.rag import score_chunk, score_chunks, score_chunks_batch
63
+
64
+ # Single chunk scoring
65
+ result = score_chunk(
66
+ "Backpropagation computes gradients using the chain rule.",
67
+ query="How does backpropagation work?",
68
+ verbose=True
69
+ )
70
+ print(result["rag_score"]) # 72/100
71
+ print(result["verdict"]) # "good"
72
+ print(result["query_knowledge"]) # how much model knows the query
73
+
74
+ # Batch scoring (3-5x faster for many chunks)
75
+ results = score_chunks_batch(chunks, query)
76
+
77
+ # Custom weights (adjust the formula)
78
+ weights = {"novelty": 0.5, "relevance": 0.3, "query_ignorance": 0.2}
79
+ result = score_chunk(chunk, query, weights=weights)
80
+ ```
81
+
82
+ ### 🔍 Knowledge Probing
83
+
84
+ ```python
85
+ from pymrsf import probe
86
+
87
+ result = probe("To be or not to be, that is the question.")
88
+ print(f"Knowledge: {result['knowledge_score']}/100 ({result['label']})")
89
+ # → Knowledge: 92/100 (memorized) — Shakespeare is well-known
90
+
91
+ result = probe("My proprietary algorithm uses a novel attention mechanism.")
92
+ print(f"Knowledge: {result['knowledge_score']}/100 ({result['label']})")
93
+ # → Knowledge: 15/100 (unknown) — novel content!
94
+ ```
95
+
96
+ ### 🔧 RAG Pipeline Filter
97
+
98
+ ```python
99
+ from pymrsf.rag import filter_chunks
100
+
101
+ chunks = retriever.get(query, top_k=20) # your retriever
102
+
103
+ # Only keep chunks worth sending to the LLM
104
+ good = filter_chunks(
105
+ chunks,
106
+ query,
107
+ min_rag_score=50, # skip low-value chunks
108
+ top_k=5, # limit context window usage
109
+ diversity_threshold=0.85, # dedup similar chunks
110
+ verbose=True,
111
+ )
112
+
113
+ answer = llm.complete(query, context=good)
114
+ ```
115
+
116
+ ### 📦 Delta Compression (Experimental)
117
+
118
+ Store text efficiently using LLM surprises:
119
+
120
+ ```python
121
+ from pymrsf import mrsf_write, mrsf_read, save_index
122
+
123
+ # Write (stores only surprise tokens = ~40% compression)
124
+ mrsf_write("The Eiffel Tower is in Paris.")
125
+ save_index()
126
+
127
+ # Read (reconstructs from delta + model)
128
+ results = mrsf_read("famous landmark in France")
129
+ ```
130
+
131
+ ## Configuration
132
+
133
+ Create a `.env` file:
134
+
135
+ ```bash
136
+ PYMRSF_PROVIDER=local
137
+ PYMRSF_MODEL_PATH=./models/mistral-7b-v0.1.Q4_K_M.gguf
138
+ ```
139
+
140
+ ## Scoring Concepts
141
+
142
+ ### RAG Score Formula
143
+ ```
144
+ rag_score = novelty × 0.40 + relevance × 0.40 + query_ignorance × 0.20
145
+ ```
146
+
147
+ ### What the Scores Mean
148
+
149
+ | Score | Verdict | Action |
150
+ |-------|---------|--------|
151
+ | 80-100 | Excellent | Prioritize this chunk |
152
+ | 60-79 | Good | Include in context |
153
+ | 40-59 | Moderate | Include if space allows |
154
+ | 20-39 | Weak | Skip if better chunks exist |
155
+ | 0-19 | Skip | Model already knows this |
156
+
157
+ ## Project Structure
158
+
159
+ ```
160
+ pymrsf/
161
+ ├── __init__.py # Public API exports
162
+ ├── core.py # Provider routing (local + openai), lazy model loading
163
+ ├── embeddings.py # Ollama embedding API client
164
+ ├── probe.py # Knowledge probing (how well does model know a text?)
165
+ ├── rag.py # RAG chunk scoring with novelty + relevance + diversity
166
+ ├── storage.py # Delta compression storage (experimental)
167
+ ├── inspect.py # Token-level visualization tools
168
+ └── benchmark.py # Compression/latency benchmarks
169
+ ```
170
+
171
+ ## Project Status
172
+
173
+ **Alpha** — The RAG novelty scoring works and solves a real problem. The delta compression/storage system is experimental.
174
+
175
+ ## License
176
+
177
+ MIT
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ pymrsf demo — see the novelty-aware RAG scoring in action.
4
+
5
+ This script tests 3 things:
6
+ 1. Knowledge probing : Does the model know Shakespeare better than novel text?
7
+ 2. RAG scoring : Do novel+relevant chunks score higher?
8
+ 3. Diversity dedup : Are duplicate chunks filtered out?
9
+
10
+ Run: python demo_novelty.py
11
+ """
12
+
13
+ import sys
14
+
15
+ # ── Test 1: Knowledge Probe ──────────────────────────────────────────────────
16
+
17
+ print("=" * 65)
18
+ print("TEST 1: Knowledge Probing")
19
+ print("Does the model know famous text better than novel text?")
20
+ print("=" * 65)
21
+
22
+ from pymrsf import probe
23
+
24
+ famous = [
25
+ "To be or not to be, that is the question. Whether tis nobler in the mind to suffer.",
26
+ "The quick brown fox jumps over the lazy dog.",
27
+ ]
28
+
29
+ novel = [
30
+ "My proprietary attention mechanism uses a novel sparse gating function.",
31
+ "The Zeta-7 protocol encrypts data using quantum-resistant lattice cryptography.",
32
+ ]
33
+
34
+ print("\n📚 FAMOUS TEXT (should score HIGH — model knows this):")
35
+ for text in famous:
36
+ result = probe(text)
37
+ if "error" in result:
38
+ print(f" ⚠️ {result['error']}")
39
+ print(f" (This is expected if no local model is loaded)")
40
+ break
41
+ bar = "█" * (result['knowledge_score'] // 4) + "░" * (25 - result['knowledge_score'] // 4)
42
+ print(f" [{bar}] {result['knowledge_score']:>2}/100 {result['label'].upper():<12} {text[:40]}...")
43
+
44
+ else:
45
+ print("\n🔬 NOVEL TEXT (should score LOW — model hasn't seen this):")
46
+ for text in novel:
47
+ result = probe(text)
48
+ bar = "█" * (result['knowledge_score'] // 4) + "░" * (25 - result['knowledge_score'] // 4)
49
+ print(f" [{bar}] {result['knowledge_score']:>2}/100 {result['label'].upper():<12} {text[:40]}...")
50
+
51
+ # Verify the claim
52
+ print(f"\n{'─'*65}")
53
+ famous_avg = sum(probe(t)['knowledge_score'] for t in famous) / len(famous)
54
+ novel_avg = sum(probe(t)['knowledge_score'] for t in novel) / len(novel)
55
+ print(f" Famous avg: {famous_avg:.0f}/100 | Novel avg: {novel_avg:.0f}/100")
56
+ if famous_avg > novel_avg:
57
+ print(f" ✅ CONFIRMED: Model knows famous text {famous_avg - novel_avg:.0f}% better!")
58
+ else:
59
+ print(f" ⚠️ Unexpected result — but that's interesting data too!")
60
+
61
+
62
+ # ── Test 2: RAG Chunk Scoring ─────────────────────────────────────────────────
63
+
64
+ print("\n\n" + "=" * 65)
65
+ print("TEST 2: RAG Chunk Scoring")
66
+ print("Do novel+relevant chunks score higher than known+irrelevant ones?")
67
+ print("=" * 65)
68
+
69
+ from pymrsf.rag import score_chunks_batch, filter_chunks
70
+
71
+ # Simulated RAG chunks for the query "How does backpropagation work?"
72
+ query = "How does backpropagation work?"
73
+ rag_chunks = [
74
+ "Backpropagation computes gradients using the chain rule. It propagates error backwards.",
75
+ "Neural networks are inspired by the human brain. They consist of layers of neurons.",
76
+ "The sky is blue because of Rayleigh scattering. This is well-known physics.",
77
+ "A novel optimization technique uses second-order gradients for faster convergence.",
78
+ ]
79
+
80
+ print(f"\nQuery: '{query}'")
81
+ print(f"Chunks to score: {len(rag_chunks)}")
82
+ print()
83
+
84
+ results = score_chunks_batch(rag_chunks, query, diversity_threshold=0.90)
85
+
86
+ for r in results:
87
+ bar = "█" * (r['rag_score'] // 4) + "░" * (25 - r['rag_score'] // 4)
88
+ status = "✅" if r['rag_score'] >= 50 else "❌"
89
+ print(f" {status} [{bar}] RAG={r['rag_score']:>2} N={r['novelty_score']:>2} R={r['relevance_score']:>2} "
90
+ f"Q={r['query_knowledge']:>2} {r['verdict'].upper():<10} {r['chunk'][:50]}...")
91
+
92
+ best_score = results[0]['rag_score']
93
+ worst_score = results[-1]['rag_score']
94
+ print(f"\n Best chunk: {results[0]['chunk'][:50]}... ({best_score}/100)")
95
+ print(f" Worst chunk: {results[-1]['chunk'][:50]}... ({worst_score}/100)")
96
+ if best_score > worst_score:
97
+ print(f" ✅ RAG scoring successfully ranked chunks by usefulness!")
98
+ else:
99
+ print(f" ⚠️ Scores are similar — fine-tuning threshold may help")
100
+
101
+
102
+ # ── Test 3: Diversity Dedup ───────────────────────────────────────────────────
103
+
104
+ print("\n\n" + "=" * 65)
105
+ print("TEST 3: Diversity Dedup")
106
+ print("Does the filter remove duplicate chunks?")
107
+ print("=" * 65)
108
+
109
+ # Create chunks where two are very similar
110
+ dup_chunks = [
111
+ "Backpropagation computes gradients using the chain rule error propagation.",
112
+ "Backpropagation uses chain rule to compute gradients by propagating error.",
113
+ "The sky is blue because of Rayleigh scattering.",
114
+ "The sky appears blue due to Rayleigh scattering of sunlight.",
115
+ ]
116
+
117
+ print(f"\nQuery: 'How does backpropagation work?'")
118
+ print(f"Input: {len(dup_chunks)} chunks (2 pairs of near-duplicates)")
119
+ print()
120
+
121
+ # First: score without dedup
122
+ print("--- WITHOUT dedup ---")
123
+ raw = score_chunks_batch(dup_chunks, query, diversity_threshold=1.0)
124
+ for r in raw:
125
+ print(f" [{r['rag_score']:>2}/100] {r['chunk'][:55]}...")
126
+
127
+ # Then: with dedup (default threshold 0.85)
128
+ print("\n--- WITH dedup (threshold=0.85) ---")
129
+ filtered = score_chunks_batch(dup_chunks, query, diversity_threshold=0.85)
130
+ dup_count = sum(1 for r in filtered if r['rag_score'] == 0)
131
+ for r in filtered:
132
+ if r['rag_score'] > 0:
133
+ print(f" ✅ [{r['rag_score']:>2}/100] {r['chunk'][:55]}...")
134
+ else:
135
+ print(f" ❌ [{r['rag_score']:>2}/100] {r['chunk'][:55]}... (DUPLICATE)")
136
+
137
+ if dup_count > 0:
138
+ print(f"\n ✅ Dedup removed {dup_count} duplicate chunks!")
139
+ else:
140
+ print(f"\n ℹ️ No duplicates detected (threshold may need adjustment)")
141
+
142
+
143
+ # ── Summary ───────────────────────────────────────────────────────────────────
144
+
145
+ print("\n\n" + "=" * 65)
146
+ print("SUMMARY")
147
+ print("=" * 65)
148
+ print(f"\n pymrsf gives you:")
149
+ print(f" 1. Which chunks contain NEW information (not just relevant)")
150
+ print(f" 2. Whether the model already KNOWS THE ANSWER to your query")
151
+ print(f" 3. Automatic removal of DUPLICATE chunks")
152
+ print(f" 4. A tunable RAG score to optimize context window usage")
153
+ print(f"\n Next step: Drop filter_chunks() into your RAG pipeline:")
154
+ print(f" from pymrsf.rag import filter_chunks")
155
+ print(f" good_chunks = filter_chunks(retriever.get(query), query)")
156
+ print(f" answer = llm.complete(query, context=good_chunks)")
157
+ print("=" * 65)