@jayarrowz/mcp-arsr 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.gitattributes +2 -0
- package/Dockerfile +19 -0
- package/LICENSE +21 -0
- package/README.md +125 -0
- package/dist/src/index.d.ts +1 -0
- package/dist/src/index.js +325 -0
- package/dist/src/schemas/tools.d.ts +85 -0
- package/dist/src/schemas/tools.js +100 -0
- package/dist/src/services/llm.d.ts +56 -0
- package/dist/src/services/llm.js +361 -0
- package/dist/src/types.d.ts +53 -0
- package/dist/src/types.js +7 -0
- package/glama.json +6 -0
- package/package.json +31 -0
- package/smithery.yaml +13 -0
- package/src/index.ts +395 -0
- package/src/schemas/tools.ts +118 -0
- package/src/services/llm.ts +480 -0
- package/src/types.ts +67 -0
- package/tsconfig.json +16 -0
package/.gitattributes
ADDED
package/Dockerfile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
|
|
2
|
+
FROM node:lts-alpine
|
|
3
|
+
|
|
4
|
+
WORKDIR /app
|
|
5
|
+
|
|
6
|
+
# Copy package files and install dependencies
|
|
7
|
+
COPY package.json package-lock.json ./
|
|
8
|
+
RUN npm install --ignore-scripts
|
|
9
|
+
|
|
10
|
+
# Copy the remainder of the application
|
|
11
|
+
COPY . .
|
|
12
|
+
|
|
13
|
+
# Build the application
|
|
14
|
+
RUN npm run build
|
|
15
|
+
|
|
16
|
+
# Expose port if necessary (optional)
|
|
17
|
+
# EXPOSE 3000
|
|
18
|
+
|
|
19
|
+
CMD [ "node", "dist/src/index.js" ]
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Github User JayArrowz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# ARSR MCP Server
|
|
2
|
+
|
|
3
|
+
**Adaptive Retrieval-Augmented Self-Refinement** — a closed-loop MCP server that lets LLMs iteratively verify and correct their own claims using uncertainty-guided retrieval.
|
|
4
|
+
|
|
5
|
+
## What it does
|
|
6
|
+
|
|
7
|
+
Unlike one-shot RAG (retrieve → generate), ARSR runs a refinement loop:
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
Generate draft → Decompose claims → Score uncertainty
|
|
11
|
+
↑ ↓
|
|
12
|
+
Decide stop ← Revise with evidence ← Retrieve for low-confidence claims
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
The key insight: **retrieval is guided by uncertainty**. Only claims the model is unsure about trigger evidence fetching, and the queries are adversarial — designed to *disprove* the claim, not just confirm it.
|
|
16
|
+
|
|
17
|
+
## Architecture
|
|
18
|
+
|
|
19
|
+
The server exposes 6 MCP tools. The outer LLM (Claude, GPT, etc.) orchestrates the loop by calling them in sequence:
|
|
20
|
+
|
|
21
|
+
| # | Tool | Purpose |
|
|
22
|
+
|---|------|---------|
|
|
23
|
+
| 1 | `arsr_draft_response` | Generate initial candidate answer (returns `is_refusal` flag) |
|
|
24
|
+
| 2 | `arsr_decompose_claims` | Split into atomic verifiable claims |
|
|
25
|
+
| 3 | `arsr_score_uncertainty` | Estimate confidence via semantic entropy |
|
|
26
|
+
| 4 | `arsr_retrieve_evidence` | Web search for low-confidence claims |
|
|
27
|
+
| 5 | `arsr_revise_response` | Rewrite draft with evidence |
|
|
28
|
+
| 6 | `arsr_should_continue` | Decide: iterate or finalize |
|
|
29
|
+
|
|
30
|
+
**Inner LLM**: Tools 1-5 use Claude Haiku internally for intelligence (query generation, claim extraction, evidence evaluation). This keeps costs low while the outer model handles orchestration.
|
|
31
|
+
|
|
32
|
+
**Refusal detection**: `arsr_draft_response` returns a structured `is_refusal` flag (classified by the inner LLM) indicating whether the draft is a non-answer. When `is_refusal` is true, downstream tools (`decompose`, `revise`) pivot to extracting claims from the original query and building an answer from retrieved evidence instead of trying to refine a refusal.
|
|
33
|
+
|
|
34
|
+
**Web Search**: `arsr_retrieve_evidence` uses the Anthropic API's built-in web search tool — no external search API keys needed.
|
|
35
|
+
|
|
36
|
+
## Setup
|
|
37
|
+
|
|
38
|
+
### Prerequisites
|
|
39
|
+
|
|
40
|
+
- Node.js 18+
|
|
41
|
+
- An Anthropic API key
|
|
42
|
+
|
|
43
|
+
### Install & Build
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
cd arsr-mcp-server
|
|
47
|
+
npm install
|
|
48
|
+
npm run build
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Environment
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
export ANTHROPIC_API_KEY="sk-ant-..."
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Run
|
|
58
|
+
|
|
59
|
+
**stdio mode** (for Claude Desktop, Cursor, etc.):
|
|
60
|
+
```bash
|
|
61
|
+
npm start
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**HTTP mode** (for remote access):
|
|
65
|
+
```bash
|
|
66
|
+
TRANSPORT=http PORT=3001 npm start
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Claude Desktop Configuration
|
|
70
|
+
|
|
71
|
+
Add to your `claude_desktop_config.json`:
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{
|
|
75
|
+
"mcpServers": {
|
|
76
|
+
"arsr": {
|
|
77
|
+
"command": "node",
|
|
78
|
+
"args": ["/path/to/arsr-mcp-server/dist/src/index.js"],
|
|
79
|
+
"env": {
|
|
80
|
+
"ANTHROPIC_API_KEY": "sk-ant-..."
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## How the outer LLM uses it
|
|
88
|
+
|
|
89
|
+
The orchestrating LLM calls the tools in sequence:
|
|
90
|
+
|
|
91
|
+
```
|
|
92
|
+
1. draft = arsr_draft_response({ query: "When was Tesla founded?" })
|
|
93
|
+
// draft.is_refusal indicates if the inner LLM refused to answer
|
|
94
|
+
2. claims = arsr_decompose_claims({ draft: draft.draft, original_query: "When was Tesla founded?", is_refusal: draft.is_refusal })
|
|
95
|
+
3. scored = arsr_score_uncertainty({ claims: claims.claims })
|
|
96
|
+
4. low = scored.scored.filter(c => c.confidence < 0.85)
|
|
97
|
+
5. evidence = arsr_retrieve_evidence({ claims_to_check: low })
|
|
98
|
+
6. revised = arsr_revise_response({ draft: draft.draft, evidence: evidence.evidence, scored: scored.scored, original_query: "When was Tesla founded?", is_refusal: draft.is_refusal })
|
|
99
|
+
7. decision = arsr_should_continue({ iteration: 1, scored: revised_scores })
|
|
100
|
+
→ if "continue": go to step 2 with revised text
|
|
101
|
+
→ if "stop": return revised.revised to user
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Configuration
|
|
105
|
+
|
|
106
|
+
All settings can be overridden via environment variables, falling back to defaults if unset:
|
|
107
|
+
|
|
108
|
+
| Setting | Env var | Default | Description |
|
|
109
|
+
|---------|---------|---------|-------------|
|
|
110
|
+
| `max_iterations` | `ARSR_MAX_ITERATIONS` | `3` | Budget limit for refinement loops |
|
|
111
|
+
| `confidence_threshold` | `ARSR_CONFIDENCE_THRESHOLD` | `0.85` | Claims above this skip retrieval |
|
|
112
|
+
| `entropy_samples` | `ARSR_ENTROPY_SAMPLES` | `3` | Rephrasings for semantic entropy |
|
|
113
|
+
| `retrieval_strategy` | `ARSR_RETRIEVAL_STRATEGY` | `adversarial` | `adversarial`, `confirmatory`, or `balanced` |
|
|
114
|
+
| `inner_model` | `ARSR_INNER_MODEL` | `claude-haiku-4-5-20251001` | Model for internal intelligence |
|
|
115
|
+
|
|
116
|
+
## Cost estimate
|
|
117
|
+
|
|
118
|
+
Per refinement loop iteration (assuming ~5 claims, 3 low-confidence):
|
|
119
|
+
- Inner LLM calls: ~6-10 Haiku calls ≈ $0.002-0.005
|
|
120
|
+
- Web searches: 6-9 queries ≈ included in API
|
|
121
|
+
- Typical total for 2 iterations: **< $0.02**
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
2
|
+
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
3
|
+
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
4
|
+
import express from "express";
|
|
5
|
+
import { DraftInputSchema, DecomposeInputSchema, ScoreInputSchema, RetrieveInputSchema, ReviseInputSchema, ContinueInputSchema, } from "./schemas/tools.js";
|
|
6
|
+
import { generateDraft, decomposeClaims, scoreClaims, retrieveEvidence, reviseDraft, shouldContinue, } from "./services/llm.js";
|
|
7
|
+
import { DEFAULT_CONFIG } from "./types.js";
|
|
8
|
+
const server = new McpServer({
|
|
9
|
+
name: "arsr-mcp-server",
|
|
10
|
+
version: "0.1.0",
|
|
11
|
+
});
|
|
12
|
+
server.registerTool("arsr_draft_response", {
|
|
13
|
+
title: "Draft Response",
|
|
14
|
+
description: `Generate an initial candidate response to a user query. This is the first step in the ARSR refinement loop.
|
|
15
|
+
|
|
16
|
+
The draft is generated by an inner LLM and may contain inaccuracies — that's expected. Subsequent tools (decompose, score, retrieve, revise) will iteratively correct it.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
- query (string): The user's question to answer
|
|
20
|
+
- context (string, optional): Additional context or constraints
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
{ "draft": "The generated response text", "is_refusal": false }`,
|
|
24
|
+
inputSchema: DraftInputSchema,
|
|
25
|
+
annotations: {
|
|
26
|
+
readOnlyHint: true,
|
|
27
|
+
destructiveHint: false,
|
|
28
|
+
idempotentHint: false,
|
|
29
|
+
openWorldHint: true,
|
|
30
|
+
},
|
|
31
|
+
}, async (params) => {
|
|
32
|
+
try {
|
|
33
|
+
const { draft, is_refusal } = await generateDraft(params.query, params.context);
|
|
34
|
+
const output = { draft, is_refusal };
|
|
35
|
+
return {
|
|
36
|
+
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
|
|
37
|
+
structuredContent: output,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
return {
|
|
42
|
+
isError: true,
|
|
43
|
+
content: [{
|
|
44
|
+
type: "text",
|
|
45
|
+
text: `Error generating draft: ${error instanceof Error ? error.message : String(error)}. Ensure ANTHROPIC_API_KEY is set.`,
|
|
46
|
+
}],
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
});
|
|
50
|
+
server.registerTool("arsr_decompose_claims", {
|
|
51
|
+
title: "Decompose Claims",
|
|
52
|
+
description: `Split a draft response into individually verifiable atomic claims.
|
|
53
|
+
|
|
54
|
+
Each claim is a single factual statement that can be independently fact-checked. Opinions, hedges, and meta-commentary are excluded.
|
|
55
|
+
|
|
56
|
+
If is_refusal is true (from arsr_draft_response output) and original_query is provided, claims will be extracted from the query instead of the draft.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
- draft (string): The response text to decompose
|
|
60
|
+
- original_query (string, optional): The user's original question, used as fallback if draft is a refusal
|
|
61
|
+
- is_refusal (boolean, optional): Whether the draft was classified as a refusal by arsr_draft_response
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
{ "claims": [{ "id": "c1", "text": "The claim as a statement", "source_span": "exact quote from draft" }] }`,
|
|
65
|
+
inputSchema: DecomposeInputSchema,
|
|
66
|
+
annotations: {
|
|
67
|
+
readOnlyHint: true,
|
|
68
|
+
destructiveHint: false,
|
|
69
|
+
idempotentHint: true,
|
|
70
|
+
openWorldHint: false,
|
|
71
|
+
},
|
|
72
|
+
}, async (params) => {
|
|
73
|
+
try {
|
|
74
|
+
const claims = await decomposeClaims(params.draft, params.original_query, params.is_refusal ?? false);
|
|
75
|
+
const output = { claims, count: claims.length };
|
|
76
|
+
return {
|
|
77
|
+
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
|
|
78
|
+
structuredContent: output,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
return {
|
|
83
|
+
isError: true,
|
|
84
|
+
content: [{
|
|
85
|
+
type: "text",
|
|
86
|
+
text: `Error decomposing claims: ${error instanceof Error ? error.message : String(error)}`,
|
|
87
|
+
}],
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
});
|
|
91
|
+
server.registerTool("arsr_score_uncertainty", {
|
|
92
|
+
title: "Score Claim Uncertainty",
|
|
93
|
+
description: `Estimate confidence for each claim using semantic entropy and consistency analysis.
|
|
94
|
+
|
|
95
|
+
Each claim receives a confidence score (0-1) and entropy score (0-1). Low confidence / high entropy indicates the claim should be fact-checked.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
- claims (array): Claims to score, each with { id, text, source_span }
|
|
99
|
+
- n_samples (number, optional): Number of rephrasings for entropy (default: 3)
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
{ "scored": [{ "id": "c1", "text": "...", "confidence": 0.92, "entropy": 0.15, "method": "semantic_entropy" }] }
|
|
103
|
+
|
|
104
|
+
Use the confidence_threshold (default 0.85) to filter which claims need evidence retrieval.`,
|
|
105
|
+
inputSchema: ScoreInputSchema,
|
|
106
|
+
annotations: {
|
|
107
|
+
readOnlyHint: true,
|
|
108
|
+
destructiveHint: false,
|
|
109
|
+
idempotentHint: false, // Stochastic
|
|
110
|
+
openWorldHint: false,
|
|
111
|
+
},
|
|
112
|
+
}, async (params) => {
|
|
113
|
+
try {
|
|
114
|
+
const scored = await scoreClaims(params.claims);
|
|
115
|
+
const avgConfidence = scored.reduce((s, c) => s + c.confidence, 0) / scored.length;
|
|
116
|
+
const lowConfCount = scored.filter((c) => c.confidence < DEFAULT_CONFIG.confidence_threshold).length;
|
|
117
|
+
const output = {
|
|
118
|
+
scored,
|
|
119
|
+
summary: {
|
|
120
|
+
total_claims: scored.length,
|
|
121
|
+
avg_confidence: Math.round(avgConfidence * 1000) / 1000,
|
|
122
|
+
low_confidence_count: lowConfCount,
|
|
123
|
+
threshold: DEFAULT_CONFIG.confidence_threshold,
|
|
124
|
+
},
|
|
125
|
+
};
|
|
126
|
+
return {
|
|
127
|
+
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
|
|
128
|
+
structuredContent: output,
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
catch (error) {
|
|
132
|
+
return {
|
|
133
|
+
isError: true,
|
|
134
|
+
content: [{
|
|
135
|
+
type: "text",
|
|
136
|
+
text: `Error scoring claims: ${error instanceof Error ? error.message : String(error)}`,
|
|
137
|
+
}],
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
});
|
|
141
|
+
server.registerTool("arsr_retrieve_evidence", {
|
|
142
|
+
title: "Retrieve Evidence",
|
|
143
|
+
description: `Fetch evidence for low-confidence claims using uncertainty-guided retrieval.
|
|
144
|
+
|
|
145
|
+
For each claim, the inner LLM generates smart search queries (adversarial by default — designed to DISPROVE the claim), executes web searches, and evaluates each result's stance (supports/contradicts/neutral).
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
- claims_to_check (array): Low-confidence ScoredClaims to investigate
|
|
149
|
+
- strategy (string, optional): "adversarial" (default), "confirmatory", or "balanced"
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
{ "evidence": [{ "claim_id": "c1", "docs": [...], "overall_stance": "contradicted", "summary": "..." }] }
|
|
153
|
+
|
|
154
|
+
IMPORTANT: Only pass claims with confidence BELOW the threshold. Do not waste budget on high-confidence claims.`,
|
|
155
|
+
inputSchema: RetrieveInputSchema,
|
|
156
|
+
annotations: {
|
|
157
|
+
readOnlyHint: true,
|
|
158
|
+
destructiveHint: false,
|
|
159
|
+
idempotentHint: false,
|
|
160
|
+
openWorldHint: true,
|
|
161
|
+
},
|
|
162
|
+
}, async (params) => {
|
|
163
|
+
try {
|
|
164
|
+
const evidence = await retrieveEvidence(params.claims_to_check.map((c) => ({
|
|
165
|
+
...c,
|
|
166
|
+
method: "semantic_entropy",
|
|
167
|
+
})), params.strategy ?? "adversarial");
|
|
168
|
+
const supported = evidence.filter((e) => e.overall_stance === "supported").length;
|
|
169
|
+
const contradicted = evidence.filter((e) => e.overall_stance === "contradicted").length;
|
|
170
|
+
const mixed = evidence.filter((e) => e.overall_stance === "mixed").length;
|
|
171
|
+
const output = {
|
|
172
|
+
evidence,
|
|
173
|
+
summary: {
|
|
174
|
+
claims_checked: evidence.length,
|
|
175
|
+
supported,
|
|
176
|
+
contradicted,
|
|
177
|
+
mixed,
|
|
178
|
+
insufficient: evidence.length - supported - contradicted - mixed,
|
|
179
|
+
},
|
|
180
|
+
};
|
|
181
|
+
return {
|
|
182
|
+
content: [{ type: "text", text: JSON.stringify(output, null, 2) }],
|
|
183
|
+
structuredContent: output,
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
catch (error) {
|
|
187
|
+
return {
|
|
188
|
+
isError: true,
|
|
189
|
+
content: [{
|
|
190
|
+
type: "text",
|
|
191
|
+
text: `Error retrieving evidence: ${error instanceof Error ? error.message : String(error)}`,
|
|
192
|
+
}],
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
server.registerTool("arsr_revise_response", {
|
|
197
|
+
title: "Revise Response",
|
|
198
|
+
description: `Rewrite the draft integrating evidence findings. Corrects contradicted claims, hedges mixed claims, and flags irreconcilable conflicts.
|
|
199
|
+
|
|
200
|
+
IMPORTANT: If the draft was a refusal/non-answer (is_refusal from arsr_draft_response), pass original_query and is_refusal. The revision engine will then generate a COMPLETELY NEW answer from the evidence instead of trying to edit a non-answer.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
- draft (string): The current draft to revise
|
|
204
|
+
- evidence (array): Evidence from retrieve_evidence
|
|
205
|
+
- scored (array): Scored claims from score_uncertainty
|
|
206
|
+
- original_query (string, optional): The user's original question, critical if draft was a refusal
|
|
207
|
+
- is_refusal (boolean, optional): Whether the draft was classified as a refusal by arsr_draft_response
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
{
|
|
211
|
+
"revised": "The corrected response text",
|
|
212
|
+
"changes": [{ "claim_id": "c1", "action": "corrected", "original": "...", "revised": "...", "reason": "..." }],
|
|
213
|
+
"conflicts": [{ "claim_id": "c2", "description": "Sources disagree about..." }]
|
|
214
|
+
}`,
|
|
215
|
+
inputSchema: ReviseInputSchema,
|
|
216
|
+
annotations: {
|
|
217
|
+
readOnlyHint: true,
|
|
218
|
+
destructiveHint: false,
|
|
219
|
+
idempotentHint: false,
|
|
220
|
+
openWorldHint: false,
|
|
221
|
+
},
|
|
222
|
+
}, async (params) => {
|
|
223
|
+
try {
|
|
224
|
+
const result = await reviseDraft(params.draft, params.evidence, params.scored.map((s) => ({
|
|
225
|
+
...s,
|
|
226
|
+
method: "semantic_entropy",
|
|
227
|
+
})), params.original_query, params.is_refusal ?? false);
|
|
228
|
+
return {
|
|
229
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
230
|
+
structuredContent: result,
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
catch (error) {
|
|
234
|
+
return {
|
|
235
|
+
isError: true,
|
|
236
|
+
content: [{
|
|
237
|
+
type: "text",
|
|
238
|
+
text: `Error revising response: ${error instanceof Error ? error.message : String(error)}`,
|
|
239
|
+
}],
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
});
|
|
243
|
+
server.registerTool("arsr_should_continue", {
|
|
244
|
+
title: "Should Continue",
|
|
245
|
+
description: `Decide whether to run another refinement iteration or finalize the response.
|
|
246
|
+
|
|
247
|
+
Uses three stopping criteria:
|
|
248
|
+
1. Budget: stop if iteration >= max_iterations
|
|
249
|
+
2. Threshold: stop if ALL claims exceed confidence_threshold
|
|
250
|
+
3. Convergence: stop if average confidence didn't improve by ≥0.02
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
- iteration (number): Current iteration (1-based)
|
|
254
|
+
- scored (array): Current confidence scores [{ id, confidence, entropy }]
|
|
255
|
+
- budget (number, optional): Max iterations (default: 3)
|
|
256
|
+
- confidence_threshold (number, optional): Target confidence (default: 0.85)
|
|
257
|
+
- previous_avg_confidence (number | null, optional): Prior iteration's avg confidence
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
{ "decision": "continue" | "stop", "reason": "Explanation of why" }`,
|
|
261
|
+
inputSchema: ContinueInputSchema,
|
|
262
|
+
annotations: {
|
|
263
|
+
readOnlyHint: true,
|
|
264
|
+
destructiveHint: false,
|
|
265
|
+
idempotentHint: true,
|
|
266
|
+
openWorldHint: false,
|
|
267
|
+
},
|
|
268
|
+
}, async (params) => {
|
|
269
|
+
try {
|
|
270
|
+
const result = await shouldContinue(params.iteration, params.scored.map((s) => ({
|
|
271
|
+
...s,
|
|
272
|
+
text: "",
|
|
273
|
+
source_span: "",
|
|
274
|
+
method: "semantic_entropy",
|
|
275
|
+
})), params.budget ?? DEFAULT_CONFIG.max_iterations, params.confidence_threshold ?? DEFAULT_CONFIG.confidence_threshold, params.previous_avg_confidence ?? null);
|
|
276
|
+
return {
|
|
277
|
+
content: [{ type: "text", text: JSON.stringify(result, null, 2) }],
|
|
278
|
+
structuredContent: result,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
catch (error) {
|
|
282
|
+
return {
|
|
283
|
+
isError: true,
|
|
284
|
+
content: [{
|
|
285
|
+
type: "text",
|
|
286
|
+
text: `Error in continue decision: ${error instanceof Error ? error.message : String(error)}`,
|
|
287
|
+
}],
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
});
|
|
291
|
+
async function runStdio() {
|
|
292
|
+
const transport = new StdioServerTransport();
|
|
293
|
+
await server.connect(transport);
|
|
294
|
+
console.error("ARSR MCP Server running on stdio");
|
|
295
|
+
}
|
|
296
|
+
async function runHTTP() {
|
|
297
|
+
const app = express();
|
|
298
|
+
app.use(express.json());
|
|
299
|
+
app.post("/mcp", async (req, res) => {
|
|
300
|
+
const transport = new StreamableHTTPServerTransport({
|
|
301
|
+
sessionIdGenerator: undefined,
|
|
302
|
+
enableJsonResponse: true,
|
|
303
|
+
});
|
|
304
|
+
res.on("close", () => transport.close());
|
|
305
|
+
await server.connect(transport);
|
|
306
|
+
await transport.handleRequest(req, res, req.body);
|
|
307
|
+
});
|
|
308
|
+
const port = parseInt(process.env.PORT || "3001");
|
|
309
|
+
app.listen(port, () => {
|
|
310
|
+
console.error(`ARSR MCP Server running on http://localhost:${port}/mcp`);
|
|
311
|
+
});
|
|
312
|
+
}
|
|
313
|
+
const transport = process.env.TRANSPORT || "stdio";
|
|
314
|
+
if (transport === "http") {
|
|
315
|
+
runHTTP().catch((error) => {
|
|
316
|
+
console.error("Server error:", error);
|
|
317
|
+
process.exit(1);
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
else {
|
|
321
|
+
runStdio().catch((error) => {
|
|
322
|
+
console.error("Server error:", error);
|
|
323
|
+
process.exit(1);
|
|
324
|
+
});
|
|
325
|
+
}
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export declare const DraftInputSchema: z.ZodObject<{
|
|
3
|
+
query: z.ZodString;
|
|
4
|
+
context: z.ZodOptional<z.ZodString>;
|
|
5
|
+
}, z.core.$strict>;
|
|
6
|
+
export type DraftInput = z.infer<typeof DraftInputSchema>;
|
|
7
|
+
export declare const DecomposeInputSchema: z.ZodObject<{
|
|
8
|
+
draft: z.ZodString;
|
|
9
|
+
original_query: z.ZodOptional<z.ZodString>;
|
|
10
|
+
is_refusal: z.ZodOptional<z.ZodBoolean>;
|
|
11
|
+
}, z.core.$strict>;
|
|
12
|
+
export type DecomposeInput = z.infer<typeof DecomposeInputSchema>;
|
|
13
|
+
export declare const ScoreInputSchema: z.ZodObject<{
|
|
14
|
+
claims: z.ZodArray<z.ZodObject<{
|
|
15
|
+
id: z.ZodString;
|
|
16
|
+
text: z.ZodString;
|
|
17
|
+
source_span: z.ZodString;
|
|
18
|
+
}, z.core.$strip>>;
|
|
19
|
+
n_samples: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
|
|
20
|
+
}, z.core.$strict>;
|
|
21
|
+
export type ScoreInput = z.infer<typeof ScoreInputSchema>;
|
|
22
|
+
export declare const RetrieveInputSchema: z.ZodObject<{
|
|
23
|
+
claims_to_check: z.ZodArray<z.ZodObject<{
|
|
24
|
+
id: z.ZodString;
|
|
25
|
+
text: z.ZodString;
|
|
26
|
+
source_span: z.ZodString;
|
|
27
|
+
confidence: z.ZodNumber;
|
|
28
|
+
entropy: z.ZodNumber;
|
|
29
|
+
method: z.ZodString;
|
|
30
|
+
}, z.core.$strip>>;
|
|
31
|
+
strategy: z.ZodOptional<z.ZodDefault<z.ZodEnum<{
|
|
32
|
+
adversarial: "adversarial";
|
|
33
|
+
confirmatory: "confirmatory";
|
|
34
|
+
balanced: "balanced";
|
|
35
|
+
}>>>;
|
|
36
|
+
}, z.core.$strict>;
|
|
37
|
+
export type RetrieveInput = z.infer<typeof RetrieveInputSchema>;
|
|
38
|
+
export declare const ReviseInputSchema: z.ZodObject<{
|
|
39
|
+
draft: z.ZodString;
|
|
40
|
+
evidence: z.ZodArray<z.ZodObject<{
|
|
41
|
+
claim_id: z.ZodString;
|
|
42
|
+
claim_text: z.ZodString;
|
|
43
|
+
docs: z.ZodArray<z.ZodObject<{
|
|
44
|
+
title: z.ZodString;
|
|
45
|
+
url: z.ZodString;
|
|
46
|
+
snippet: z.ZodString;
|
|
47
|
+
stance: z.ZodEnum<{
|
|
48
|
+
supports: "supports";
|
|
49
|
+
contradicts: "contradicts";
|
|
50
|
+
neutral: "neutral";
|
|
51
|
+
unclear: "unclear";
|
|
52
|
+
}>;
|
|
53
|
+
}, z.core.$strip>>;
|
|
54
|
+
overall_stance: z.ZodEnum<{
|
|
55
|
+
supported: "supported";
|
|
56
|
+
contradicted: "contradicted";
|
|
57
|
+
mixed: "mixed";
|
|
58
|
+
insufficient: "insufficient";
|
|
59
|
+
}>;
|
|
60
|
+
summary: z.ZodString;
|
|
61
|
+
}, z.core.$strip>>;
|
|
62
|
+
scored: z.ZodArray<z.ZodObject<{
|
|
63
|
+
id: z.ZodString;
|
|
64
|
+
text: z.ZodString;
|
|
65
|
+
source_span: z.ZodString;
|
|
66
|
+
confidence: z.ZodNumber;
|
|
67
|
+
entropy: z.ZodNumber;
|
|
68
|
+
method: z.ZodString;
|
|
69
|
+
}, z.core.$strip>>;
|
|
70
|
+
original_query: z.ZodOptional<z.ZodString>;
|
|
71
|
+
is_refusal: z.ZodOptional<z.ZodBoolean>;
|
|
72
|
+
}, z.core.$strict>;
|
|
73
|
+
export type ReviseInput = z.infer<typeof ReviseInputSchema>;
|
|
74
|
+
export declare const ContinueInputSchema: z.ZodObject<{
|
|
75
|
+
iteration: z.ZodNumber;
|
|
76
|
+
scored: z.ZodArray<z.ZodObject<{
|
|
77
|
+
id: z.ZodString;
|
|
78
|
+
confidence: z.ZodNumber;
|
|
79
|
+
entropy: z.ZodNumber;
|
|
80
|
+
}, z.core.$strip>>;
|
|
81
|
+
budget: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
|
|
82
|
+
confidence_threshold: z.ZodOptional<z.ZodDefault<z.ZodNumber>>;
|
|
83
|
+
previous_avg_confidence: z.ZodOptional<z.ZodDefault<z.ZodNullable<z.ZodNumber>>>;
|
|
84
|
+
}, z.core.$strict>;
|
|
85
|
+
export type ContinueInput = z.infer<typeof ContinueInputSchema>;
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
export const DraftInputSchema = z.object({
|
|
3
|
+
query: z.string()
|
|
4
|
+
.min(1, "Query must not be empty")
|
|
5
|
+
.describe("The user's question or request to answer"),
|
|
6
|
+
context: z.string()
|
|
7
|
+
.optional()
|
|
8
|
+
.describe("Optional additional context, prior conversation, or domain constraints"),
|
|
9
|
+
}).strict();
|
|
10
|
+
export const DecomposeInputSchema = z.object({
|
|
11
|
+
draft: z.string()
|
|
12
|
+
.min(1, "Draft must not be empty")
|
|
13
|
+
.describe("The candidate response text to decompose into atomic claims"),
|
|
14
|
+
original_query: z.string()
|
|
15
|
+
.optional()
|
|
16
|
+
.describe("The user's original question. If the draft is a refusal/non-answer, claims will be extracted from this query instead."),
|
|
17
|
+
is_refusal: z.boolean()
|
|
18
|
+
.optional()
|
|
19
|
+
.describe("Whether the draft was classified as a refusal/non-answer by arsr_draft_response. If true (and original_query is provided), claims are extracted from the query instead of the draft."),
|
|
20
|
+
}).strict();
|
|
21
|
+
export const ScoreInputSchema = z.object({
|
|
22
|
+
claims: z.array(z.object({
|
|
23
|
+
id: z.string().describe("Claim identifier (e.g. 'c1')"),
|
|
24
|
+
text: z.string().describe("The claim as a standalone factual statement"),
|
|
25
|
+
source_span: z.string().describe("The exact substring from the original draft"),
|
|
26
|
+
})).min(1, "At least one claim required")
|
|
27
|
+
.describe("Array of atomic claims to score"),
|
|
28
|
+
n_samples: z.number()
|
|
29
|
+
.int().min(1).max(10).default(3)
|
|
30
|
+
.optional()
|
|
31
|
+
.describe("Number of rephrasings for semantic entropy (default: 3)"),
|
|
32
|
+
}).strict();
|
|
33
|
+
export const RetrieveInputSchema = z.object({
|
|
34
|
+
claims_to_check: z.array(z.object({
|
|
35
|
+
id: z.string(),
|
|
36
|
+
text: z.string(),
|
|
37
|
+
source_span: z.string(),
|
|
38
|
+
confidence: z.number(),
|
|
39
|
+
entropy: z.number(),
|
|
40
|
+
method: z.string(),
|
|
41
|
+
})).min(1, "At least one claim required")
|
|
42
|
+
.describe("Low-confidence claims to gather evidence for"),
|
|
43
|
+
strategy: z.enum(["adversarial", "confirmatory", "balanced"])
|
|
44
|
+
.default("adversarial")
|
|
45
|
+
.optional()
|
|
46
|
+
.describe("Retrieval strategy: 'adversarial' generates counter-queries, 'confirmatory' seeks support, 'balanced' does both"),
|
|
47
|
+
}).strict();
|
|
48
|
+
export const ReviseInputSchema = z.object({
|
|
49
|
+
draft: z.string()
|
|
50
|
+
.min(1)
|
|
51
|
+
.describe("The current draft text to revise"),
|
|
52
|
+
evidence: z.array(z.object({
|
|
53
|
+
claim_id: z.string(),
|
|
54
|
+
claim_text: z.string(),
|
|
55
|
+
docs: z.array(z.object({
|
|
56
|
+
title: z.string(),
|
|
57
|
+
url: z.string(),
|
|
58
|
+
snippet: z.string(),
|
|
59
|
+
stance: z.enum(["supports", "contradicts", "neutral", "unclear"]),
|
|
60
|
+
})),
|
|
61
|
+
overall_stance: z.enum(["supported", "contradicted", "mixed", "insufficient"]),
|
|
62
|
+
summary: z.string(),
|
|
63
|
+
})).describe("Evidence gathered for each claim"),
|
|
64
|
+
scored: z.array(z.object({
|
|
65
|
+
id: z.string(),
|
|
66
|
+
text: z.string(),
|
|
67
|
+
source_span: z.string(),
|
|
68
|
+
confidence: z.number(),
|
|
69
|
+
entropy: z.number(),
|
|
70
|
+
method: z.string(),
|
|
71
|
+
})).describe("The scored claims from the uncertainty step"),
|
|
72
|
+
original_query: z.string()
|
|
73
|
+
.optional()
|
|
74
|
+
.describe("The user's original question. If the draft was a refusal, this is used to generate a new answer from evidence."),
|
|
75
|
+
is_refusal: z.boolean()
|
|
76
|
+
.optional()
|
|
77
|
+
.describe("Whether the draft was classified as a refusal/non-answer by arsr_draft_response. If true, a new response is generated from evidence instead of revising the refusal."),
|
|
78
|
+
}).strict();
|
|
79
|
+
export const ContinueInputSchema = z.object({
|
|
80
|
+
iteration: z.number()
|
|
81
|
+
.int().min(1)
|
|
82
|
+
.describe("Current iteration number (1-based)"),
|
|
83
|
+
scored: z.array(z.object({
|
|
84
|
+
id: z.string(),
|
|
85
|
+
confidence: z.number(),
|
|
86
|
+
entropy: z.number(),
|
|
87
|
+
})).describe("Current confidence scores for all claims"),
|
|
88
|
+
budget: z.number()
|
|
89
|
+
.int().min(1).max(10).default(3)
|
|
90
|
+
.optional()
|
|
91
|
+
.describe("Maximum iterations allowed (default: 3)"),
|
|
92
|
+
confidence_threshold: z.number()
|
|
93
|
+
.min(0).max(1).default(0.85)
|
|
94
|
+
.optional()
|
|
95
|
+
.describe("Stop when all claims exceed this confidence (default: 0.85)"),
|
|
96
|
+
previous_avg_confidence: z.number()
|
|
97
|
+
.nullable().default(null)
|
|
98
|
+
.optional()
|
|
99
|
+
.describe("Average confidence from previous iteration for convergence detection"),
|
|
100
|
+
}).strict();
|