multivon-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multivon_mcp-0.1.0/LICENSE +11 -0
- multivon_mcp-0.1.0/PKG-INFO +160 -0
- multivon_mcp-0.1.0/README.md +134 -0
- multivon_mcp-0.1.0/multivon_mcp/__init__.py +28 -0
- multivon_mcp-0.1.0/multivon_mcp/server.py +67 -0
- multivon_mcp-0.1.0/multivon_mcp/tools/__init__.py +18 -0
- multivon_mcp-0.1.0/multivon_mcp/tools/audit_tools.py +104 -0
- multivon_mcp-0.1.0/multivon_mcp/tools/discover_tools.py +173 -0
- multivon_mcp-0.1.0/multivon_mcp/tools/eval_tools.py +215 -0
- multivon_mcp-0.1.0/multivon_mcp/tools/pdfhell_tools.py +113 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/PKG-INFO +160 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/SOURCES.txt +17 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/dependency_links.txt +1 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/entry_points.txt +2 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/requires.txt +3 -0
- multivon_mcp-0.1.0/multivon_mcp.egg-info/top_level.txt +1 -0
- multivon_mcp-0.1.0/pyproject.toml +46 -0
- multivon_mcp-0.1.0/setup.cfg +4 -0
- multivon_mcp-0.1.0/tests/test_server.py +151 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Apache License
|
|
2
|
+
Version 2.0, January 2004
|
|
3
|
+
http://www.apache.org/licenses/
|
|
4
|
+
|
|
5
|
+
Copyright 2026 Multivon
|
|
6
|
+
|
|
7
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
you may not use this file except in compliance with the License.
|
|
9
|
+
You may obtain a copy of the License at
|
|
10
|
+
|
|
11
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: multivon-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server exposing multivon-eval + pdfhell as agent-callable tools. Drop into Claude Desktop, Cursor, Cline, or any MCP-compatible AI coding agent.
|
|
5
|
+
Author-email: Multivon <hello@multivon.ai>
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Project-URL: Homepage, https://multivon.ai
|
|
8
|
+
Project-URL: Repository, https://github.com/multivon-ai/multivon-mcp
|
|
9
|
+
Project-URL: Issues, https://github.com/multivon-ai/multivon-mcp/issues
|
|
10
|
+
Project-URL: Agent docs, https://multivon.ai/agents
|
|
11
|
+
Keywords: mcp,llm,evaluation,agents,claude,cursor,cline,anthropic
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: mcp[cli]>=1.0
|
|
23
|
+
Requires-Dist: multivon-eval>=0.7.3
|
|
24
|
+
Requires-Dist: pdfhell>=0.1.0
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# multivon-mcp
|
|
28
|
+
|
|
29
|
+
**MCP server that gives AI coding agents direct access to evaluation tools.** Drop into Claude Desktop, Claude Code, Cursor, Cline, or any [Model Context Protocol](https://modelcontextprotocol.io)–compatible agent.
|
|
30
|
+
|
|
31
|
+
When the agent is helping you build an LLM product, it can:
|
|
32
|
+
|
|
33
|
+
- Score a RAG output for hallucination without you writing the scaffolding
|
|
34
|
+
- Generate an adversarial PDF on demand to test your document AI
|
|
35
|
+
- Run the full pdfhell mini-suite against a model and analyse the results
|
|
36
|
+
- Produce a hash-chained audit pack for procurement diligence
|
|
37
|
+
- Discover the full evaluation capability catalog as JSON
|
|
38
|
+
|
|
39
|
+
No copy-paste, no `python -c "..."`, no asking the agent to figure out the SDK calls.
|
|
40
|
+
|
|
41
|
+
## Install
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install multivon-mcp
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Bare install pulls `multivon-eval`, `pdfhell`, and the MCP SDK. The provider SDKs (`anthropic`, `openai`, `google-genai`) come along too — bring your own API key in env.
|
|
48
|
+
|
|
49
|
+
## Configure your agent
|
|
50
|
+
|
|
51
|
+
### Claude Desktop / Claude Code
|
|
52
|
+
|
|
53
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"mcpServers": {
|
|
58
|
+
"multivon": {
|
|
59
|
+
"command": "multivon-mcp",
|
|
60
|
+
"env": {
|
|
61
|
+
"ANTHROPIC_API_KEY": "sk-ant-...",
|
|
62
|
+
"OPENAI_API_KEY": "sk-proj-...",
|
|
63
|
+
"GOOGLE_API_KEY": "AIza..."
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Restart Claude. The 9 tools become available; ask Claude `"use multivon to evaluate this RAG output"` and it figures out which tool to call.
|
|
71
|
+
|
|
72
|
+
### Cursor
|
|
73
|
+
|
|
74
|
+
`cursor.json` or via Settings → MCP:
|
|
75
|
+
|
|
76
|
+
```json
|
|
77
|
+
{ "mcpServers": { "multivon": { "command": "multivon-mcp" } } }
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Cline / OpenCode / any MCP-compatible agent
|
|
81
|
+
|
|
82
|
+
Same shape — point at the `multivon-mcp` console script.
|
|
83
|
+
|
|
84
|
+
### Local dev / debugging
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
mcp dev multivon_mcp.server
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Opens the MCP Inspector UI in your browser. You can call any tool by name, see the JSON schemas, and watch the requests/responses.
|
|
91
|
+
|
|
92
|
+
## The 9 tools
|
|
93
|
+
|
|
94
|
+
| Tool | What it does | API key needed |
|
|
95
|
+
|---|---|---|
|
|
96
|
+
| `eval_discover` | Returns the full machine-readable capability catalog (evaluators, traps, suites, calibration data, versions). Call this first. | No |
|
|
97
|
+
| `pdfhell_make` | Generates one adversarial PDF + its answer key. Useful for inspecting what a trap looks like. | No |
|
|
98
|
+
| `pdfhell_run` | Runs the pdfhell adversarial-PDF benchmark against a vision model. Returns pass rate, per-trap CIs, suite hash. | Yes (vision provider) |
|
|
99
|
+
| `eval_faithfulness` | QAG-graded faithfulness — is a RAG output grounded in the retrieved context? | Yes (judge) |
|
|
100
|
+
| `eval_hallucination` | QAG-graded hallucination detection — does an output contain content NOT in context? | Yes (judge) |
|
|
101
|
+
| `eval_relevance` | QAG-graded answer-vs-question relevance. | Yes (judge) |
|
|
102
|
+
| `eval_answer_accuracy` | QAG-graded semantic equivalence vs ground truth. | Yes (judge) |
|
|
103
|
+
| `eval_tool_call_accuracy` | Deterministic agent tool-call correctness. No LLM. | No |
|
|
104
|
+
| `eval_audit_pack` | Build a hash-chained, procurement-ready ZIP from a pdfhell run. | No |
|
|
105
|
+
|
|
106
|
+
## Example session
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
User: I just shipped a RAG endpoint. Can you check it for hallucinations?
|
|
110
|
+
|
|
111
|
+
Claude: I'll use multivon to evaluate it.
|
|
112
|
+
[calls eval_discover to see what's available]
|
|
113
|
+
[calls eval_faithfulness with your input/context/output]
|
|
114
|
+
|
|
115
|
+
→ score: 0.667 (passed: False), threshold: 0.9
|
|
116
|
+
reason: 2/3 claims grounded
|
|
117
|
+
✓ "annual renewal" — supported by context
|
|
118
|
+
✓ "30-day notice" — supported by context
|
|
119
|
+
✗ "automatic upgrade" — NOT in context
|
|
120
|
+
|
|
121
|
+
Claude: Your RAG hallucinated the "automatic upgrade" detail. The context
|
|
122
|
+
doesn't mention upgrades. I'd add a Hallucination evaluator to your CI
|
|
123
|
+
gate, threshold ≥0.85, and re-prompt with explicit "only use facts
|
|
124
|
+
from context" instructions.
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Why these 9 tools (not all 44)
|
|
128
|
+
|
|
129
|
+
`eval_discover` returns the full 44-evaluator catalog, so the agent can always introspect everything. The 9 tools we expose directly are the ones agents actually call mid-edit:
|
|
130
|
+
|
|
131
|
+
- RAG checks (faithfulness, hallucination, relevance) — most common need
|
|
132
|
+
- Agent traces (tool_call_accuracy) — second most common
|
|
133
|
+
- Document AI (pdfhell.run, pdfhell.make) — for any RAG-on-PDFs flow
|
|
134
|
+
- Audit pack — when procurement is involved
|
|
135
|
+
- Discover — meta-capability for planning
|
|
136
|
+
|
|
137
|
+
Exposing all 44 evaluators as MCP tools would bloat the agent's context window and overwhelm tool-selection. If you need an evaluator that's not directly exposed, the agent can still use `multivon-eval` as a library — `eval_discover` returns the import paths.
|
|
138
|
+
|
|
139
|
+
## Dependencies
|
|
140
|
+
|
|
141
|
+
- `mcp[cli] >= 1.0` — official MCP Python SDK + the `mcp dev` inspector
|
|
142
|
+
- `multivon-eval >= 0.7.3` — the evaluator surface this wraps
|
|
143
|
+
- `pdfhell >= 0.1.0` — the adversarial-PDF benchmark this wraps
|
|
144
|
+
|
|
145
|
+
All Apache 2.0.
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
Apache 2.0.
|
|
150
|
+
|
|
151
|
+
## Citing
|
|
152
|
+
|
|
153
|
+
```bibtex
|
|
154
|
+
@software{multivon_mcp,
|
|
155
|
+
title = {multivon-mcp: MCP server exposing multivon-eval + pdfhell as agent-callable tools},
|
|
156
|
+
author = {Multivon},
|
|
157
|
+
year = {2026},
|
|
158
|
+
url = {https://github.com/multivon-ai/multivon-mcp},
|
|
159
|
+
}
|
|
160
|
+
```
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# multivon-mcp
|
|
2
|
+
|
|
3
|
+
**MCP server that gives AI coding agents direct access to evaluation tools.** Drop into Claude Desktop, Claude Code, Cursor, Cline, or any [Model Context Protocol](https://modelcontextprotocol.io)–compatible agent.
|
|
4
|
+
|
|
5
|
+
When the agent is helping you build an LLM product, it can:
|
|
6
|
+
|
|
7
|
+
- Score a RAG output for hallucination without you writing the scaffolding
|
|
8
|
+
- Generate an adversarial PDF on demand to test your document AI
|
|
9
|
+
- Run the full pdfhell mini-suite against a model and analyse the results
|
|
10
|
+
- Produce a hash-chained audit pack for procurement diligence
|
|
11
|
+
- Discover the full evaluation capability catalog as JSON
|
|
12
|
+
|
|
13
|
+
No copy-paste, no `python -c "..."`, no asking the agent to figure out the SDK calls.
|
|
14
|
+
|
|
15
|
+
## Install
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install multivon-mcp
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Bare install pulls `multivon-eval`, `pdfhell`, and the MCP SDK. The provider SDKs (`anthropic`, `openai`, `google-genai`) come along too — bring your own API key in env.
|
|
22
|
+
|
|
23
|
+
## Configure your agent
|
|
24
|
+
|
|
25
|
+
### Claude Desktop / Claude Code
|
|
26
|
+
|
|
27
|
+
Add to `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
|
|
28
|
+
|
|
29
|
+
```json
|
|
30
|
+
{
|
|
31
|
+
"mcpServers": {
|
|
32
|
+
"multivon": {
|
|
33
|
+
"command": "multivon-mcp",
|
|
34
|
+
"env": {
|
|
35
|
+
"ANTHROPIC_API_KEY": "sk-ant-...",
|
|
36
|
+
"OPENAI_API_KEY": "sk-proj-...",
|
|
37
|
+
"GOOGLE_API_KEY": "AIza..."
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Restart Claude. The 9 tools become available; ask Claude `"use multivon to evaluate this RAG output"` and it figures out which tool to call.
|
|
45
|
+
|
|
46
|
+
### Cursor
|
|
47
|
+
|
|
48
|
+
`cursor.json` or via Settings → MCP:
|
|
49
|
+
|
|
50
|
+
```json
|
|
51
|
+
{ "mcpServers": { "multivon": { "command": "multivon-mcp" } } }
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Cline / OpenCode / any MCP-compatible agent
|
|
55
|
+
|
|
56
|
+
Same shape — point at the `multivon-mcp` console script.
|
|
57
|
+
|
|
58
|
+
### Local dev / debugging
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
mcp dev multivon_mcp.server
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Opens the MCP Inspector UI in your browser. You can call any tool by name, see the JSON schemas, and watch the requests/responses.
|
|
65
|
+
|
|
66
|
+
## The 9 tools
|
|
67
|
+
|
|
68
|
+
| Tool | What it does | API key needed |
|
|
69
|
+
|---|---|---|
|
|
70
|
+
| `eval_discover` | Returns the full machine-readable capability catalog (evaluators, traps, suites, calibration data, versions). Call this first. | No |
|
|
71
|
+
| `pdfhell_make` | Generates one adversarial PDF + its answer key. Useful for inspecting what a trap looks like. | No |
|
|
72
|
+
| `pdfhell_run` | Runs the pdfhell adversarial-PDF benchmark against a vision model. Returns pass rate, per-trap CIs, suite hash. | Yes (vision provider) |
|
|
73
|
+
| `eval_faithfulness` | QAG-graded faithfulness — is a RAG output grounded in the retrieved context? | Yes (judge) |
|
|
74
|
+
| `eval_hallucination` | QAG-graded hallucination detection — does an output contain content NOT in context? | Yes (judge) |
|
|
75
|
+
| `eval_relevance` | QAG-graded answer-vs-question relevance. | Yes (judge) |
|
|
76
|
+
| `eval_answer_accuracy` | QAG-graded semantic equivalence vs ground truth. | Yes (judge) |
|
|
77
|
+
| `eval_tool_call_accuracy` | Deterministic agent tool-call correctness. No LLM. | No |
|
|
78
|
+
| `eval_audit_pack` | Build a hash-chained, procurement-ready ZIP from a pdfhell run. | No |
|
|
79
|
+
|
|
80
|
+
## Example session
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
User: I just shipped a RAG endpoint. Can you check it for hallucinations?
|
|
84
|
+
|
|
85
|
+
Claude: I'll use multivon to evaluate it.
|
|
86
|
+
[calls eval_discover to see what's available]
|
|
87
|
+
[calls eval_faithfulness with your input/context/output]
|
|
88
|
+
|
|
89
|
+
→ score: 0.667 (passed: False), threshold: 0.9
|
|
90
|
+
reason: 2/3 claims grounded
|
|
91
|
+
✓ "annual renewal" — supported by context
|
|
92
|
+
✓ "30-day notice" — supported by context
|
|
93
|
+
✗ "automatic upgrade" — NOT in context
|
|
94
|
+
|
|
95
|
+
Claude: Your RAG hallucinated the "automatic upgrade" detail. The context
|
|
96
|
+
doesn't mention upgrades. I'd add a Hallucination evaluator to your CI
|
|
97
|
+
gate, threshold ≥0.85, and re-prompt with explicit "only use facts
|
|
98
|
+
from context" instructions.
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Why these 9 tools (not all 44)
|
|
102
|
+
|
|
103
|
+
`eval_discover` returns the full 44-evaluator catalog, so the agent can always introspect everything. The 9 tools we expose directly are the ones agents actually call mid-edit:
|
|
104
|
+
|
|
105
|
+
- RAG checks (faithfulness, hallucination, relevance) — most common need
|
|
106
|
+
- Agent traces (tool_call_accuracy) — second most common
|
|
107
|
+
- Document AI (pdfhell.run, pdfhell.make) — for any RAG-on-PDFs flow
|
|
108
|
+
- Audit pack — when procurement is involved
|
|
109
|
+
- Discover — meta-capability for planning
|
|
110
|
+
|
|
111
|
+
Exposing all 44 evaluators as MCP tools would bloat the agent's context window and overwhelm tool-selection. If you need an evaluator that's not directly exposed, the agent can still use `multivon-eval` as a library — `eval_discover` returns the import paths.
|
|
112
|
+
|
|
113
|
+
## Dependencies
|
|
114
|
+
|
|
115
|
+
- `mcp[cli] >= 1.0` — official MCP Python SDK + the `mcp dev` inspector
|
|
116
|
+
- `multivon-eval >= 0.7.3` — the evaluator surface this wraps
|
|
117
|
+
- `pdfhell >= 0.1.0` — the adversarial-PDF benchmark this wraps
|
|
118
|
+
|
|
119
|
+
All Apache 2.0.
|
|
120
|
+
|
|
121
|
+
## License
|
|
122
|
+
|
|
123
|
+
Apache 2.0.
|
|
124
|
+
|
|
125
|
+
## Citing
|
|
126
|
+
|
|
127
|
+
```bibtex
|
|
128
|
+
@software{multivon_mcp,
|
|
129
|
+
title = {multivon-mcp: MCP server exposing multivon-eval + pdfhell as agent-callable tools},
|
|
130
|
+
author = {Multivon},
|
|
131
|
+
year = {2026},
|
|
132
|
+
url = {https://github.com/multivon-ai/multivon-mcp},
|
|
133
|
+
}
|
|
134
|
+
```
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""multivon-mcp — agent-callable evaluation tools.
|
|
2
|
+
|
|
3
|
+
Drop into Claude Desktop, Cursor, Cline, or any MCP-compatible agent
|
|
4
|
+
to give it direct access to multivon-eval + pdfhell tools without
|
|
5
|
+
shelling out to ``python -c`` or copy-pasting code.
|
|
6
|
+
|
|
7
|
+
Quickstart::
|
|
8
|
+
|
|
9
|
+
pip install multivon-mcp
|
|
10
|
+
|
|
11
|
+
# Claude Desktop / Claude Code config (mcpServers):
|
|
12
|
+
{
|
|
13
|
+
"mcpServers": {
|
|
14
|
+
"multivon": {"command": "multivon-mcp"}
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
After registering, ask Claude:
|
|
19
|
+
"use multivon to evaluate this RAG output for faithfulness"
|
|
20
|
+
|
|
21
|
+
The agent discovers the 8 available tools via the MCP capabilities
|
|
22
|
+
handshake and calls them directly.
|
|
23
|
+
"""
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
|
|
28
|
+
__all__ = ["__version__"]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""FastMCP server entry point.
|
|
2
|
+
|
|
3
|
+
``multivon-mcp`` (the console script) starts this server in stdio
|
|
4
|
+
transport mode — exactly what Claude Desktop / Cursor / Cline expect
|
|
5
|
+
when configured via ``mcpServers``.
|
|
6
|
+
|
|
7
|
+
The 8 tools registered:
|
|
8
|
+
|
|
9
|
+
pdfhell.run — evaluate a vision model on the suite
|
|
10
|
+
pdfhell.make — generate one trap PDF + answer key
|
|
11
|
+
eval.faithfulness — QAG-graded RAG faithfulness
|
|
12
|
+
eval.hallucination — QAG-graded hallucination detection
|
|
13
|
+
eval.relevance — QAG-graded answer-vs-question relevance
|
|
14
|
+
eval.tool_call_accuracy — agent tool-call correctness (no LLM judge)
|
|
15
|
+
eval.answer_accuracy — QAG-graded semantic-equivalence
|
|
16
|
+
eval.audit_pack — build a hash-chained audit ZIP from a run
|
|
17
|
+
eval.discover — full machine-readable capability catalog
|
|
18
|
+
|
|
19
|
+
Why 8 (not 43): the narrow set is the surface AI coding agents actually
|
|
20
|
+
need mid-edit. The full evaluator catalog stays available via
|
|
21
|
+
``eval.discover`` for the agents that want to inspect everything.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import sys
|
|
26
|
+
|
|
27
|
+
from mcp.server.fastmcp import FastMCP
|
|
28
|
+
|
|
29
|
+
from . import __version__
|
|
30
|
+
from .tools import register_all
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def build_server() -> FastMCP:
|
|
34
|
+
"""Build and configure the FastMCP server. Factored out for tests
|
|
35
|
+
that want to introspect the registered tools without actually
|
|
36
|
+
serving."""
|
|
37
|
+
mcp = FastMCP(
|
|
38
|
+
name="multivon-mcp",
|
|
39
|
+
instructions=(
|
|
40
|
+
"Multivon's evaluation toolkit for AI agents. Use eval.discover() "
|
|
41
|
+
"at session start to see every available evaluator + trap family. "
|
|
42
|
+
"For RAG outputs, prefer eval.faithfulness + eval.hallucination. "
|
|
43
|
+
"For agent traces, use eval.tool_call_accuracy. For document AI, "
|
|
44
|
+
"use pdfhell.run with a vision model. All judge calls require "
|
|
45
|
+
"the matching provider's API key in env "
|
|
46
|
+
"(ANTHROPIC_API_KEY / OPENAI_API_KEY / GOOGLE_API_KEY)."
|
|
47
|
+
),
|
|
48
|
+
)
|
|
49
|
+
register_all(mcp)
|
|
50
|
+
return mcp
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main() -> None:
|
|
54
|
+
"""Console-script entry point. Runs the server over stdio.
|
|
55
|
+
|
|
56
|
+
For local dev / debugging, you can also run ``mcp dev multivon_mcp.server``
|
|
57
|
+
which opens the MCP Inspector UI on a local port.
|
|
58
|
+
"""
|
|
59
|
+
if "--version" in sys.argv:
|
|
60
|
+
print(f"multivon-mcp {__version__}")
|
|
61
|
+
return
|
|
62
|
+
mcp = build_server()
|
|
63
|
+
mcp.run()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
main()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Tool modules. Each registers a small group of related MCP tools on
|
|
2
|
+
the FastMCP server instance passed in."""
|
|
3
|
+
|
|
4
|
+
from .pdfhell_tools import register as register_pdfhell
|
|
5
|
+
from .eval_tools import register as register_eval
|
|
6
|
+
from .audit_tools import register as register_audit
|
|
7
|
+
from .discover_tools import register as register_discover
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def register_all(mcp) -> None:
|
|
11
|
+
"""Register every tool group on the FastMCP server."""
|
|
12
|
+
register_pdfhell(mcp)
|
|
13
|
+
register_eval(mcp)
|
|
14
|
+
register_audit(mcp)
|
|
15
|
+
register_discover(mcp)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
__all__ = ["register_all"]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Audit-pack MCP tool.
|
|
2
|
+
|
|
3
|
+
Wraps pdfhell's audit-pack generation. The agent calls this after a
|
|
4
|
+
pdfhell run to produce a procurement-ready ZIP with hash-chained
|
|
5
|
+
manifest, PDFs, answer keys, JUnit XML, and a human-readable README.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def register(mcp) -> None:
|
|
15
|
+
"""Register audit tools on the FastMCP server."""
|
|
16
|
+
|
|
17
|
+
@mcp.tool()
|
|
18
|
+
def eval_audit_pack(
|
|
19
|
+
run_json_path: str,
|
|
20
|
+
cases_dir: str,
|
|
21
|
+
output_zip_path: str,
|
|
22
|
+
) -> dict[str, Any]:
|
|
23
|
+
"""Build a hash-chained audit ZIP from a pdfhell run.
|
|
24
|
+
|
|
25
|
+
Combines the run JSON, the case PDFs + answer keys, JUnit XML,
|
|
26
|
+
and a SHA-256 manifest into one downloadable ZIP. Suitable for
|
|
27
|
+
attaching to a procurement diligence appendix.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
run_json_path: Path to a pdfhell run JSON (from ``pdfhell run --out``).
|
|
31
|
+
cases_dir: Directory containing the case PDFs + answer keys that
|
|
32
|
+
were evaluated. Same dir the run used.
|
|
33
|
+
output_zip_path: Where to write the audit ZIP.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
``{"path": "/abs/path/to.zip", "size_bytes": N, "manifest": {...}}``.
|
|
37
|
+
The manifest dict mirrors the one inside the ZIP — useful for
|
|
38
|
+
an agent that wants to verify the contents without opening
|
|
39
|
+
the ZIP itself.
|
|
40
|
+
"""
|
|
41
|
+
from pdfhell.auditpack import build_audit_pack
|
|
42
|
+
from pdfhell.scorer import SuiteReport, CaseScore
|
|
43
|
+
|
|
44
|
+
run_path = Path(run_json_path).expanduser().resolve()
|
|
45
|
+
if not run_path.is_file():
|
|
46
|
+
return {"error": f"run JSON not found: {run_path}"}
|
|
47
|
+
cases_path = Path(cases_dir).expanduser().resolve()
|
|
48
|
+
if not cases_path.is_dir():
|
|
49
|
+
return {"error": f"cases dir not found: {cases_path}"}
|
|
50
|
+
|
|
51
|
+
# Reconstruct a SuiteReport from the JSON. We only need the fields
|
|
52
|
+
# the audit-pack builder reads.
|
|
53
|
+
raw = json.loads(run_path.read_text(encoding="utf-8"))
|
|
54
|
+
cases = [
|
|
55
|
+
CaseScore(
|
|
56
|
+
case_id=c["case_id"],
|
|
57
|
+
trap_family=c["trap_family"],
|
|
58
|
+
correct=bool(c["correct"]),
|
|
59
|
+
fell_for_trap=bool(c.get("fell_for_trap", False)),
|
|
60
|
+
refused=bool(c.get("refused", False)),
|
|
61
|
+
matched_expected=bool(c.get("matched_expected", False)),
|
|
62
|
+
matched_forbidden=list(c.get("matched_forbidden", [])),
|
|
63
|
+
model_output=c.get("model_output", ""),
|
|
64
|
+
expected=c.get("expected", ""),
|
|
65
|
+
failure_mode=c.get("failure_mode", ""),
|
|
66
|
+
)
|
|
67
|
+
for c in raw.get("cases", [])
|
|
68
|
+
]
|
|
69
|
+
report = SuiteReport(
|
|
70
|
+
model=raw["model"],
|
|
71
|
+
suite=raw["suite"],
|
|
72
|
+
n=raw["n"],
|
|
73
|
+
pass_rate=raw["pass_rate"],
|
|
74
|
+
per_trap_pass=raw.get("per_trap_pass", {}),
|
|
75
|
+
per_trap_fell_for_trap=raw.get("per_trap_fell_for_trap", {}),
|
|
76
|
+
refused_rate=raw.get("refused_rate", 0.0),
|
|
77
|
+
cases=cases,
|
|
78
|
+
suite_version=raw.get("suite_version", ""),
|
|
79
|
+
suite_hash=raw.get("suite_hash", ""),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
out_path = Path(output_zip_path).expanduser().resolve()
|
|
83
|
+
build_audit_pack(report, cases_path, out_path)
|
|
84
|
+
|
|
85
|
+
# Return a compact manifest summary the agent can use directly.
|
|
86
|
+
import zipfile
|
|
87
|
+
with zipfile.ZipFile(out_path, "r") as zf:
|
|
88
|
+
manifest = json.loads(zf.read("manifest.json").decode("utf-8"))
|
|
89
|
+
return {
|
|
90
|
+
"path": str(out_path),
|
|
91
|
+
"size_bytes": out_path.stat().st_size,
|
|
92
|
+
"manifest": {
|
|
93
|
+
"pdfhell_version": manifest["pdfhell_version"],
|
|
94
|
+
"model": manifest["model"],
|
|
95
|
+
"suite": manifest["suite"],
|
|
96
|
+
"suite_version": manifest.get("suite_version", ""),
|
|
97
|
+
"suite_hash": manifest.get("suite_hash", ""),
|
|
98
|
+
"n": manifest["n"],
|
|
99
|
+
"passed": manifest["passed"],
|
|
100
|
+
"pass_rate": manifest["pass_rate"],
|
|
101
|
+
"pass_rate_ci_95": manifest.get("pass_rate_ci_95", []),
|
|
102
|
+
"file_count": len(manifest["files"]),
|
|
103
|
+
},
|
|
104
|
+
}
|