docreadi-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docreadi_mcp-0.1.0/.gitignore +67 -0
- docreadi_mcp-0.1.0/PKG-INFO +106 -0
- docreadi_mcp-0.1.0/PUBLISHING.md +53 -0
- docreadi_mcp-0.1.0/README.md +84 -0
- docreadi_mcp-0.1.0/docreadi_mcp/__init__.py +17 -0
- docreadi_mcp-0.1.0/docreadi_mcp/__main__.py +28 -0
- docreadi_mcp-0.1.0/docreadi_mcp/catalog.py +34 -0
- docreadi_mcp-0.1.0/docreadi_mcp/client.py +272 -0
- docreadi_mcp-0.1.0/docreadi_mcp/config.py +41 -0
- docreadi_mcp-0.1.0/docreadi_mcp/errors.py +11 -0
- docreadi_mcp-0.1.0/docreadi_mcp/files.py +62 -0
- docreadi_mcp-0.1.0/docreadi_mcp/server.py +219 -0
- docreadi_mcp-0.1.0/pyproject.toml +37 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Secrets — never commit
|
|
2
|
+
.env
|
|
3
|
+
|
|
4
|
+
# Claude Code local settings (gitignore the directory's contents
|
|
5
|
+
# via /* so the directory itself stays trackable, then carve out
|
|
6
|
+
# the project-shared settings.json — added 2026-05-26 for the
|
|
7
|
+
# team-wide ruff pre-push hook. Using `.claude/` here would
|
|
8
|
+
# block the !.claude/settings.json re-include because git can't
|
|
9
|
+
# re-include a file under an excluded directory).
|
|
10
|
+
.claude/*
|
|
11
|
+
!.claude/settings.json
|
|
12
|
+
# Custom subagents are shared, durable assets (brand-guardian, marketing-agent
|
|
13
|
+
# …) — version them. Re-include the dir, then the .md definitions inside it.
|
|
14
|
+
!.claude/agents/
|
|
15
|
+
!.claude/agents/*.md
|
|
16
|
+
|
|
17
|
+
# Python
|
|
18
|
+
__pycache__/
|
|
19
|
+
*.py[cod]
|
|
20
|
+
*.pyo
|
|
21
|
+
*.pyd
|
|
22
|
+
.Python
|
|
23
|
+
*.egg-info/
|
|
24
|
+
dist/
|
|
25
|
+
build/
|
|
26
|
+
.eggs/
|
|
27
|
+
*.egg
|
|
28
|
+
.venv/
|
|
29
|
+
venv/
|
|
30
|
+
env/
|
|
31
|
+
|
|
32
|
+
# Docker volumes / data
|
|
33
|
+
data/
|
|
34
|
+
*.sqlite
|
|
35
|
+
*.db
|
|
36
|
+
|
|
37
|
+
# IDE
|
|
38
|
+
.vscode/
|
|
39
|
+
.idea/
|
|
40
|
+
*.swp
|
|
41
|
+
*.swo
|
|
42
|
+
|
|
43
|
+
# OS
|
|
44
|
+
.DS_Store
|
|
45
|
+
Thumbs.db
|
|
46
|
+
|
|
47
|
+
# Logs
|
|
48
|
+
*.log
|
|
49
|
+
logs/
|
|
50
|
+
|
|
51
|
+
# Test artifacts
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
.coverage
|
|
54
|
+
htmlcov/
|
|
55
|
+
|
|
56
|
+
# WSL metadata garbage from Windows copy-paste (foo.png:Zone.Identifier)
|
|
57
|
+
*:Zone.Identifier
|
|
58
|
+
|
|
59
|
+
# Generated/temp HTML renders of the markdown review docs — never track
|
|
60
|
+
PRODUCT_REVIEW.html
|
|
61
|
+
PRODUCT_REVIEW_tmp.html
|
|
62
|
+
*_tmp.html
|
|
63
|
+
|
|
64
|
+
# Local-only working / handover docs + ad-hoc probes — never track
|
|
65
|
+
HANDOVER.md
|
|
66
|
+
PIPELINE_CONCURRENCY_GOVERNOR_PLAN.md
|
|
67
|
+
test_ocr_concurrency.py
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docreadi-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for DocReadi — document data extraction for finance, in your AI client.
|
|
5
|
+
Project-URL: Homepage, https://docreadi.com
|
|
6
|
+
Project-URL: Documentation, https://api.docreadi.com/api/docs-guide
|
|
7
|
+
Project-URL: Repository, https://github.com/greenmartian138/doc_intelligence
|
|
8
|
+
Author-email: DocReadi <support@docreadi.com>
|
|
9
|
+
License: Proprietary
|
|
10
|
+
Keywords: docreadi,documents,extraction,finance,invoice,mcp,ocr
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Environment :: Console
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: httpx<1,>=0.27.0
|
|
20
|
+
Requires-Dist: mcp>=1.2.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# docreadi-mcp
|
|
24
|
+
|
|
25
|
+
A **local** [MCP](https://modelcontextprotocol.io) server for **DocReadi** —
|
|
26
|
+
document data extraction for finance. It runs on your machine, reads documents
|
|
27
|
+
off your disk, and calls the hosted DocReadi API with your API key, so an AI
|
|
28
|
+
client (Claude Desktop, Claude Code, Cursor, …) can extract documents and query
|
|
29
|
+
your DocReadi corpus without leaving the chat.
|
|
30
|
+
|
|
31
|
+
> **Status: v1, in progress** (see `../../MCP_SERVER_PLAN.md`). Tools land
|
|
32
|
+
> phase by phase; this is the skeleton + first read tool.
|
|
33
|
+
|
|
34
|
+
## How it works
|
|
35
|
+
|
|
36
|
+
Your MCP client spawns `docreadi-mcp` as a local subprocess and talks to it over
|
|
37
|
+
stdio. The server is a thin courier — all extraction/storage happens on the
|
|
38
|
+
hosted DocReadi API; the server just translates tool calls into HTTPS requests
|
|
39
|
+
and (for ingestion) reads local files. Your API key lives only in your client
|
|
40
|
+
config, never in this repo.
|
|
41
|
+
|
|
42
|
+
## Install / configure
|
|
43
|
+
|
|
44
|
+
Add the server to your MCP client config with your DocReadi API key
|
|
45
|
+
(get one at **docreadi.com → Settings → API keys**):
|
|
46
|
+
|
|
47
|
+
```jsonc
|
|
48
|
+
{
|
|
49
|
+
"mcpServers": {
|
|
50
|
+
"docreadi": {
|
|
51
|
+
"command": "uvx",
|
|
52
|
+
"args": ["docreadi-mcp"],
|
|
53
|
+
"env": {
|
|
54
|
+
"DOCREADI_API_KEY": "dr_live_…"
|
|
55
|
+
// "DOCREADI_BASE_URL": "https://api.docreadi.com" // override if self-hosting
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
| Env var | Required | Default |
|
|
63
|
+
|---|---|---|
|
|
64
|
+
| `DOCREADI_API_KEY` | yes | — |
|
|
65
|
+
| `DOCREADI_BASE_URL` | no | `https://api.docreadi.com` |
|
|
66
|
+
|
|
67
|
+
## Tools
|
|
68
|
+
|
|
69
|
+
| Tool | What it does |
|
|
70
|
+
|---|---|
|
|
71
|
+
| `check_connection` | First-run self-test — is DocReadi reachable and is your API key valid? Never errors; returns a diagnostic. |
|
|
72
|
+
| `extract_document` | Extract structured data from a **local** PDF/JPG/PNG — uploads, waits for extraction, returns the fields + line items + totals + confidence. |
|
|
73
|
+
| `get_document` | Fetch a document's status + extracted data by id. |
|
|
74
|
+
| `extract_adhoc` | One-off, non-destructive extraction of caller-defined fields from a document already in DocReadi (e.g. "who signed these delivery notes?"). |
|
|
75
|
+
| `classify_document` | (Re)classify a document's type; returns the type + reasoning. |
|
|
76
|
+
| `search_documents` | Search/list the workspace's documents (by vendor, number, type, status, or date range) — a summary row per match. |
|
|
77
|
+
| `list_counterparties` | List vendors/customers (VAT, aliases, doc counts), paginated. |
|
|
78
|
+
| `list_reports` | List the workspace's saved reports. |
|
|
79
|
+
| `run_report` | Run a saved report and return its rows — the way to query/search the corpus. |
|
|
80
|
+
| `export_report_csv` | Run a saved report and write the result as a CSV to a local path. |
|
|
81
|
+
|
|
82
|
+
_(`search_documents` is the raw query surface; for richer column selection and
|
|
83
|
+
saved queries, author a report in the UI and use `run_report` /
|
|
84
|
+
`export_report_csv`.)_
|
|
85
|
+
|
|
86
|
+
## Development
|
|
87
|
+
|
|
88
|
+
This package lives in the DocReadi monorepo so it's reviewed and CI'd alongside
|
|
89
|
+
the API it wraps. Its layering keeps the dependency-free parts testable in the
|
|
90
|
+
main CI without the `mcp` SDK:
|
|
91
|
+
|
|
92
|
+
- `docreadi_mcp/config.py` — env config.
|
|
93
|
+
- `docreadi_mcp/catalog.py` — pure-data tool→endpoint registry (the maintenance
|
|
94
|
+
anchor: `tests/test_mcp_catalog_contract.py` binds it to `agent_guide.GUIDE`).
|
|
95
|
+
- `docreadi_mcp/client.py` — httpx client (tested with `MockTransport`).
|
|
96
|
+
- `docreadi_mcp/server.py` — thin FastMCP glue (lazy-imports the `mcp` SDK).
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# from the repo root
|
|
100
|
+
pip install -e services/mcp # installs mcp + httpx
|
|
101
|
+
DOCREADI_API_KEY=dr_… python -m docreadi_mcp # run over stdio
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**Maintenance contract:** when a DocReadi API endpoint a tool wraps changes,
|
|
105
|
+
update `docreadi_mcp/catalog.py` (and the tool) — the contract test fails CI
|
|
106
|
+
otherwise. See `MCP_SERVER_PLAN.md` §5.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Publishing `docreadi-mcp` to PyPI
|
|
2
|
+
|
|
3
|
+
This is a **founder action** — it needs a PyPI account + an API token (a secret
|
|
4
|
+
that must never go in the repo, chat, or CI logs without a protected store). The
|
|
5
|
+
package itself is ready; this is the one step an agent can't do.
|
|
6
|
+
|
|
7
|
+
## One-time setup
|
|
8
|
+
|
|
9
|
+
1. Create a PyPI account at <https://pypi.org> (and ideally
|
|
10
|
+
<https://test.pypi.org> for a dry run).
|
|
11
|
+
2. Create a **project-scoped API token** (Account → API tokens). For the very
|
|
12
|
+
first upload the project doesn't exist yet, so use an account-scoped token,
|
|
13
|
+
then rotate to a project-scoped one afterwards.
|
|
14
|
+
3. Reserve the name by doing the first upload (the name `docreadi-mcp` is set in
|
|
15
|
+
`pyproject.toml`).
|
|
16
|
+
|
|
17
|
+
## Build + upload
|
|
18
|
+
|
|
19
|
+
From `services/mcp/` (in a clean venv):
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python -m pip install --upgrade build twine
|
|
23
|
+
python -m build # writes dist/docreadi_mcp-<v>.tar.gz + .whl
|
|
24
|
+
python -m twine check dist/* # metadata sanity
|
|
25
|
+
# dry run first (optional):
|
|
26
|
+
python -m twine upload --repository testpypi dist/*
|
|
27
|
+
# real upload:
|
|
28
|
+
python -m twine upload dist/* # paste the token as the password (user: __token__)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Then verify the user-facing install path works from a clean machine:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
DOCREADI_API_KEY=dr_live_… uvx docreadi-mcp # should start and wait on stdio
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Releasing a new version
|
|
38
|
+
|
|
39
|
+
1. Bump the version in **both** `pyproject.toml` and
|
|
40
|
+
`docreadi_mcp/__init__.py` (`__version__`) — they must match (the
|
|
41
|
+
`User-Agent` header reports `__version__`).
|
|
42
|
+
2. `python -m build && python -m twine upload dist/*` (delete the old `dist/`
|
|
43
|
+
first).
|
|
44
|
+
3. PyPI is append-only — you cannot overwrite a version, only yank it. Never
|
|
45
|
+
reuse a version number.
|
|
46
|
+
|
|
47
|
+
## Notes
|
|
48
|
+
|
|
49
|
+
- **No registry listing yet** (open decision #3 in `MCP_SERVER_PLAN.md`) — list
|
|
50
|
+
in an MCP registry only after a hosted/remote v2.
|
|
51
|
+
- CI does **not** publish (no token in GitHub Actions). Publishing stays a
|
|
52
|
+
manual, founder-gated step until a release workflow with a protected token is
|
|
53
|
+
set up deliberately.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# docreadi-mcp
|
|
2
|
+
|
|
3
|
+
A **local** [MCP](https://modelcontextprotocol.io) server for **DocReadi** —
|
|
4
|
+
document data extraction for finance. It runs on your machine, reads documents
|
|
5
|
+
off your disk, and calls the hosted DocReadi API with your API key, so an AI
|
|
6
|
+
client (Claude Desktop, Claude Code, Cursor, …) can extract documents and query
|
|
7
|
+
your DocReadi corpus without leaving the chat.
|
|
8
|
+
|
|
9
|
+
> **Status: v1, in progress** (see `../../MCP_SERVER_PLAN.md`). Tools land
|
|
10
|
+
> phase by phase; this is the skeleton + first read tool.
|
|
11
|
+
|
|
12
|
+
## How it works
|
|
13
|
+
|
|
14
|
+
Your MCP client spawns `docreadi-mcp` as a local subprocess and talks to it over
|
|
15
|
+
stdio. The server is a thin courier — all extraction/storage happens on the
|
|
16
|
+
hosted DocReadi API; the server just translates tool calls into HTTPS requests
|
|
17
|
+
and (for ingestion) reads local files. Your API key lives only in your client
|
|
18
|
+
config, never in this repo.
|
|
19
|
+
|
|
20
|
+
## Install / configure
|
|
21
|
+
|
|
22
|
+
Add the server to your MCP client config with your DocReadi API key
|
|
23
|
+
(get one at **docreadi.com → Settings → API keys**):
|
|
24
|
+
|
|
25
|
+
```jsonc
|
|
26
|
+
{
|
|
27
|
+
"mcpServers": {
|
|
28
|
+
"docreadi": {
|
|
29
|
+
"command": "uvx",
|
|
30
|
+
"args": ["docreadi-mcp"],
|
|
31
|
+
"env": {
|
|
32
|
+
"DOCREADI_API_KEY": "dr_live_…"
|
|
33
|
+
// "DOCREADI_BASE_URL": "https://api.docreadi.com" // override if self-hosting
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
| Env var | Required | Default |
|
|
41
|
+
|---|---|---|
|
|
42
|
+
| `DOCREADI_API_KEY` | yes | — |
|
|
43
|
+
| `DOCREADI_BASE_URL` | no | `https://api.docreadi.com` |
|
|
44
|
+
|
|
45
|
+
## Tools
|
|
46
|
+
|
|
47
|
+
| Tool | What it does |
|
|
48
|
+
|---|---|
|
|
49
|
+
| `check_connection` | First-run self-test — is DocReadi reachable and is your API key valid? Never errors; returns a diagnostic. |
|
|
50
|
+
| `extract_document` | Extract structured data from a **local** PDF/JPG/PNG — uploads, waits for extraction, returns the fields + line items + totals + confidence. |
|
|
51
|
+
| `get_document` | Fetch a document's status + extracted data by id. |
|
|
52
|
+
| `extract_adhoc` | One-off, non-destructive extraction of caller-defined fields from a document already in DocReadi (e.g. "who signed these delivery notes?"). |
|
|
53
|
+
| `classify_document` | (Re)classify a document's type; returns the type + reasoning. |
|
|
54
|
+
| `search_documents` | Search/list the workspace's documents (by vendor, number, type, status, or date range) — a summary row per match. |
|
|
55
|
+
| `list_counterparties` | List vendors/customers (VAT, aliases, doc counts), paginated. |
|
|
56
|
+
| `list_reports` | List the workspace's saved reports. |
|
|
57
|
+
| `run_report` | Run a saved report and return its rows — the way to query/search the corpus. |
|
|
58
|
+
| `export_report_csv` | Run a saved report and write the result as a CSV to a local path. |
|
|
59
|
+
|
|
60
|
+
_(`search_documents` is the raw query surface; for richer column selection and
|
|
61
|
+
saved queries, author a report in the UI and use `run_report` /
|
|
62
|
+
`export_report_csv`.)_
|
|
63
|
+
|
|
64
|
+
## Development
|
|
65
|
+
|
|
66
|
+
This package lives in the DocReadi monorepo so it's reviewed and CI'd alongside
|
|
67
|
+
the API it wraps. Its layering keeps the dependency-free parts testable in the
|
|
68
|
+
main CI without the `mcp` SDK:
|
|
69
|
+
|
|
70
|
+
- `docreadi_mcp/config.py` — env config.
|
|
71
|
+
- `docreadi_mcp/catalog.py` — pure-data tool→endpoint registry (the maintenance
|
|
72
|
+
anchor: `tests/test_mcp_catalog_contract.py` binds it to `agent_guide.GUIDE`).
|
|
73
|
+
- `docreadi_mcp/client.py` — httpx client (tested with `MockTransport`).
|
|
74
|
+
- `docreadi_mcp/server.py` — thin FastMCP glue (lazy-imports the `mcp` SDK).
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# from the repo root
|
|
78
|
+
pip install -e services/mcp # installs mcp + httpx
|
|
79
|
+
DOCREADI_API_KEY=dr_… python -m docreadi_mcp # run over stdio
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
**Maintenance contract:** when a DocReadi API endpoint a tool wraps changes,
|
|
83
|
+
update `docreadi_mcp/catalog.py` (and the tool) — the contract test fails CI
|
|
84
|
+
otherwise. See `MCP_SERVER_PLAN.md` §5.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""DocReadi MCP server — local-first wrapper over the DocReadi REST API.
|
|
2
|
+
|
|
3
|
+
A small process the user's MCP client (Claude Desktop / Claude Code / …) spawns
|
|
4
|
+
locally. It reads files off the user's disk and calls the hosted DocReadi API
|
|
5
|
+
with the user's API key — so an agent can extract documents and query a corpus
|
|
6
|
+
without leaving the chat. See MCP_SERVER_PLAN.md.
|
|
7
|
+
|
|
8
|
+
Layering (deliberate, so the maintenance-contract + client tests run in CI
|
|
9
|
+
without the `mcp` SDK installed):
|
|
10
|
+
* config.py — env config (DOCREADI_API_KEY / DOCREADI_BASE_URL). No deps.
|
|
11
|
+
* catalog.py — pure-data tool→endpoint registry. No deps. The contract test
|
|
12
|
+
(tests/test_mcp_catalog_contract.py) binds it to agent_guide.GUIDE.
|
|
13
|
+
* client.py — httpx client wrapping the REST API. Tested with MockTransport.
|
|
14
|
+
* server.py — thin FastMCP glue (imports the `mcp` SDK lazily).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Console entry point: `docreadi-mcp` (and `python -m docreadi_mcp`).
|
|
2
|
+
|
|
3
|
+
Runs the MCP server over stdio — the user's MCP client spawns this as a
|
|
4
|
+
subprocess. A missing API key (ConfigError) prints an actionable message and
|
|
5
|
+
exits non-zero rather than dumping a traceback.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main() -> int:
|
|
13
|
+
from .config import ConfigError
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from .server import build_server
|
|
17
|
+
|
|
18
|
+
server = build_server()
|
|
19
|
+
except ConfigError as exc:
|
|
20
|
+
print(f"docreadi-mcp: {exc}", file=sys.stderr)
|
|
21
|
+
return 2
|
|
22
|
+
|
|
23
|
+
server.run() # FastMCP defaults to stdio transport
|
|
24
|
+
return 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Tool → REST-endpoint catalog (pure data, no dependencies).
|
|
2
|
+
|
|
3
|
+
This is the **maintenance anchor** (MCP_SERVER_PLAN.md §5): every MCP tool
|
|
4
|
+
declares which DocReadi endpoint(s) it wraps, as the EXACT
|
|
5
|
+
``"METHOD /path"`` string that appears in ``agent_guide.GUIDE["endpoints"]``.
|
|
6
|
+
``tests/test_mcp_catalog_contract.py`` asserts every entry here resolves to a
|
|
7
|
+
real GUIDE endpoint — so a tool can never wrap an endpoint that was removed or
|
|
8
|
+
renamed without CI going red.
|
|
9
|
+
|
|
10
|
+
Keep this module dependency-free so that contract test runs in the main CI
|
|
11
|
+
without the `mcp` SDK installed.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
# tool name → the GUIDE endpoint(s) it calls, each "METHOD /path" verbatim from
|
|
16
|
+
# agent_guide.GUIDE. A tool may wrap more than one (e.g. submit + poll).
|
|
17
|
+
TOOL_ENDPOINTS: dict[str, list[str]] = {
|
|
18
|
+
"check_connection": [
|
|
19
|
+
"GET /health",
|
|
20
|
+
"GET /api/v1/reports",
|
|
21
|
+
],
|
|
22
|
+
"extract_document": [
|
|
23
|
+
"POST /ingest/process",
|
|
24
|
+
"GET /ingest/document/{document_id}",
|
|
25
|
+
],
|
|
26
|
+
"get_document": ["GET /ingest/document/{document_id}"],
|
|
27
|
+
"extract_adhoc": ["POST /api/v1/documents/{id}/extract-adhoc"],
|
|
28
|
+
"classify_document": ["POST /classify"],
|
|
29
|
+
"search_documents": ["GET /api/v1/documents"],
|
|
30
|
+
"list_counterparties": ["GET /api/v1/counterparties"],
|
|
31
|
+
"list_reports": ["GET /api/v1/reports"],
|
|
32
|
+
"run_report": ["GET /api/v1/reports/{report_id}"],
|
|
33
|
+
"export_report_csv": ["GET /api/v1/reports/{report_id}/csv"],
|
|
34
|
+
}
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""Thin synchronous HTTP client over the DocReadi REST API.
|
|
2
|
+
|
|
3
|
+
All the real work — extraction, storage — happens on the hosted API; this client
|
|
4
|
+
is a courier. Sync (no pytest-asyncio in the repo's test env, and FastMCP runs
|
|
5
|
+
sync tool functions in a threadpool just fine). Errors map to a single clean,
|
|
6
|
+
user-facing ``DocReadiError`` so a tool failure reads as a sentence, not a stack
|
|
7
|
+
trace.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Callable
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from . import __version__
|
|
17
|
+
from .config import Config
|
|
18
|
+
from .errors import DocReadiError
|
|
19
|
+
|
|
20
|
+
# Terminal document statuses (see routes/ingest.py + the GUIDE status list).
|
|
21
|
+
_TERMINAL_OK = {"extracted", "approved"}
|
|
22
|
+
_TERMINAL_FAIL = {"failed", "voided", "rejected"}
|
|
23
|
+
|
|
24
|
+
__all__ = ["DocReadiClient", "DocReadiError"]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _format_http_error(resp: httpx.Response) -> str:
|
|
28
|
+
"""Turn a 4xx/5xx into a readable sentence. Prefers the API's own ``detail``
|
|
29
|
+
field; adds guidance for the auth/permission cases an agent will hit."""
|
|
30
|
+
detail = None
|
|
31
|
+
try:
|
|
32
|
+
body = resp.json()
|
|
33
|
+
if isinstance(body, dict):
|
|
34
|
+
detail = body.get("detail") or body.get("error")
|
|
35
|
+
except (ValueError, httpx.HTTPError):
|
|
36
|
+
detail = (resp.text or "").strip()[:300] or None
|
|
37
|
+
|
|
38
|
+
code = resp.status_code
|
|
39
|
+
if code == 401:
|
|
40
|
+
return "DocReadi rejected the API key (401). Check DOCREADI_API_KEY in your MCP config."
|
|
41
|
+
if code == 403:
|
|
42
|
+
return (
|
|
43
|
+
f"DocReadi denied this request (403){f': {detail}' if detail else ''}. "
|
|
44
|
+
"The key may be a read-only (reviewer) key, or lack permission for this action."
|
|
45
|
+
)
|
|
46
|
+
if code == 404:
|
|
47
|
+
return f"Not found (404){f': {detail}' if detail else ''}."
|
|
48
|
+
if code == 429:
|
|
49
|
+
return "DocReadi rate-limited this request (429). Wait a moment and retry."
|
|
50
|
+
return f"DocReadi API error ({code}){f': {detail}' if detail else ''}."
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DocReadiClient:
|
|
54
|
+
"""Wraps the DocReadi REST API. Construct from a Config; pass a custom
|
|
55
|
+
``transport`` (e.g. ``httpx.MockTransport``) in tests."""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
config: Config,
|
|
60
|
+
*,
|
|
61
|
+
transport: httpx.BaseTransport | None = None,
|
|
62
|
+
timeout: float = 60.0,
|
|
63
|
+
) -> None:
|
|
64
|
+
self._config = config
|
|
65
|
+
self._http = httpx.Client(
|
|
66
|
+
base_url=config.base_url,
|
|
67
|
+
headers={
|
|
68
|
+
"X-Api-Key": config.api_key,
|
|
69
|
+
"User-Agent": f"docreadi-mcp/{__version__}",
|
|
70
|
+
"Accept": "application/json",
|
|
71
|
+
},
|
|
72
|
+
timeout=timeout,
|
|
73
|
+
transport=transport,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
def close(self) -> None:
|
|
77
|
+
self._http.close()
|
|
78
|
+
|
|
79
|
+
def __enter__(self) -> "DocReadiClient":
|
|
80
|
+
return self
|
|
81
|
+
|
|
82
|
+
def __exit__(self, *exc: object) -> None:
|
|
83
|
+
self.close()
|
|
84
|
+
|
|
85
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> Any:
|
|
86
|
+
try:
|
|
87
|
+
resp = self._http.request(method, path, **kwargs)
|
|
88
|
+
except httpx.HTTPError as exc:
|
|
89
|
+
raise DocReadiError(
|
|
90
|
+
f"Could not reach DocReadi at {self._config.base_url}: {exc}"
|
|
91
|
+
) from exc
|
|
92
|
+
if resp.status_code >= 400:
|
|
93
|
+
raise DocReadiError(_format_http_error(resp))
|
|
94
|
+
if not resp.content:
|
|
95
|
+
return None
|
|
96
|
+
try:
|
|
97
|
+
return resp.json()
|
|
98
|
+
except ValueError:
|
|
99
|
+
return resp.text
|
|
100
|
+
|
|
101
|
+
# ── Tools (one method per wrapped endpoint) ──────────────────────────────
|
|
102
|
+
|
|
103
|
+
def check_connection(self) -> dict:
|
|
104
|
+
"""Two cheap probes for a first-run self-test: is the DocReadi service
|
|
105
|
+
reachable (GET /health, no auth) and does the API key work (GET
|
|
106
|
+
/api/v1/reports, an authed reviewer-OK read). Never raises — returns a
|
|
107
|
+
diagnostic dict so the agent can explain *what* is wrong (bad URL, down
|
|
108
|
+
service, bad/insufficient key) instead of a cryptic stack trace."""
|
|
109
|
+
result: dict[str, Any] = {
|
|
110
|
+
"base_url": self._config.base_url,
|
|
111
|
+
"service_reachable": False,
|
|
112
|
+
"authenticated": False,
|
|
113
|
+
}
|
|
114
|
+
try:
|
|
115
|
+
health = self._request("GET", "/health")
|
|
116
|
+
result["service_reachable"] = True
|
|
117
|
+
if isinstance(health, dict):
|
|
118
|
+
result["health"] = health
|
|
119
|
+
except DocReadiError as exc:
|
|
120
|
+
result["error"] = str(exc)
|
|
121
|
+
return result
|
|
122
|
+
try:
|
|
123
|
+
self._request("GET", "/api/v1/reports")
|
|
124
|
+
result["authenticated"] = True
|
|
125
|
+
result["message"] = "Connected — DocReadi is reachable and the API key is valid."
|
|
126
|
+
except DocReadiError as exc:
|
|
127
|
+
result["error"] = str(exc)
|
|
128
|
+
result["message"] = (
|
|
129
|
+
"DocReadi is reachable but the API key check failed — see error."
|
|
130
|
+
)
|
|
131
|
+
return result
|
|
132
|
+
|
|
133
|
+
def get_document(self, document_id: str) -> Any:
|
|
134
|
+
"""GET /ingest/document/{document_id} — status + extracted data."""
|
|
135
|
+
return self._request("GET", f"/ingest/document/{document_id}")
|
|
136
|
+
|
|
137
|
+
def ingest(
|
|
138
|
+
self,
|
|
139
|
+
file_bytes: bytes,
|
|
140
|
+
filename: str,
|
|
141
|
+
content_type: str,
|
|
142
|
+
*,
|
|
143
|
+
source: str = "mcp",
|
|
144
|
+
document_type: str | None = None,
|
|
145
|
+
template_id: str | None = None,
|
|
146
|
+
) -> Any:
|
|
147
|
+
"""POST /ingest/process — upload a document. Returns immediately with
|
|
148
|
+
``{document_id, status: ...}``; the pipeline runs in the background."""
|
|
149
|
+
files = {"file": (filename, file_bytes, content_type)}
|
|
150
|
+
data: dict[str, str] = {"source": source}
|
|
151
|
+
if document_type:
|
|
152
|
+
data["document_type"] = document_type
|
|
153
|
+
if template_id:
|
|
154
|
+
data["template_id"] = template_id
|
|
155
|
+
return self._request("POST", "/ingest/process", files=files, data=data)
|
|
156
|
+
|
|
157
|
+
def extract_and_wait(
|
|
158
|
+
self,
|
|
159
|
+
file_bytes: bytes,
|
|
160
|
+
filename: str,
|
|
161
|
+
content_type: str,
|
|
162
|
+
*,
|
|
163
|
+
document_type: str | None = None,
|
|
164
|
+
template_id: str | None = None,
|
|
165
|
+
poll_interval: float = 2.0,
|
|
166
|
+
timeout: float = 180.0,
|
|
167
|
+
_sleep: Callable[[float], None] = time.sleep,
|
|
168
|
+
_clock: Callable[[], float] = time.monotonic,
|
|
169
|
+
) -> Any:
|
|
170
|
+
"""Upload a document and poll until extraction finishes, returning the
|
|
171
|
+
final document record. Raises DocReadiError on a failed/voided document
|
|
172
|
+
or if it doesn't finish within ``timeout`` (the doc still exists — fetch
|
|
173
|
+
it later with get_document)."""
|
|
174
|
+
ingested = self.ingest(
|
|
175
|
+
file_bytes, filename, content_type,
|
|
176
|
+
document_type=document_type, template_id=template_id,
|
|
177
|
+
)
|
|
178
|
+
doc_id = (ingested or {}).get("document_id") if isinstance(ingested, dict) else None
|
|
179
|
+
if not doc_id:
|
|
180
|
+
raise DocReadiError(f"Ingest did not return a document_id (got {ingested!r}).")
|
|
181
|
+
|
|
182
|
+
deadline = _clock() + timeout
|
|
183
|
+
while True:
|
|
184
|
+
current = self.get_document(doc_id)
|
|
185
|
+
status = str((current or {}).get("status") or "").lower() if isinstance(current, dict) else ""
|
|
186
|
+
if status in _TERMINAL_OK:
|
|
187
|
+
return current
|
|
188
|
+
if status in _TERMINAL_FAIL:
|
|
189
|
+
err = current.get("error") or current.get("error_message") if isinstance(current, dict) else None
|
|
190
|
+
raise DocReadiError(
|
|
191
|
+
f"Document {doc_id} ended in status '{status}'"
|
|
192
|
+
+ (f": {err}" if err else "") + "."
|
|
193
|
+
)
|
|
194
|
+
if _clock() >= deadline:
|
|
195
|
+
raise DocReadiError(
|
|
196
|
+
f"Extraction of {doc_id} did not finish within {timeout:.0f}s "
|
|
197
|
+
f"(last status '{status or 'unknown'}'). It is still processing — "
|
|
198
|
+
f"fetch it later with get_document."
|
|
199
|
+
)
|
|
200
|
+
_sleep(poll_interval)
|
|
201
|
+
|
|
202
|
+
def extract_adhoc(
|
|
203
|
+
self,
|
|
204
|
+
document_id: str,
|
|
205
|
+
fields: list,
|
|
206
|
+
*,
|
|
207
|
+
system_instructions: str | None = None,
|
|
208
|
+
) -> Any:
|
|
209
|
+
"""POST /api/v1/documents/{id}/extract-adhoc — one-off, non-destructive
|
|
210
|
+
extraction of caller-defined fields from an already-ingested document."""
|
|
211
|
+
body: dict[str, Any] = {"fields": fields}
|
|
212
|
+
if system_instructions:
|
|
213
|
+
body["system_instructions"] = system_instructions
|
|
214
|
+
return self._request(
|
|
215
|
+
"POST", f"/api/v1/documents/{document_id}/extract-adhoc", json=body
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def classify(self, document_id: str) -> Any:
|
|
219
|
+
"""POST /classify — (re)classify a document's type."""
|
|
220
|
+
return self._request("POST", "/classify", json={"document_id": document_id})
|
|
221
|
+
|
|
222
|
+
def list_counterparties(
|
|
223
|
+
self, *, page: int = 1, page_size: int = 100, status: str | None = None
|
|
224
|
+
) -> Any:
|
|
225
|
+
"""GET /api/v1/counterparties — paginated vendors/customers."""
|
|
226
|
+
params: dict[str, Any] = {"page": page, "page_size": page_size}
|
|
227
|
+
if status:
|
|
228
|
+
params["status"] = status
|
|
229
|
+
return self._request("GET", "/api/v1/counterparties", params=params)
|
|
230
|
+
|
|
231
|
+
def list_documents(
|
|
232
|
+
self,
|
|
233
|
+
*,
|
|
234
|
+
page: int = 1,
|
|
235
|
+
page_size: int = 100,
|
|
236
|
+
status: str | None = None,
|
|
237
|
+
document_type: str | None = None,
|
|
238
|
+
q: str | None = None,
|
|
239
|
+
created_after: str | None = None,
|
|
240
|
+
created_before: str | None = None,
|
|
241
|
+
) -> Any:
|
|
242
|
+
"""GET /api/v1/documents — list/search documents (newest first)."""
|
|
243
|
+
params: dict[str, Any] = {"page": page, "page_size": page_size}
|
|
244
|
+
if status:
|
|
245
|
+
params["status"] = status
|
|
246
|
+
if document_type:
|
|
247
|
+
params["document_type"] = document_type
|
|
248
|
+
if q:
|
|
249
|
+
params["q"] = q
|
|
250
|
+
if created_after:
|
|
251
|
+
params["created_after"] = created_after
|
|
252
|
+
if created_before:
|
|
253
|
+
params["created_before"] = created_before
|
|
254
|
+
return self._request("GET", "/api/v1/documents", params=params)
|
|
255
|
+
|
|
256
|
+
def list_reports(self) -> Any:
|
|
257
|
+
"""GET /api/v1/reports — the workspace's saved reports."""
|
|
258
|
+
return self._request("GET", "/api/v1/reports")
|
|
259
|
+
|
|
260
|
+
def run_report(
|
|
261
|
+
self, report_id: str, *, page: int = 1, page_size: int = 100, dedup: str = "primary"
|
|
262
|
+
) -> Any:
|
|
263
|
+
"""GET /api/v1/reports/{report_id} — run a saved report, return rows."""
|
|
264
|
+
params = {"page": page, "page_size": page_size, "dedup": dedup}
|
|
265
|
+
return self._request("GET", f"/api/v1/reports/{report_id}", params=params)
|
|
266
|
+
|
|
267
|
+
def report_csv(self, report_id: str, *, dedup: str = "primary") -> str:
|
|
268
|
+
"""GET /api/v1/reports/{report_id}/csv — the report as CSV text."""
|
|
269
|
+
out = self._request(
|
|
270
|
+
"GET", f"/api/v1/reports/{report_id}/csv", params={"dedup": dedup}
|
|
271
|
+
)
|
|
272
|
+
return out if isinstance(out, str) else str(out)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Runtime config for the DocReadi MCP server — read from the environment.
|
|
2
|
+
|
|
3
|
+
The user supplies these in their MCP client config (see services/mcp/README.md):
|
|
4
|
+
|
|
5
|
+
"env": { "DOCREADI_API_KEY": "dr_live_…", "DOCREADI_BASE_URL": "https://…" }
|
|
6
|
+
|
|
7
|
+
No secrets are ever read from or written to the repo — the key lives only in the
|
|
8
|
+
user's local client config.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
DEFAULT_BASE_URL = "https://api.docreadi.com"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class Config:
|
|
20
|
+
api_key: str
|
|
21
|
+
base_url: str
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ConfigError(RuntimeError):
|
|
25
|
+
"""Raised when required configuration is missing — surfaced to the user at
|
|
26
|
+
server startup with an actionable message."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_config(env: dict | None = None) -> Config:
|
|
30
|
+
"""Build a Config from the environment. Raises ConfigError (actionable) when
|
|
31
|
+
DOCREADI_API_KEY is absent. ``env`` overridable for tests."""
|
|
32
|
+
src = os.environ if env is None else env
|
|
33
|
+
api_key = (src.get("DOCREADI_API_KEY") or "").strip()
|
|
34
|
+
if not api_key:
|
|
35
|
+
raise ConfigError(
|
|
36
|
+
"DOCREADI_API_KEY is not set. Add it to your MCP client config under "
|
|
37
|
+
'the docreadi server\'s "env" (get a key at '
|
|
38
|
+
"https://docreadi.com → Settings → API keys)."
|
|
39
|
+
)
|
|
40
|
+
base_url = (src.get("DOCREADI_BASE_URL") or DEFAULT_BASE_URL).strip().rstrip("/")
|
|
41
|
+
return Config(api_key=api_key, base_url=base_url)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Shared, dependency-free error type.
|
|
2
|
+
|
|
3
|
+
Lives in its own module so both `client.py` (httpx) and `files.py` (pure
|
|
4
|
+
stdlib) can raise it without `files.py` pulling in httpx.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DocReadiError(RuntimeError):
|
|
10
|
+
"""A clean, user-facing error from a DocReadi operation — surfaced as the
|
|
11
|
+
MCP tool's error text (a sentence, not a stack trace)."""
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Local file reading for the ingest tools (pure stdlib — no httpx/mcp).
|
|
2
|
+
|
|
3
|
+
The whole point of a *local* MCP server is disk access: the agent passes a path,
|
|
4
|
+
we read the bytes and POST them. Kept dependency-free so its test runs in the
|
|
5
|
+
main CI.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from .errors import DocReadiError
|
|
12
|
+
|
|
13
|
+
# DocReadi accepts PDF + common image formats (see routes/ingest.py).
|
|
14
|
+
CONTENT_TYPES: dict[str, str] = {
|
|
15
|
+
".pdf": "application/pdf",
|
|
16
|
+
".jpg": "image/jpeg",
|
|
17
|
+
".jpeg": "image/jpeg",
|
|
18
|
+
".png": "image/png",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
# Guard against an agent passing a huge/wrong path. 50 MB is comfortably above
|
|
22
|
+
# any real invoice; the API enforces its own limit too.
|
|
23
|
+
MAX_BYTES = 50 * 1024 * 1024
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def read_document_file(file_path: str) -> tuple[str, bytes, str]:
|
|
27
|
+
"""Return (filename, content_bytes, content_type) for a local document.
|
|
28
|
+
|
|
29
|
+
Raises DocReadiError (clean, user-facing) for a missing path, a non-file, an
|
|
30
|
+
unsupported extension, or an oversize file."""
|
|
31
|
+
p = Path(file_path).expanduser()
|
|
32
|
+
if not p.exists():
|
|
33
|
+
raise DocReadiError(f"No file at {p}")
|
|
34
|
+
if not p.is_file():
|
|
35
|
+
raise DocReadiError(f"{p} is not a file (is it a directory?).")
|
|
36
|
+
ext = p.suffix.lower()
|
|
37
|
+
if ext not in CONTENT_TYPES:
|
|
38
|
+
raise DocReadiError(
|
|
39
|
+
f"Unsupported file type '{ext or '(none)'}' — DocReadi accepts "
|
|
40
|
+
f"PDF, JPG, PNG."
|
|
41
|
+
)
|
|
42
|
+
size = p.stat().st_size
|
|
43
|
+
if size > MAX_BYTES:
|
|
44
|
+
raise DocReadiError(
|
|
45
|
+
f"{p.name} is {size / 1024 / 1024:.1f} MB — over the "
|
|
46
|
+
f"{MAX_BYTES // 1024 // 1024} MB limit."
|
|
47
|
+
)
|
|
48
|
+
return p.name, p.read_bytes(), CONTENT_TYPES[ext]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def write_text_file(out_path: str, text: str) -> dict:
|
|
52
|
+
"""Write ``text`` to a local file, returning {path, bytes}. Raises a clean
|
|
53
|
+
DocReadiError when the target is a directory or its parent folder is
|
|
54
|
+
missing (so the tool gives an actionable message instead of an OSError)."""
|
|
55
|
+
p = Path(out_path).expanduser()
|
|
56
|
+
if p.is_dir():
|
|
57
|
+
raise DocReadiError(f"{p} is a directory — give a file path to write to.")
|
|
58
|
+
if not p.parent.exists():
|
|
59
|
+
raise DocReadiError(f"The folder {p.parent} does not exist.")
|
|
60
|
+
data = text if isinstance(text, str) else str(text)
|
|
61
|
+
p.write_text(data, encoding="utf-8")
|
|
62
|
+
return {"path": str(p), "bytes": len(data.encode("utf-8"))}
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""FastMCP glue — registers the DocReadi tools and runs over stdio.
|
|
2
|
+
|
|
3
|
+
Deliberately thin: each tool is a few lines that call `DocReadiClient`. The
|
|
4
|
+
`mcp` SDK is imported lazily inside `build_server` so the rest of the package
|
|
5
|
+
(config / client / catalog) and their tests don't require the SDK installed.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .catalog import TOOL_ENDPOINTS
|
|
10
|
+
from .client import DocReadiClient
|
|
11
|
+
from .config import Config, load_config
|
|
12
|
+
from .files import read_document_file, write_text_file
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_server(client: DocReadiClient | None = None, config: Config | None = None):
|
|
16
|
+
"""Construct the FastMCP server with every tool registered.
|
|
17
|
+
|
|
18
|
+
``client`` is injectable for tests (so the SDK wiring can be exercised
|
|
19
|
+
without real config/network). In normal use both args are None and config is
|
|
20
|
+
read from the environment.
|
|
21
|
+
"""
|
|
22
|
+
# Resolve config/client FIRST so a missing DOCREADI_API_KEY surfaces as the
|
|
23
|
+
# friendly ConfigError before we touch the SDK.
|
|
24
|
+
if client is None:
|
|
25
|
+
client = DocReadiClient(config or load_config())
|
|
26
|
+
|
|
27
|
+
from mcp.server.fastmcp import FastMCP # lazy — only needed to actually run
|
|
28
|
+
|
|
29
|
+
mcp = FastMCP("docreadi")
|
|
30
|
+
|
|
31
|
+
@mcp.tool()
|
|
32
|
+
def check_connection() -> dict:
|
|
33
|
+
"""Check that DocReadi is reachable and your API key works — run this
|
|
34
|
+
first if anything seems wrong. Returns {service_reachable, authenticated,
|
|
35
|
+
message, …}; it never errors, so use it to diagnose a bad DOCREADI_API_KEY
|
|
36
|
+
or DOCREADI_BASE_URL before trying other tools."""
|
|
37
|
+
return client.check_connection()
|
|
38
|
+
|
|
39
|
+
@mcp.tool()
|
|
40
|
+
def extract_document(
|
|
41
|
+
file_path: str, document_type: str = "", template_id: str = ""
|
|
42
|
+
) -> dict:
|
|
43
|
+
"""Extract structured data from a local document file (PDF, JPG, or PNG)
|
|
44
|
+
on this machine. Uploads it to DocReadi, waits for extraction to finish
|
|
45
|
+
(usually well under a minute), and returns the document's status and the
|
|
46
|
+
structured extracted data (fields, line items, totals, confidence).
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
file_path: path to the document file on this machine.
|
|
50
|
+
document_type: optional — force a type (e.g. 'ap_invoice'); leave
|
|
51
|
+
empty to auto-detect.
|
|
52
|
+
template_id: optional — id of a custom extraction template to use.
|
|
53
|
+
"""
|
|
54
|
+
filename, content, content_type = read_document_file(file_path)
|
|
55
|
+
return client.extract_and_wait(
|
|
56
|
+
content, filename, content_type,
|
|
57
|
+
document_type=document_type or None,
|
|
58
|
+
template_id=template_id or None,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
@mcp.tool()
|
|
62
|
+
def get_document(document_id: str) -> dict:
|
|
63
|
+
"""Fetch a DocReadi document by its id: its processing status and, once
|
|
64
|
+
extraction has finished, the structured extracted data.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
document_id: the DocReadi document UUID (e.g. returned when a
|
|
68
|
+
document was ingested).
|
|
69
|
+
"""
|
|
70
|
+
return client.get_document(document_id)
|
|
71
|
+
|
|
72
|
+
@mcp.tool()
|
|
73
|
+
def extract_adhoc(
|
|
74
|
+
document_id: str, fields: list, system_instructions: str = ""
|
|
75
|
+
) -> dict:
|
|
76
|
+
"""One-off, non-destructive extraction of specific fields from a document
|
|
77
|
+
that is already in DocReadi — without changing its stored extraction.
|
|
78
|
+
Use it to answer an ad-hoc question ("who signed these delivery notes?").
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
document_id: the DocReadi document UUID.
|
|
82
|
+
fields: the fields to extract — a list of
|
|
83
|
+
{"name": str, "type": str, "description": str} objects
|
|
84
|
+
(type one of string/number/date/boolean/list_of_strings).
|
|
85
|
+
system_instructions: optional extra guidance for the model.
|
|
86
|
+
"""
|
|
87
|
+
return client.extract_adhoc(
|
|
88
|
+
document_id, fields, system_instructions=system_instructions or None
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
@mcp.tool()
|
|
92
|
+
def classify_document(document_id: str) -> dict:
|
|
93
|
+
"""Classify (or re-classify) a document already in DocReadi — returns the
|
|
94
|
+
detected document_type and the model's reasoning. (Classification runs
|
|
95
|
+
automatically during ingestion; use this only to re-run it.)
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
document_id: the DocReadi document UUID.
|
|
99
|
+
"""
|
|
100
|
+
return client.classify(document_id)
|
|
101
|
+
|
|
102
|
+
@mcp.tool()
|
|
103
|
+
def search_documents(
|
|
104
|
+
q: str = "",
|
|
105
|
+
status: str = "",
|
|
106
|
+
document_type: str = "",
|
|
107
|
+
created_after: str = "",
|
|
108
|
+
created_before: str = "",
|
|
109
|
+
page: int = 1,
|
|
110
|
+
page_size: int = 100,
|
|
111
|
+
) -> dict:
|
|
112
|
+
"""Search/list the documents in this DocReadi workspace (newest first).
|
|
113
|
+
The quick way to find documents by vendor, number, type, status, or date
|
|
114
|
+
— returns a summary row per document (id, type, status, number, vendor,
|
|
115
|
+
date, total, currency, filename). Use get_document for the full data, or
|
|
116
|
+
a saved report (list_reports / run_report) for richer columns.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
q: free-text match over vendor name / document number / filename.
|
|
120
|
+
status: filter by processing status (e.g. 'extracted', 'approved',
|
|
121
|
+
'flagged').
|
|
122
|
+
document_type: filter by type (e.g. 'ap_invoice').
|
|
123
|
+
created_after: only documents created on/after this date (YYYY-MM-DD).
|
|
124
|
+
created_before: only documents created on/before this date
|
|
125
|
+
(YYYY-MM-DD, inclusive).
|
|
126
|
+
page: 1-based page number.
|
|
127
|
+
page_size: rows per page (max 500).
|
|
128
|
+
"""
|
|
129
|
+
return client.list_documents(
|
|
130
|
+
page=page,
|
|
131
|
+
page_size=page_size,
|
|
132
|
+
status=status or None,
|
|
133
|
+
document_type=document_type or None,
|
|
134
|
+
q=q or None,
|
|
135
|
+
created_after=created_after or None,
|
|
136
|
+
created_before=created_before or None,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
@mcp.tool()
|
|
140
|
+
def list_counterparties(
|
|
141
|
+
page: int = 1, page_size: int = 100, status: str = ""
|
|
142
|
+
) -> dict:
|
|
143
|
+
"""List this workspace's counterparties (vendors / customers) with their
|
|
144
|
+
VAT numbers, aliases, and document counts. Paginated.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
page: 1-based page number.
|
|
148
|
+
page_size: rows per page (max 500).
|
|
149
|
+
status: optional filter — 'confirmed' or 'candidate'.
|
|
150
|
+
"""
|
|
151
|
+
return client.list_counterparties(
|
|
152
|
+
page=page, page_size=page_size, status=status or None
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
@mcp.tool()
|
|
156
|
+
def list_reports() -> list:
|
|
157
|
+
"""List the saved reports in this DocReadi workspace (id, name,
|
|
158
|
+
description, columns). Use run_report to fetch a report's rows — saved
|
|
159
|
+
reports are how you query/search the document corpus."""
|
|
160
|
+
return client.list_reports()
|
|
161
|
+
|
|
162
|
+
@mcp.tool()
|
|
163
|
+
def run_report(
|
|
164
|
+
report_id: str, page: int = 1, page_size: int = 100, dedup: str = "primary"
|
|
165
|
+
) -> dict:
|
|
166
|
+
"""Run a saved report and return its rows — the way to query/search the
|
|
167
|
+
documents in this workspace (e.g. "last month's approved AP invoices").
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
report_id: the saved report's id (from list_reports).
|
|
171
|
+
page: 1-based page number.
|
|
172
|
+
page_size: rows per page.
|
|
173
|
+
dedup: 'primary' (default, one row per document), 'all', or
|
|
174
|
+
'duplicates'.
|
|
175
|
+
"""
|
|
176
|
+
return client.run_report(
|
|
177
|
+
report_id, page=page, page_size=page_size, dedup=dedup
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
@mcp.tool()
|
|
181
|
+
def export_report_csv(report_id: str, out_path: str, dedup: str = "primary") -> dict:
|
|
182
|
+
"""Run a saved report and write the full result as a CSV file to a LOCAL
|
|
183
|
+
path on this machine. Returns {path, bytes}.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
report_id: the saved report's id (from list_reports).
|
|
187
|
+
out_path: where to write the CSV on this machine (e.g.
|
|
188
|
+
'/Users/me/Desktop/report.csv').
|
|
189
|
+
dedup: 'primary' (default), 'all', or 'duplicates'.
|
|
190
|
+
"""
|
|
191
|
+
csv_text = client.report_csv(report_id, dedup=dedup)
|
|
192
|
+
return write_text_file(out_path, csv_text)
|
|
193
|
+
|
|
194
|
+
# Guardrail: every registered tool must be declared in the catalog (which
|
|
195
|
+
# the contract test binds to the API reference). Catches "added a tool,
|
|
196
|
+
# forgot the catalog entry" at startup.
|
|
197
|
+
registered = set(_tool_names(mcp))
|
|
198
|
+
undeclared = registered - set(TOOL_ENDPOINTS)
|
|
199
|
+
if undeclared:
|
|
200
|
+
raise RuntimeError(
|
|
201
|
+
f"MCP tools not declared in catalog.TOOL_ENDPOINTS: {sorted(undeclared)}"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
return mcp
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _tool_names(mcp) -> list[str]:
|
|
208
|
+
"""Best-effort introspection of registered tool names across FastMCP
|
|
209
|
+
versions (the internal accessor has shifted)."""
|
|
210
|
+
mgr = getattr(mcp, "_tool_manager", None)
|
|
211
|
+
if mgr is not None and hasattr(mgr, "_tools"):
|
|
212
|
+
return list(mgr._tools.keys())
|
|
213
|
+
lister = getattr(mcp, "list_tools", None)
|
|
214
|
+
if callable(lister):
|
|
215
|
+
try:
|
|
216
|
+
return [t.name for t in lister()]
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
return list(TOOL_ENDPOINTS) # fall back to the declared set
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "docreadi-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for DocReadi — document data extraction for finance, in your AI client."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "Proprietary" }
|
|
8
|
+
authors = [{ name = "DocReadi", email = "support@docreadi.com" }]
|
|
9
|
+
keywords = ["mcp", "docreadi", "invoice", "extraction", "documents", "finance", "ocr"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Environment :: Console",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Topic :: Office/Business :: Financial",
|
|
17
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
18
|
+
]
|
|
19
|
+
dependencies = [
|
|
20
|
+
"mcp>=1.2.0",
|
|
21
|
+
"httpx>=0.27.0,<1",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
25
|
+
Homepage = "https://docreadi.com"
|
|
26
|
+
Documentation = "https://api.docreadi.com/api/docs-guide"
|
|
27
|
+
Repository = "https://github.com/greenmartian138/doc_intelligence"
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
docreadi-mcp = "docreadi_mcp.__main__:main"
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["hatchling"]
|
|
34
|
+
build-backend = "hatchling.build"
|
|
35
|
+
|
|
36
|
+
[tool.hatch.build.targets.wheel]
|
|
37
|
+
packages = ["docreadi_mcp"]
|