docreadi-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ # Secrets — never commit
2
+ .env
3
+
4
+ # Claude Code local settings (gitignore the directory's contents
5
+ # via /* so the directory itself stays trackable, then carve out
6
+ # the project-shared settings.json — added 2026-05-26 for the
7
+ # team-wide ruff pre-push hook. Using `.claude/` here would
8
+ # block the !.claude/settings.json re-include because git can't
9
+ # re-include a file under an excluded directory).
10
+ .claude/*
11
+ !.claude/settings.json
12
+ # Custom subagents are shared, durable assets (brand-guardian, marketing-agent
13
+ # …) — version them. Re-include the dir, then the .md definitions inside it.
14
+ !.claude/agents/
15
+ !.claude/agents/*.md
16
+
17
+ # Python
18
+ __pycache__/
19
+ *.py[cod]
20
+ *.pyo
21
+ *.pyd
22
+ .Python
23
+ *.egg-info/
24
+ dist/
25
+ build/
26
+ .eggs/
27
+ *.egg
28
+ .venv/
29
+ venv/
30
+ env/
31
+
32
+ # Docker volumes / data
33
+ data/
34
+ *.sqlite
35
+ *.db
36
+
37
+ # IDE
38
+ .vscode/
39
+ .idea/
40
+ *.swp
41
+ *.swo
42
+
43
+ # OS
44
+ .DS_Store
45
+ Thumbs.db
46
+
47
+ # Logs
48
+ *.log
49
+ logs/
50
+
51
+ # Test artifacts
52
+ .pytest_cache/
53
+ .coverage
54
+ htmlcov/
55
+
56
+ # WSL metadata garbage from Windows copy-paste (foo.png:Zone.Identifier)
57
+ *:Zone.Identifier
58
+
59
+ # Generated/temp HTML renders of the markdown review docs — never track
60
+ PRODUCT_REVIEW.html
61
+ PRODUCT_REVIEW_tmp.html
62
+ *_tmp.html
63
+
64
+ # Local-only working / handover docs + ad-hoc probes — never track
65
+ HANDOVER.md
66
+ PIPELINE_CONCURRENCY_GOVERNOR_PLAN.md
67
+ test_ocr_concurrency.py
@@ -0,0 +1,106 @@
1
+ Metadata-Version: 2.4
2
+ Name: docreadi-mcp
3
+ Version: 0.1.0
4
+ Summary: MCP server for DocReadi — document data extraction for finance, in your AI client.
5
+ Project-URL: Homepage, https://docreadi.com
6
+ Project-URL: Documentation, https://api.docreadi.com/api/docs-guide
7
+ Project-URL: Repository, https://github.com/greenmartian138/doc_intelligence
8
+ Author-email: DocReadi <support@docreadi.com>
9
+ License: Proprietary
10
+ Keywords: docreadi,documents,extraction,finance,invoice,mcp,ocr
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Topic :: Office/Business :: Financial
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: httpx<1,>=0.27.0
20
+ Requires-Dist: mcp>=1.2.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # docreadi-mcp
24
+
25
+ A **local** [MCP](https://modelcontextprotocol.io) server for **DocReadi** —
26
+ document data extraction for finance. It runs on your machine, reads documents
27
+ off your disk, and calls the hosted DocReadi API with your API key, so an AI
28
+ client (Claude Desktop, Claude Code, Cursor, …) can extract documents and query
29
+ your DocReadi corpus without leaving the chat.
30
+
31
+ > **Status: v1, in progress** (see `../../MCP_SERVER_PLAN.md`). Tools land
32
+ > phase by phase; this is the skeleton + first read tool.
33
+
34
+ ## How it works
35
+
36
+ Your MCP client spawns `docreadi-mcp` as a local subprocess and talks to it over
37
+ stdio. The server is a thin courier — all extraction/storage happens on the
38
+ hosted DocReadi API; the server just translates tool calls into HTTPS requests
39
+ and (for ingestion) reads local files. Your API key lives only in your client
40
+ config, never in this repo.
41
+
42
+ ## Install / configure
43
+
44
+ Add the server to your MCP client config with your DocReadi API key
45
+ (get one at **docreadi.com → Settings → API keys**):
46
+
47
+ ```jsonc
48
+ {
49
+ "mcpServers": {
50
+ "docreadi": {
51
+ "command": "uvx",
52
+ "args": ["docreadi-mcp"],
53
+ "env": {
54
+ "DOCREADI_API_KEY": "dr_live_…"
55
+ // "DOCREADI_BASE_URL": "https://api.docreadi.com" // override if self-hosting
56
+ }
57
+ }
58
+ }
59
+ }
60
+ ```
61
+
62
+ | Env var | Required | Default |
63
+ |---|---|---|
64
+ | `DOCREADI_API_KEY` | yes | — |
65
+ | `DOCREADI_BASE_URL` | no | `https://api.docreadi.com` |
66
+
67
+ ## Tools
68
+
69
+ | Tool | What it does |
70
+ |---|---|
71
+ | `check_connection` | First-run self-test — is DocReadi reachable and is your API key valid? Never errors; returns a diagnostic. |
72
+ | `extract_document` | Extract structured data from a **local** PDF/JPG/PNG — uploads, waits for extraction, returns the fields + line items + totals + confidence. |
73
+ | `get_document` | Fetch a document's status + extracted data by id. |
74
+ | `extract_adhoc` | One-off, non-destructive extraction of caller-defined fields from a document already in DocReadi (e.g. "who signed these delivery notes?"). |
75
+ | `classify_document` | (Re)classify a document's type; returns the type + reasoning. |
76
+ | `search_documents` | Search/list the workspace's documents (by vendor, number, type, status, or date range) — a summary row per match. |
77
+ | `list_counterparties` | List vendors/customers (VAT, aliases, doc counts), paginated. |
78
+ | `list_reports` | List the workspace's saved reports. |
79
+ | `run_report` | Run a saved report and return its rows — the way to query/search the corpus. |
80
+ | `export_report_csv` | Run a saved report and write the result as a CSV to a local path. |
81
+
82
+ _(`search_documents` is the raw query surface; for richer column selection and
83
+ saved queries, author a report in the UI and use `run_report` /
84
+ `export_report_csv`.)_
85
+
86
+ ## Development
87
+
88
+ This package lives in the DocReadi monorepo so it's reviewed and CI'd alongside
89
+ the API it wraps. Its layering keeps the dependency-free parts testable in the
90
+ main CI without the `mcp` SDK:
91
+
92
+ - `docreadi_mcp/config.py` — env config.
93
+ - `docreadi_mcp/catalog.py` — pure-data tool→endpoint registry (the maintenance
94
+ anchor: `tests/test_mcp_catalog_contract.py` binds it to `agent_guide.GUIDE`).
95
+ - `docreadi_mcp/client.py` — httpx client (tested with `MockTransport`).
96
+ - `docreadi_mcp/server.py` — thin FastMCP glue (lazy-imports the `mcp` SDK).
97
+
98
+ ```bash
99
+ # from the repo root
100
+ pip install -e services/mcp # installs mcp + httpx
101
+ DOCREADI_API_KEY=dr_… python -m docreadi_mcp # run over stdio
102
+ ```
103
+
104
+ **Maintenance contract:** when a DocReadi API endpoint a tool wraps changes,
105
+ update `docreadi_mcp/catalog.py` (and the tool) — the contract test fails CI
106
+ otherwise. See `MCP_SERVER_PLAN.md` §5.
@@ -0,0 +1,53 @@
1
+ # Publishing `docreadi-mcp` to PyPI
2
+
3
+ This is a **founder action** — it needs a PyPI account + an API token (a secret
4
+ that must never go in the repo, chat, or CI logs without a protected store). The
5
+ package itself is ready; this is the one step an agent can't do.
6
+
7
+ ## One-time setup
8
+
9
+ 1. Create a PyPI account at <https://pypi.org> (and ideally
10
+ <https://test.pypi.org> for a dry run).
11
+ 2. Create a **project-scoped API token** (Account → API tokens). For the very
12
+ first upload the project doesn't exist yet, so use an account-scoped token,
13
+ then rotate to a project-scoped one afterwards.
14
+ 3. Reserve the name by doing the first upload (the name `docreadi-mcp` is set in
15
+ `pyproject.toml`).
16
+
17
+ ## Build + upload
18
+
19
+ From `services/mcp/` (in a clean venv):
20
+
21
+ ```bash
22
+ python -m pip install --upgrade build twine
23
+ python -m build # writes dist/docreadi_mcp-<v>.tar.gz + .whl
24
+ python -m twine check dist/* # metadata sanity
25
+ # dry run first (optional):
26
+ python -m twine upload --repository testpypi dist/*
27
+ # real upload:
28
+ python -m twine upload dist/* # paste the token as the password (user: __token__)
29
+ ```
30
+
31
+ Then verify the user-facing install path works from a clean machine:
32
+
33
+ ```bash
34
+ DOCREADI_API_KEY=dr_live_… uvx docreadi-mcp # should start and wait on stdio
35
+ ```
36
+
37
+ ## Releasing a new version
38
+
39
+ 1. Bump the version in **both** `pyproject.toml` and
40
+ `docreadi_mcp/__init__.py` (`__version__`) — they must match (the
41
+ `User-Agent` header reports `__version__`).
42
+ 2. `python -m build && python -m twine upload dist/*` (delete the old `dist/`
43
+ first).
44
+ 3. PyPI is append-only — you cannot overwrite a version, only yank it. Never
45
+ reuse a version number.
46
+
47
+ ## Notes
48
+
49
+ - **No registry listing yet** (open decision #3 in `MCP_SERVER_PLAN.md`) — list
50
+ in an MCP registry only after a hosted/remote v2.
51
+ - CI does **not** publish (no token in GitHub Actions). Publishing stays a
52
+ manual, founder-gated step until a release workflow with a protected token is
53
+ set up deliberately.
@@ -0,0 +1,84 @@
1
+ # docreadi-mcp
2
+
3
+ A **local** [MCP](https://modelcontextprotocol.io) server for **DocReadi** —
4
+ document data extraction for finance. It runs on your machine, reads documents
5
+ off your disk, and calls the hosted DocReadi API with your API key, so an AI
6
+ client (Claude Desktop, Claude Code, Cursor, …) can extract documents and query
7
+ your DocReadi corpus without leaving the chat.
8
+
9
+ > **Status: v1, in progress** (see `../../MCP_SERVER_PLAN.md`). Tools land
10
+ > phase by phase; this is the skeleton + first read tool.
11
+
12
+ ## How it works
13
+
14
+ Your MCP client spawns `docreadi-mcp` as a local subprocess and talks to it over
15
+ stdio. The server is a thin courier — all extraction/storage happens on the
16
+ hosted DocReadi API; the server just translates tool calls into HTTPS requests
17
+ and (for ingestion) reads local files. Your API key lives only in your client
18
+ config, never in this repo.
19
+
20
+ ## Install / configure
21
+
22
+ Add the server to your MCP client config with your DocReadi API key
23
+ (get one at **docreadi.com → Settings → API keys**):
24
+
25
+ ```jsonc
26
+ {
27
+ "mcpServers": {
28
+ "docreadi": {
29
+ "command": "uvx",
30
+ "args": ["docreadi-mcp"],
31
+ "env": {
32
+ "DOCREADI_API_KEY": "dr_live_…"
33
+ // "DOCREADI_BASE_URL": "https://api.docreadi.com" // override if self-hosting
34
+ }
35
+ }
36
+ }
37
+ }
38
+ ```
39
+
40
+ | Env var | Required | Default |
41
+ |---|---|---|
42
+ | `DOCREADI_API_KEY` | yes | — |
43
+ | `DOCREADI_BASE_URL` | no | `https://api.docreadi.com` |
44
+
45
+ ## Tools
46
+
47
+ | Tool | What it does |
48
+ |---|---|
49
+ | `check_connection` | First-run self-test — is DocReadi reachable and is your API key valid? Never errors; returns a diagnostic. |
50
+ | `extract_document` | Extract structured data from a **local** PDF/JPG/PNG — uploads, waits for extraction, returns the fields + line items + totals + confidence. |
51
+ | `get_document` | Fetch a document's status + extracted data by id. |
52
+ | `extract_adhoc` | One-off, non-destructive extraction of caller-defined fields from a document already in DocReadi (e.g. "who signed these delivery notes?"). |
53
+ | `classify_document` | (Re)classify a document's type; returns the type + reasoning. |
54
+ | `search_documents` | Search/list the workspace's documents (by vendor, number, type, status, or date range) — a summary row per match. |
55
+ | `list_counterparties` | List vendors/customers (VAT, aliases, doc counts), paginated. |
56
+ | `list_reports` | List the workspace's saved reports. |
57
+ | `run_report` | Run a saved report and return its rows — the way to query/search the corpus. |
58
+ | `export_report_csv` | Run a saved report and write the result as a CSV to a local path. |
59
+
60
+ _(`search_documents` is the raw query surface; for richer column selection and
61
+ saved queries, author a report in the UI and use `run_report` /
62
+ `export_report_csv`.)_
63
+
64
+ ## Development
65
+
66
+ This package lives in the DocReadi monorepo so it's reviewed and CI'd alongside
67
+ the API it wraps. Its layering keeps the dependency-free parts testable in the
68
+ main CI without the `mcp` SDK:
69
+
70
+ - `docreadi_mcp/config.py` — env config.
71
+ - `docreadi_mcp/catalog.py` — pure-data tool→endpoint registry (the maintenance
72
+ anchor: `tests/test_mcp_catalog_contract.py` binds it to `agent_guide.GUIDE`).
73
+ - `docreadi_mcp/client.py` — httpx client (tested with `MockTransport`).
74
+ - `docreadi_mcp/server.py` — thin FastMCP glue (lazy-imports the `mcp` SDK).
75
+
76
+ ```bash
77
+ # from the repo root
78
+ pip install -e services/mcp # installs mcp + httpx
79
+ DOCREADI_API_KEY=dr_… python -m docreadi_mcp # run over stdio
80
+ ```
81
+
82
+ **Maintenance contract:** when a DocReadi API endpoint a tool wraps changes,
83
+ update `docreadi_mcp/catalog.py` (and the tool) — the contract test fails CI
84
+ otherwise. See `MCP_SERVER_PLAN.md` §5.
@@ -0,0 +1,17 @@
1
+ """DocReadi MCP server — local-first wrapper over the DocReadi REST API.
2
+
3
+ A small process the user's MCP client (Claude Desktop / Claude Code / …) spawns
4
+ locally. It reads files off the user's disk and calls the hosted DocReadi API
5
+ with the user's API key — so an agent can extract documents and query a corpus
6
+ without leaving the chat. See MCP_SERVER_PLAN.md.
7
+
8
+ Layering (deliberate, so the maintenance-contract + client tests run in CI
9
+ without the `mcp` SDK installed):
10
+ * config.py — env config (DOCREADI_API_KEY / DOCREADI_BASE_URL). No deps.
11
+ * catalog.py — pure-data tool→endpoint registry. No deps. The contract test
12
+ (tests/test_mcp_catalog_contract.py) binds it to agent_guide.GUIDE.
13
+ * client.py — httpx client wrapping the REST API. Tested with MockTransport.
14
+ * server.py — thin FastMCP glue (imports the `mcp` SDK lazily).
15
+ """
16
+
17
+ __version__ = "0.1.0"
@@ -0,0 +1,28 @@
1
+ """Console entry point: `docreadi-mcp` (and `python -m docreadi_mcp`).
2
+
3
+ Runs the MCP server over stdio — the user's MCP client spawns this as a
4
+ subprocess. A missing API key (ConfigError) prints an actionable message and
5
+ exits non-zero rather than dumping a traceback.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import sys
10
+
11
+
12
+ def main() -> int:
13
+ from .config import ConfigError
14
+
15
+ try:
16
+ from .server import build_server
17
+
18
+ server = build_server()
19
+ except ConfigError as exc:
20
+ print(f"docreadi-mcp: {exc}", file=sys.stderr)
21
+ return 2
22
+
23
+ server.run() # FastMCP defaults to stdio transport
24
+ return 0
25
+
26
+
27
+ if __name__ == "__main__":
28
+ raise SystemExit(main())
@@ -0,0 +1,34 @@
1
+ """Tool → REST-endpoint catalog (pure data, no dependencies).
2
+
3
+ This is the **maintenance anchor** (MCP_SERVER_PLAN.md §5): every MCP tool
4
+ declares which DocReadi endpoint(s) it wraps, as the EXACT
5
+ ``"METHOD /path"`` string that appears in ``agent_guide.GUIDE["endpoints"]``.
6
+ ``tests/test_mcp_catalog_contract.py`` asserts every entry here resolves to a
7
+ real GUIDE endpoint — so a tool can never wrap an endpoint that was removed or
8
+ renamed without CI going red.
9
+
10
+ Keep this module dependency-free so that contract test runs in the main CI
11
+ without the `mcp` SDK installed.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ # tool name → the GUIDE endpoint(s) it calls, each "METHOD /path" verbatim from
16
+ # agent_guide.GUIDE. A tool may wrap more than one (e.g. submit + poll).
17
+ TOOL_ENDPOINTS: dict[str, list[str]] = {
18
+ "check_connection": [
19
+ "GET /health",
20
+ "GET /api/v1/reports",
21
+ ],
22
+ "extract_document": [
23
+ "POST /ingest/process",
24
+ "GET /ingest/document/{document_id}",
25
+ ],
26
+ "get_document": ["GET /ingest/document/{document_id}"],
27
+ "extract_adhoc": ["POST /api/v1/documents/{id}/extract-adhoc"],
28
+ "classify_document": ["POST /classify"],
29
+ "search_documents": ["GET /api/v1/documents"],
30
+ "list_counterparties": ["GET /api/v1/counterparties"],
31
+ "list_reports": ["GET /api/v1/reports"],
32
+ "run_report": ["GET /api/v1/reports/{report_id}"],
33
+ "export_report_csv": ["GET /api/v1/reports/{report_id}/csv"],
34
+ }
@@ -0,0 +1,272 @@
1
+ """Thin synchronous HTTP client over the DocReadi REST API.
2
+
3
+ All the real work — extraction, storage — happens on the hosted API; this client
4
+ is a courier. Sync (no pytest-asyncio in the repo's test env, and FastMCP runs
5
+ sync tool functions in a threadpool just fine). Errors map to a single clean,
6
+ user-facing ``DocReadiError`` so a tool failure reads as a sentence, not a stack
7
+ trace.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import time
12
+ from typing import Any, Callable
13
+
14
+ import httpx
15
+
16
+ from . import __version__
17
+ from .config import Config
18
+ from .errors import DocReadiError
19
+
20
+ # Terminal document statuses (see routes/ingest.py + the GUIDE status list).
21
+ _TERMINAL_OK = {"extracted", "approved"}
22
+ _TERMINAL_FAIL = {"failed", "voided", "rejected"}
23
+
24
+ __all__ = ["DocReadiClient", "DocReadiError"]
25
+
26
+
27
+ def _format_http_error(resp: httpx.Response) -> str:
28
+ """Turn a 4xx/5xx into a readable sentence. Prefers the API's own ``detail``
29
+ field; adds guidance for the auth/permission cases an agent will hit."""
30
+ detail = None
31
+ try:
32
+ body = resp.json()
33
+ if isinstance(body, dict):
34
+ detail = body.get("detail") or body.get("error")
35
+ except (ValueError, httpx.HTTPError):
36
+ detail = (resp.text or "").strip()[:300] or None
37
+
38
+ code = resp.status_code
39
+ if code == 401:
40
+ return "DocReadi rejected the API key (401). Check DOCREADI_API_KEY in your MCP config."
41
+ if code == 403:
42
+ return (
43
+ f"DocReadi denied this request (403){f': {detail}' if detail else ''}. "
44
+ "The key may be a read-only (reviewer) key, or lack permission for this action."
45
+ )
46
+ if code == 404:
47
+ return f"Not found (404){f': {detail}' if detail else ''}."
48
+ if code == 429:
49
+ return "DocReadi rate-limited this request (429). Wait a moment and retry."
50
+ return f"DocReadi API error ({code}){f': {detail}' if detail else ''}."
51
+
52
+
53
+ class DocReadiClient:
54
+ """Wraps the DocReadi REST API. Construct from a Config; pass a custom
55
+ ``transport`` (e.g. ``httpx.MockTransport``) in tests."""
56
+
57
+ def __init__(
58
+ self,
59
+ config: Config,
60
+ *,
61
+ transport: httpx.BaseTransport | None = None,
62
+ timeout: float = 60.0,
63
+ ) -> None:
64
+ self._config = config
65
+ self._http = httpx.Client(
66
+ base_url=config.base_url,
67
+ headers={
68
+ "X-Api-Key": config.api_key,
69
+ "User-Agent": f"docreadi-mcp/{__version__}",
70
+ "Accept": "application/json",
71
+ },
72
+ timeout=timeout,
73
+ transport=transport,
74
+ )
75
+
76
+ def close(self) -> None:
77
+ self._http.close()
78
+
79
+ def __enter__(self) -> "DocReadiClient":
80
+ return self
81
+
82
+ def __exit__(self, *exc: object) -> None:
83
+ self.close()
84
+
85
+ def _request(self, method: str, path: str, **kwargs: Any) -> Any:
86
+ try:
87
+ resp = self._http.request(method, path, **kwargs)
88
+ except httpx.HTTPError as exc:
89
+ raise DocReadiError(
90
+ f"Could not reach DocReadi at {self._config.base_url}: {exc}"
91
+ ) from exc
92
+ if resp.status_code >= 400:
93
+ raise DocReadiError(_format_http_error(resp))
94
+ if not resp.content:
95
+ return None
96
+ try:
97
+ return resp.json()
98
+ except ValueError:
99
+ return resp.text
100
+
101
+ # ── Tools (one method per wrapped endpoint) ──────────────────────────────
102
+
103
+ def check_connection(self) -> dict:
104
+ """Two cheap probes for a first-run self-test: is the DocReadi service
105
+ reachable (GET /health, no auth) and does the API key work (GET
106
+ /api/v1/reports, an authed reviewer-OK read). Never raises — returns a
107
+ diagnostic dict so the agent can explain *what* is wrong (bad URL, down
108
+ service, bad/insufficient key) instead of a cryptic stack trace."""
109
+ result: dict[str, Any] = {
110
+ "base_url": self._config.base_url,
111
+ "service_reachable": False,
112
+ "authenticated": False,
113
+ }
114
+ try:
115
+ health = self._request("GET", "/health")
116
+ result["service_reachable"] = True
117
+ if isinstance(health, dict):
118
+ result["health"] = health
119
+ except DocReadiError as exc:
120
+ result["error"] = str(exc)
121
+ return result
122
+ try:
123
+ self._request("GET", "/api/v1/reports")
124
+ result["authenticated"] = True
125
+ result["message"] = "Connected — DocReadi is reachable and the API key is valid."
126
+ except DocReadiError as exc:
127
+ result["error"] = str(exc)
128
+ result["message"] = (
129
+ "DocReadi is reachable but the API key check failed — see error."
130
+ )
131
+ return result
132
+
133
+ def get_document(self, document_id: str) -> Any:
134
+ """GET /ingest/document/{document_id} — status + extracted data."""
135
+ return self._request("GET", f"/ingest/document/{document_id}")
136
+
137
+ def ingest(
138
+ self,
139
+ file_bytes: bytes,
140
+ filename: str,
141
+ content_type: str,
142
+ *,
143
+ source: str = "mcp",
144
+ document_type: str | None = None,
145
+ template_id: str | None = None,
146
+ ) -> Any:
147
+ """POST /ingest/process — upload a document. Returns immediately with
148
+ ``{document_id, status: ...}``; the pipeline runs in the background."""
149
+ files = {"file": (filename, file_bytes, content_type)}
150
+ data: dict[str, str] = {"source": source}
151
+ if document_type:
152
+ data["document_type"] = document_type
153
+ if template_id:
154
+ data["template_id"] = template_id
155
+ return self._request("POST", "/ingest/process", files=files, data=data)
156
+
157
+ def extract_and_wait(
158
+ self,
159
+ file_bytes: bytes,
160
+ filename: str,
161
+ content_type: str,
162
+ *,
163
+ document_type: str | None = None,
164
+ template_id: str | None = None,
165
+ poll_interval: float = 2.0,
166
+ timeout: float = 180.0,
167
+ _sleep: Callable[[float], None] = time.sleep,
168
+ _clock: Callable[[], float] = time.monotonic,
169
+ ) -> Any:
170
+ """Upload a document and poll until extraction finishes, returning the
171
+ final document record. Raises DocReadiError on a failed/voided document
172
+ or if it doesn't finish within ``timeout`` (the doc still exists — fetch
173
+ it later with get_document)."""
174
+ ingested = self.ingest(
175
+ file_bytes, filename, content_type,
176
+ document_type=document_type, template_id=template_id,
177
+ )
178
+ doc_id = (ingested or {}).get("document_id") if isinstance(ingested, dict) else None
179
+ if not doc_id:
180
+ raise DocReadiError(f"Ingest did not return a document_id (got {ingested!r}).")
181
+
182
+ deadline = _clock() + timeout
183
+ while True:
184
+ current = self.get_document(doc_id)
185
+ status = str((current or {}).get("status") or "").lower() if isinstance(current, dict) else ""
186
+ if status in _TERMINAL_OK:
187
+ return current
188
+ if status in _TERMINAL_FAIL:
189
+ err = current.get("error") or current.get("error_message") if isinstance(current, dict) else None
190
+ raise DocReadiError(
191
+ f"Document {doc_id} ended in status '{status}'"
192
+ + (f": {err}" if err else "") + "."
193
+ )
194
+ if _clock() >= deadline:
195
+ raise DocReadiError(
196
+ f"Extraction of {doc_id} did not finish within {timeout:.0f}s "
197
+ f"(last status '{status or 'unknown'}'). It is still processing — "
198
+ f"fetch it later with get_document."
199
+ )
200
+ _sleep(poll_interval)
201
+
202
+ def extract_adhoc(
203
+ self,
204
+ document_id: str,
205
+ fields: list,
206
+ *,
207
+ system_instructions: str | None = None,
208
+ ) -> Any:
209
+ """POST /api/v1/documents/{id}/extract-adhoc — one-off, non-destructive
210
+ extraction of caller-defined fields from an already-ingested document."""
211
+ body: dict[str, Any] = {"fields": fields}
212
+ if system_instructions:
213
+ body["system_instructions"] = system_instructions
214
+ return self._request(
215
+ "POST", f"/api/v1/documents/{document_id}/extract-adhoc", json=body
216
+ )
217
+
218
+ def classify(self, document_id: str) -> Any:
219
+ """POST /classify — (re)classify a document's type."""
220
+ return self._request("POST", "/classify", json={"document_id": document_id})
221
+
222
+ def list_counterparties(
223
+ self, *, page: int = 1, page_size: int = 100, status: str | None = None
224
+ ) -> Any:
225
+ """GET /api/v1/counterparties — paginated vendors/customers."""
226
+ params: dict[str, Any] = {"page": page, "page_size": page_size}
227
+ if status:
228
+ params["status"] = status
229
+ return self._request("GET", "/api/v1/counterparties", params=params)
230
+
231
+ def list_documents(
232
+ self,
233
+ *,
234
+ page: int = 1,
235
+ page_size: int = 100,
236
+ status: str | None = None,
237
+ document_type: str | None = None,
238
+ q: str | None = None,
239
+ created_after: str | None = None,
240
+ created_before: str | None = None,
241
+ ) -> Any:
242
+ """GET /api/v1/documents — list/search documents (newest first)."""
243
+ params: dict[str, Any] = {"page": page, "page_size": page_size}
244
+ if status:
245
+ params["status"] = status
246
+ if document_type:
247
+ params["document_type"] = document_type
248
+ if q:
249
+ params["q"] = q
250
+ if created_after:
251
+ params["created_after"] = created_after
252
+ if created_before:
253
+ params["created_before"] = created_before
254
+ return self._request("GET", "/api/v1/documents", params=params)
255
+
256
+ def list_reports(self) -> Any:
257
+ """GET /api/v1/reports — the workspace's saved reports."""
258
+ return self._request("GET", "/api/v1/reports")
259
+
260
+ def run_report(
261
+ self, report_id: str, *, page: int = 1, page_size: int = 100, dedup: str = "primary"
262
+ ) -> Any:
263
+ """GET /api/v1/reports/{report_id} — run a saved report, return rows."""
264
+ params = {"page": page, "page_size": page_size, "dedup": dedup}
265
+ return self._request("GET", f"/api/v1/reports/{report_id}", params=params)
266
+
267
+ def report_csv(self, report_id: str, *, dedup: str = "primary") -> str:
268
+ """GET /api/v1/reports/{report_id}/csv — the report as CSV text."""
269
+ out = self._request(
270
+ "GET", f"/api/v1/reports/{report_id}/csv", params={"dedup": dedup}
271
+ )
272
+ return out if isinstance(out, str) else str(out)
@@ -0,0 +1,41 @@
1
+ """Runtime config for the DocReadi MCP server — read from the environment.
2
+
3
+ The user supplies these in their MCP client config (see services/mcp/README.md):
4
+
5
+ "env": { "DOCREADI_API_KEY": "dr_live_…", "DOCREADI_BASE_URL": "https://…" }
6
+
7
+ No secrets are ever read from or written to the repo — the key lives only in the
8
+ user's local client config.
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from dataclasses import dataclass
14
+
15
+ DEFAULT_BASE_URL = "https://api.docreadi.com"
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class Config:
20
+ api_key: str
21
+ base_url: str
22
+
23
+
24
+ class ConfigError(RuntimeError):
25
+ """Raised when required configuration is missing — surfaced to the user at
26
+ server startup with an actionable message."""
27
+
28
+
29
+ def load_config(env: dict | None = None) -> Config:
30
+ """Build a Config from the environment. Raises ConfigError (actionable) when
31
+ DOCREADI_API_KEY is absent. ``env`` overridable for tests."""
32
+ src = os.environ if env is None else env
33
+ api_key = (src.get("DOCREADI_API_KEY") or "").strip()
34
+ if not api_key:
35
+ raise ConfigError(
36
+ "DOCREADI_API_KEY is not set. Add it to your MCP client config under "
37
+ 'the docreadi server\'s "env" (get a key at '
38
+ "https://docreadi.com → Settings → API keys)."
39
+ )
40
+ base_url = (src.get("DOCREADI_BASE_URL") or DEFAULT_BASE_URL).strip().rstrip("/")
41
+ return Config(api_key=api_key, base_url=base_url)
@@ -0,0 +1,11 @@
1
+ """Shared, dependency-free error type.
2
+
3
+ Lives in its own module so both `client.py` (httpx) and `files.py` (pure
4
+ stdlib) can raise it without `files.py` pulling in httpx.
5
+ """
6
+ from __future__ import annotations
7
+
8
+
9
+ class DocReadiError(RuntimeError):
10
+ """A clean, user-facing error from a DocReadi operation — surfaced as the
11
+ MCP tool's error text (a sentence, not a stack trace)."""
@@ -0,0 +1,62 @@
1
+ """Local file reading for the ingest tools (pure stdlib — no httpx/mcp).
2
+
3
+ The whole point of a *local* MCP server is disk access: the agent passes a path,
4
+ we read the bytes and POST them. Kept dependency-free so its test runs in the
5
+ main CI.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+
11
+ from .errors import DocReadiError
12
+
13
+ # DocReadi accepts PDF + common image formats (see routes/ingest.py).
14
+ CONTENT_TYPES: dict[str, str] = {
15
+ ".pdf": "application/pdf",
16
+ ".jpg": "image/jpeg",
17
+ ".jpeg": "image/jpeg",
18
+ ".png": "image/png",
19
+ }
20
+
21
+ # Guard against an agent passing a huge/wrong path. 50 MB is comfortably above
22
+ # any real invoice; the API enforces its own limit too.
23
+ MAX_BYTES = 50 * 1024 * 1024
24
+
25
+
26
+ def read_document_file(file_path: str) -> tuple[str, bytes, str]:
27
+ """Return (filename, content_bytes, content_type) for a local document.
28
+
29
+ Raises DocReadiError (clean, user-facing) for a missing path, a non-file, an
30
+ unsupported extension, or an oversize file."""
31
+ p = Path(file_path).expanduser()
32
+ if not p.exists():
33
+ raise DocReadiError(f"No file at {p}")
34
+ if not p.is_file():
35
+ raise DocReadiError(f"{p} is not a file (is it a directory?).")
36
+ ext = p.suffix.lower()
37
+ if ext not in CONTENT_TYPES:
38
+ raise DocReadiError(
39
+ f"Unsupported file type '{ext or '(none)'}' — DocReadi accepts "
40
+ f"PDF, JPG, PNG."
41
+ )
42
+ size = p.stat().st_size
43
+ if size > MAX_BYTES:
44
+ raise DocReadiError(
45
+ f"{p.name} is {size / 1024 / 1024:.1f} MB — over the "
46
+ f"{MAX_BYTES // 1024 // 1024} MB limit."
47
+ )
48
+ return p.name, p.read_bytes(), CONTENT_TYPES[ext]
49
+
50
+
51
+ def write_text_file(out_path: str, text: str) -> dict:
52
+ """Write ``text`` to a local file, returning {path, bytes}. Raises a clean
53
+ DocReadiError when the target is a directory or its parent folder is
54
+ missing (so the tool gives an actionable message instead of an OSError)."""
55
+ p = Path(out_path).expanduser()
56
+ if p.is_dir():
57
+ raise DocReadiError(f"{p} is a directory — give a file path to write to.")
58
+ if not p.parent.exists():
59
+ raise DocReadiError(f"The folder {p.parent} does not exist.")
60
+ data = text if isinstance(text, str) else str(text)
61
+ p.write_text(data, encoding="utf-8")
62
+ return {"path": str(p), "bytes": len(data.encode("utf-8"))}
@@ -0,0 +1,219 @@
1
+ """FastMCP glue — registers the DocReadi tools and runs over stdio.
2
+
3
+ Deliberately thin: each tool is a few lines that call `DocReadiClient`. The
4
+ `mcp` SDK is imported lazily inside `build_server` so the rest of the package
5
+ (config / client / catalog) and their tests don't require the SDK installed.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from .catalog import TOOL_ENDPOINTS
10
+ from .client import DocReadiClient
11
+ from .config import Config, load_config
12
+ from .files import read_document_file, write_text_file
13
+
14
+
15
+ def build_server(client: DocReadiClient | None = None, config: Config | None = None):
16
+ """Construct the FastMCP server with every tool registered.
17
+
18
+ ``client`` is injectable for tests (so the SDK wiring can be exercised
19
+ without real config/network). In normal use both args are None and config is
20
+ read from the environment.
21
+ """
22
+ # Resolve config/client FIRST so a missing DOCREADI_API_KEY surfaces as the
23
+ # friendly ConfigError before we touch the SDK.
24
+ if client is None:
25
+ client = DocReadiClient(config or load_config())
26
+
27
+ from mcp.server.fastmcp import FastMCP # lazy — only needed to actually run
28
+
29
+ mcp = FastMCP("docreadi")
30
+
31
+ @mcp.tool()
32
+ def check_connection() -> dict:
33
+ """Check that DocReadi is reachable and your API key works — run this
34
+ first if anything seems wrong. Returns {service_reachable, authenticated,
35
+ message, …}; it never errors, so use it to diagnose a bad DOCREADI_API_KEY
36
+ or DOCREADI_BASE_URL before trying other tools."""
37
+ return client.check_connection()
38
+
39
+ @mcp.tool()
40
+ def extract_document(
41
+ file_path: str, document_type: str = "", template_id: str = ""
42
+ ) -> dict:
43
+ """Extract structured data from a local document file (PDF, JPG, or PNG)
44
+ on this machine. Uploads it to DocReadi, waits for extraction to finish
45
+ (usually well under a minute), and returns the document's status and the
46
+ structured extracted data (fields, line items, totals, confidence).
47
+
48
+ Args:
49
+ file_path: path to the document file on this machine.
50
+ document_type: optional — force a type (e.g. 'ap_invoice'); leave
51
+ empty to auto-detect.
52
+ template_id: optional — id of a custom extraction template to use.
53
+ """
54
+ filename, content, content_type = read_document_file(file_path)
55
+ return client.extract_and_wait(
56
+ content, filename, content_type,
57
+ document_type=document_type or None,
58
+ template_id=template_id or None,
59
+ )
60
+
61
+ @mcp.tool()
62
+ def get_document(document_id: str) -> dict:
63
+ """Fetch a DocReadi document by its id: its processing status and, once
64
+ extraction has finished, the structured extracted data.
65
+
66
+ Args:
67
+ document_id: the DocReadi document UUID (e.g. returned when a
68
+ document was ingested).
69
+ """
70
+ return client.get_document(document_id)
71
+
72
+ @mcp.tool()
73
+ def extract_adhoc(
74
+ document_id: str, fields: list, system_instructions: str = ""
75
+ ) -> dict:
76
+ """One-off, non-destructive extraction of specific fields from a document
77
+ that is already in DocReadi — without changing its stored extraction.
78
+ Use it to answer an ad-hoc question ("who signed these delivery notes?").
79
+
80
+ Args:
81
+ document_id: the DocReadi document UUID.
82
+ fields: the fields to extract — a list of
83
+ {"name": str, "type": str, "description": str} objects
84
+ (type one of string/number/date/boolean/list_of_strings).
85
+ system_instructions: optional extra guidance for the model.
86
+ """
87
+ return client.extract_adhoc(
88
+ document_id, fields, system_instructions=system_instructions or None
89
+ )
90
+
91
+ @mcp.tool()
92
+ def classify_document(document_id: str) -> dict:
93
+ """Classify (or re-classify) a document already in DocReadi — returns the
94
+ detected document_type and the model's reasoning. (Classification runs
95
+ automatically during ingestion; use this only to re-run it.)
96
+
97
+ Args:
98
+ document_id: the DocReadi document UUID.
99
+ """
100
+ return client.classify(document_id)
101
+
102
+ @mcp.tool()
103
+ def search_documents(
104
+ q: str = "",
105
+ status: str = "",
106
+ document_type: str = "",
107
+ created_after: str = "",
108
+ created_before: str = "",
109
+ page: int = 1,
110
+ page_size: int = 100,
111
+ ) -> dict:
112
+ """Search/list the documents in this DocReadi workspace (newest first).
113
+ The quick way to find documents by vendor, number, type, status, or date
114
+ — returns a summary row per document (id, type, status, number, vendor,
115
+ date, total, currency, filename). Use get_document for the full data, or
116
+ a saved report (list_reports / run_report) for richer columns.
117
+
118
+ Args:
119
+ q: free-text match over vendor name / document number / filename.
120
+ status: filter by processing status (e.g. 'extracted', 'approved',
121
+ 'flagged').
122
+ document_type: filter by type (e.g. 'ap_invoice').
123
+ created_after: only documents created on/after this date (YYYY-MM-DD).
124
+ created_before: only documents created on/before this date
125
+ (YYYY-MM-DD, inclusive).
126
+ page: 1-based page number.
127
+ page_size: rows per page (max 500).
128
+ """
129
+ return client.list_documents(
130
+ page=page,
131
+ page_size=page_size,
132
+ status=status or None,
133
+ document_type=document_type or None,
134
+ q=q or None,
135
+ created_after=created_after or None,
136
+ created_before=created_before or None,
137
+ )
138
+
139
+ @mcp.tool()
140
+ def list_counterparties(
141
+ page: int = 1, page_size: int = 100, status: str = ""
142
+ ) -> dict:
143
+ """List this workspace's counterparties (vendors / customers) with their
144
+ VAT numbers, aliases, and document counts. Paginated.
145
+
146
+ Args:
147
+ page: 1-based page number.
148
+ page_size: rows per page (max 500).
149
+ status: optional filter — 'confirmed' or 'candidate'.
150
+ """
151
+ return client.list_counterparties(
152
+ page=page, page_size=page_size, status=status or None
153
+ )
154
+
155
+ @mcp.tool()
156
+ def list_reports() -> list:
157
+ """List the saved reports in this DocReadi workspace (id, name,
158
+ description, columns). Use run_report to fetch a report's rows — saved
159
+ reports are how you query/search the document corpus."""
160
+ return client.list_reports()
161
+
162
+ @mcp.tool()
163
+ def run_report(
164
+ report_id: str, page: int = 1, page_size: int = 100, dedup: str = "primary"
165
+ ) -> dict:
166
+ """Run a saved report and return its rows — the way to query/search the
167
+ documents in this workspace (e.g. "last month's approved AP invoices").
168
+
169
+ Args:
170
+ report_id: the saved report's id (from list_reports).
171
+ page: 1-based page number.
172
+ page_size: rows per page.
173
+ dedup: 'primary' (default, one row per document), 'all', or
174
+ 'duplicates'.
175
+ """
176
+ return client.run_report(
177
+ report_id, page=page, page_size=page_size, dedup=dedup
178
+ )
179
+
180
+ @mcp.tool()
181
+ def export_report_csv(report_id: str, out_path: str, dedup: str = "primary") -> dict:
182
+ """Run a saved report and write the full result as a CSV file to a LOCAL
183
+ path on this machine. Returns {path, bytes}.
184
+
185
+ Args:
186
+ report_id: the saved report's id (from list_reports).
187
+ out_path: where to write the CSV on this machine (e.g.
188
+ '/Users/me/Desktop/report.csv').
189
+ dedup: 'primary' (default), 'all', or 'duplicates'.
190
+ """
191
+ csv_text = client.report_csv(report_id, dedup=dedup)
192
+ return write_text_file(out_path, csv_text)
193
+
194
+ # Guardrail: every registered tool must be declared in the catalog (which
195
+ # the contract test binds to the API reference). Catches "added a tool,
196
+ # forgot the catalog entry" at startup.
197
+ registered = set(_tool_names(mcp))
198
+ undeclared = registered - set(TOOL_ENDPOINTS)
199
+ if undeclared:
200
+ raise RuntimeError(
201
+ f"MCP tools not declared in catalog.TOOL_ENDPOINTS: {sorted(undeclared)}"
202
+ )
203
+
204
+ return mcp
205
+
206
+
207
+ def _tool_names(mcp) -> list[str]:
208
+ """Best-effort introspection of registered tool names across FastMCP
209
+ versions (the internal accessor has shifted)."""
210
+ mgr = getattr(mcp, "_tool_manager", None)
211
+ if mgr is not None and hasattr(mgr, "_tools"):
212
+ return list(mgr._tools.keys())
213
+ lister = getattr(mcp, "list_tools", None)
214
+ if callable(lister):
215
+ try:
216
+ return [t.name for t in lister()]
217
+ except Exception:
218
+ pass
219
+ return list(TOOL_ENDPOINTS) # fall back to the declared set
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "docreadi-mcp"
3
+ version = "0.1.0"
4
+ description = "MCP server for DocReadi — document data extraction for finance, in your AI client."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = { text = "Proprietary" }
8
+ authors = [{ name = "DocReadi", email = "support@docreadi.com" }]
9
+ keywords = ["mcp", "docreadi", "invoice", "extraction", "documents", "finance", "ocr"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Environment :: Console",
13
+ "Intended Audience :: Developers",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3",
16
+ "Topic :: Office/Business :: Financial",
17
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
18
+ ]
19
+ dependencies = [
20
+ "mcp>=1.2.0",
21
+ "httpx>=0.27.0,<1",
22
+ ]
23
+
24
+ [project.urls]
25
+ Homepage = "https://docreadi.com"
26
+ Documentation = "https://api.docreadi.com/api/docs-guide"
27
+ Repository = "https://github.com/greenmartian138/doc_intelligence"
28
+
29
+ [project.scripts]
30
+ docreadi-mcp = "docreadi_mcp.__main__:main"
31
+
32
+ [build-system]
33
+ requires = ["hatchling"]
34
+ build-backend = "hatchling.build"
35
+
36
+ [tool.hatch.build.targets.wheel]
37
+ packages = ["docreadi_mcp"]