localparse-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ # --- Secrets (do NOT commit) ---
2
+ .env
3
+ .env.*
4
+ !.env.example
5
+ **/service-account.json
6
+ **/credentials.json
7
+ **/*.pem
8
+ **/*.key
9
+ # Supabase JWT/JWKS material and any local key dumps (the legacy "jtw" typo too).
10
+ **/supabase_jtw.json
11
+ **/supabase_jwt.json
12
+ **/*.jwks.json
13
+
14
+ # --- Downloaded data (large, regenerable) ---
15
+ **/downloads/
16
+ **/cases/
17
+ **/stores/
18
+ **/blocks.parquet
19
+ **/entities.db
20
+ **/index/
21
+
22
+ # --- Generated eval / audit reports ---
23
+ doc_extraction_v2/reports/
24
+ doc_extraction_v2/scripts/reports/
25
+
26
+ # --- Real-PDF test artifacts (regenerable, large) ---
27
+ doc_extraction_v2/tests/real_pdfs/reports/
28
+ .cache/doc-extraction-v2/
29
+
30
+ # --- Extraction artifacts (regenerable per parse) ---
31
+ **/figures/
32
+
33
+ # --- Python ---
34
+ .venv/
35
+ venv/
36
+ __pycache__/
37
+ *.pyc
38
+ *.pyo
39
+ *.egg-info/
40
+ build/
41
+ dist/
42
+ .pytest_cache/
43
+ .mypy_cache/
44
+ .ruff_cache/
45
+ .coverage
46
+ htmlcov/
47
+
48
+ # --- OS / editor ---
49
+ .DS_Store
50
+ Thumbs.db
51
+ .idea/
52
+ .vscode/
53
+ *.swp
54
+
55
+ # --- Logs ---
56
+ *.log
@@ -0,0 +1,87 @@
1
+ Metadata-Version: 2.4
2
+ Name: localparse-mcp
3
+ Version: 0.1.0
4
+ Summary: MCP server for LocalParse — accurate document parsing for Cursor, Claude Desktop, and other MCP clients.
5
+ Project-URL: Homepage, https://localparse.com
6
+ Project-URL: Source, https://github.com/stevencoveta/Agent-ingestor
7
+ Author: LocalParse
8
+ License: MIT
9
+ Keywords: document parsing,localparse,mcp,model context protocol,ocr,pdf
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: localparse>=0.1.0
12
+ Requires-Dist: mcp>=1.2.0
13
+ Requires-Dist: requests>=2.28
14
+ Description-Content-Type: text/markdown
15
+
16
+ # localparse-mcp
17
+
18
+ An [MCP](https://modelcontextprotocol.io) server that gives your coding agent
19
+ (Cursor, Claude Desktop, etc.) **accurate document parsing** via
20
+ [LocalParse](https://localparse.com) — table-detection recovery + arithmetic
21
+ identity checks — instead of letting it write throwaway OCR scripts.
22
+
23
+ ## Tools
24
+
25
+ | Tool | What it does |
26
+ |---|---|
27
+ | `parse_document(path, result_type)` | Parse a local file (PDF/image/PPTX/…). |
28
+ | `parse_url(url, result_type)` | Download a remote doc and parse it. |
29
+ | `parse_folder(path, case_id, resume)` | Ingest a whole folder into a named case; re-runs only parse changed files. |
30
+ | `get_structured(job_id)` | Fetch the structured ingestion contract for a job. |
31
+ | `case_manifest(case_id)` | List a case's ingested docs (path + hash + status). |
32
+ | `case_failures(case_id)` | List a case's failed docs for reingest. |
33
+
34
+ `result_type` ∈ `markdown` (default), `text`, `json`, `structured`.
35
+
36
+ ## Configure your client
37
+
38
+ You need a LocalParse API key (`LOCALPARSE_API_KEY`).
39
+
40
+ ### Cursor — `~/.cursor/mcp.json` (or project `.cursor/mcp.json`)
41
+
42
+ ```json
43
+ {
44
+ "mcpServers": {
45
+ "localparse": {
46
+ "command": "uvx",
47
+ "args": ["localparse-mcp"],
48
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
49
+ }
50
+ }
51
+ }
52
+ ```
53
+
54
+ ### Claude Desktop — `claude_desktop_config.json`
55
+
56
+ ```json
57
+ {
58
+ "mcpServers": {
59
+ "localparse": {
60
+ "command": "uvx",
61
+ "args": ["localparse-mcp"],
62
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
63
+ }
64
+ }
65
+ }
66
+ ```
67
+
68
+ Then ask the agent things like *"parse the PDFs in ./data-room into case acme and
69
+ tell me which tables don't reconcile."*
70
+
71
+ ## Local development (before it's on PyPI)
72
+
73
+ Run the server straight from the repo and point the client at it:
74
+
75
+ ```json
76
+ {
77
+ "mcpServers": {
78
+ "localparse": {
79
+ "command": "uv",
80
+ "args": ["run", "--directory", "/abs/path/to/Agent-ingestor", "localparse-mcp"],
81
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ Point at a self-hosted parser by also setting `LOCALPARSE_BASE_URL`.
@@ -0,0 +1,72 @@
1
+ # localparse-mcp
2
+
3
+ An [MCP](https://modelcontextprotocol.io) server that gives your coding agent
4
+ (Cursor, Claude Desktop, etc.) **accurate document parsing** via
5
+ [LocalParse](https://localparse.com) — table-detection recovery + arithmetic
6
+ identity checks — instead of letting it write throwaway OCR scripts.
7
+
8
+ ## Tools
9
+
10
+ | Tool | What it does |
11
+ |---|---|
12
+ | `parse_document(path, result_type)` | Parse a local file (PDF/image/PPTX/…). |
13
+ | `parse_url(url, result_type)` | Download a remote doc and parse it. |
14
+ | `parse_folder(path, case_id, resume)` | Ingest a whole folder into a named case; re-runs only parse changed files. |
15
+ | `get_structured(job_id)` | Fetch the structured ingestion contract for a job. |
16
+ | `case_manifest(case_id)` | List a case's ingested docs (path + hash + status). |
17
+ | `case_failures(case_id)` | List a case's failed docs for reingest. |
18
+
19
+ `result_type` ∈ `markdown` (default), `text`, `json`, `structured`.
20
+
21
+ ## Configure your client
22
+
23
+ You need a LocalParse API key (`LOCALPARSE_API_KEY`).
24
+
25
+ ### Cursor — `~/.cursor/mcp.json` (or project `.cursor/mcp.json`)
26
+
27
+ ```json
28
+ {
29
+ "mcpServers": {
30
+ "localparse": {
31
+ "command": "uvx",
32
+ "args": ["localparse-mcp"],
33
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
34
+ }
35
+ }
36
+ }
37
+ ```
38
+
39
+ ### Claude Desktop — `claude_desktop_config.json`
40
+
41
+ ```json
42
+ {
43
+ "mcpServers": {
44
+ "localparse": {
45
+ "command": "uvx",
46
+ "args": ["localparse-mcp"],
47
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
48
+ }
49
+ }
50
+ }
51
+ ```
52
+
53
+ Then ask the agent things like *"parse the PDFs in ./data-room into case acme and
54
+ tell me which tables don't reconcile."*
55
+
56
+ ## Local development (before it's on PyPI)
57
+
58
+ Run the server straight from the repo and point the client at it:
59
+
60
+ ```json
61
+ {
62
+ "mcpServers": {
63
+ "localparse": {
64
+ "command": "uv",
65
+ "args": ["run", "--directory", "/abs/path/to/Agent-ingestor", "localparse-mcp"],
66
+ "env": { "LOCALPARSE_API_KEY": "lp-your-key" }
67
+ }
68
+ }
69
+ }
70
+ ```
71
+
72
+ Point at a self-hosted parser by also setting `LOCALPARSE_BASE_URL`.
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "localparse-mcp"
3
+ version = "0.1.0"
4
+ description = "MCP server for LocalParse — accurate document parsing for Cursor, Claude Desktop, and other MCP clients."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "LocalParse" }]
9
+ keywords = ["mcp", "model context protocol", "document parsing", "pdf", "ocr", "localparse"]
10
+ dependencies = [
11
+ "localparse>=0.1.0",
12
+ "mcp>=1.2.0",
13
+ "requests>=2.28",
14
+ ]
15
+
16
+ [project.scripts]
17
+ localparse-mcp = "localparse_mcp.server:main"
18
+
19
+ [project.urls]
20
+ Homepage = "https://localparse.com"
21
+ Source = "https://github.com/stevencoveta/Agent-ingestor"
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/localparse_mcp"]
@@ -0,0 +1,7 @@
1
+ """MCP server for LocalParse — accurate document parsing for MCP clients."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ __all__ = ["__version__"]
@@ -0,0 +1,188 @@
1
+ """MCP server exposing LocalParse's accurate document parsing to MCP clients.
2
+
3
+ Run locally over stdio (how Cursor / Claude Desktop launch it):
4
+
5
+ LOCALPARSE_API_KEY=lp-... uvx localparse-mcp
6
+
7
+ The tools let an agent parse local files / folders / URLs with a real parser
8
+ (table-detection recovery + arithmetic identity checks) instead of writing
9
+ ad-hoc OCR scripts.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import os
16
+ import tempfile
17
+ from pathlib import Path
18
+ from urllib.parse import urlparse
19
+
20
+ import requests
21
+ from localparse import LocalParse, LocalParseError, ParseResult
22
+ from mcp.server.fastmcp import FastMCP
23
+
24
+ mcp = FastMCP("localparse")
25
+
26
+ _client: LocalParse | None = None
27
+
28
+
29
+ def _get_client() -> LocalParse:
30
+ """Build (once) a LocalParse client from the environment.
31
+
32
+ Reads ``LOCALPARSE_API_KEY`` (required) and ``LOCALPARSE_BASE_URL``
33
+ (optional; defaults to the hosted API). Raises ``LocalParseError`` with a
34
+ clear message when the key is missing so the tool can surface it.
35
+ """
36
+ global _client
37
+ if _client is None:
38
+ base_url = os.environ.get("LOCALPARSE_BASE_URL", "https://api.localparse.com")
39
+ _client = LocalParse(base_url=base_url)
40
+ return _client
41
+
42
+
43
+ def _render(result: ParseResult, result_type: str) -> str:
44
+ if result_type in ("markdown", "md"):
45
+ return result.markdown or ""
46
+ if result_type in ("text", "txt"):
47
+ return result.text or ""
48
+ return json.dumps(result.data, indent=2, ensure_ascii=False)
49
+
50
+
51
+ @mcp.tool()
52
+ def parse_document(path: str, result_type: str = "markdown") -> str:
53
+ """Parse a local document with LocalParse and return its contents.
54
+
55
+ Use this instead of writing OCR/pdf-to-text scripts: LocalParse recovers
56
+ tables a layout model would miss and arithmetic-checks numeric Totals, so
57
+ the output is far more reliable on financial/tabular PDFs, scans, and decks.
58
+ Supports PDF, images, PPTX, and more. ``result_type`` is one of ``markdown``
59
+ (default), ``text``, ``json``, or ``structured`` (the ingestion contract).
60
+ """
61
+ try:
62
+ result = _get_client().parse(path, result_type=result_type)
63
+ except LocalParseError as exc:
64
+ return f"LocalParse error: {exc}"
65
+ return _render(result, result_type)
66
+
67
+
68
+ @mcp.tool()
69
+ def parse_url(url: str, result_type: str = "markdown") -> str:
70
+ """Download a remote document (e.g. a PDF URL) and parse it with LocalParse.
71
+
72
+ Same accuracy guarantees as ``parse_document``. ``result_type`` is one of
73
+ ``markdown`` (default), ``text``, ``json``, or ``structured``.
74
+ """
75
+ name = Path(urlparse(url).path).name or "download.pdf"
76
+ suffix = Path(name).suffix or ".pdf"
77
+ tmp_path: Path | None = None
78
+ try:
79
+ with requests.get(url, stream=True, timeout=60) as resp:
80
+ resp.raise_for_status()
81
+ with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
82
+ tmp_path = Path(tmp.name)
83
+ for chunk in resp.iter_content(chunk_size=1 << 16):
84
+ tmp.write(chunk)
85
+ result = _get_client().parse(tmp_path, result_type=result_type)
86
+ except requests.RequestException as exc:
87
+ return f"Download failed: {exc}"
88
+ except LocalParseError as exc:
89
+ return f"LocalParse error: {exc}"
90
+ finally:
91
+ if tmp_path is not None:
92
+ tmp_path.unlink(missing_ok=True)
93
+ return _render(result, result_type)
94
+
95
+
96
+ @mcp.tool()
97
+ def parse_folder(
98
+ path: str,
99
+ case_id: str,
100
+ resume: bool = True,
101
+ result_type: str = "markdown",
102
+ ) -> str:
103
+ """Parse every document in a local folder into a named LocalParse "case".
104
+
105
+ Ideal for ingesting a docs/ or data-room folder. Each file is persisted under
106
+ ``case_id`` with a folder-relative path; with ``resume`` (default) a re-run
107
+ only parses new/changed files (unchanged ones are skipped by content hash).
108
+ Returns a JSON summary: parsed/skipped counts and, per parsed file, its
109
+ ``job_id``, persisted ``doc_id``, and any arithmetic ``identity_violations``
110
+ (a non-zero count means a Total didn't reconcile — worth reviewing).
111
+ """
112
+ summary: dict[str, object] = {"case_id": case_id, "parsed": [], "skipped": []}
113
+ parsed: list[dict[str, object]] = summary["parsed"] # type: ignore[assignment]
114
+ skipped: list[str] = summary["skipped"] # type: ignore[assignment]
115
+
116
+ def _progress(rel: str, res: ParseResult | None) -> None:
117
+ if res is None:
118
+ skipped.append(rel)
119
+ return
120
+ identity = res.identity_check or {}
121
+ parsed.append(
122
+ {
123
+ "source_path": rel,
124
+ "job_id": res.job_id,
125
+ "persisted_doc_id": (res.persisted or {}).get("doc_id"),
126
+ "identity_violations": identity.get("violations"),
127
+ }
128
+ )
129
+
130
+ try:
131
+ _get_client().parse_folder(
132
+ path,
133
+ case_id=case_id,
134
+ resume=resume,
135
+ result_type=result_type,
136
+ on_progress=_progress,
137
+ )
138
+ except (LocalParseError, ValueError) as exc:
139
+ return f"LocalParse error: {exc}"
140
+ summary["parsed_count"] = len(parsed)
141
+ summary["skipped_count"] = len(skipped)
142
+ return json.dumps(summary, indent=2)
143
+
144
+
145
+ @mcp.tool()
146
+ def get_structured(job_id: str) -> str:
147
+ """Fetch the versioned structured-document contract for a finished job.
148
+
149
+ Returns the JSON ingestion contract (ordered blocks with provenance:
150
+ page, section_path, type, table_markdown, bbox, needs_review, ...).
151
+ """
152
+ try:
153
+ result = _get_client().get_result(job_id, result_type="structured")
154
+ except LocalParseError as exc:
155
+ return f"LocalParse error: {exc}"
156
+ return json.dumps(result.data, indent=2, ensure_ascii=False)
157
+
158
+
159
+ @mcp.tool()
160
+ def case_manifest(case_id: str) -> str:
161
+ """List a case's already-ingested documents (source_path + content hash + status).
162
+
163
+ Lets the agent see what's in a case before re-ingesting or to plan a delta.
164
+ """
165
+ try:
166
+ manifest = _get_client().case_manifest(case_id)
167
+ except LocalParseError as exc:
168
+ return f"LocalParse error: {exc}"
169
+ return json.dumps(manifest, indent=2, ensure_ascii=False)
170
+
171
+
172
+ @mcp.tool()
173
+ def case_failures(case_id: str, include_resolved: bool = False) -> str:
174
+ """List a case's failed documents (error, attempts, timestamps) for reingest."""
175
+ try:
176
+ failures = _get_client().case_failures(case_id, include_resolved=include_resolved)
177
+ except LocalParseError as exc:
178
+ return f"LocalParse error: {exc}"
179
+ return json.dumps(failures, indent=2, ensure_ascii=False)
180
+
181
+
182
+ def main() -> None:
183
+ """Console entry point: run the server over stdio."""
184
+ mcp.run()
185
+
186
+
187
+ if __name__ == "__main__":
188
+ main()
@@ -0,0 +1,78 @@
1
+ """Unit tests for the LocalParse MCP server (fake client, no network)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ import localparse_mcp.server as server
8
+ import pytest
9
+ from localparse import AuthenticationError, ParseResult
10
+
11
+
12
+ def test_render_flavours() -> None:
13
+ md = ParseResult(job_id="j", result_type="markdown", data={"markdown": "# Hi"})
14
+ assert server._render(md, "markdown") == "# Hi"
15
+ txt = ParseResult(job_id="j", result_type="text", data={"text": "hello"})
16
+ assert server._render(txt, "text") == "hello"
17
+ js = ParseResult(job_id="j", result_type="json", data={"pages": [1]})
18
+ assert json.loads(server._render(js, "json")) == {"pages": [1]}
19
+
20
+
21
+ def test_get_client_requires_key(monkeypatch: pytest.MonkeyPatch) -> None:
22
+ monkeypatch.delenv("LOCALPARSE_API_KEY", raising=False)
23
+ server._client = None
24
+ with pytest.raises(AuthenticationError):
25
+ server._get_client()
26
+
27
+
28
+ def test_parse_document_tool(monkeypatch: pytest.MonkeyPatch) -> None:
29
+ class FakeClient:
30
+ def parse(self, path: str, result_type: str = "markdown") -> ParseResult:
31
+ return ParseResult(
32
+ job_id="j", result_type=result_type, data={"markdown": "# Doc"}
33
+ )
34
+
35
+ monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
36
+ assert server.parse_document("x.pdf", "markdown") == "# Doc"
37
+
38
+
39
+ def test_parse_document_tool_surfaces_errors(monkeypatch: pytest.MonkeyPatch) -> None:
40
+ class FakeClient:
41
+ def parse(self, path: str, result_type: str = "markdown") -> ParseResult:
42
+ raise AuthenticationError("bad key")
43
+
44
+ monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
45
+ out = server.parse_document("x.pdf")
46
+ assert "LocalParse error" in out and "bad key" in out
47
+
48
+
49
+ def test_parse_folder_tool_summary(monkeypatch: pytest.MonkeyPatch) -> None:
50
+ class FakeClient:
51
+ def parse_folder(self, path, *, case_id, resume, result_type, on_progress): # noqa: ANN001
52
+ on_progress(
53
+ "a.pdf",
54
+ ParseResult(
55
+ job_id="j1",
56
+ result_type="markdown",
57
+ data={
58
+ "job_metadata": {
59
+ "identity_check": {"violations": 2},
60
+ "persisted": {"doc_id": "d1"},
61
+ }
62
+ },
63
+ ),
64
+ )
65
+ on_progress("b.pdf", None)
66
+
67
+ monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
68
+ data = json.loads(server.parse_folder("dir", "acme"))
69
+
70
+ assert data["parsed_count"] == 1
71
+ assert data["skipped_count"] == 1
72
+ assert data["skipped"] == ["b.pdf"]
73
+ assert data["parsed"][0] == {
74
+ "source_path": "a.pdf",
75
+ "job_id": "j1",
76
+ "persisted_doc_id": "d1",
77
+ "identity_violations": 2,
78
+ }