localparse-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- localparse_mcp-0.1.0/.gitignore +56 -0
- localparse_mcp-0.1.0/PKG-INFO +87 -0
- localparse_mcp-0.1.0/README.md +72 -0
- localparse_mcp-0.1.0/pyproject.toml +28 -0
- localparse_mcp-0.1.0/src/localparse_mcp/__init__.py +7 -0
- localparse_mcp-0.1.0/src/localparse_mcp/server.py +188 -0
- localparse_mcp-0.1.0/tests/test_server.py +78 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# --- Secrets (do NOT commit) ---
|
|
2
|
+
.env
|
|
3
|
+
.env.*
|
|
4
|
+
!.env.example
|
|
5
|
+
**/service-account.json
|
|
6
|
+
**/credentials.json
|
|
7
|
+
**/*.pem
|
|
8
|
+
**/*.key
|
|
9
|
+
# Supabase JWT/JWKS material and any local key dumps (the legacy "jtw" typo too).
|
|
10
|
+
**/supabase_jtw.json
|
|
11
|
+
**/supabase_jwt.json
|
|
12
|
+
**/*.jwks.json
|
|
13
|
+
|
|
14
|
+
# --- Downloaded data (large, regenerable) ---
|
|
15
|
+
**/downloads/
|
|
16
|
+
**/cases/
|
|
17
|
+
**/stores/
|
|
18
|
+
**/blocks.parquet
|
|
19
|
+
**/entities.db
|
|
20
|
+
**/index/
|
|
21
|
+
|
|
22
|
+
# --- Generated eval / audit reports ---
|
|
23
|
+
doc_extraction_v2/reports/
|
|
24
|
+
doc_extraction_v2/scripts/reports/
|
|
25
|
+
|
|
26
|
+
# --- Real-PDF test artifacts (regenerable, large) ---
|
|
27
|
+
doc_extraction_v2/tests/real_pdfs/reports/
|
|
28
|
+
.cache/doc-extraction-v2/
|
|
29
|
+
|
|
30
|
+
# --- Extraction artifacts (regenerable per parse) ---
|
|
31
|
+
**/figures/
|
|
32
|
+
|
|
33
|
+
# --- Python ---
|
|
34
|
+
.venv/
|
|
35
|
+
venv/
|
|
36
|
+
__pycache__/
|
|
37
|
+
*.pyc
|
|
38
|
+
*.pyo
|
|
39
|
+
*.egg-info/
|
|
40
|
+
build/
|
|
41
|
+
dist/
|
|
42
|
+
.pytest_cache/
|
|
43
|
+
.mypy_cache/
|
|
44
|
+
.ruff_cache/
|
|
45
|
+
.coverage
|
|
46
|
+
htmlcov/
|
|
47
|
+
|
|
48
|
+
# --- OS / editor ---
|
|
49
|
+
.DS_Store
|
|
50
|
+
Thumbs.db
|
|
51
|
+
.idea/
|
|
52
|
+
.vscode/
|
|
53
|
+
*.swp
|
|
54
|
+
|
|
55
|
+
# --- Logs ---
|
|
56
|
+
*.log
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: localparse-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for LocalParse — accurate document parsing for Cursor, Claude Desktop, and other MCP clients.
|
|
5
|
+
Project-URL: Homepage, https://localparse.com
|
|
6
|
+
Project-URL: Source, https://github.com/stevencoveta/Agent-ingestor
|
|
7
|
+
Author: LocalParse
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: document parsing,localparse,mcp,model context protocol,ocr,pdf
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: localparse>=0.1.0
|
|
12
|
+
Requires-Dist: mcp>=1.2.0
|
|
13
|
+
Requires-Dist: requests>=2.28
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# localparse-mcp
|
|
17
|
+
|
|
18
|
+
An [MCP](https://modelcontextprotocol.io) server that gives your coding agent
|
|
19
|
+
(Cursor, Claude Desktop, etc.) **accurate document parsing** via
|
|
20
|
+
[LocalParse](https://localparse.com) — table-detection recovery + arithmetic
|
|
21
|
+
identity checks — instead of letting it write throwaway OCR scripts.
|
|
22
|
+
|
|
23
|
+
## Tools
|
|
24
|
+
|
|
25
|
+
| Tool | What it does |
|
|
26
|
+
|---|---|
|
|
27
|
+
| `parse_document(path, result_type)` | Parse a local file (PDF/image/PPTX/…). |
|
|
28
|
+
| `parse_url(url, result_type)` | Download a remote doc and parse it. |
|
|
29
|
+
| `parse_folder(path, case_id, resume)` | Ingest a whole folder into a named case; re-runs only parse changed files. |
|
|
30
|
+
| `get_structured(job_id)` | Fetch the structured ingestion contract for a job. |
|
|
31
|
+
| `case_manifest(case_id)` | List a case's ingested docs (path + hash + status). |
|
|
32
|
+
| `case_failures(case_id)` | List a case's failed docs for reingest. |
|
|
33
|
+
|
|
34
|
+
`result_type` ∈ `markdown` (default), `text`, `json`, `structured`.
|
|
35
|
+
|
|
36
|
+
## Configure your client
|
|
37
|
+
|
|
38
|
+
You need a LocalParse API key (`LOCALPARSE_API_KEY`).
|
|
39
|
+
|
|
40
|
+
### Cursor — `~/.cursor/mcp.json` (or project `.cursor/mcp.json`)
|
|
41
|
+
|
|
42
|
+
```json
|
|
43
|
+
{
|
|
44
|
+
"mcpServers": {
|
|
45
|
+
"localparse": {
|
|
46
|
+
"command": "uvx",
|
|
47
|
+
"args": ["localparse-mcp"],
|
|
48
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Claude Desktop — `claude_desktop_config.json`
|
|
55
|
+
|
|
56
|
+
```json
|
|
57
|
+
{
|
|
58
|
+
"mcpServers": {
|
|
59
|
+
"localparse": {
|
|
60
|
+
"command": "uvx",
|
|
61
|
+
"args": ["localparse-mcp"],
|
|
62
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Then ask the agent things like *"parse the PDFs in ./data-room into case acme and
|
|
69
|
+
tell me which tables don't reconcile."*
|
|
70
|
+
|
|
71
|
+
## Local development (before it's on PyPI)
|
|
72
|
+
|
|
73
|
+
Run the server straight from the repo and point the client at it:
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"mcpServers": {
|
|
78
|
+
"localparse": {
|
|
79
|
+
"command": "uv",
|
|
80
|
+
"args": ["run", "--directory", "/abs/path/to/Agent-ingestor", "localparse-mcp"],
|
|
81
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Point at a self-hosted parser by also setting `LOCALPARSE_BASE_URL`.
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# localparse-mcp
|
|
2
|
+
|
|
3
|
+
An [MCP](https://modelcontextprotocol.io) server that gives your coding agent
|
|
4
|
+
(Cursor, Claude Desktop, etc.) **accurate document parsing** via
|
|
5
|
+
[LocalParse](https://localparse.com) — table-detection recovery + arithmetic
|
|
6
|
+
identity checks — instead of letting it write throwaway OCR scripts.
|
|
7
|
+
|
|
8
|
+
## Tools
|
|
9
|
+
|
|
10
|
+
| Tool | What it does |
|
|
11
|
+
|---|---|
|
|
12
|
+
| `parse_document(path, result_type)` | Parse a local file (PDF/image/PPTX/…). |
|
|
13
|
+
| `parse_url(url, result_type)` | Download a remote doc and parse it. |
|
|
14
|
+
| `parse_folder(path, case_id, resume)` | Ingest a whole folder into a named case; re-runs only parse changed files. |
|
|
15
|
+
| `get_structured(job_id)` | Fetch the structured ingestion contract for a job. |
|
|
16
|
+
| `case_manifest(case_id)` | List a case's ingested docs (path + hash + status). |
|
|
17
|
+
| `case_failures(case_id)` | List a case's failed docs for reingest. |
|
|
18
|
+
|
|
19
|
+
`result_type` ∈ `markdown` (default), `text`, `json`, `structured`.
|
|
20
|
+
|
|
21
|
+
## Configure your client
|
|
22
|
+
|
|
23
|
+
You need a LocalParse API key (`LOCALPARSE_API_KEY`).
|
|
24
|
+
|
|
25
|
+
### Cursor — `~/.cursor/mcp.json` (or project `.cursor/mcp.json`)
|
|
26
|
+
|
|
27
|
+
```json
|
|
28
|
+
{
|
|
29
|
+
"mcpServers": {
|
|
30
|
+
"localparse": {
|
|
31
|
+
"command": "uvx",
|
|
32
|
+
"args": ["localparse-mcp"],
|
|
33
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### Claude Desktop — `claude_desktop_config.json`
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"localparse": {
|
|
45
|
+
"command": "uvx",
|
|
46
|
+
"args": ["localparse-mcp"],
|
|
47
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Then ask the agent things like *"parse the PDFs in ./data-room into case acme and
|
|
54
|
+
tell me which tables don't reconcile."*
|
|
55
|
+
|
|
56
|
+
## Local development (before it's on PyPI)
|
|
57
|
+
|
|
58
|
+
Run the server straight from the repo and point the client at it:
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
{
|
|
62
|
+
"mcpServers": {
|
|
63
|
+
"localparse": {
|
|
64
|
+
"command": "uv",
|
|
65
|
+
"args": ["run", "--directory", "/abs/path/to/Agent-ingestor", "localparse-mcp"],
|
|
66
|
+
"env": { "LOCALPARSE_API_KEY": "lp-your-key" }
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Point at a self-hosted parser by also setting `LOCALPARSE_BASE_URL`.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "localparse-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for LocalParse — accurate document parsing for Cursor, Claude Desktop, and other MCP clients."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "LocalParse" }]
|
|
9
|
+
keywords = ["mcp", "model context protocol", "document parsing", "pdf", "ocr", "localparse"]
|
|
10
|
+
dependencies = [
|
|
11
|
+
"localparse>=0.1.0",
|
|
12
|
+
"mcp>=1.2.0",
|
|
13
|
+
"requests>=2.28",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
[project.scripts]
|
|
17
|
+
localparse-mcp = "localparse_mcp.server:main"
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Homepage = "https://localparse.com"
|
|
21
|
+
Source = "https://github.com/stevencoveta/Agent-ingestor"
|
|
22
|
+
|
|
23
|
+
[build-system]
|
|
24
|
+
requires = ["hatchling"]
|
|
25
|
+
build-backend = "hatchling.build"
|
|
26
|
+
|
|
27
|
+
[tool.hatch.build.targets.wheel]
|
|
28
|
+
packages = ["src/localparse_mcp"]
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""MCP server exposing LocalParse's accurate document parsing to MCP clients.
|
|
2
|
+
|
|
3
|
+
Run locally over stdio (how Cursor / Claude Desktop launch it):
|
|
4
|
+
|
|
5
|
+
LOCALPARSE_API_KEY=lp-... uvx localparse-mcp
|
|
6
|
+
|
|
7
|
+
The tools let an agent parse local files / folders / URLs with a real parser
|
|
8
|
+
(table-detection recovery + arithmetic identity checks) instead of writing
|
|
9
|
+
ad-hoc OCR scripts.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
import tempfile
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from urllib.parse import urlparse
|
|
19
|
+
|
|
20
|
+
import requests
|
|
21
|
+
from localparse import LocalParse, LocalParseError, ParseResult
|
|
22
|
+
from mcp.server.fastmcp import FastMCP
|
|
23
|
+
|
|
24
|
+
mcp = FastMCP("localparse")
|
|
25
|
+
|
|
26
|
+
_client: LocalParse | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _get_client() -> LocalParse:
|
|
30
|
+
"""Build (once) a LocalParse client from the environment.
|
|
31
|
+
|
|
32
|
+
Reads ``LOCALPARSE_API_KEY`` (required) and ``LOCALPARSE_BASE_URL``
|
|
33
|
+
(optional; defaults to the hosted API). Raises ``LocalParseError`` with a
|
|
34
|
+
clear message when the key is missing so the tool can surface it.
|
|
35
|
+
"""
|
|
36
|
+
global _client
|
|
37
|
+
if _client is None:
|
|
38
|
+
base_url = os.environ.get("LOCALPARSE_BASE_URL", "https://api.localparse.com")
|
|
39
|
+
_client = LocalParse(base_url=base_url)
|
|
40
|
+
return _client
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _render(result: ParseResult, result_type: str) -> str:
|
|
44
|
+
if result_type in ("markdown", "md"):
|
|
45
|
+
return result.markdown or ""
|
|
46
|
+
if result_type in ("text", "txt"):
|
|
47
|
+
return result.text or ""
|
|
48
|
+
return json.dumps(result.data, indent=2, ensure_ascii=False)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@mcp.tool()
|
|
52
|
+
def parse_document(path: str, result_type: str = "markdown") -> str:
|
|
53
|
+
"""Parse a local document with LocalParse and return its contents.
|
|
54
|
+
|
|
55
|
+
Use this instead of writing OCR/pdf-to-text scripts: LocalParse recovers
|
|
56
|
+
tables a layout model would miss and arithmetic-checks numeric Totals, so
|
|
57
|
+
the output is far more reliable on financial/tabular PDFs, scans, and decks.
|
|
58
|
+
Supports PDF, images, PPTX, and more. ``result_type`` is one of ``markdown``
|
|
59
|
+
(default), ``text``, ``json``, or ``structured`` (the ingestion contract).
|
|
60
|
+
"""
|
|
61
|
+
try:
|
|
62
|
+
result = _get_client().parse(path, result_type=result_type)
|
|
63
|
+
except LocalParseError as exc:
|
|
64
|
+
return f"LocalParse error: {exc}"
|
|
65
|
+
return _render(result, result_type)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@mcp.tool()
|
|
69
|
+
def parse_url(url: str, result_type: str = "markdown") -> str:
|
|
70
|
+
"""Download a remote document (e.g. a PDF URL) and parse it with LocalParse.
|
|
71
|
+
|
|
72
|
+
Same accuracy guarantees as ``parse_document``. ``result_type`` is one of
|
|
73
|
+
``markdown`` (default), ``text``, ``json``, or ``structured``.
|
|
74
|
+
"""
|
|
75
|
+
name = Path(urlparse(url).path).name or "download.pdf"
|
|
76
|
+
suffix = Path(name).suffix or ".pdf"
|
|
77
|
+
tmp_path: Path | None = None
|
|
78
|
+
try:
|
|
79
|
+
with requests.get(url, stream=True, timeout=60) as resp:
|
|
80
|
+
resp.raise_for_status()
|
|
81
|
+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
|
|
82
|
+
tmp_path = Path(tmp.name)
|
|
83
|
+
for chunk in resp.iter_content(chunk_size=1 << 16):
|
|
84
|
+
tmp.write(chunk)
|
|
85
|
+
result = _get_client().parse(tmp_path, result_type=result_type)
|
|
86
|
+
except requests.RequestException as exc:
|
|
87
|
+
return f"Download failed: {exc}"
|
|
88
|
+
except LocalParseError as exc:
|
|
89
|
+
return f"LocalParse error: {exc}"
|
|
90
|
+
finally:
|
|
91
|
+
if tmp_path is not None:
|
|
92
|
+
tmp_path.unlink(missing_ok=True)
|
|
93
|
+
return _render(result, result_type)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@mcp.tool()
|
|
97
|
+
def parse_folder(
|
|
98
|
+
path: str,
|
|
99
|
+
case_id: str,
|
|
100
|
+
resume: bool = True,
|
|
101
|
+
result_type: str = "markdown",
|
|
102
|
+
) -> str:
|
|
103
|
+
"""Parse every document in a local folder into a named LocalParse "case".
|
|
104
|
+
|
|
105
|
+
Ideal for ingesting a docs/ or data-room folder. Each file is persisted under
|
|
106
|
+
``case_id`` with a folder-relative path; with ``resume`` (default) a re-run
|
|
107
|
+
only parses new/changed files (unchanged ones are skipped by content hash).
|
|
108
|
+
Returns a JSON summary: parsed/skipped counts and, per parsed file, its
|
|
109
|
+
``job_id``, persisted ``doc_id``, and any arithmetic ``identity_violations``
|
|
110
|
+
(a non-zero count means a Total didn't reconcile — worth reviewing).
|
|
111
|
+
"""
|
|
112
|
+
summary: dict[str, object] = {"case_id": case_id, "parsed": [], "skipped": []}
|
|
113
|
+
parsed: list[dict[str, object]] = summary["parsed"] # type: ignore[assignment]
|
|
114
|
+
skipped: list[str] = summary["skipped"] # type: ignore[assignment]
|
|
115
|
+
|
|
116
|
+
def _progress(rel: str, res: ParseResult | None) -> None:
|
|
117
|
+
if res is None:
|
|
118
|
+
skipped.append(rel)
|
|
119
|
+
return
|
|
120
|
+
identity = res.identity_check or {}
|
|
121
|
+
parsed.append(
|
|
122
|
+
{
|
|
123
|
+
"source_path": rel,
|
|
124
|
+
"job_id": res.job_id,
|
|
125
|
+
"persisted_doc_id": (res.persisted or {}).get("doc_id"),
|
|
126
|
+
"identity_violations": identity.get("violations"),
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
_get_client().parse_folder(
|
|
132
|
+
path,
|
|
133
|
+
case_id=case_id,
|
|
134
|
+
resume=resume,
|
|
135
|
+
result_type=result_type,
|
|
136
|
+
on_progress=_progress,
|
|
137
|
+
)
|
|
138
|
+
except (LocalParseError, ValueError) as exc:
|
|
139
|
+
return f"LocalParse error: {exc}"
|
|
140
|
+
summary["parsed_count"] = len(parsed)
|
|
141
|
+
summary["skipped_count"] = len(skipped)
|
|
142
|
+
return json.dumps(summary, indent=2)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
@mcp.tool()
|
|
146
|
+
def get_structured(job_id: str) -> str:
|
|
147
|
+
"""Fetch the versioned structured-document contract for a finished job.
|
|
148
|
+
|
|
149
|
+
Returns the JSON ingestion contract (ordered blocks with provenance:
|
|
150
|
+
page, section_path, type, table_markdown, bbox, needs_review, ...).
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
result = _get_client().get_result(job_id, result_type="structured")
|
|
154
|
+
except LocalParseError as exc:
|
|
155
|
+
return f"LocalParse error: {exc}"
|
|
156
|
+
return json.dumps(result.data, indent=2, ensure_ascii=False)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@mcp.tool()
|
|
160
|
+
def case_manifest(case_id: str) -> str:
|
|
161
|
+
"""List a case's already-ingested documents (source_path + content hash + status).
|
|
162
|
+
|
|
163
|
+
Lets the agent see what's in a case before re-ingesting or to plan a delta.
|
|
164
|
+
"""
|
|
165
|
+
try:
|
|
166
|
+
manifest = _get_client().case_manifest(case_id)
|
|
167
|
+
except LocalParseError as exc:
|
|
168
|
+
return f"LocalParse error: {exc}"
|
|
169
|
+
return json.dumps(manifest, indent=2, ensure_ascii=False)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@mcp.tool()
|
|
173
|
+
def case_failures(case_id: str, include_resolved: bool = False) -> str:
|
|
174
|
+
"""List a case's failed documents (error, attempts, timestamps) for reingest."""
|
|
175
|
+
try:
|
|
176
|
+
failures = _get_client().case_failures(case_id, include_resolved=include_resolved)
|
|
177
|
+
except LocalParseError as exc:
|
|
178
|
+
return f"LocalParse error: {exc}"
|
|
179
|
+
return json.dumps(failures, indent=2, ensure_ascii=False)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def main() -> None:
|
|
183
|
+
"""Console entry point: run the server over stdio."""
|
|
184
|
+
mcp.run()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__":
|
|
188
|
+
main()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Unit tests for the LocalParse MCP server (fake client, no network)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
|
|
7
|
+
import localparse_mcp.server as server
|
|
8
|
+
import pytest
|
|
9
|
+
from localparse import AuthenticationError, ParseResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_render_flavours() -> None:
|
|
13
|
+
md = ParseResult(job_id="j", result_type="markdown", data={"markdown": "# Hi"})
|
|
14
|
+
assert server._render(md, "markdown") == "# Hi"
|
|
15
|
+
txt = ParseResult(job_id="j", result_type="text", data={"text": "hello"})
|
|
16
|
+
assert server._render(txt, "text") == "hello"
|
|
17
|
+
js = ParseResult(job_id="j", result_type="json", data={"pages": [1]})
|
|
18
|
+
assert json.loads(server._render(js, "json")) == {"pages": [1]}
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def test_get_client_requires_key(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
22
|
+
monkeypatch.delenv("LOCALPARSE_API_KEY", raising=False)
|
|
23
|
+
server._client = None
|
|
24
|
+
with pytest.raises(AuthenticationError):
|
|
25
|
+
server._get_client()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_parse_document_tool(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
29
|
+
class FakeClient:
|
|
30
|
+
def parse(self, path: str, result_type: str = "markdown") -> ParseResult:
|
|
31
|
+
return ParseResult(
|
|
32
|
+
job_id="j", result_type=result_type, data={"markdown": "# Doc"}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
|
|
36
|
+
assert server.parse_document("x.pdf", "markdown") == "# Doc"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_parse_document_tool_surfaces_errors(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
40
|
+
class FakeClient:
|
|
41
|
+
def parse(self, path: str, result_type: str = "markdown") -> ParseResult:
|
|
42
|
+
raise AuthenticationError("bad key")
|
|
43
|
+
|
|
44
|
+
monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
|
|
45
|
+
out = server.parse_document("x.pdf")
|
|
46
|
+
assert "LocalParse error" in out and "bad key" in out
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_parse_folder_tool_summary(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
50
|
+
class FakeClient:
|
|
51
|
+
def parse_folder(self, path, *, case_id, resume, result_type, on_progress): # noqa: ANN001
|
|
52
|
+
on_progress(
|
|
53
|
+
"a.pdf",
|
|
54
|
+
ParseResult(
|
|
55
|
+
job_id="j1",
|
|
56
|
+
result_type="markdown",
|
|
57
|
+
data={
|
|
58
|
+
"job_metadata": {
|
|
59
|
+
"identity_check": {"violations": 2},
|
|
60
|
+
"persisted": {"doc_id": "d1"},
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
),
|
|
64
|
+
)
|
|
65
|
+
on_progress("b.pdf", None)
|
|
66
|
+
|
|
67
|
+
monkeypatch.setattr(server, "_get_client", lambda: FakeClient())
|
|
68
|
+
data = json.loads(server.parse_folder("dir", "acme"))
|
|
69
|
+
|
|
70
|
+
assert data["parsed_count"] == 1
|
|
71
|
+
assert data["skipped_count"] == 1
|
|
72
|
+
assert data["skipped"] == ["b.pdf"]
|
|
73
|
+
assert data["parsed"][0] == {
|
|
74
|
+
"source_path": "a.pdf",
|
|
75
|
+
"job_id": "j1",
|
|
76
|
+
"persisted_doc_id": "d1",
|
|
77
|
+
"identity_violations": 2,
|
|
78
|
+
}
|