flint-slating 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flint_slating-0.1.0/.github/workflows/license-check.yml +35 -0
- flint_slating-0.1.0/.github/workflows/release.yml +97 -0
- flint_slating-0.1.0/.github/workflows/test.yml +28 -0
- flint_slating-0.1.0/.gitignore +14 -0
- flint_slating-0.1.0/Dockerfile +40 -0
- flint_slating-0.1.0/LICENSE +21 -0
- flint_slating-0.1.0/PKG-INFO +188 -0
- flint_slating-0.1.0/README.md +168 -0
- flint_slating-0.1.0/docker-compose.yml +54 -0
- flint_slating-0.1.0/pyproject.toml +61 -0
- flint_slating-0.1.0/src/flint_slating/__init__.py +0 -0
- flint_slating-0.1.0/src/flint_slating/__main__.py +62 -0
- flint_slating-0.1.0/src/flint_slating/app.py +41 -0
- flint_slating-0.1.0/src/flint_slating/config.py +45 -0
- flint_slating-0.1.0/src/flint_slating/images.py +100 -0
- flint_slating-0.1.0/src/flint_slating/jobs.py +262 -0
- flint_slating-0.1.0/src/flint_slating/mcp_server.py +26 -0
- flint_slating-0.1.0/src/flint_slating/outputs.py +139 -0
- flint_slating-0.1.0/src/flint_slating/pdf_reader.py +420 -0
- flint_slating-0.1.0/src/flint_slating/pdf_source.py +184 -0
- flint_slating-0.1.0/src/flint_slating/routes.py +175 -0
- flint_slating-0.1.0/src/flint_slating/schema.py +40 -0
- flint_slating-0.1.0/src/flint_slating/tables.py +91 -0
- flint_slating-0.1.0/src/flint_slating/tools.py +364 -0
- flint_slating-0.1.0/tests/__init__.py +0 -0
- flint_slating-0.1.0/tests/conftest.py +86 -0
- flint_slating-0.1.0/tests/test_http_routes.py +31 -0
- flint_slating-0.1.0/tests/test_jobs.py +61 -0
- flint_slating-0.1.0/tests/test_pdf_reader.py +56 -0
- flint_slating-0.1.0/tests/test_pdf_source.py +54 -0
- flint_slating-0.1.0/tests/test_tools.py +64 -0
- flint_slating-0.1.0/uv.lock +2749 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: License check
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main, develop]
|
|
6
|
+
push:
|
|
7
|
+
branches: [main, develop]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
licenses:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v3
|
|
20
|
+
|
|
21
|
+
- name: Sync deps (including dev for pip-licenses)
|
|
22
|
+
run: uv sync
|
|
23
|
+
|
|
24
|
+
# We use a fail-on deny-list rather than an allow-list because PyPI
|
|
25
|
+
# license metadata is wildly inconsistent (MIT-CMU, "Apache 2.0
|
|
26
|
+
# License", multi-license SPDX expressions, etc.). The real concern
|
|
27
|
+
# is keeping copyleft out — specifically the PyMuPDF AGPL, Marker
|
|
28
|
+
# GPL, and Unstructured's AGPL transitive (ultralytics) trapdoors
|
|
29
|
+
# we explicitly designed this stack to avoid.
|
|
30
|
+
- name: Check transitive licenses
|
|
31
|
+
run: |
|
|
32
|
+
uv run pip-licenses \
|
|
33
|
+
--from=mixed \
|
|
34
|
+
--fail-on="GPL;GPL v2;GPL v3;GNU General Public License;GNU General Public License v2 (GPLv2);GNU General Public License v3 (GPLv3);AGPL;Affero;GNU Affero General Public License v3;GNU Affero General Public License v3 (AGPLv3);LGPL;GNU Lesser General Public License;GNU Lesser General Public License v2 (LGPLv2);GNU Lesser General Public License v3 (LGPLv3);GNU Library or Lesser General Public License (LGPL);SSPL;Server Side Public License" \
|
|
35
|
+
--order=license
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ['v*']
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
packages: write # GHCR push
|
|
10
|
+
id-token: write # PyPI trusted publishing
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
# Pre-flight checks. Both publish jobs depend on this, so a failed gate
|
|
14
|
+
# prevents *any* artifact from shipping (no half-shipped state).
|
|
15
|
+
gate:
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v4
|
|
19
|
+
with:
|
|
20
|
+
fetch-depth: 0 # full history needed for the ancestor check below
|
|
21
|
+
|
|
22
|
+
- name: Install uv
|
|
23
|
+
uses: astral-sh/setup-uv@v3
|
|
24
|
+
|
|
25
|
+
- name: Verify tag matches pyproject version
|
|
26
|
+
run: |
|
|
27
|
+
TAG="${GITHUB_REF#refs/tags/v}"
|
|
28
|
+
PROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
29
|
+
if [ "$TAG" != "$PROJECT_VERSION" ]; then
|
|
30
|
+
echo "tag $TAG != pyproject $PROJECT_VERSION"
|
|
31
|
+
exit 1
|
|
32
|
+
fi
|
|
33
|
+
|
|
34
|
+
- name: Verify tag is reachable from origin/main
|
|
35
|
+
run: |
|
|
36
|
+
git fetch --no-tags origin main
|
|
37
|
+
if ! git merge-base --is-ancestor "$GITHUB_SHA" origin/main; then
|
|
38
|
+
echo "tagged commit $GITHUB_SHA is not on origin/main"
|
|
39
|
+
echo "release tags must come from main — back-merge through, then re-tag from main"
|
|
40
|
+
exit 1
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
docker:
|
|
44
|
+
needs: gate
|
|
45
|
+
runs-on: ubuntu-latest
|
|
46
|
+
steps:
|
|
47
|
+
- uses: actions/checkout@v4
|
|
48
|
+
|
|
49
|
+
- name: Set up QEMU (arm64 emulation)
|
|
50
|
+
uses: docker/setup-qemu-action@v3
|
|
51
|
+
|
|
52
|
+
- name: Set up Docker Buildx
|
|
53
|
+
uses: docker/setup-buildx-action@v3
|
|
54
|
+
|
|
55
|
+
- name: Log in to GHCR
|
|
56
|
+
uses: docker/login-action@v3
|
|
57
|
+
with:
|
|
58
|
+
registry: ghcr.io
|
|
59
|
+
username: ${{ github.actor }}
|
|
60
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
61
|
+
|
|
62
|
+
- name: Extract metadata (tags + labels)
|
|
63
|
+
id: meta
|
|
64
|
+
uses: docker/metadata-action@v5
|
|
65
|
+
with:
|
|
66
|
+
images: ghcr.io/${{ github.repository }}
|
|
67
|
+
tags: |
|
|
68
|
+
type=semver,pattern={{version}}
|
|
69
|
+
type=semver,pattern={{major}}.{{minor}}
|
|
70
|
+
type=raw,value=latest
|
|
71
|
+
|
|
72
|
+
- name: Build and push (linux/amd64 + linux/arm64)
|
|
73
|
+
uses: docker/build-push-action@v5
|
|
74
|
+
with:
|
|
75
|
+
context: .
|
|
76
|
+
platforms: linux/amd64,linux/arm64
|
|
77
|
+
push: true
|
|
78
|
+
tags: ${{ steps.meta.outputs.tags }}
|
|
79
|
+
labels: ${{ steps.meta.outputs.labels }}
|
|
80
|
+
cache-from: type=gha
|
|
81
|
+
cache-to: type=gha,mode=max
|
|
82
|
+
|
|
83
|
+
pypi:
|
|
84
|
+
needs: gate
|
|
85
|
+
runs-on: ubuntu-latest
|
|
86
|
+
environment: pypi # matches the trusted-publisher config on PyPI
|
|
87
|
+
steps:
|
|
88
|
+
- uses: actions/checkout@v4
|
|
89
|
+
|
|
90
|
+
- name: Install uv
|
|
91
|
+
uses: astral-sh/setup-uv@v3
|
|
92
|
+
|
|
93
|
+
- name: Build wheel + sdist
|
|
94
|
+
run: uv build
|
|
95
|
+
|
|
96
|
+
- name: Publish to PyPI
|
|
97
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main, develop]
|
|
6
|
+
push:
|
|
7
|
+
branches: [main, develop]
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
test:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Install uv
|
|
19
|
+
uses: astral-sh/setup-uv@v3
|
|
20
|
+
|
|
21
|
+
- name: Sync deps
|
|
22
|
+
run: uv sync
|
|
23
|
+
|
|
24
|
+
- name: Lint
|
|
25
|
+
run: uv run ruff check src tests
|
|
26
|
+
|
|
27
|
+
- name: Tests (excluding network + docling marker)
|
|
28
|
+
run: uv run pytest -m "not network and not docling" -q
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
|
|
2
|
+
|
|
3
|
+
WORKDIR /app
|
|
4
|
+
|
|
5
|
+
# CA certs for httpx URL downloads. No system PDF tools needed — the
|
|
6
|
+
# whole PDF stack is pure-Python wheels (docling, pypdf, pdfplumber,
|
|
7
|
+
# pypdfium2). git is dev-time only.
|
|
8
|
+
RUN apt-get update \
|
|
9
|
+
&& apt-get install -y --no-install-recommends ca-certificates \
|
|
10
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
11
|
+
|
|
12
|
+
# Resolve deps first so they cache across source changes.
|
|
13
|
+
COPY pyproject.toml uv.lock ./
|
|
14
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
15
|
+
uv sync --no-dev --no-install-project
|
|
16
|
+
|
|
17
|
+
# README.md is part of the package metadata (pyproject.toml -> readme).
|
|
18
|
+
COPY README.md ./
|
|
19
|
+
COPY src/ src/
|
|
20
|
+
RUN --mount=type=cache,target=/root/.cache/uv \
|
|
21
|
+
uv sync --no-dev
|
|
22
|
+
|
|
23
|
+
# Pre-fetch Docling's layout model so the first user-facing call is hot.
|
|
24
|
+
# Failure here is not fatal — the runtime will re-download on first use.
|
|
25
|
+
ENV DOCLING_ARTIFACTS_PATH=/opt/docling-models
|
|
26
|
+
RUN uv run python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" || true
|
|
27
|
+
|
|
28
|
+
ENV PYTHONUNBUFFERED=1 \
|
|
29
|
+
OUTPUT_ROOT=/data/output \
|
|
30
|
+
CACHE_ROOT=/data/cache \
|
|
31
|
+
PORT=35833 \
|
|
32
|
+
HOST=0.0.0.0
|
|
33
|
+
|
|
34
|
+
EXPOSE 35833
|
|
35
|
+
VOLUME ["/data"]
|
|
36
|
+
|
|
37
|
+
# Container always runs the HTTP transport — stdio across a container
|
|
38
|
+
# boundary doesn't make sense. HTTP is the default; --transport http is
|
|
39
|
+
# explicit so anyone reading the Dockerfile knows what mode it runs in.
|
|
40
|
+
CMD ["uv", "run", "python", "-m", "flint_slating", "--transport", "http"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Parkview Lab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: flint-slating
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server that reads PDFs (metadata, TOC, Markdown, text, images, tables) for downstream LLM consumers.
|
|
5
|
+
Author: Parkview Lab
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Python: >=3.13
|
|
9
|
+
Requires-Dist: anyio
|
|
10
|
+
Requires-Dist: docling>=2.0
|
|
11
|
+
Requires-Dist: fastapi
|
|
12
|
+
Requires-Dist: httpx
|
|
13
|
+
Requires-Dist: mcp[cli]>=1.27.0
|
|
14
|
+
Requires-Dist: pdfplumber>=0.11
|
|
15
|
+
Requires-Dist: pydantic
|
|
16
|
+
Requires-Dist: pypdf>=5.0
|
|
17
|
+
Requires-Dist: starlette
|
|
18
|
+
Requires-Dist: uvicorn[standard]
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# flint-slating
|
|
22
|
+
|
|
23
|
+
MCP server that reads PDFs and exposes them to LLM consumers as
|
|
24
|
+
structured Markdown, plus the usual ancillaries: metadata, outline,
|
|
25
|
+
images, tables.
|
|
26
|
+
|
|
27
|
+
Designed to pair with a separate "wiki" MCP server that handles the
|
|
28
|
+
*writing* side — an agent calls `flint-slating` to read PDFs and another
|
|
29
|
+
MCP to persist notes about them into a frontmattered-markdown knowledge
|
|
30
|
+
base.
|
|
31
|
+
|
|
32
|
+
## What it does
|
|
33
|
+
|
|
34
|
+
Built on a permissive-license PDF stack:
|
|
35
|
+
|
|
36
|
+
| Library | License | Role |
|
|
37
|
+
|---|---|---|
|
|
38
|
+
| [Docling](https://github.com/docling-project/docling) | MIT | PDF → Markdown with heading hierarchy, multi-column reading order, and Markdown tables |
|
|
39
|
+
| [pypdf](https://github.com/py-pdf/pypdf) | BSD-3 | metadata, TOC, page count, encryption checks, image enumeration |
|
|
40
|
+
| [pdfplumber](https://github.com/jsvine/pdfplumber) | MIT | per-page table extraction |
|
|
41
|
+
|
|
42
|
+
**There is no PyMuPDF, no MuPDF, no AGPL or GPL anywhere in the
|
|
43
|
+
dependency tree.** A CI license-check job rejects PRs that pull in
|
|
44
|
+
copyleft transitive deps.
|
|
45
|
+
|
|
46
|
+
## Transports
|
|
47
|
+
|
|
48
|
+
Two transports off the same MCP server, selected via `--transport`:
|
|
49
|
+
|
|
50
|
+
| Transport | Run via | Use case |
|
|
51
|
+
|---|---|---|
|
|
52
|
+
| **Streamable-HTTP** (default) | `uvx flint-slating` or `--transport http` | Long-lived local daemon, container, or shared service. |
|
|
53
|
+
| **stdio** | `uvx flint-slating --transport stdio` | The standard MCP integration shape — drop into `claude_desktop_config.json` or any `mcp.json`. |
|
|
54
|
+
|
|
55
|
+
## Run
|
|
56
|
+
|
|
57
|
+
### As an HTTP daemon (default)
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
uvx flint-slating # listens on PORT (default 35833)
|
|
61
|
+
curl http://127.0.0.1:35833/health
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Or pin it:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
uv tool install flint-slating
|
|
68
|
+
flint-slating
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### As a stdio MCP server
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uvx flint-slating --transport stdio
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Wire into Claude Code's MCP config:
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"mcpServers": {
|
|
82
|
+
"flint-slating": {
|
|
83
|
+
"command": "uvx",
|
|
84
|
+
"args": ["flint-slating", "--transport", "stdio"]
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Docker
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
docker run --rm \
|
|
94
|
+
-p 35833:35833 \
|
|
95
|
+
-v $(pwd)/pdfs:/pdfs:ro \
|
|
96
|
+
-v flint-slating-data:/data \
|
|
97
|
+
ghcr.io/parkviewlab/flint-slating:latest
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Or use [`docker-compose.yml`](docker-compose.yml) for a persistent stack.
|
|
101
|
+
|
|
102
|
+
## MCP tools
|
|
103
|
+
|
|
104
|
+
All PDF tools take a `source` argument with one of:
|
|
105
|
+
|
|
106
|
+
- `{"path": "/abs/path/to/file.pdf"}` — local file
|
|
107
|
+
- `{"url": "https://..."}` — streamed to a content-addressed cache
|
|
108
|
+
- `{"bytes_b64": "...", "filename": "x.pdf"}` — base64 upload (size-capped)
|
|
109
|
+
|
|
110
|
+
| Tool | What it does |
|
|
111
|
+
|---|---|
|
|
112
|
+
| `pdf_info` | `{page_count, metadata, is_encrypted, sha256}` |
|
|
113
|
+
| `pdf_toc` | flat outline `[{level, title, page}]` |
|
|
114
|
+
| `pdf_read_text` | plain text by page range (fast — pypdf, no ML) |
|
|
115
|
+
| `pdf_read_markdown` | high-quality Markdown via Docling (hybrid sync/async — see below) |
|
|
116
|
+
| `pdf_read_chunks` | per-page Markdown chunks with tables/images/toc_items (hybrid sync/async) |
|
|
117
|
+
| `pdf_list_images` | enumerate images: `[{page, index, name, width, height, ext}]` |
|
|
118
|
+
| `pdf_extract_image` | base64 bytes of one image |
|
|
119
|
+
| `pdf_find_tables` | per-page Markdown tables via pdfplumber |
|
|
120
|
+
| `get_job_status` | poll a background job |
|
|
121
|
+
| `get_job_result` | fetch a finished job's artifact |
|
|
122
|
+
| `cancel_job` | cancel a running job |
|
|
123
|
+
|
|
124
|
+
### Hybrid sync/async
|
|
125
|
+
|
|
126
|
+
`pdf_read_markdown` and `pdf_read_chunks` run inline when
|
|
127
|
+
`page_count <= SYNC_PAGE_THRESHOLD` (default 20). For larger PDFs they
|
|
128
|
+
queue a background job and return a `job_id` — poll `get_job_status`
|
|
129
|
+
until `state=="done"`, then call `get_job_result` (or, in HTTP mode,
|
|
130
|
+
fetch `output_url` directly).
|
|
131
|
+
|
|
132
|
+
**stdio mode** transparently waits for the job inline — there's no HTTP
|
|
133
|
+
server to download from, so the originating tool call blocks until the
|
|
134
|
+
result is ready and returns it directly.
|
|
135
|
+
|
|
136
|
+
## HTTP endpoints (HTTP mode only)
|
|
137
|
+
|
|
138
|
+
- `GET /health` — `{ok, version, uptime_seconds}`
|
|
139
|
+
- `GET /admin/version` — package and dependency versions, Docling model status
|
|
140
|
+
- `GET /admin/jobs` — recent job list
|
|
141
|
+
- `GET /outputs/{job_id}/result.md` — finished Markdown
|
|
142
|
+
- `GET /outputs/{job_id}/result.json` — finished chunked output
|
|
143
|
+
- `GET /outputs/{job_id}/log.jsonl` — append-only job log
|
|
144
|
+
- `POST /sse` — MCP Streamable-HTTP transport
|
|
145
|
+
|
|
146
|
+
## Configuration
|
|
147
|
+
|
|
148
|
+
| Env var | Default (daemon) | Default (container) | Purpose |
|
|
149
|
+
|---|---|---|---|
|
|
150
|
+
| `PORT` | `35833` | `35833` | HTTP bind port |
|
|
151
|
+
| `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address |
|
|
152
|
+
| `OUTPUT_ROOT` | `./output` | `/data/output` | Per-job output dirs |
|
|
153
|
+
| `CACHE_ROOT` | `./cache` | `/data/cache` | Materialized URL / base64 PDFs |
|
|
154
|
+
| `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Sweep finished jobs older than N days |
|
|
155
|
+
| `MAX_INLINE_PDF_BYTES` | `25 MB` | `25 MB` | Cap on base64 upload size |
|
|
156
|
+
| `MAX_URL_PDF_BYTES` | `200 MB` | `200 MB` | Cap on URL download size |
|
|
157
|
+
| `SYNC_PAGE_THRESHOLD` | `20` | `20` | Inline-vs-job cutoff for Markdown conversion |
|
|
158
|
+
| `DOCLING_ARTIFACTS_PATH` | `~/.cache/docling` | `/opt/docling-models` | Docling layout-model cache |
|
|
159
|
+
| `ENABLE_OCR` | `false` | `false` | Enable Docling OCR (Tesseract required) |
|
|
160
|
+
| `PUBLIC_BASE_URL` | `http://localhost:35833` | `http://localhost:35833` | Used to build `output_url` |
|
|
161
|
+
|
|
162
|
+
## Resource notes
|
|
163
|
+
|
|
164
|
+
- Docling downloads a ~200–500 MB layout model on first use. The
|
|
165
|
+
container image pre-fetches it at build time; the daemon warms it at
|
|
166
|
+
startup (`stdio_entry` / HTTP lifespan).
|
|
167
|
+
- pypdf, pdfplumber, and the URL / base64 paths are fast and have no ML
|
|
168
|
+
overhead — use `pdf_info`, `pdf_toc`, `pdf_read_text`, and
|
|
169
|
+
`pdf_find_tables` whenever Markdown isn't strictly needed.
|
|
170
|
+
|
|
171
|
+
## Releasing
|
|
172
|
+
|
|
173
|
+
Tag-driven CI publishes to both PyPI (`flint-slating`) and GHCR
|
|
174
|
+
(`ghcr.io/parkviewlab/flint-slating`):
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Bump version in pyproject.toml first, then:
|
|
178
|
+
git tag v0.1.0
|
|
179
|
+
git push origin v0.1.0
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The release workflow refuses tags that don't match `pyproject.toml`'s
|
|
183
|
+
`version`, or that aren't on `origin/main`.
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
[MIT](LICENSE). flint-slating only depends on permissive-licensed
|
|
188
|
+
libraries; the CI `license-check` job enforces this on every PR.
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# flint-slating
|
|
2
|
+
|
|
3
|
+
MCP server that reads PDFs and exposes them to LLM consumers as
|
|
4
|
+
structured Markdown, plus the usual ancillaries: metadata, outline,
|
|
5
|
+
images, tables.
|
|
6
|
+
|
|
7
|
+
Designed to pair with a separate "wiki" MCP server that handles the
|
|
8
|
+
*writing* side — an agent calls `flint-slating` to read PDFs and another
|
|
9
|
+
MCP to persist notes about them into a frontmattered-markdown knowledge
|
|
10
|
+
base.
|
|
11
|
+
|
|
12
|
+
## What it does
|
|
13
|
+
|
|
14
|
+
Built on a permissive-license PDF stack:
|
|
15
|
+
|
|
16
|
+
| Library | License | Role |
|
|
17
|
+
|---|---|---|
|
|
18
|
+
| [Docling](https://github.com/docling-project/docling) | MIT | PDF → Markdown with heading hierarchy, multi-column reading order, and Markdown tables |
|
|
19
|
+
| [pypdf](https://github.com/py-pdf/pypdf) | BSD-3 | metadata, TOC, page count, encryption checks, image enumeration |
|
|
20
|
+
| [pdfplumber](https://github.com/jsvine/pdfplumber) | MIT | per-page table extraction |
|
|
21
|
+
|
|
22
|
+
**There is no PyMuPDF, no MuPDF, no AGPL or GPL anywhere in the
|
|
23
|
+
dependency tree.** A CI license-check job rejects PRs that pull in
|
|
24
|
+
copyleft transitive deps.
|
|
25
|
+
|
|
26
|
+
## Transports
|
|
27
|
+
|
|
28
|
+
Two transports off the same MCP server, selected via `--transport`:
|
|
29
|
+
|
|
30
|
+
| Transport | Run via | Use case |
|
|
31
|
+
|---|---|---|
|
|
32
|
+
| **Streamable-HTTP** (default) | `uvx flint-slating` or `--transport http` | Long-lived local daemon, container, or shared service. |
|
|
33
|
+
| **stdio** | `uvx flint-slating --transport stdio` | The standard MCP integration shape — drop into `claude_desktop_config.json` or any `mcp.json`. |
|
|
34
|
+
|
|
35
|
+
## Run
|
|
36
|
+
|
|
37
|
+
### As an HTTP daemon (default)
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uvx flint-slating # listens on PORT (default 35833)
|
|
41
|
+
curl http://127.0.0.1:35833/health
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Or pin it:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
uv tool install flint-slating
|
|
48
|
+
flint-slating
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### As a stdio MCP server
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uvx flint-slating --transport stdio
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Wire into Claude Code's MCP config:
|
|
58
|
+
|
|
59
|
+
```json
|
|
60
|
+
{
|
|
61
|
+
"mcpServers": {
|
|
62
|
+
"flint-slating": {
|
|
63
|
+
"command": "uvx",
|
|
64
|
+
"args": ["flint-slating", "--transport", "stdio"]
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Docker
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
docker run --rm \
|
|
74
|
+
-p 35833:35833 \
|
|
75
|
+
-v $(pwd)/pdfs:/pdfs:ro \
|
|
76
|
+
-v flint-slating-data:/data \
|
|
77
|
+
ghcr.io/parkviewlab/flint-slating:latest
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Or use [`docker-compose.yml`](docker-compose.yml) for a persistent stack.
|
|
81
|
+
|
|
82
|
+
## MCP tools
|
|
83
|
+
|
|
84
|
+
All PDF tools take a `source` argument with one of:
|
|
85
|
+
|
|
86
|
+
- `{"path": "/abs/path/to/file.pdf"}` — local file
|
|
87
|
+
- `{"url": "https://..."}` — streamed to a content-addressed cache
|
|
88
|
+
- `{"bytes_b64": "...", "filename": "x.pdf"}` — base64 upload (size-capped)
|
|
89
|
+
|
|
90
|
+
| Tool | What it does |
|
|
91
|
+
|---|---|
|
|
92
|
+
| `pdf_info` | `{page_count, metadata, is_encrypted, sha256}` |
|
|
93
|
+
| `pdf_toc` | flat outline `[{level, title, page}]` |
|
|
94
|
+
| `pdf_read_text` | plain text by page range (fast — pypdf, no ML) |
|
|
95
|
+
| `pdf_read_markdown` | high-quality Markdown via Docling (hybrid sync/async — see below) |
|
|
96
|
+
| `pdf_read_chunks` | per-page Markdown chunks with tables/images/toc_items (hybrid sync/async) |
|
|
97
|
+
| `pdf_list_images` | enumerate images: `[{page, index, name, width, height, ext}]` |
|
|
98
|
+
| `pdf_extract_image` | base64 bytes of one image |
|
|
99
|
+
| `pdf_find_tables` | per-page Markdown tables via pdfplumber |
|
|
100
|
+
| `get_job_status` | poll a background job |
|
|
101
|
+
| `get_job_result` | fetch a finished job's artifact |
|
|
102
|
+
| `cancel_job` | cancel a running job |
|
|
103
|
+
|
|
104
|
+
### Hybrid sync/async
|
|
105
|
+
|
|
106
|
+
`pdf_read_markdown` and `pdf_read_chunks` run inline when
|
|
107
|
+
`page_count <= SYNC_PAGE_THRESHOLD` (default 20). For larger PDFs they
|
|
108
|
+
queue a background job and return a `job_id` — poll `get_job_status`
|
|
109
|
+
until `state=="done"`, then call `get_job_result` (or, in HTTP mode,
|
|
110
|
+
fetch `output_url` directly).
|
|
111
|
+
|
|
112
|
+
**stdio mode** transparently waits for the job inline — there's no HTTP
|
|
113
|
+
server to download from, so the originating tool call blocks until the
|
|
114
|
+
result is ready and returns it directly.
|
|
115
|
+
|
|
116
|
+
## HTTP endpoints (HTTP mode only)
|
|
117
|
+
|
|
118
|
+
- `GET /health` — `{ok, version, uptime_seconds}`
|
|
119
|
+
- `GET /admin/version` — package and dependency versions, Docling model status
|
|
120
|
+
- `GET /admin/jobs` — recent job list
|
|
121
|
+
- `GET /outputs/{job_id}/result.md` — finished Markdown
|
|
122
|
+
- `GET /outputs/{job_id}/result.json` — finished chunked output
|
|
123
|
+
- `GET /outputs/{job_id}/log.jsonl` — append-only job log
|
|
124
|
+
- `POST /sse` — MCP Streamable-HTTP transport
|
|
125
|
+
|
|
126
|
+
## Configuration
|
|
127
|
+
|
|
128
|
+
| Env var | Default (daemon) | Default (container) | Purpose |
|
|
129
|
+
|---|---|---|---|
|
|
130
|
+
| `PORT` | `35833` | `35833` | HTTP bind port |
|
|
131
|
+
| `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address |
|
|
132
|
+
| `OUTPUT_ROOT` | `./output` | `/data/output` | Per-job output dirs |
|
|
133
|
+
| `CACHE_ROOT` | `./cache` | `/data/cache` | Materialized URL / base64 PDFs |
|
|
134
|
+
| `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Sweep finished jobs older than N days |
|
|
135
|
+
| `MAX_INLINE_PDF_BYTES` | `25 MB` | `25 MB` | Cap on base64 upload size |
|
|
136
|
+
| `MAX_URL_PDF_BYTES` | `200 MB` | `200 MB` | Cap on URL download size |
|
|
137
|
+
| `SYNC_PAGE_THRESHOLD` | `20` | `20` | Inline-vs-job cutoff for Markdown conversion |
|
|
138
|
+
| `DOCLING_ARTIFACTS_PATH` | `~/.cache/docling` | `/opt/docling-models` | Docling layout-model cache |
|
|
139
|
+
| `ENABLE_OCR` | `false` | `false` | Enable Docling OCR (Tesseract required) |
|
|
140
|
+
| `PUBLIC_BASE_URL` | `http://localhost:35833` | `http://localhost:35833` | Used to build `output_url` |
|
|
141
|
+
|
|
142
|
+
## Resource notes
|
|
143
|
+
|
|
144
|
+
- Docling downloads a ~200–500 MB layout model on first use. The
|
|
145
|
+
container image pre-fetches it at build time; the daemon warms it at
|
|
146
|
+
startup (`stdio_entry` / HTTP lifespan).
|
|
147
|
+
- pypdf, pdfplumber, and the URL / base64 paths are fast and have no ML
|
|
148
|
+
overhead — use `pdf_info`, `pdf_toc`, `pdf_read_text`, and
|
|
149
|
+
`pdf_find_tables` whenever Markdown isn't strictly needed.
|
|
150
|
+
|
|
151
|
+
## Releasing
|
|
152
|
+
|
|
153
|
+
Tag-driven CI publishes to both PyPI (`flint-slating`) and GHCR
|
|
154
|
+
(`ghcr.io/parkviewlab/flint-slating`):
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
# Bump version in pyproject.toml first, then:
|
|
158
|
+
git tag v0.1.0
|
|
159
|
+
git push origin v0.1.0
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
The release workflow refuses tags that don't match `pyproject.toml`'s
|
|
163
|
+
`version`, or that aren't on `origin/main`.
|
|
164
|
+
|
|
165
|
+
## License
|
|
166
|
+
|
|
167
|
+
[MIT](LICENSE). flint-slating only depends on permissive-licensed
|
|
168
|
+
libraries; the CI `license-check` job enforces this on every PR.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# flint-slating — example compose stack.
|
|
2
|
+
#
|
|
3
|
+
# Copy this file, edit the CHANGE-ME values, then:
|
|
4
|
+
# docker compose up -d
|
|
5
|
+
#
|
|
6
|
+
# To upgrade:
|
|
7
|
+
# docker compose pull && docker compose up -d
|
|
8
|
+
|
|
9
|
+
services:
|
|
10
|
+
flint-slating:
|
|
11
|
+
image: ghcr.io/parkviewlab/flint-slating:latest
|
|
12
|
+
container_name: flint-slating
|
|
13
|
+
restart: unless-stopped
|
|
14
|
+
|
|
15
|
+
ports:
|
|
16
|
+
- "35833:35833"
|
|
17
|
+
|
|
18
|
+
environment:
|
|
19
|
+
OUTPUT_ROOT: /data/output
|
|
20
|
+
CACHE_ROOT: /data/cache
|
|
21
|
+
|
|
22
|
+
# Auto-purge finished job dirs older than this many days. 0 disables.
|
|
23
|
+
OUTPUT_EXPIRY_DAYS: "7"
|
|
24
|
+
|
|
25
|
+
# CHANGE-ME: the externally-reachable URL of this daemon. Used to
|
|
26
|
+
# build absolute `output_url` values returned by the job tools.
|
|
27
|
+
# Examples:
|
|
28
|
+
# PUBLIC_BASE_URL: "http://192.168.1.50:35833"
|
|
29
|
+
# PUBLIC_BASE_URL: "https://flint.example.com"
|
|
30
|
+
PUBLIC_BASE_URL: "http://CHANGE-ME:35833"
|
|
31
|
+
|
|
32
|
+
# Docling layout model cache — baked into the image at build time.
|
|
33
|
+
DOCLING_ARTIFACTS_PATH: /opt/docling-models
|
|
34
|
+
|
|
35
|
+
# Set to "true" to enable Docling OCR (requires tesseract; not in
|
|
36
|
+
# this image — extend the Dockerfile or use a mount).
|
|
37
|
+
ENABLE_OCR: "false"
|
|
38
|
+
|
|
39
|
+
volumes:
|
|
40
|
+
# Default: a docker-managed named volume.
|
|
41
|
+
- flint-slating-data:/data
|
|
42
|
+
|
|
43
|
+
# OR — bind-mount a host directory:
|
|
44
|
+
# - /srv/flint-slating:/data
|
|
45
|
+
|
|
46
|
+
# If your PDFs live on the host filesystem and you want to feed
|
|
47
|
+
# them in via `source.path`, mount that read-only:
|
|
48
|
+
# - /srv/pdf-corpus:/pdfs:ro
|
|
49
|
+
|
|
50
|
+
extra_hosts:
|
|
51
|
+
- "host.docker.internal:host-gateway"
|
|
52
|
+
|
|
53
|
+
volumes:
|
|
54
|
+
flint-slating-data:
|