flint-slating 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. flint_slating-0.1.0/.github/workflows/license-check.yml +35 -0
  2. flint_slating-0.1.0/.github/workflows/release.yml +97 -0
  3. flint_slating-0.1.0/.github/workflows/test.yml +28 -0
  4. flint_slating-0.1.0/.gitignore +14 -0
  5. flint_slating-0.1.0/Dockerfile +40 -0
  6. flint_slating-0.1.0/LICENSE +21 -0
  7. flint_slating-0.1.0/PKG-INFO +188 -0
  8. flint_slating-0.1.0/README.md +168 -0
  9. flint_slating-0.1.0/docker-compose.yml +54 -0
  10. flint_slating-0.1.0/pyproject.toml +61 -0
  11. flint_slating-0.1.0/src/flint_slating/__init__.py +0 -0
  12. flint_slating-0.1.0/src/flint_slating/__main__.py +62 -0
  13. flint_slating-0.1.0/src/flint_slating/app.py +41 -0
  14. flint_slating-0.1.0/src/flint_slating/config.py +45 -0
  15. flint_slating-0.1.0/src/flint_slating/images.py +100 -0
  16. flint_slating-0.1.0/src/flint_slating/jobs.py +262 -0
  17. flint_slating-0.1.0/src/flint_slating/mcp_server.py +26 -0
  18. flint_slating-0.1.0/src/flint_slating/outputs.py +139 -0
  19. flint_slating-0.1.0/src/flint_slating/pdf_reader.py +420 -0
  20. flint_slating-0.1.0/src/flint_slating/pdf_source.py +184 -0
  21. flint_slating-0.1.0/src/flint_slating/routes.py +175 -0
  22. flint_slating-0.1.0/src/flint_slating/schema.py +40 -0
  23. flint_slating-0.1.0/src/flint_slating/tables.py +91 -0
  24. flint_slating-0.1.0/src/flint_slating/tools.py +364 -0
  25. flint_slating-0.1.0/tests/__init__.py +0 -0
  26. flint_slating-0.1.0/tests/conftest.py +86 -0
  27. flint_slating-0.1.0/tests/test_http_routes.py +31 -0
  28. flint_slating-0.1.0/tests/test_jobs.py +61 -0
  29. flint_slating-0.1.0/tests/test_pdf_reader.py +56 -0
  30. flint_slating-0.1.0/tests/test_pdf_source.py +54 -0
  31. flint_slating-0.1.0/tests/test_tools.py +64 -0
  32. flint_slating-0.1.0/uv.lock +2749 -0
@@ -0,0 +1,35 @@
1
+ name: License check
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main, develop]
6
+ push:
7
+ branches: [main, develop]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ licenses:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v3
20
+
21
+ - name: Sync deps (including dev for pip-licenses)
22
+ run: uv sync
23
+
24
+ # We use a fail-on deny-list rather than an allow-list because PyPI
25
+ # license metadata is wildly inconsistent (MIT-CMU, "Apache 2.0
26
+ # License", multi-license SPDX expressions, etc.). The real concern
27
+ # is keeping copyleft out — specifically the PyMuPDF AGPL, Marker
28
+ # GPL, and Unstructured's AGPL transitive (ultralytics) trapdoors
29
+ # we explicitly designed this stack to avoid.
30
+ - name: Check transitive licenses
31
+ run: |
32
+ uv run pip-licenses \
33
+ --from=mixed \
34
+ --fail-on="GPL;GPL v2;GPL v3;GNU General Public License;GNU General Public License v2 (GPLv2);GNU General Public License v3 (GPLv3);AGPL;Affero;GNU Affero General Public License v3;GNU Affero General Public License v3 (AGPLv3);LGPL;GNU Lesser General Public License;GNU Lesser General Public License v2 (LGPLv2);GNU Lesser General Public License v3 (LGPLv3);GNU Library or Lesser General Public License (LGPL);SSPL;Server Side Public License" \
35
+ --order=license
@@ -0,0 +1,97 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ['v*']
6
+
7
+ permissions:
8
+ contents: read
9
+ packages: write # GHCR push
10
+ id-token: write # PyPI trusted publishing
11
+
12
+ jobs:
13
+ # Pre-flight checks. Both publish jobs depend on this, so a failed gate
14
+ # prevents *any* artifact from shipping (no half-shipped state).
15
+ gate:
16
+ runs-on: ubuntu-latest
17
+ steps:
18
+ - uses: actions/checkout@v4
19
+ with:
20
+ fetch-depth: 0 # full history needed for the ancestor check below
21
+
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v3
24
+
25
+ - name: Verify tag matches pyproject version
26
+ run: |
27
+ TAG="${GITHUB_REF#refs/tags/v}"
28
+ PROJECT_VERSION=$(uv run python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
29
+ if [ "$TAG" != "$PROJECT_VERSION" ]; then
30
+ echo "tag $TAG != pyproject $PROJECT_VERSION"
31
+ exit 1
32
+ fi
33
+
34
+ - name: Verify tag is reachable from origin/main
35
+ run: |
36
+ git fetch --no-tags origin main
37
+ if ! git merge-base --is-ancestor "$GITHUB_SHA" origin/main; then
38
+ echo "tagged commit $GITHUB_SHA is not on origin/main"
39
+ echo "release tags must come from main — back-merge through, then re-tag from main"
40
+ exit 1
41
+ fi
42
+
43
+ docker:
44
+ needs: gate
45
+ runs-on: ubuntu-latest
46
+ steps:
47
+ - uses: actions/checkout@v4
48
+
49
+ - name: Set up QEMU (arm64 emulation)
50
+ uses: docker/setup-qemu-action@v3
51
+
52
+ - name: Set up Docker Buildx
53
+ uses: docker/setup-buildx-action@v3
54
+
55
+ - name: Log in to GHCR
56
+ uses: docker/login-action@v3
57
+ with:
58
+ registry: ghcr.io
59
+ username: ${{ github.actor }}
60
+ password: ${{ secrets.GITHUB_TOKEN }}
61
+
62
+ - name: Extract metadata (tags + labels)
63
+ id: meta
64
+ uses: docker/metadata-action@v5
65
+ with:
66
+ images: ghcr.io/${{ github.repository }}
67
+ tags: |
68
+ type=semver,pattern={{version}}
69
+ type=semver,pattern={{major}}.{{minor}}
70
+ type=raw,value=latest
71
+
72
+ - name: Build and push (linux/amd64 + linux/arm64)
73
+ uses: docker/build-push-action@v5
74
+ with:
75
+ context: .
76
+ platforms: linux/amd64,linux/arm64
77
+ push: true
78
+ tags: ${{ steps.meta.outputs.tags }}
79
+ labels: ${{ steps.meta.outputs.labels }}
80
+ cache-from: type=gha
81
+ cache-to: type=gha,mode=max
82
+
83
+ pypi:
84
+ needs: gate
85
+ runs-on: ubuntu-latest
86
+ environment: pypi # matches the trusted-publisher config on PyPI
87
+ steps:
88
+ - uses: actions/checkout@v4
89
+
90
+ - name: Install uv
91
+ uses: astral-sh/setup-uv@v3
92
+
93
+ - name: Build wheel + sdist
94
+ run: uv build
95
+
96
+ - name: Publish to PyPI
97
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,28 @@
1
+ name: Test
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main, develop]
6
+ push:
7
+ branches: [main, develop]
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ test:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v3
20
+
21
+ - name: Sync deps
22
+ run: uv sync
23
+
24
+ - name: Lint
25
+ run: uv run ruff check src tests
26
+
27
+ - name: Tests (excluding network + docling marker)
28
+ run: uv run pytest -m "not network and not docling" -q
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ .pytest_cache/
5
+ .ruff_cache/
6
+ .ty_cache/
7
+ *.egg-info/
8
+ dist/
9
+ build/
10
+ .DS_Store
11
+ .claude/
12
+ output/
13
+ cache/
14
+ .docling-models/
@@ -0,0 +1,40 @@
1
+ FROM ghcr.io/astral-sh/uv:python3.13-bookworm-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # CA certs for httpx URL downloads. No system PDF tools needed — the
6
+ # whole PDF stack is pure-Python wheels (docling, pypdf, pdfplumber,
7
+ # pypdfium2). git is dev-time only.
8
+ RUN apt-get update \
9
+ && apt-get install -y --no-install-recommends ca-certificates \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Resolve deps first so they cache across source changes.
13
+ COPY pyproject.toml uv.lock ./
14
+ RUN --mount=type=cache,target=/root/.cache/uv \
15
+ uv sync --no-dev --no-install-project
16
+
17
+ # README.md is part of the package metadata (pyproject.toml -> readme).
18
+ COPY README.md ./
19
+ COPY src/ src/
20
+ RUN --mount=type=cache,target=/root/.cache/uv \
21
+ uv sync --no-dev
22
+
23
+ # Pre-fetch Docling's layout model so the first user-facing call is hot.
24
+ # Failure here is not fatal — the runtime will re-download on first use.
25
+ ENV DOCLING_ARTIFACTS_PATH=/opt/docling-models
26
+ RUN uv run python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" || true
27
+
28
+ ENV PYTHONUNBUFFERED=1 \
29
+ OUTPUT_ROOT=/data/output \
30
+ CACHE_ROOT=/data/cache \
31
+ PORT=35833 \
32
+ HOST=0.0.0.0
33
+
34
+ EXPOSE 35833
35
+ VOLUME ["/data"]
36
+
37
+ # Container always runs the HTTP transport — stdio across a container
38
+ # boundary doesn't make sense. HTTP is the default; --transport http is
39
+ # explicit so anyone reading the Dockerfile knows what mode it runs in.
40
+ CMD ["uv", "run", "python", "-m", "flint_slating", "--transport", "http"]
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Parkview Lab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,188 @@
1
+ Metadata-Version: 2.4
2
+ Name: flint-slating
3
+ Version: 0.1.0
4
+ Summary: MCP server that reads PDFs (metadata, TOC, Markdown, text, images, tables) for downstream LLM consumers.
5
+ Author: Parkview Lab
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.13
9
+ Requires-Dist: anyio
10
+ Requires-Dist: docling>=2.0
11
+ Requires-Dist: fastapi
12
+ Requires-Dist: httpx
13
+ Requires-Dist: mcp[cli]>=1.27.0
14
+ Requires-Dist: pdfplumber>=0.11
15
+ Requires-Dist: pydantic
16
+ Requires-Dist: pypdf>=5.0
17
+ Requires-Dist: starlette
18
+ Requires-Dist: uvicorn[standard]
19
+ Description-Content-Type: text/markdown
20
+
21
+ # flint-slating
22
+
23
+ MCP server that reads PDFs and exposes them to LLM consumers as
24
+ structured Markdown, plus the usual ancillaries: metadata, outline,
25
+ images, tables.
26
+
27
+ Designed to pair with a separate "wiki" MCP server that handles the
28
+ *writing* side — an agent calls `flint-slating` to read PDFs and another
29
+ MCP to persist notes about them into a frontmattered-markdown knowledge
30
+ base.
31
+
32
+ ## What it does
33
+
34
+ Built on a permissive-license PDF stack:
35
+
36
+ | Library | License | Role |
37
+ |---|---|---|
38
+ | [Docling](https://github.com/docling-project/docling) | MIT | PDF → Markdown with heading hierarchy, multi-column reading order, and Markdown tables |
39
+ | [pypdf](https://github.com/py-pdf/pypdf) | BSD-3 | metadata, TOC, page count, encryption checks, image enumeration |
40
+ | [pdfplumber](https://github.com/jsvine/pdfplumber) | MIT | per-page table extraction |
41
+
42
+ **There is no PyMuPDF, no MuPDF, no AGPL or GPL anywhere in the
43
+ dependency tree.** A CI license-check job rejects PRs that pull in
44
+ copyleft transitive deps.
45
+
46
+ ## Transports
47
+
48
+ Two transports off the same MCP server, selected via `--transport`:
49
+
50
+ | Transport | Run via | Use case |
51
+ |---|---|---|
52
+ | **Streamable-HTTP** (default) | `uvx flint-slating` or `--transport http` | Long-lived local daemon, container, or shared service. |
53
+ | **stdio** | `uvx flint-slating --transport stdio` | The standard MCP integration shape — drop into `claude_desktop_config.json` or any `mcp.json`. |
54
+
55
+ ## Run
56
+
57
+ ### As an HTTP daemon (default)
58
+
59
+ ```bash
60
+ uvx flint-slating # listens on PORT (default 35833)
61
+ curl http://127.0.0.1:35833/health
62
+ ```
63
+
64
+ Or pin it:
65
+
66
+ ```bash
67
+ uv tool install flint-slating
68
+ flint-slating
69
+ ```
70
+
71
+ ### As a stdio MCP server
72
+
73
+ ```bash
74
+ uvx flint-slating --transport stdio
75
+ ```
76
+
77
+ Wire into Claude Code's MCP config:
78
+
79
+ ```json
80
+ {
81
+ "mcpServers": {
82
+ "flint-slating": {
83
+ "command": "uvx",
84
+ "args": ["flint-slating", "--transport", "stdio"]
85
+ }
86
+ }
87
+ }
88
+ ```
89
+
90
+ ### Docker
91
+
92
+ ```bash
93
+ docker run --rm \
94
+ -p 35833:35833 \
95
+ -v $(pwd)/pdfs:/pdfs:ro \
96
+ -v flint-slating-data:/data \
97
+ ghcr.io/parkviewlab/flint-slating:latest
98
+ ```
99
+
100
+ Or use [`docker-compose.yml`](docker-compose.yml) for a persistent stack.
101
+
102
+ ## MCP tools
103
+
104
+ All PDF tools take a `source` argument with one of:
105
+
106
+ - `{"path": "/abs/path/to/file.pdf"}` — local file
107
+ - `{"url": "https://..."}` — streamed to a content-addressed cache
108
+ - `{"bytes_b64": "...", "filename": "x.pdf"}` — base64 upload (size-capped)
109
+
110
+ | Tool | What it does |
111
+ |---|---|
112
+ | `pdf_info` | `{page_count, metadata, is_encrypted, sha256}` |
113
+ | `pdf_toc` | flat outline `[{level, title, page}]` |
114
+ | `pdf_read_text` | plain text by page range (fast — pypdf, no ML) |
115
+ | `pdf_read_markdown` | high-quality Markdown via Docling (hybrid sync/async — see below) |
116
+ | `pdf_read_chunks` | per-page Markdown chunks with tables/images/toc_items (hybrid sync/async) |
117
+ | `pdf_list_images` | enumerate images: `[{page, index, name, width, height, ext}]` |
118
+ | `pdf_extract_image` | base64 bytes of one image |
119
+ | `pdf_find_tables` | per-page Markdown tables via pdfplumber |
120
+ | `get_job_status` | poll a background job |
121
+ | `get_job_result` | fetch a finished job's artifact |
122
+ | `cancel_job` | cancel a running job |
123
+
124
+ ### Hybrid sync/async
125
+
126
+ `pdf_read_markdown` and `pdf_read_chunks` run inline when
127
+ `page_count <= SYNC_PAGE_THRESHOLD` (default 20). For larger PDFs they
128
+ queue a background job and return a `job_id` — poll `get_job_status`
129
+ until `state=="done"`, then call `get_job_result` (or, in HTTP mode,
130
+ fetch `output_url` directly).
131
+
132
+ **stdio mode** transparently waits for the job inline — there's no HTTP
133
+ server to download from, so the originating tool call blocks until the
134
+ result is ready and returns it directly.
135
+
136
+ ## HTTP endpoints (HTTP mode only)
137
+
138
+ - `GET /health` — `{ok, version, uptime_seconds}`
139
+ - `GET /admin/version` — package and dependency versions, Docling model status
140
+ - `GET /admin/jobs` — recent job list
141
+ - `GET /outputs/{job_id}/result.md` — finished Markdown
142
+ - `GET /outputs/{job_id}/result.json` — finished chunked output
143
+ - `GET /outputs/{job_id}/log.jsonl` — append-only job log
144
+ - `POST /sse` — MCP Streamable-HTTP transport
145
+
146
+ ## Configuration
147
+
148
+ | Env var | Default (daemon) | Default (container) | Purpose |
149
+ |---|---|---|---|
150
+ | `PORT` | `35833` | `35833` | HTTP bind port |
151
+ | `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address |
152
+ | `OUTPUT_ROOT` | `./output` | `/data/output` | Per-job output dirs |
153
+ | `CACHE_ROOT` | `./cache` | `/data/cache` | Materialized URL / base64 PDFs |
154
+ | `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Sweep finished jobs older than N days |
155
+ | `MAX_INLINE_PDF_BYTES` | `25 MB` | `25 MB` | Cap on base64 upload size |
156
+ | `MAX_URL_PDF_BYTES` | `200 MB` | `200 MB` | Cap on URL download size |
157
+ | `SYNC_PAGE_THRESHOLD` | `20` | `20` | Inline-vs-job cutoff for Markdown conversion |
158
+ | `DOCLING_ARTIFACTS_PATH` | `~/.cache/docling` | `/opt/docling-models` | Docling layout-model cache |
159
+ | `ENABLE_OCR` | `false` | `false` | Enable Docling OCR (Tesseract required) |
160
+ | `PUBLIC_BASE_URL` | `http://localhost:35833` | `http://localhost:35833` | Used to build `output_url` |
161
+
162
+ ## Resource notes
163
+
164
+ - Docling downloads a ~200–500 MB layout model on first use. The
165
+ container image pre-fetches it at build time; the daemon warms it at
166
+ startup (`stdio_entry` / HTTP lifespan).
167
+ - pypdf, pdfplumber, and the URL / base64 paths are fast and have no ML
168
+ overhead — use `pdf_info`, `pdf_toc`, `pdf_read_text`, and
169
+ `pdf_find_tables` whenever Markdown isn't strictly needed.
170
+
171
+ ## Releasing
172
+
173
+ Tag-driven CI publishes to both PyPI (`flint-slating`) and GHCR
174
+ (`ghcr.io/parkviewlab/flint-slating`):
175
+
176
+ ```bash
177
+ # Bump version in pyproject.toml first, then:
178
+ git tag v0.1.0
179
+ git push origin v0.1.0
180
+ ```
181
+
182
+ The release workflow refuses tags that don't match `pyproject.toml`'s
183
+ `version`, or that aren't on `origin/main`.
184
+
185
+ ## License
186
+
187
+ [MIT](LICENSE). flint-slating only depends on permissive-licensed
188
+ libraries; the CI `license-check` job enforces this on every PR.
@@ -0,0 +1,168 @@
1
+ # flint-slating
2
+
3
+ MCP server that reads PDFs and exposes them to LLM consumers as
4
+ structured Markdown, plus the usual ancillaries: metadata, outline,
5
+ images, tables.
6
+
7
+ Designed to pair with a separate "wiki" MCP server that handles the
8
+ *writing* side — an agent calls `flint-slating` to read PDFs and another
9
+ MCP to persist notes about them into a frontmattered-markdown knowledge
10
+ base.
11
+
12
+ ## What it does
13
+
14
+ Built on a permissive-license PDF stack:
15
+
16
+ | Library | License | Role |
17
+ |---|---|---|
18
+ | [Docling](https://github.com/docling-project/docling) | MIT | PDF → Markdown with heading hierarchy, multi-column reading order, and Markdown tables |
19
+ | [pypdf](https://github.com/py-pdf/pypdf) | BSD-3 | metadata, TOC, page count, encryption checks, image enumeration |
20
+ | [pdfplumber](https://github.com/jsvine/pdfplumber) | MIT | per-page table extraction |
21
+
22
+ **There is no PyMuPDF, no MuPDF, no AGPL or GPL anywhere in the
23
+ dependency tree.** A CI license-check job rejects PRs that pull in
24
+ copyleft transitive deps.
25
+
26
+ ## Transports
27
+
28
+ Two transports off the same MCP server, selected via `--transport`:
29
+
30
+ | Transport | Run via | Use case |
31
+ |---|---|---|
32
+ | **Streamable-HTTP** (default) | `uvx flint-slating` or `--transport http` | Long-lived local daemon, container, or shared service. |
33
+ | **stdio** | `uvx flint-slating --transport stdio` | The standard MCP integration shape — drop into `claude_desktop_config.json` or any `mcp.json`. |
34
+
35
+ ## Run
36
+
37
+ ### As an HTTP daemon (default)
38
+
39
+ ```bash
40
+ uvx flint-slating # listens on PORT (default 35833)
41
+ curl http://127.0.0.1:35833/health
42
+ ```
43
+
44
+ Or pin it:
45
+
46
+ ```bash
47
+ uv tool install flint-slating
48
+ flint-slating
49
+ ```
50
+
51
+ ### As a stdio MCP server
52
+
53
+ ```bash
54
+ uvx flint-slating --transport stdio
55
+ ```
56
+
57
+ Wire into Claude Code's MCP config:
58
+
59
+ ```json
60
+ {
61
+ "mcpServers": {
62
+ "flint-slating": {
63
+ "command": "uvx",
64
+ "args": ["flint-slating", "--transport", "stdio"]
65
+ }
66
+ }
67
+ }
68
+ ```
69
+
70
+ ### Docker
71
+
72
+ ```bash
73
+ docker run --rm \
74
+ -p 35833:35833 \
75
+ -v $(pwd)/pdfs:/pdfs:ro \
76
+ -v flint-slating-data:/data \
77
+ ghcr.io/parkviewlab/flint-slating:latest
78
+ ```
79
+
80
+ Or use [`docker-compose.yml`](docker-compose.yml) for a persistent stack.
81
+
82
+ ## MCP tools
83
+
84
+ All PDF tools take a `source` argument with one of:
85
+
86
+ - `{"path": "/abs/path/to/file.pdf"}` — local file
87
+ - `{"url": "https://..."}` — streamed to a content-addressed cache
88
+ - `{"bytes_b64": "...", "filename": "x.pdf"}` — base64 upload (size-capped)
89
+
90
+ | Tool | What it does |
91
+ |---|---|
92
+ | `pdf_info` | `{page_count, metadata, is_encrypted, sha256}` |
93
+ | `pdf_toc` | flat outline `[{level, title, page}]` |
94
+ | `pdf_read_text` | plain text by page range (fast — pypdf, no ML) |
95
+ | `pdf_read_markdown` | high-quality Markdown via Docling (hybrid sync/async — see below) |
96
+ | `pdf_read_chunks` | per-page Markdown chunks with tables/images/toc_items (hybrid sync/async) |
97
+ | `pdf_list_images` | enumerate images: `[{page, index, name, width, height, ext}]` |
98
+ | `pdf_extract_image` | base64 bytes of one image |
99
+ | `pdf_find_tables` | per-page Markdown tables via pdfplumber |
100
+ | `get_job_status` | poll a background job |
101
+ | `get_job_result` | fetch a finished job's artifact |
102
+ | `cancel_job` | cancel a running job |
103
+
104
+ ### Hybrid sync/async
105
+
106
+ `pdf_read_markdown` and `pdf_read_chunks` run inline when
107
+ `page_count <= SYNC_PAGE_THRESHOLD` (default 20). For larger PDFs they
108
+ queue a background job and return a `job_id` — poll `get_job_status`
109
+ until `state=="done"`, then call `get_job_result` (or, in HTTP mode,
110
+ fetch `output_url` directly).
111
+
112
+ **stdio mode** transparently waits for the job inline — there's no HTTP
113
+ server to download from, so the originating tool call blocks until the
114
+ result is ready and returns it directly.
115
+
116
+ ## HTTP endpoints (HTTP mode only)
117
+
118
+ - `GET /health` — `{ok, version, uptime_seconds}`
119
+ - `GET /admin/version` — package and dependency versions, Docling model status
120
+ - `GET /admin/jobs` — recent job list
121
+ - `GET /outputs/{job_id}/result.md` — finished Markdown
122
+ - `GET /outputs/{job_id}/result.json` — finished chunked output
123
+ - `GET /outputs/{job_id}/log.jsonl` — append-only job log
124
+ - `POST /sse` — MCP Streamable-HTTP transport
125
+
126
+ ## Configuration
127
+
128
+ | Env var | Default (daemon) | Default (container) | Purpose |
129
+ |---|---|---|---|
130
+ | `PORT` | `35833` | `35833` | HTTP bind port |
131
+ | `HOST` | `0.0.0.0` | `0.0.0.0` | HTTP bind address |
132
+ | `OUTPUT_ROOT` | `./output` | `/data/output` | Per-job output dirs |
133
+ | `CACHE_ROOT` | `./cache` | `/data/cache` | Materialized URL / base64 PDFs |
134
+ | `OUTPUT_EXPIRY_DAYS` | `7` | `7` | Sweep finished jobs older than N days |
135
+ | `MAX_INLINE_PDF_BYTES` | `25 MB` | `25 MB` | Cap on base64 upload size |
136
+ | `MAX_URL_PDF_BYTES` | `200 MB` | `200 MB` | Cap on URL download size |
137
+ | `SYNC_PAGE_THRESHOLD` | `20` | `20` | Inline-vs-job cutoff for Markdown conversion |
138
+ | `DOCLING_ARTIFACTS_PATH` | `~/.cache/docling` | `/opt/docling-models` | Docling layout-model cache |
139
+ | `ENABLE_OCR` | `false` | `false` | Enable Docling OCR (Tesseract required) |
140
+ | `PUBLIC_BASE_URL` | `http://localhost:35833` | `http://localhost:35833` | Used to build `output_url` |
141
+
142
+ ## Resource notes
143
+
144
+ - Docling downloads a ~200–500 MB layout model on first use. The
145
+ container image pre-fetches it at build time; the daemon warms it at
146
+ startup (`stdio_entry` / HTTP lifespan).
147
+ - pypdf, pdfplumber, and the URL / base64 paths are fast and have no ML
148
+ overhead — use `pdf_info`, `pdf_toc`, `pdf_read_text`, and
149
+ `pdf_find_tables` whenever Markdown isn't strictly needed.
150
+
151
+ ## Releasing
152
+
153
+ Tag-driven CI publishes to both PyPI (`flint-slating`) and GHCR
154
+ (`ghcr.io/parkviewlab/flint-slating`):
155
+
156
+ ```bash
157
+ # Bump version in pyproject.toml first, then:
158
+ git tag v0.1.0
159
+ git push origin v0.1.0
160
+ ```
161
+
162
+ The release workflow refuses tags that don't match `pyproject.toml`'s
163
+ `version`, or that aren't on `origin/main`.
164
+
165
+ ## License
166
+
167
+ [MIT](LICENSE). flint-slating only depends on permissive-licensed
168
+ libraries; the CI `license-check` job enforces this on every PR.
@@ -0,0 +1,54 @@
1
+ # flint-slating — example compose stack.
2
+ #
3
+ # Copy this file, edit the CHANGE-ME values, then:
4
+ # docker compose up -d
5
+ #
6
+ # To upgrade:
7
+ # docker compose pull && docker compose up -d
8
+
9
+ services:
10
+ flint-slating:
11
+ image: ghcr.io/parkviewlab/flint-slating:latest
12
+ container_name: flint-slating
13
+ restart: unless-stopped
14
+
15
+ ports:
16
+ - "35833:35833"
17
+
18
+ environment:
19
+ OUTPUT_ROOT: /data/output
20
+ CACHE_ROOT: /data/cache
21
+
22
+ # Auto-purge finished job dirs older than this many days. 0 disables.
23
+ OUTPUT_EXPIRY_DAYS: "7"
24
+
25
+ # CHANGE-ME: the externally-reachable URL of this daemon. Used to
26
+ # build absolute `output_url` values returned by the job tools.
27
+ # Examples:
28
+ # PUBLIC_BASE_URL: "http://192.168.1.50:35833"
29
+ # PUBLIC_BASE_URL: "https://flint.example.com"
30
+ PUBLIC_BASE_URL: "http://CHANGE-ME:35833"
31
+
32
+ # Docling layout model cache — baked into the image at build time.
33
+ DOCLING_ARTIFACTS_PATH: /opt/docling-models
34
+
35
+ # Set to "true" to enable Docling OCR (requires tesseract; not in
36
+ # this image — extend the Dockerfile or use a mount).
37
+ ENABLE_OCR: "false"
38
+
39
+ volumes:
40
+ # Default: a docker-managed named volume.
41
+ - flint-slating-data:/data
42
+
43
+ # OR — bind-mount a host directory:
44
+ # - /srv/flint-slating:/data
45
+
46
+ # If your PDFs live on the host filesystem and you want to feed
47
+ # them in via `source.path`, mount that read-only:
48
+ # - /srv/pdf-corpus:/pdfs:ro
49
+
50
+ extra_hosts:
51
+ - "host.docker.internal:host-gateway"
52
+
53
+ volumes:
54
+ flint-slating-data: