groundmark 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,33 @@
1
+ name: CI Test Build
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ build:
11
+ name: Build pure Python package
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: '3.11'
23
+
24
+ - name: Update uv lock
25
+ run: uv lock
26
+
27
+ - name: Build package
28
+ run: uv build
29
+
30
+ - uses: actions/upload-artifact@v4
31
+ with:
32
+ name: dist
33
+ path: dist/*
@@ -0,0 +1,21 @@
1
+ name: Lint
2
+ on: [push]
3
+
4
+ jobs:
5
+ lint:
6
+ runs-on: ubuntu-latest
7
+ steps:
8
+ - uses: actions/checkout@v4
9
+
10
+ - uses: actions/setup-python@v5
11
+ with:
12
+ python-version: '3.11'
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+
17
+ - name: Install dependencies
18
+ run: uv sync --group dev
19
+
20
+ - name: Run pre-commit
21
+ run: uv run pre-commit run --all-files
@@ -0,0 +1,68 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build distribution
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: '3.11'
21
+
22
+ - name: Update uv lock
23
+ run: uv lock
24
+
25
+ - name: Build package
26
+ run: uv build
27
+
28
+ - uses: actions/upload-artifact@v4
29
+ with:
30
+ name: dist
31
+ path: dist/*
32
+
33
+ upload_release_assets:
34
+ name: Upload Assets to Release
35
+ runs-on: ubuntu-latest
36
+ needs: [build]
37
+ permissions:
38
+ contents: write
39
+ steps:
40
+ - name: Download all artifacts
41
+ uses: actions/download-artifact@v4
42
+ with:
43
+ path: artifacts
44
+ merge-multiple: true
45
+ - name: Upload Wheels and sdist
46
+ uses: softprops/action-gh-release@v2
47
+ with:
48
+ tag_name: ${{ github.event.release.tag_name }}
49
+ files: artifacts/*
50
+ overwrite_files: true
51
+
52
+ publish-to-pypi:
53
+ name: Publish to PyPI
54
+ runs-on: ubuntu-latest
55
+ needs: [build]
56
+ environment:
57
+ name: pypi
58
+ url: https://pypi.org/p/groundmark
59
+ permissions:
60
+ id-token: write # Required for trusted publishing
61
+ steps:
62
+ - name: Download all wheels and sdist
63
+ uses: actions/download-artifact@v4
64
+ with:
65
+ path: dist
66
+ merge-multiple: true
67
+ - name: Publish to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,23 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
11
+
12
+ # Tool caches
13
+ .mypy_cache/
14
+ .pytest_cache/
15
+ .ruff_cache/
16
+
17
+ # Specs (local working docs)
18
+ specs/
19
+
20
+ # Output and Data
21
+ output.md
22
+ *.pdf
23
+ !tests/data/*.pdf
@@ -0,0 +1,5 @@
1
+ {
2
+ "MD033": false,
3
+ "MD013": false,
4
+ "MD041": false
5
+ }
@@ -0,0 +1,48 @@
1
+ default_language_version:
2
+ python: python3.11
3
+ repos:
4
+ - repo: https://github.com/pre-commit/pre-commit-hooks
5
+ rev: v6.0.0
6
+ hooks:
7
+ - id: check-yaml
8
+ exclude: '\.*conda/.*'
9
+ - id: end-of-file-fixer
10
+ exclude: 'tests/fixtures'
11
+ - id: trailing-whitespace
12
+ exclude: '\.txt$|\.tsv$'
13
+ - id: check-case-conflict
14
+ - id: check-merge-conflict
15
+ - id: detect-private-key
16
+ - id: debug-statements
17
+ - id: check-added-large-files
18
+
19
+ - repo: https://github.com/igorshubovych/markdownlint-cli
20
+ rev: v0.45.0
21
+ hooks:
22
+ - id: markdownlint
23
+ exclude: 'tests/fixtures'
24
+
25
+ - repo: https://github.com/astral-sh/ruff-pre-commit
26
+ # Ruff version.
27
+ rev: v0.14.1
28
+ hooks:
29
+ - id: ruff
30
+ args: ["--fix"]
31
+ - id: ruff-format
32
+
33
+ - repo: https://github.com/pre-commit/mirrors-mypy
34
+ rev: v1.18.2
35
+ hooks:
36
+ - id: mypy
37
+ exclude: "docs/"
38
+ args:
39
+ [
40
+ --pretty,
41
+ --show-error-codes,
42
+ --no-strict-optional,
43
+ --ignore-missing-imports,
44
+ --install-types,
45
+ --non-interactive,
46
+ --config-file=./pyproject.toml
47
+ ]
48
+ additional_dependencies: []
@@ -0,0 +1 @@
1
+ 3.11
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Centre for Population Genomics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: groundmark
3
+ Version: 0.1.0
4
+ Summary: PDF to grounded Markdown with bounding box annotations
5
+ Project-URL: Homepage, https://github.com/populationgenomics/groundmark
6
+ Project-URL: Bug Tracker, https://github.com/populationgenomics/groundmark/issues
7
+ Author-email: Tobias Sargeant <tobias.sargeant@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: anchorite>=0.1.1
18
+ Requires-Dist: pdfplumber>=0.11.9
19
+ Requires-Dist: pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0
20
+ Requires-Dist: pypdf>=6.8.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # groundmark
24
+
25
+ <img src="groundmark.webp" alt="groundmark" width="200">
26
+
27
+ ## Grounded Markdown for PDFs
28
+
29
+ **groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
30
+
31
+ Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
32
+
33
+ ## Architecture
34
+
35
+ The library processes documents in two streams that are then merged:
36
+
37
+ 1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
38
+ 2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
39
+ 3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
40
+ 4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
41
+
42
+ ```html
43
+ <span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ import asyncio
50
+ import groundmark as gm
51
+
52
+ async def main():
53
+ pdf_bytes = open("document.pdf", "rb").read()
54
+
55
+ config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
56
+
57
+ # PDF -> annotated Markdown (one call)
58
+ result = await gm.process(pdf_bytes, config)
59
+ print(f"Coverage: {result.coverage_percent:.2%}")
60
+ print(result.annotated_markdown[:500])
61
+
62
+ # Strip for LLM consumption
63
+ stripped = gm.strip(result.annotated_markdown)
64
+ # stripped.plain_text: clean Markdown with spans removed
65
+ # stripped.validation_map: list of (start, end, Anchor) ranges
66
+
67
+ # Resolve verbatim quotes to PDF coordinates
68
+ resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
69
+ # -> {"the patient presented with": [(page, BBox), ...]}
70
+
71
+ if __name__ == "__main__":
72
+ asyncio.run(main())
73
+ ```
74
+
75
+ ## Debug Visualizer
76
+
77
+ The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
78
+
79
+ ```bash
80
+ python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
81
+
82
+ # Or with cached Markdown:
83
+ python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
84
+ ```
85
+
86
+ ![Visualizer output showing blue (raw) and red (aligned) bounding box overlays](visualize_example.jpg)
87
+
88
+ *Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
89
+
90
+ ## Configuration
91
+
92
+ ### Timeouts
93
+
94
+ The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
95
+
96
+ | Provider | Default | Environment Variable |
97
+ |----------|---------|---------------------|
98
+ | Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
99
+ | Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
100
+
101
+ For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
102
+
103
+ ```bash
104
+ export AWS_READ_TIMEOUT=600
105
+ ```
106
+
107
+ ## License
108
+
109
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,87 @@
1
+ # groundmark
2
+
3
+ <img src="groundmark.webp" alt="groundmark" width="200">
4
+
5
+ ## Grounded Markdown for PDFs
6
+
7
+ **groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
8
+
9
+ Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
10
+
11
+ ## Architecture
12
+
13
+ The library processes documents in two streams that are then merged:
14
+
15
+ 1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
16
+ 2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
17
+ 3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
18
+ 4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
19
+
20
+ ```html
21
+ <span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
22
+ ```
23
+
24
+ ## Quick Start
25
+
26
+ ```python
27
+ import asyncio
28
+ import groundmark as gm
29
+
30
+ async def main():
31
+ pdf_bytes = open("document.pdf", "rb").read()
32
+
33
+ config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
34
+
35
+ # PDF -> annotated Markdown (one call)
36
+ result = await gm.process(pdf_bytes, config)
37
+ print(f"Coverage: {result.coverage_percent:.2%}")
38
+ print(result.annotated_markdown[:500])
39
+
40
+ # Strip for LLM consumption
41
+ stripped = gm.strip(result.annotated_markdown)
42
+ # stripped.plain_text: clean Markdown with spans removed
43
+ # stripped.validation_map: list of (start, end, Anchor) ranges
44
+
45
+ # Resolve verbatim quotes to PDF coordinates
46
+ resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
47
+ # -> {"the patient presented with": [(page, BBox), ...]}
48
+
49
+ if __name__ == "__main__":
50
+ asyncio.run(main())
51
+ ```
52
+
53
+ ## Debug Visualizer
54
+
55
+ The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
56
+
57
+ ```bash
58
+ python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
59
+
60
+ # Or with cached Markdown:
61
+ python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
62
+ ```
63
+
64
+ ![Visualizer output showing blue (raw) and red (aligned) bounding box overlays](visualize_example.jpg)
65
+
66
+ *Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
67
+
68
+ ## Configuration
69
+
70
+ ### Timeouts
71
+
72
+ The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
73
+
74
+ | Provider | Default | Environment Variable |
75
+ |----------|---------|---------------------|
76
+ | Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
77
+ | Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
78
+
79
+ For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
80
+
81
+ ```bash
82
+ export AWS_READ_TIMEOUT=600
83
+ ```
84
+
85
+ ## License
86
+
87
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
Binary file
@@ -0,0 +1,75 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "groundmark"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name = "Tobias Sargeant", email = "tobias.sargeant@gmail.com" },
10
+ ]
11
+ description = "PDF to grounded Markdown with bounding box annotations"
12
+ readme = "README.md"
13
+ license = { text = "MIT" }
14
+ requires-python = ">=3.11"
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.11",
18
+ "Programming Language :: Python :: 3.12",
19
+ "Programming Language :: Python :: 3.13",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ ]
23
+ dependencies = [
24
+ "anchorite>=0.1.1",
25
+ "pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0",
26
+ "pdfplumber>=0.11.9",
27
+ "pypdf>=6.8.0",
28
+ ]
29
+
30
+ [dependency-groups]
31
+ dev = [
32
+ "pre-commit>=4.5.1",
33
+ "pytest",
34
+ "pytest-asyncio",
35
+ "ruff>=0.14.6",
36
+ "typer>=0.24.1",
37
+ ]
38
+
39
+ [project.urls]
40
+ "Homepage" = "https://github.com/populationgenomics/groundmark"
41
+ "Bug Tracker" = "https://github.com/populationgenomics/groundmark/issues"
42
+
43
+ [tool.ruff]
44
+ line-length = 120
45
+ target-version = "py311"
46
+ indent-width = 4
47
+
48
+ [tool.ruff.lint]
49
+ select = ["A", "B", "C", "E", "F", "G", "I", "N", "Q", "S", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "ERA", "EXE", "ICN", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "UP", "YTT"]
50
+ ignore = [
51
+ "C901", # function complexity — too aggressive for straightforward extraction loops
52
+ "COM812", # trailing comma — conflicts with ruff formatter
53
+ "PD011", # pandas-use-of-dot-values (false positive)
54
+ "PLR0912", # too many branches — same as C901
55
+ "PLR0913", # too many arguments — sometimes unavoidable
56
+ ]
57
+ fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "ERA", "EXE", "FBT", "ICN", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "UP", "YTT"]
58
+
59
+ [tool.ruff.lint.isort]
60
+ known-first-party = ["groundmark"]
61
+
62
+ [tool.ruff.lint.per-file-ignores]
63
+ "src/groundmark/markdown.py" = [
64
+ "RUF001", # ambiguous unicode — intentional in LLM prompt about preserving unicode symbols
65
+ ]
66
+ "tests/*" = [
67
+ "ARG001", # unused function arguments (mock.patch positional injection).
68
+ "S101", # asserts.
69
+ "S102", # exec().
70
+ "PLR2004", # magic value comparisons.
71
+ "SLF001" # private method access.
72
+ ]
73
+
74
+ [tool.mypy]
75
+ python_version = "3.11"
@@ -0,0 +1,6 @@
1
+ from anchorite import Anchor, BBox, annotate, resolve, strip
2
+
3
+ from groundmark.markdown import PROMPT
4
+ from groundmark.process import Config, ProcessResult, process
5
+
6
+ __all__ = ["PROMPT", "Anchor", "BBox", "Config", "ProcessResult", "annotate", "process", "resolve", "strip"]
@@ -0,0 +1,104 @@
1
+ """PDF to Markdown conversion via Pydantic AI (any supported LLM)."""
2
+
3
+ import re
4
+ import unicodedata
5
+ from typing import Final
6
+
7
+ from anchorite.document import DocumentChunk
8
+ from pydantic_ai import Agent
9
+ from pydantic_ai.messages import BinaryContent
10
+
11
+ # Apparently, faithfully analyzing a PDF's complicated layout and transcribing
12
+ # it into well-structured Markdown isn't creative enough for Claude's content
13
+ # filter. Asking the model to add line numbers gives it something "original"
14
+ # to contribute, which satisfies the anti-regurgitation heuristic. We strip
15
+ # them right after.
16
+ # https://privacy.claude.com/en/articles/10023638-why-am-i-receiving-an-output-blocked-by-content-filtering-policy-error
17
+ _LINE_NUM_PREFIX = """
18
+ IMPORTANT: Prefix every output line with its line number followed by a
19
+ pipe character (no trailing space), e.g.:
20
+ 1|# Heading
21
+ 2|
22
+ 3|Some paragraph text here.
23
+ Start numbering at 1. This is required for all output.
24
+ """
25
+
26
+ _LINE_NUM_RE = re.compile(r"^\d+\|", re.MULTILINE)
27
+
28
+ PROMPT: Final[str] = (
29
+ """
30
+ Carefully transcribe the text for this pdf into a text file with
31
+ markdown annotations.
32
+ """
33
+ + _LINE_NUM_PREFIX
34
+ + """
35
+ **The final output must be formatted as text that visually
36
+ mimics in markdown the layout and hierarchy of the original PDF
37
+ when rendered (ignoring the line-number prefixes).**
38
+
39
+ * Do not include headers or footers that are repeated on each page.
40
+ * Do not include page numbers.
41
+ * Preserve the reading order of the text as it appears in the PDF.
42
+ * Remove hyphens that break words at the end of lines.
43
+ * e.g. "uti- lized" -> "utilized"
44
+ * Use Markdown headings (`#`, `##`, `###`) to reflect the size and
45
+ hierarchy of titles and subtitles in the PDF.
46
+ * Ensure that there are blank lines before and after headings, lists,
47
+ tables, and images.
48
+ * End each paragraph with a blank line.
49
+ * Do not break lines within paragraphs or headings.
50
+ * Render bullet points and numbered lettered lists as markdown lists.
51
+ * It is ok to remove brackets and other consistent punctuation around
52
+ list identifiers
53
+ * e.g. "a)" -> "a."
54
+ * Use blockquotes for any sidebars or highlighted text.
55
+ * Bold all words and phrases that appear bolded in the original
56
+ source material. Similarly, italicise all text in italics.
57
+ * Render tables as markdown, paying particular attention to copying
58
+ identifiers exactly.
59
+ * Break text into paragraphs and lists exactly as they appear in
60
+ the PDF.
61
+ * Preserve figure/chart captions verbatim — do not paraphrase, extend,
62
+ or interleave them with descriptions. If useful context is only visible
63
+ in the image (e.g. axis labels, legend entries, data values), add it
64
+ as a separate paragraph after the caption.
65
+ * Convert bar charts into markdown tables where possible.
66
+ * Convert tables contained in images into markdown.
67
+ * Keep mathematical expressions as close to the PDF's own characters as
68
+ possible — use the same Unicode symbols (×, ≥, α, β, etc.) rather than
69
+ converting to LaTeX commands.
70
+ * Only use LaTeX (`$...$` / `$$...$$`) for complex display equations
71
+ with fractions, integrals, summations, or multi-level notation that
72
+ cannot be represented legibly in plain text.
73
+ * Insert markers at the start of each page of the form `<!--page-->`
74
+ * Surround tables and figure descriptions with markers:
75
+ * `<!--table-->` ... `<!--end-->`
76
+ * `<!--figure-->` ... `<!--end-->`
77
+ """
78
+ )
79
+
80
+ _agent: Agent[None, str] = Agent(output_type=str)
81
+
82
+
83
+ class PydanticAIMarkdownProvider:
84
+ """MarkdownProvider that converts PDF chunks to Markdown via a vision-capable LLM."""
85
+
86
+ def __init__(self, model: str, *, prompt: str | None = None) -> None:
87
+ self.model = model
88
+ self._prompt = prompt or PROMPT
89
+
90
+ async def generate_markdown(self, chunk: DocumentChunk) -> str:
91
+ """Convert a document chunk to Markdown.
92
+
93
+ Returns:
94
+ Markdown string with ``<!--page-->`` markers between pages.
95
+ """
96
+ result = await _agent.run(
97
+ [BinaryContent(data=chunk.data, media_type=chunk.mime_type), self._prompt],
98
+ model=self.model,
99
+ )
100
+ # Strip the line-number prefixes added to bypass Claude's content filter.
101
+ markdown = _LINE_NUM_RE.sub("", result.output)
102
+ # NFKC-normalize so superscript digits, ligatures, etc. match the
103
+ # NFKC-normalized anchor text from pdfplumber extraction.
104
+ return unicodedata.normalize("NFKC", markdown)