groundmark 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundmark-0.1.0/.github/workflows/build.yaml +33 -0
- groundmark-0.1.0/.github/workflows/lint.yaml +21 -0
- groundmark-0.1.0/.github/workflows/release.yaml +68 -0
- groundmark-0.1.0/.gitignore +23 -0
- groundmark-0.1.0/.markdownlint.json +5 -0
- groundmark-0.1.0/.pre-commit-config.yaml +48 -0
- groundmark-0.1.0/.python-version +1 -0
- groundmark-0.1.0/LICENSE +21 -0
- groundmark-0.1.0/PKG-INFO +109 -0
- groundmark-0.1.0/README.md +87 -0
- groundmark-0.1.0/groundmark.webp +0 -0
- groundmark-0.1.0/pyproject.toml +75 -0
- groundmark-0.1.0/src/groundmark/__init__.py +6 -0
- groundmark-0.1.0/src/groundmark/markdown.py +104 -0
- groundmark-0.1.0/src/groundmark/parse.py +95 -0
- groundmark-0.1.0/src/groundmark/process.py +82 -0
- groundmark-0.1.0/src/groundmark/visualize.py +201 -0
- groundmark-0.1.0/tests/data/hello_world.pdf +67 -0
- groundmark-0.1.0/tests/data/simple_hello.pdf +52 -0
- groundmark-0.1.0/tests/data/table_2x2.pdf +181 -0
- groundmark-0.1.0/tests/test_markdown.py +60 -0
- groundmark-0.1.0/tests/test_parse.py +38 -0
- groundmark-0.1.0/tests/test_process.py +42 -0
- groundmark-0.1.0/uv.lock +1745 -0
- groundmark-0.1.0/visualize_example.jpg +0 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: CI Test Build
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
name: Build pure Python package
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v5
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.11'
|
|
23
|
+
|
|
24
|
+
- name: Update uv lock
|
|
25
|
+
run: uv lock
|
|
26
|
+
|
|
27
|
+
- name: Build package
|
|
28
|
+
run: uv build
|
|
29
|
+
|
|
30
|
+
- uses: actions/upload-artifact@v4
|
|
31
|
+
with:
|
|
32
|
+
name: dist
|
|
33
|
+
path: dist/*
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: Lint
|
|
2
|
+
on: [push]
|
|
3
|
+
|
|
4
|
+
jobs:
|
|
5
|
+
lint:
|
|
6
|
+
runs-on: ubuntu-latest
|
|
7
|
+
steps:
|
|
8
|
+
- uses: actions/checkout@v4
|
|
9
|
+
|
|
10
|
+
- uses: actions/setup-python@v5
|
|
11
|
+
with:
|
|
12
|
+
python-version: '3.11'
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
|
|
17
|
+
- name: Install dependencies
|
|
18
|
+
run: uv sync --group dev
|
|
19
|
+
|
|
20
|
+
- name: Run pre-commit
|
|
21
|
+
run: uv run pre-commit run --all-files
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [created]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
build:
|
|
9
|
+
name: Build distribution
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: '3.11'
|
|
21
|
+
|
|
22
|
+
- name: Update uv lock
|
|
23
|
+
run: uv lock
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: uv build
|
|
27
|
+
|
|
28
|
+
- uses: actions/upload-artifact@v4
|
|
29
|
+
with:
|
|
30
|
+
name: dist
|
|
31
|
+
path: dist/*
|
|
32
|
+
|
|
33
|
+
upload_release_assets:
|
|
34
|
+
name: Upload Assets to Release
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
needs: [build]
|
|
37
|
+
permissions:
|
|
38
|
+
contents: write
|
|
39
|
+
steps:
|
|
40
|
+
- name: Download all artifacts
|
|
41
|
+
uses: actions/download-artifact@v4
|
|
42
|
+
with:
|
|
43
|
+
path: artifacts
|
|
44
|
+
merge-multiple: true
|
|
45
|
+
- name: Upload Wheels and sdist
|
|
46
|
+
uses: softprops/action-gh-release@v2
|
|
47
|
+
with:
|
|
48
|
+
tag_name: ${{ github.event.release.tag_name }}
|
|
49
|
+
files: artifacts/*
|
|
50
|
+
overwrite_files: true
|
|
51
|
+
|
|
52
|
+
publish-to-pypi:
|
|
53
|
+
name: Publish to PyPI
|
|
54
|
+
runs-on: ubuntu-latest
|
|
55
|
+
needs: [build]
|
|
56
|
+
environment:
|
|
57
|
+
name: pypi
|
|
58
|
+
url: https://pypi.org/p/groundmark
|
|
59
|
+
permissions:
|
|
60
|
+
id-token: write # Required for trusted publishing
|
|
61
|
+
steps:
|
|
62
|
+
- name: Download all wheels and sdist
|
|
63
|
+
uses: actions/download-artifact@v4
|
|
64
|
+
with:
|
|
65
|
+
path: dist
|
|
66
|
+
merge-multiple: true
|
|
67
|
+
- name: Publish to PyPI
|
|
68
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Python-generated files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[oc]
|
|
4
|
+
build/
|
|
5
|
+
dist/
|
|
6
|
+
wheels/
|
|
7
|
+
*.egg-info
|
|
8
|
+
|
|
9
|
+
# Virtual environments
|
|
10
|
+
.venv
|
|
11
|
+
|
|
12
|
+
# Tool caches
|
|
13
|
+
.mypy_cache/
|
|
14
|
+
.pytest_cache/
|
|
15
|
+
.ruff_cache/
|
|
16
|
+
|
|
17
|
+
# Specs (local working docs)
|
|
18
|
+
specs/
|
|
19
|
+
|
|
20
|
+
# Output and Data
|
|
21
|
+
output.md
|
|
22
|
+
*.pdf
|
|
23
|
+
!tests/data/*.pdf
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
default_language_version:
|
|
2
|
+
python: python3.11
|
|
3
|
+
repos:
|
|
4
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
5
|
+
rev: v6.0.0
|
|
6
|
+
hooks:
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
exclude: '\.*conda/.*'
|
|
9
|
+
- id: end-of-file-fixer
|
|
10
|
+
exclude: 'tests/fixtures'
|
|
11
|
+
- id: trailing-whitespace
|
|
12
|
+
exclude: '\.txt$|\.tsv$'
|
|
13
|
+
- id: check-case-conflict
|
|
14
|
+
- id: check-merge-conflict
|
|
15
|
+
- id: detect-private-key
|
|
16
|
+
- id: debug-statements
|
|
17
|
+
- id: check-added-large-files
|
|
18
|
+
|
|
19
|
+
- repo: https://github.com/igorshubovych/markdownlint-cli
|
|
20
|
+
rev: v0.45.0
|
|
21
|
+
hooks:
|
|
22
|
+
- id: markdownlint
|
|
23
|
+
exclude: 'tests/fixtures'
|
|
24
|
+
|
|
25
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
26
|
+
# Ruff version.
|
|
27
|
+
rev: v0.14.1
|
|
28
|
+
hooks:
|
|
29
|
+
- id: ruff
|
|
30
|
+
args: ["--fix"]
|
|
31
|
+
- id: ruff-format
|
|
32
|
+
|
|
33
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
34
|
+
rev: v1.18.2
|
|
35
|
+
hooks:
|
|
36
|
+
- id: mypy
|
|
37
|
+
exclude: "docs/"
|
|
38
|
+
args:
|
|
39
|
+
[
|
|
40
|
+
--pretty,
|
|
41
|
+
--show-error-codes,
|
|
42
|
+
--no-strict-optional,
|
|
43
|
+
--ignore-missing-imports,
|
|
44
|
+
--install-types,
|
|
45
|
+
--non-interactive,
|
|
46
|
+
--config-file=./pyproject.toml
|
|
47
|
+
]
|
|
48
|
+
additional_dependencies: []
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.11
|
groundmark-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Centre for Population Genomics
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundmark
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF to grounded Markdown with bounding box annotations
|
|
5
|
+
Project-URL: Homepage, https://github.com/populationgenomics/groundmark
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/populationgenomics/groundmark/issues
|
|
7
|
+
Author-email: Tobias Sargeant <tobias.sargeant@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: anchorite>=0.1.1
|
|
18
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
19
|
+
Requires-Dist: pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0
|
|
20
|
+
Requires-Dist: pypdf>=6.8.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# groundmark
|
|
24
|
+
|
|
25
|
+
<img src="groundmark.webp" alt="groundmark" width="200">
|
|
26
|
+
|
|
27
|
+
## Grounded Markdown for PDFs
|
|
28
|
+
|
|
29
|
+
**groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
|
|
30
|
+
|
|
31
|
+
Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
|
|
32
|
+
|
|
33
|
+
## Architecture
|
|
34
|
+
|
|
35
|
+
The library processes documents in two streams that are then merged:
|
|
36
|
+
|
|
37
|
+
1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
|
|
38
|
+
2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
|
|
39
|
+
3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
|
|
40
|
+
4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
|
|
41
|
+
|
|
42
|
+
```html
|
|
43
|
+
<span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import asyncio
|
|
50
|
+
import groundmark as gm
|
|
51
|
+
|
|
52
|
+
async def main():
|
|
53
|
+
pdf_bytes = open("document.pdf", "rb").read()
|
|
54
|
+
|
|
55
|
+
config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
|
|
56
|
+
|
|
57
|
+
# PDF -> annotated Markdown (one call)
|
|
58
|
+
result = await gm.process(pdf_bytes, config)
|
|
59
|
+
print(f"Coverage: {result.coverage_percent:.2%}")
|
|
60
|
+
print(result.annotated_markdown[:500])
|
|
61
|
+
|
|
62
|
+
# Strip for LLM consumption
|
|
63
|
+
stripped = gm.strip(result.annotated_markdown)
|
|
64
|
+
# stripped.plain_text: clean Markdown with spans removed
|
|
65
|
+
# stripped.validation_map: list of (start, end, Anchor) ranges
|
|
66
|
+
|
|
67
|
+
# Resolve verbatim quotes to PDF coordinates
|
|
68
|
+
resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
|
|
69
|
+
# -> {"the patient presented with": [(page, BBox), ...]}
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
asyncio.run(main())
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Debug Visualizer
|
|
76
|
+
|
|
77
|
+
The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
|
|
81
|
+
|
|
82
|
+
# Or with cached Markdown:
|
|
83
|
+
python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+

|
|
87
|
+
|
|
88
|
+
*Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
|
|
89
|
+
|
|
90
|
+
## Configuration
|
|
91
|
+
|
|
92
|
+
### Timeouts
|
|
93
|
+
|
|
94
|
+
The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
|
|
95
|
+
|
|
96
|
+
| Provider | Default | Environment Variable |
|
|
97
|
+
|----------|---------|---------------------|
|
|
98
|
+
| Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
|
|
99
|
+
| Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
|
|
100
|
+
|
|
101
|
+
For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
export AWS_READ_TIMEOUT=600
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# groundmark
|
|
2
|
+
|
|
3
|
+
<img src="groundmark.webp" alt="groundmark" width="200">
|
|
4
|
+
|
|
5
|
+
## Grounded Markdown for PDFs
|
|
6
|
+
|
|
7
|
+
**groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
|
|
8
|
+
|
|
9
|
+
Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
|
|
10
|
+
|
|
11
|
+
## Architecture
|
|
12
|
+
|
|
13
|
+
The library processes documents in two streams that are then merged:
|
|
14
|
+
|
|
15
|
+
1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
|
|
16
|
+
2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
|
|
17
|
+
3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
|
|
18
|
+
4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
|
|
19
|
+
|
|
20
|
+
```html
|
|
21
|
+
<span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import asyncio
|
|
28
|
+
import groundmark as gm
|
|
29
|
+
|
|
30
|
+
async def main():
|
|
31
|
+
pdf_bytes = open("document.pdf", "rb").read()
|
|
32
|
+
|
|
33
|
+
config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
|
|
34
|
+
|
|
35
|
+
# PDF -> annotated Markdown (one call)
|
|
36
|
+
result = await gm.process(pdf_bytes, config)
|
|
37
|
+
print(f"Coverage: {result.coverage_percent:.2%}")
|
|
38
|
+
print(result.annotated_markdown[:500])
|
|
39
|
+
|
|
40
|
+
# Strip for LLM consumption
|
|
41
|
+
stripped = gm.strip(result.annotated_markdown)
|
|
42
|
+
# stripped.plain_text: clean Markdown with spans removed
|
|
43
|
+
# stripped.validation_map: list of (start, end, Anchor) ranges
|
|
44
|
+
|
|
45
|
+
# Resolve verbatim quotes to PDF coordinates
|
|
46
|
+
resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
|
|
47
|
+
# -> {"the patient presented with": [(page, BBox), ...]}
|
|
48
|
+
|
|
49
|
+
if __name__ == "__main__":
|
|
50
|
+
asyncio.run(main())
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Debug Visualizer
|
|
54
|
+
|
|
55
|
+
The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
|
|
59
|
+
|
|
60
|
+
# Or with cached Markdown:
|
|
61
|
+
python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+

|
|
65
|
+
|
|
66
|
+
*Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
|
|
67
|
+
|
|
68
|
+
## Configuration
|
|
69
|
+
|
|
70
|
+
### Timeouts
|
|
71
|
+
|
|
72
|
+
The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
|
|
73
|
+
|
|
74
|
+
| Provider | Default | Environment Variable |
|
|
75
|
+
|----------|---------|---------------------|
|
|
76
|
+
| Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
|
|
77
|
+
| Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
|
|
78
|
+
|
|
79
|
+
For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
export AWS_READ_TIMEOUT=600
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## License
|
|
86
|
+
|
|
87
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
Binary file
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "groundmark"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name = "Tobias Sargeant", email = "tobias.sargeant@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "PDF to grounded Markdown with bounding box annotations"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = { text = "MIT" }
|
|
14
|
+
requires-python = ">=3.11"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.11",
|
|
18
|
+
"Programming Language :: Python :: 3.12",
|
|
19
|
+
"Programming Language :: Python :: 3.13",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"anchorite>=0.1.1",
|
|
25
|
+
"pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0",
|
|
26
|
+
"pdfplumber>=0.11.9",
|
|
27
|
+
"pypdf>=6.8.0",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[dependency-groups]
|
|
31
|
+
dev = [
|
|
32
|
+
"pre-commit>=4.5.1",
|
|
33
|
+
"pytest",
|
|
34
|
+
"pytest-asyncio",
|
|
35
|
+
"ruff>=0.14.6",
|
|
36
|
+
"typer>=0.24.1",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
"Homepage" = "https://github.com/populationgenomics/groundmark"
|
|
41
|
+
"Bug Tracker" = "https://github.com/populationgenomics/groundmark/issues"
|
|
42
|
+
|
|
43
|
+
[tool.ruff]
|
|
44
|
+
line-length = 120
|
|
45
|
+
target-version = "py311"
|
|
46
|
+
indent-width = 4
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint]
|
|
49
|
+
select = ["A", "B", "C", "E", "F", "G", "I", "N", "Q", "S", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "ERA", "EXE", "ICN", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "UP", "YTT"]
|
|
50
|
+
ignore = [
|
|
51
|
+
"C901", # function complexity — too aggressive for straightforward extraction loops
|
|
52
|
+
"COM812", # trailing comma — conflicts with ruff formatter
|
|
53
|
+
"PD011", # pandas-use-of-dot-values (false positive)
|
|
54
|
+
"PLR0912", # too many branches — same as C901
|
|
55
|
+
"PLR0913", # too many arguments — sometimes unavoidable
|
|
56
|
+
]
|
|
57
|
+
fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "ERA", "EXE", "FBT", "ICN", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "UP", "YTT"]
|
|
58
|
+
|
|
59
|
+
[tool.ruff.lint.isort]
|
|
60
|
+
known-first-party = ["groundmark"]
|
|
61
|
+
|
|
62
|
+
[tool.ruff.lint.per-file-ignores]
|
|
63
|
+
"src/groundmark/markdown.py" = [
|
|
64
|
+
"RUF001", # ambiguous unicode — intentional in LLM prompt about preserving unicode symbols
|
|
65
|
+
]
|
|
66
|
+
"tests/*" = [
|
|
67
|
+
"ARG001", # unused function arguments (mock.patch positional injection).
|
|
68
|
+
"S101", # asserts.
|
|
69
|
+
"S102", # exec().
|
|
70
|
+
"PLR2004", # magic value comparisons.
|
|
71
|
+
"SLF001" # private method access.
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
[tool.mypy]
|
|
75
|
+
python_version = "3.11"
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from anchorite import Anchor, BBox, annotate, resolve, strip
|
|
2
|
+
|
|
3
|
+
from groundmark.markdown import PROMPT
|
|
4
|
+
from groundmark.process import Config, ProcessResult, process
|
|
5
|
+
|
|
6
|
+
__all__ = ["PROMPT", "Anchor", "BBox", "Config", "ProcessResult", "annotate", "process", "resolve", "strip"]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""PDF to Markdown conversion via Pydantic AI (any supported LLM)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from typing import Final
|
|
6
|
+
|
|
7
|
+
from anchorite.document import DocumentChunk
|
|
8
|
+
from pydantic_ai import Agent
|
|
9
|
+
from pydantic_ai.messages import BinaryContent
|
|
10
|
+
|
|
11
|
+
# Apparently, faithfully analyzing a PDF's complicated layout and transcribing
|
|
12
|
+
# it into well-structured Markdown isn't creative enough for Claude's content
|
|
13
|
+
# filter. Asking the model to add line numbers gives it something "original"
|
|
14
|
+
# to contribute, which satisfies the anti-regurgitation heuristic. We strip
|
|
15
|
+
# them right after.
|
|
16
|
+
# https://privacy.claude.com/en/articles/10023638-why-am-i-receiving-an-output-blocked-by-content-filtering-policy-error
|
|
17
|
+
_LINE_NUM_PREFIX = """
|
|
18
|
+
IMPORTANT: Prefix every output line with its line number followed by a
|
|
19
|
+
pipe character (no trailing space), e.g.:
|
|
20
|
+
1|# Heading
|
|
21
|
+
2|
|
|
22
|
+
3|Some paragraph text here.
|
|
23
|
+
Start numbering at 1. This is required for all output.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
_LINE_NUM_RE = re.compile(r"^\d+\|", re.MULTILINE)
|
|
27
|
+
|
|
28
|
+
PROMPT: Final[str] = (
|
|
29
|
+
"""
|
|
30
|
+
Carefully transcribe the text for this pdf into a text file with
|
|
31
|
+
markdown annotations.
|
|
32
|
+
"""
|
|
33
|
+
+ _LINE_NUM_PREFIX
|
|
34
|
+
+ """
|
|
35
|
+
**The final output must be formatted as text that visually
|
|
36
|
+
mimics in markdown the layout and hierarchy of the original PDF
|
|
37
|
+
when rendered (ignoring the line-number prefixes).**
|
|
38
|
+
|
|
39
|
+
* Do not include headers or footers that are repeated on each page.
|
|
40
|
+
* Do not include page numbers.
|
|
41
|
+
* Preserve the reading order of the text as it appears in the PDF.
|
|
42
|
+
* Remove hyphens that break words at the end of lines.
|
|
43
|
+
* e.g. "uti- lized" -> "utilized"
|
|
44
|
+
* Use Markdown headings (`#`, `##`, `###`) to reflect the size and
|
|
45
|
+
hierarchy of titles and subtitles in the PDF.
|
|
46
|
+
* Ensure that there are blank lines before and after headings, lists,
|
|
47
|
+
tables, and images.
|
|
48
|
+
* End each paragraph with a blank line.
|
|
49
|
+
* Do not break lines within paragraphs or headings.
|
|
50
|
+
* Render bullet points and numbered lettered lists as markdown lists.
|
|
51
|
+
* It is ok to remove brackets and other consistent punctuation around
|
|
52
|
+
list identifiers
|
|
53
|
+
* e.g. "a)" -> "a."
|
|
54
|
+
* Use blockquotes for any sidebars or highlighted text.
|
|
55
|
+
* Bold all words and phrases that appear bolded in the original
|
|
56
|
+
source material. Similarly, italicise all text in italics.
|
|
57
|
+
* Render tables as markdown, paying particular attention to copying
|
|
58
|
+
identifiers exactly.
|
|
59
|
+
* Break text into paragraphs and lists exactly as they appear in
|
|
60
|
+
the PDF.
|
|
61
|
+
* Preserve figure/chart captions verbatim — do not paraphrase, extend,
|
|
62
|
+
or interleave them with descriptions. If useful context is only visible
|
|
63
|
+
in the image (e.g. axis labels, legend entries, data values), add it
|
|
64
|
+
as a separate paragraph after the caption.
|
|
65
|
+
* Convert bar charts into markdown tables where possible.
|
|
66
|
+
* Convert tables contained in images into markdown.
|
|
67
|
+
* Keep mathematical expressions as close to the PDF's own characters as
|
|
68
|
+
possible — use the same Unicode symbols (×, ≥, α, β, etc.) rather than
|
|
69
|
+
converting to LaTeX commands.
|
|
70
|
+
* Only use LaTeX (`$...$` / `$$...$$`) for complex display equations
|
|
71
|
+
with fractions, integrals, summations, or multi-level notation that
|
|
72
|
+
cannot be represented legibly in plain text.
|
|
73
|
+
* Insert markers at the start of each page of the form `<!--page-->`
|
|
74
|
+
* Surround tables and figure descriptions with markers:
|
|
75
|
+
* `<!--table-->` ... `<!--end-->`
|
|
76
|
+
* `<!--figure-->` ... `<!--end-->`
|
|
77
|
+
"""
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
_agent: Agent[None, str] = Agent(output_type=str)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PydanticAIMarkdownProvider:
|
|
84
|
+
"""MarkdownProvider that converts PDF chunks to Markdown via a vision-capable LLM."""
|
|
85
|
+
|
|
86
|
+
def __init__(self, model: str, *, prompt: str | None = None) -> None:
|
|
87
|
+
self.model = model
|
|
88
|
+
self._prompt = prompt or PROMPT
|
|
89
|
+
|
|
90
|
+
async def generate_markdown(self, chunk: DocumentChunk) -> str:
|
|
91
|
+
"""Convert a document chunk to Markdown.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Markdown string with ``<!--page-->`` markers between pages.
|
|
95
|
+
"""
|
|
96
|
+
result = await _agent.run(
|
|
97
|
+
[BinaryContent(data=chunk.data, media_type=chunk.mime_type), self._prompt],
|
|
98
|
+
model=self.model,
|
|
99
|
+
)
|
|
100
|
+
# Strip the line-number prefixes added to bypass Claude's content filter.
|
|
101
|
+
markdown = _LINE_NUM_RE.sub("", result.output)
|
|
102
|
+
# NFKC-normalize so superscript digits, ligatures, etc. match the
|
|
103
|
+
# NFKC-normalized anchor text from pdfplumber extraction.
|
|
104
|
+
return unicodedata.normalize("NFKC", markdown)
|