groundmark 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- groundmark/__init__.py +6 -0
- groundmark/markdown.py +104 -0
- groundmark/parse.py +95 -0
- groundmark/process.py +82 -0
- groundmark/visualize.py +201 -0
- groundmark-0.1.0.dist-info/METADATA +109 -0
- groundmark-0.1.0.dist-info/RECORD +9 -0
- groundmark-0.1.0.dist-info/WHEEL +4 -0
- groundmark-0.1.0.dist-info/licenses/LICENSE +21 -0
groundmark/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
from anchorite import Anchor, BBox, annotate, resolve, strip
|
|
2
|
+
|
|
3
|
+
from groundmark.markdown import PROMPT
|
|
4
|
+
from groundmark.process import Config, ProcessResult, process
|
|
5
|
+
|
|
6
|
+
__all__ = ["PROMPT", "Anchor", "BBox", "Config", "ProcessResult", "annotate", "process", "resolve", "strip"]
|
groundmark/markdown.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""PDF to Markdown conversion via Pydantic AI (any supported LLM)."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from typing import Final
|
|
6
|
+
|
|
7
|
+
from anchorite.document import DocumentChunk
|
|
8
|
+
from pydantic_ai import Agent
|
|
9
|
+
from pydantic_ai.messages import BinaryContent
|
|
10
|
+
|
|
11
|
+
# Apparently, faithfully analyzing a PDF's complicated layout and transcribing
|
|
12
|
+
# it into well-structured Markdown isn't creative enough for Claude's content
|
|
13
|
+
# filter. Asking the model to add line numbers gives it something "original"
|
|
14
|
+
# to contribute, which satisfies the anti-regurgitation heuristic. We strip
|
|
15
|
+
# them right after.
|
|
16
|
+
# https://privacy.claude.com/en/articles/10023638-why-am-i-receiving-an-output-blocked-by-content-filtering-policy-error
|
|
17
|
+
_LINE_NUM_PREFIX = """
|
|
18
|
+
IMPORTANT: Prefix every output line with its line number followed by a
|
|
19
|
+
pipe character (no trailing space), e.g.:
|
|
20
|
+
1|# Heading
|
|
21
|
+
2|
|
|
22
|
+
3|Some paragraph text here.
|
|
23
|
+
Start numbering at 1. This is required for all output.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
_LINE_NUM_RE = re.compile(r"^\d+\|", re.MULTILINE)
|
|
27
|
+
|
|
28
|
+
PROMPT: Final[str] = (
|
|
29
|
+
"""
|
|
30
|
+
Carefully transcribe the text for this pdf into a text file with
|
|
31
|
+
markdown annotations.
|
|
32
|
+
"""
|
|
33
|
+
+ _LINE_NUM_PREFIX
|
|
34
|
+
+ """
|
|
35
|
+
**The final output must be formatted as text that visually
|
|
36
|
+
mimics in markdown the layout and hierarchy of the original PDF
|
|
37
|
+
when rendered (ignoring the line-number prefixes).**
|
|
38
|
+
|
|
39
|
+
* Do not include headers or footers that are repeated on each page.
|
|
40
|
+
* Do not include page numbers.
|
|
41
|
+
* Preserve the reading order of the text as it appears in the PDF.
|
|
42
|
+
* Remove hyphens that break words at the end of lines.
|
|
43
|
+
* e.g. "uti- lized" -> "utilized"
|
|
44
|
+
* Use Markdown headings (`#`, `##`, `###`) to reflect the size and
|
|
45
|
+
hierarchy of titles and subtitles in the PDF.
|
|
46
|
+
* Ensure that there are blank lines before and after headings, lists,
|
|
47
|
+
tables, and images.
|
|
48
|
+
* End each paragraph with a blank line.
|
|
49
|
+
* Do not break lines within paragraphs or headings.
|
|
50
|
+
* Render bullet points and numbered lettered lists as markdown lists.
|
|
51
|
+
* It is ok to remove brackets and other consistent punctuation around
|
|
52
|
+
list identifiers
|
|
53
|
+
* e.g. "a)" -> "a."
|
|
54
|
+
* Use blockquotes for any sidebars or highlighted text.
|
|
55
|
+
* Bold all words and phrases that appear bolded in the original
|
|
56
|
+
source material. Similarly, italicise all text in italics.
|
|
57
|
+
* Render tables as markdown, paying particular attention to copying
|
|
58
|
+
identifiers exactly.
|
|
59
|
+
* Break text into paragraphs and lists exactly as they appear in
|
|
60
|
+
the PDF.
|
|
61
|
+
* Preserve figure/chart captions verbatim — do not paraphrase, extend,
|
|
62
|
+
or interleave them with descriptions. If useful context is only visible
|
|
63
|
+
in the image (e.g. axis labels, legend entries, data values), add it
|
|
64
|
+
as a separate paragraph after the caption.
|
|
65
|
+
* Convert bar charts into markdown tables where possible.
|
|
66
|
+
* Convert tables contained in images into markdown.
|
|
67
|
+
* Keep mathematical expressions as close to the PDF's own characters as
|
|
68
|
+
possible — use the same Unicode symbols (×, ≥, α, β, etc.) rather than
|
|
69
|
+
converting to LaTeX commands.
|
|
70
|
+
* Only use LaTeX (`$...$` / `$$...$$`) for complex display equations
|
|
71
|
+
with fractions, integrals, summations, or multi-level notation that
|
|
72
|
+
cannot be represented legibly in plain text.
|
|
73
|
+
* Insert markers at the start of each page of the form `<!--page-->`
|
|
74
|
+
* Surround tables and figure descriptions with markers:
|
|
75
|
+
* `<!--table-->` ... `<!--end-->`
|
|
76
|
+
* `<!--figure-->` ... `<!--end-->`
|
|
77
|
+
"""
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
_agent: Agent[None, str] = Agent(output_type=str)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PydanticAIMarkdownProvider:
|
|
84
|
+
"""MarkdownProvider that converts PDF chunks to Markdown via a vision-capable LLM."""
|
|
85
|
+
|
|
86
|
+
def __init__(self, model: str, *, prompt: str | None = None) -> None:
|
|
87
|
+
self.model = model
|
|
88
|
+
self._prompt = prompt or PROMPT
|
|
89
|
+
|
|
90
|
+
async def generate_markdown(self, chunk: DocumentChunk) -> str:
|
|
91
|
+
"""Convert a document chunk to Markdown.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Markdown string with ``<!--page-->`` markers between pages.
|
|
95
|
+
"""
|
|
96
|
+
result = await _agent.run(
|
|
97
|
+
[BinaryContent(data=chunk.data, media_type=chunk.mime_type), self._prompt],
|
|
98
|
+
model=self.model,
|
|
99
|
+
)
|
|
100
|
+
# Strip the line-number prefixes added to bypass Claude's content filter.
|
|
101
|
+
markdown = _LINE_NUM_RE.sub("", result.output)
|
|
102
|
+
# NFKC-normalize so superscript digits, ligatures, etc. match the
|
|
103
|
+
# NFKC-normalized anchor text from pdfplumber extraction.
|
|
104
|
+
return unicodedata.normalize("NFKC", markdown)
|
groundmark/parse.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""Bounding box extraction from PDFs using pdfplumber."""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import logging
|
|
5
|
+
import unicodedata
|
|
6
|
+
|
|
7
|
+
import pdfplumber
|
|
8
|
+
from anchorite import Anchor, BBox
|
|
9
|
+
from anchorite.document import DocumentChunk
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PdfplumberAnchorProvider:
|
|
13
|
+
"""AnchorProvider that extracts line-level bounding boxes via pdfplumber.
|
|
14
|
+
|
|
15
|
+
Body text lines are emitted individually; table rows are merged into
|
|
16
|
+
single anchors so that repeated short cell values (e.g. "Pathogenic")
|
|
17
|
+
become unique when combined with the full row.
|
|
18
|
+
|
|
19
|
+
Coordinates are normalized to a 0-1000 scale with top-left origin.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
async def generate_anchors(self, chunk: DocumentChunk) -> list[Anchor]:
|
|
23
|
+
pdf = pdfplumber.open(io.BytesIO(chunk.data))
|
|
24
|
+
anchors: list[Anchor] = []
|
|
25
|
+
|
|
26
|
+
for page_offset, page in enumerate(pdf.pages):
|
|
27
|
+
pw, ph = page.width, page.height
|
|
28
|
+
if pw <= 0 or ph <= 0:
|
|
29
|
+
logging.warning(
|
|
30
|
+
"Page %d has invalid dimensions (%s x %s), skipping",
|
|
31
|
+
chunk.start_page + page_offset,
|
|
32
|
+
pw,
|
|
33
|
+
ph,
|
|
34
|
+
)
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
page_num = chunk.start_page + page_offset
|
|
38
|
+
|
|
39
|
+
# Detect table regions on this page.
|
|
40
|
+
table_bboxes = [table.bbox for table in page.find_tables()]
|
|
41
|
+
|
|
42
|
+
lines = page.extract_text_lines(return_chars=False, use_text_flow=True)
|
|
43
|
+
|
|
44
|
+
for line in lines:
|
|
45
|
+
x0, x1 = line["x0"], line["x1"]
|
|
46
|
+
# pdfplumber's top starts at baseline minus font size,
|
|
47
|
+
# missing the ascender portion above the glyph. Pad upward
|
|
48
|
+
# by ~30% of line height to approximate full glyph bounds.
|
|
49
|
+
line_h = line["bottom"] - line["top"]
|
|
50
|
+
y0 = line["top"] - 0.3 * line_h
|
|
51
|
+
y1 = line["bottom"]
|
|
52
|
+
text = unicodedata.normalize("NFKC", line["text"].strip())
|
|
53
|
+
if not text:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
in_table = any(_rects_intersect((x0, y0, x1, y1), tb) for tb in table_bboxes)
|
|
57
|
+
|
|
58
|
+
if in_table:
|
|
59
|
+
# Table lines are already row-level from extract_text_lines.
|
|
60
|
+
anchors.append(
|
|
61
|
+
Anchor(
|
|
62
|
+
page=page_num,
|
|
63
|
+
box=_normalize(x0, y0, x1, y1, pw, ph),
|
|
64
|
+
text=text,
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
anchors.append(
|
|
69
|
+
Anchor(
|
|
70
|
+
page=page_num,
|
|
71
|
+
box=_normalize(x0, y0, x1, y1, pw, ph),
|
|
72
|
+
text=text,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
pdf.close()
|
|
77
|
+
logging.debug("Generated %d bounding boxes", len(anchors))
|
|
78
|
+
return anchors
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _rects_intersect(a: tuple[float, float, float, float], b: tuple[float, float, float, float]) -> bool:
|
|
82
|
+
"""Check if two (x0, y0, x1, y1) rectangles overlap."""
|
|
83
|
+
ax0, ay0, ax1, ay1 = a
|
|
84
|
+
bx0, by0, bx1, by1 = b
|
|
85
|
+
return ax0 < bx1 and ax1 > bx0 and ay0 < by1 and ay1 > by0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _normalize(x0: float, y0: float, x1: float, y1: float, pw: float, ph: float) -> BBox:
|
|
89
|
+
"""Convert PDF coordinates to 0-1000 scale with top-left origin."""
|
|
90
|
+
return BBox(
|
|
91
|
+
top=int(y0 / ph * 1000),
|
|
92
|
+
left=int(x0 / pw * 1000),
|
|
93
|
+
bottom=int(y1 / ph * 1000),
|
|
94
|
+
right=int(x1 / pw * 1000),
|
|
95
|
+
)
|
groundmark/process.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import dataclasses
|
|
2
|
+
|
|
3
|
+
from anchorite import process_document
|
|
4
|
+
from anchorite.document import DocumentChunk, chunks
|
|
5
|
+
|
|
6
|
+
from groundmark.markdown import PydanticAIMarkdownProvider
|
|
7
|
+
from groundmark.parse import PdfplumberAnchorProvider
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclasses.dataclass(frozen=True)
|
|
11
|
+
class Config:
|
|
12
|
+
"""Configuration for the groundmark processing pipeline."""
|
|
13
|
+
|
|
14
|
+
model: str
|
|
15
|
+
"""Pydantic AI model string (e.g. "bedrock:au.anthropic.claude-sonnet-4-6")."""
|
|
16
|
+
uniqueness_threshold: float = 0.5
|
|
17
|
+
"""Minimum score ratio between best and second-best alignment match."""
|
|
18
|
+
min_overlap: float = 0.9
|
|
19
|
+
"""Minimum overlap fraction required for a valid alignment match."""
|
|
20
|
+
page_count: int | None = None
|
|
21
|
+
"""Pages per chunk (None = whole PDF in one chunk)."""
|
|
22
|
+
prompt: str | None = None
|
|
23
|
+
"""Custom LLM prompt (None = use built-in default)."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclasses.dataclass
|
|
27
|
+
class ProcessResult:
|
|
28
|
+
"""Result of processing a PDF into annotated Markdown."""
|
|
29
|
+
|
|
30
|
+
annotated_markdown: str
|
|
31
|
+
"""Markdown with <span data-bbox="..." data-page="N"> tags."""
|
|
32
|
+
coverage_percent: float
|
|
33
|
+
"""Fraction of markdown content covered by aligned bounding boxes."""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def process(
|
|
37
|
+
pdf_bytes: bytes,
|
|
38
|
+
config: Config,
|
|
39
|
+
*,
|
|
40
|
+
markdown: str | None = None,
|
|
41
|
+
) -> ProcessResult:
|
|
42
|
+
"""Process a PDF into annotated Markdown with bounding box spans.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
pdf_bytes: Raw PDF file bytes.
|
|
46
|
+
config: Processing configuration.
|
|
47
|
+
markdown: Optional pre-generated markdown (skips LLM call).
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
ProcessResult with annotated markdown and coverage stats.
|
|
51
|
+
"""
|
|
52
|
+
md_provider: _CachedMarkdownProvider | PydanticAIMarkdownProvider
|
|
53
|
+
if markdown is not None:
|
|
54
|
+
# Cached markdown corresponds to the whole PDF — don't chunk.
|
|
55
|
+
md_provider = _CachedMarkdownProvider(markdown)
|
|
56
|
+
doc_chunks = chunks(pdf_bytes)
|
|
57
|
+
else:
|
|
58
|
+
md_provider = PydanticAIMarkdownProvider(config.model, prompt=config.prompt)
|
|
59
|
+
doc_chunks = chunks(pdf_bytes, page_count=config.page_count)
|
|
60
|
+
|
|
61
|
+
result = await process_document(
|
|
62
|
+
doc_chunks,
|
|
63
|
+
markdown_provider=md_provider,
|
|
64
|
+
anchor_provider=PdfplumberAnchorProvider(),
|
|
65
|
+
alignment_uniqueness_threshold=config.uniqueness_threshold,
|
|
66
|
+
alignment_min_overlap=config.min_overlap,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return ProcessResult(
|
|
70
|
+
annotated_markdown=result.annotate(),
|
|
71
|
+
coverage_percent=result.coverage_percent,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class _CachedMarkdownProvider:
|
|
76
|
+
"""Returns pre-generated markdown, ignoring the chunk."""
|
|
77
|
+
|
|
78
|
+
def __init__(self, markdown: str) -> None:
|
|
79
|
+
self._markdown = markdown
|
|
80
|
+
|
|
81
|
+
async def generate_markdown(self, _chunk: DocumentChunk) -> str:
|
|
82
|
+
return self._markdown
|
groundmark/visualize.py
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
"""Debug visualizer: overlay bounding boxes from annotated Markdown onto the source PDF."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import io
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import sys
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Annotated
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
from anchorite import Anchor, align, annotate
|
|
14
|
+
from anchorite.document import chunks
|
|
15
|
+
from pypdf import PdfReader, PdfWriter
|
|
16
|
+
from pypdf.annotations import Highlight
|
|
17
|
+
from pypdf.generic import ArrayObject, FloatObject
|
|
18
|
+
|
|
19
|
+
from groundmark.markdown import PydanticAIMarkdownProvider
|
|
20
|
+
from groundmark.parse import PdfplumberAnchorProvider
|
|
21
|
+
|
|
22
|
+
_SPAN_RE = re.compile(
|
|
23
|
+
r'<span\s+data-bbox="(\d+),(\d+),(\d+),(\d+)"\s+data-page="(\d+)">(.*?)</span>',
|
|
24
|
+
re.DOTALL,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
app = typer.Typer()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _log(msg: str) -> None:
|
|
31
|
+
print(msg, file=sys.stderr)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _add_highlight(
|
|
35
|
+
writer: PdfWriter,
|
|
36
|
+
page_num: int,
|
|
37
|
+
bbox: tuple[int, int, int, int],
|
|
38
|
+
page_width: float,
|
|
39
|
+
page_height: float,
|
|
40
|
+
*,
|
|
41
|
+
color: tuple[float, float, float],
|
|
42
|
+
) -> None:
|
|
43
|
+
top, left, bottom, right = bbox
|
|
44
|
+
# Convert from 0-1000 scale to PDF coordinates (bottom-left origin).
|
|
45
|
+
x0 = left / 1000 * page_width
|
|
46
|
+
x1 = right / 1000 * page_width
|
|
47
|
+
y0 = page_height - bottom / 1000 * page_height
|
|
48
|
+
y1 = page_height - top / 1000 * page_height
|
|
49
|
+
|
|
50
|
+
hex_color = "".join(f"{int(c * 255):02x}" for c in color)
|
|
51
|
+
quad_points = ArrayObject(
|
|
52
|
+
[
|
|
53
|
+
FloatObject(x0),
|
|
54
|
+
FloatObject(y1), # top-left
|
|
55
|
+
FloatObject(x1),
|
|
56
|
+
FloatObject(y1), # top-right
|
|
57
|
+
FloatObject(x0),
|
|
58
|
+
FloatObject(y0), # bottom-left
|
|
59
|
+
FloatObject(x1),
|
|
60
|
+
FloatObject(y0), # bottom-right
|
|
61
|
+
]
|
|
62
|
+
)
|
|
63
|
+
annotation = Highlight(
|
|
64
|
+
rect=(x0, y0, x1, y1),
|
|
65
|
+
quad_points=quad_points,
|
|
66
|
+
highlight_color=hex_color,
|
|
67
|
+
)
|
|
68
|
+
writer.add_annotation(page_number=page_num, annotation=annotation)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _overlay_bboxes(
|
|
72
|
+
pdf_bytes: bytes,
|
|
73
|
+
annotated_markdown: str,
|
|
74
|
+
raw_anchors: list[Anchor],
|
|
75
|
+
) -> bytes:
|
|
76
|
+
"""Draw bounding box highlights onto the PDF.
|
|
77
|
+
|
|
78
|
+
Blue = raw extracted boxes, Red = aligned (from annotated Markdown).
|
|
79
|
+
"""
|
|
80
|
+
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
81
|
+
writer = PdfWriter()
|
|
82
|
+
writer.append_pages_from_reader(reader)
|
|
83
|
+
|
|
84
|
+
# Blue: raw extracted boxes.
|
|
85
|
+
for anchor in raw_anchors:
|
|
86
|
+
if anchor.page >= len(reader.pages):
|
|
87
|
+
continue
|
|
88
|
+
page = reader.pages[anchor.page]
|
|
89
|
+
pw = float(page.mediabox.width)
|
|
90
|
+
ph = float(page.mediabox.height)
|
|
91
|
+
box = anchor.box
|
|
92
|
+
_add_highlight(writer, anchor.page, (box.top, box.left, box.bottom, box.right), pw, ph, color=(0.8, 0.9, 1))
|
|
93
|
+
|
|
94
|
+
# Red: aligned boxes from annotated Markdown.
|
|
95
|
+
for match in _SPAN_RE.finditer(annotated_markdown):
|
|
96
|
+
top, left, bottom, right = int(match[1]), int(match[2]), int(match[3]), int(match[4])
|
|
97
|
+
page_num = int(match[5])
|
|
98
|
+
if page_num >= len(reader.pages):
|
|
99
|
+
continue
|
|
100
|
+
page = reader.pages[page_num]
|
|
101
|
+
pw = float(page.mediabox.width)
|
|
102
|
+
ph = float(page.mediabox.height)
|
|
103
|
+
_add_highlight(writer, page_num, (top, left, bottom, right), pw, ph, color=(1, 0.85, 0.85))
|
|
104
|
+
|
|
105
|
+
buf = io.BytesIO()
|
|
106
|
+
writer.write(buf)
|
|
107
|
+
return buf.getvalue()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@app.command()
|
|
111
|
+
def visualize(
|
|
112
|
+
input_pdf: Annotated[Path, typer.Argument(help="Path to the source PDF.")],
|
|
113
|
+
output_pdf: Annotated[Path, typer.Argument(help="Path for the output PDF with bbox overlays.")],
|
|
114
|
+
model: Annotated[str, typer.Option(help="Pydantic AI model string.")] = "",
|
|
115
|
+
markdown_file: Annotated[Path | None, typer.Option("--markdown", "-m", help="Cached Markdown file.")] = None,
|
|
116
|
+
uniqueness_threshold: Annotated[float, typer.Option("--threshold", "-t")] = 0.5,
|
|
117
|
+
min_overlap: Annotated[float, typer.Option("--overlap", "-o")] = 0.9,
|
|
118
|
+
page_count: Annotated[int | None, typer.Option("--page-count", "-p", help="Pages per chunk.")] = None,
|
|
119
|
+
verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable debug logging.")] = False,
|
|
120
|
+
) -> None:
|
|
121
|
+
"""Run the groundmark pipeline on a PDF and overlay bounding boxes."""
|
|
122
|
+
if verbose:
|
|
123
|
+
logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
|
|
124
|
+
|
|
125
|
+
if not markdown_file and not model:
|
|
126
|
+
_log("Error: provide --model or --markdown")
|
|
127
|
+
raise SystemExit(1)
|
|
128
|
+
|
|
129
|
+
asyncio.run(
|
|
130
|
+
_run(
|
|
131
|
+
input_pdf,
|
|
132
|
+
output_pdf,
|
|
133
|
+
model,
|
|
134
|
+
markdown_file,
|
|
135
|
+
uniqueness_threshold,
|
|
136
|
+
min_overlap,
|
|
137
|
+
page_count,
|
|
138
|
+
)
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
async def _run(
|
|
143
|
+
input_pdf: Path,
|
|
144
|
+
output_pdf: Path,
|
|
145
|
+
model: str,
|
|
146
|
+
markdown_file: Path | None,
|
|
147
|
+
uniqueness_threshold: float,
|
|
148
|
+
min_overlap: float,
|
|
149
|
+
page_count: int | None,
|
|
150
|
+
) -> None:
|
|
151
|
+
pdf_bytes = input_pdf.read_bytes()
|
|
152
|
+
doc_chunks = list(chunks(pdf_bytes, page_count=page_count))
|
|
153
|
+
t0 = time.perf_counter()
|
|
154
|
+
|
|
155
|
+
# Extract anchors (bounding boxes) from the PDF.
|
|
156
|
+
_log("Extracting bounding boxes...")
|
|
157
|
+
t = time.perf_counter()
|
|
158
|
+
provider = PdfplumberAnchorProvider()
|
|
159
|
+
all_anchors = await asyncio.gather(*(provider.generate_anchors(c) for c in doc_chunks))
|
|
160
|
+
flat_anchors = [a for chunk_anchors in all_anchors for a in chunk_anchors]
|
|
161
|
+
_log(f" {len(flat_anchors)} bounding boxes ({time.perf_counter() - t:.1f}s)")
|
|
162
|
+
|
|
163
|
+
# Generate or load Markdown.
|
|
164
|
+
if markdown_file:
|
|
165
|
+
markdown = markdown_file.read_text()
|
|
166
|
+
_log(f" Loaded cached Markdown from {markdown_file}")
|
|
167
|
+
else:
|
|
168
|
+
_log("Generating Markdown...")
|
|
169
|
+
t = time.perf_counter()
|
|
170
|
+
md_provider = PydanticAIMarkdownProvider(model)
|
|
171
|
+
md_chunks = await asyncio.gather(*(md_provider.generate_markdown(c) for c in doc_chunks))
|
|
172
|
+
markdown = "\n\n<!--page-->\n\n".join(md_chunks)
|
|
173
|
+
_log(f" {len(markdown)} chars ({time.perf_counter() - t:.1f}s)")
|
|
174
|
+
|
|
175
|
+
# Write plain markdown before alignment so it's available even if alignment hangs.
|
|
176
|
+
if not markdown_file:
|
|
177
|
+
md_path = output_pdf.with_suffix(".md")
|
|
178
|
+
md_path.write_text(markdown)
|
|
179
|
+
_log(f" Markdown written to {md_path}")
|
|
180
|
+
|
|
181
|
+
# Align anchors to Markdown.
|
|
182
|
+
_log("Aligning...")
|
|
183
|
+
t = time.perf_counter()
|
|
184
|
+
alignment = align(
|
|
185
|
+
flat_anchors,
|
|
186
|
+
markdown,
|
|
187
|
+
uniqueness_threshold=uniqueness_threshold,
|
|
188
|
+
min_overlap=min_overlap,
|
|
189
|
+
)
|
|
190
|
+
annotated_markdown = annotate(markdown, alignment)
|
|
191
|
+
coverage = sum(e - s for s, e in alignment.values()) / len(markdown) if markdown else 0.0
|
|
192
|
+
_log(f" {coverage:.1%} coverage ({time.perf_counter() - t:.1f}s)")
|
|
193
|
+
|
|
194
|
+
output_bytes = _overlay_bboxes(pdf_bytes, annotated_markdown, flat_anchors)
|
|
195
|
+
output_pdf.write_bytes(output_bytes)
|
|
196
|
+
_log(f" Output written to {output_pdf}")
|
|
197
|
+
_log(f"Total: {time.perf_counter() - t0:.1f}s")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
if __name__ == "__main__":
|
|
201
|
+
app()
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: groundmark
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: PDF to grounded Markdown with bounding box annotations
|
|
5
|
+
Project-URL: Homepage, https://github.com/populationgenomics/groundmark
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/populationgenomics/groundmark/issues
|
|
7
|
+
Author-email: Tobias Sargeant <tobias.sargeant@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Requires-Dist: anchorite>=0.1.1
|
|
18
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
19
|
+
Requires-Dist: pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0
|
|
20
|
+
Requires-Dist: pypdf>=6.8.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# groundmark
|
|
24
|
+
|
|
25
|
+
<img src="groundmark.webp" alt="groundmark" width="200">
|
|
26
|
+
|
|
27
|
+
## Grounded Markdown for PDFs
|
|
28
|
+
|
|
29
|
+
**groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
|
|
30
|
+
|
|
31
|
+
Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
|
|
32
|
+
|
|
33
|
+
## Architecture
|
|
34
|
+
|
|
35
|
+
The library processes documents in two streams that are then merged:
|
|
36
|
+
|
|
37
|
+
1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
|
|
38
|
+
2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
|
|
39
|
+
3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
|
|
40
|
+
4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
|
|
41
|
+
|
|
42
|
+
```html
|
|
43
|
+
<span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import asyncio
|
|
50
|
+
import groundmark as gm
|
|
51
|
+
|
|
52
|
+
async def main():
|
|
53
|
+
pdf_bytes = open("document.pdf", "rb").read()
|
|
54
|
+
|
|
55
|
+
config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
|
|
56
|
+
|
|
57
|
+
# PDF -> annotated Markdown (one call)
|
|
58
|
+
result = await gm.process(pdf_bytes, config)
|
|
59
|
+
print(f"Coverage: {result.coverage_percent:.2%}")
|
|
60
|
+
print(result.annotated_markdown[:500])
|
|
61
|
+
|
|
62
|
+
# Strip for LLM consumption
|
|
63
|
+
stripped = gm.strip(result.annotated_markdown)
|
|
64
|
+
# stripped.plain_text: clean Markdown with spans removed
|
|
65
|
+
# stripped.validation_map: list of (start, end, Anchor) ranges
|
|
66
|
+
|
|
67
|
+
# Resolve verbatim quotes to PDF coordinates
|
|
68
|
+
resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
|
|
69
|
+
# -> {"the patient presented with": [(page, BBox), ...]}
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
asyncio.run(main())
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Debug Visualizer
|
|
76
|
+
|
|
77
|
+
The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
|
|
81
|
+
|
|
82
|
+
# Or with cached Markdown:
|
|
83
|
+
python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+

|
|
87
|
+
|
|
88
|
+
*Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
|
|
89
|
+
|
|
90
|
+
## Configuration
|
|
91
|
+
|
|
92
|
+
### Timeouts
|
|
93
|
+
|
|
94
|
+
The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
|
|
95
|
+
|
|
96
|
+
| Provider | Default | Environment Variable |
|
|
97
|
+
|----------|---------|---------------------|
|
|
98
|
+
| Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
|
|
99
|
+
| Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
|
|
100
|
+
|
|
101
|
+
For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
export AWS_READ_TIMEOUT=600
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## License
|
|
108
|
+
|
|
109
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
groundmark/__init__.py,sha256=nuEYgTIqO-5cTZ7XpWzmnKt01EKpuRsqKC_eDHaolM0,273
|
|
2
|
+
groundmark/markdown.py,sha256=v_zBQyL7ArWqmdYrjBuj2nXlXMRHHEYm9kd9bfwC5IY,4407
|
|
3
|
+
groundmark/parse.py,sha256=sw91PbgskQiGzgU3A1N5HTly7yfEvBPqotazRnhwfD0,3460
|
|
4
|
+
groundmark/process.py,sha256=PHCNAwIyS6JgIYCQM094uJmYiOnGRODGEbAFXuiqb7w,2696
|
|
5
|
+
groundmark/visualize.py,sha256=BfvirOZyYSD__VqDuF_kZW9cAzT1AMjZ741gpJsFyOo,6755
|
|
6
|
+
groundmark-0.1.0.dist-info/METADATA,sha256=hIjcRwYGxhhVXav93h0bb3oi4M9Z35blwYkuMIiiNEQ,4690
|
|
7
|
+
groundmark-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
8
|
+
groundmark-0.1.0.dist-info/licenses/LICENSE,sha256=lwIanGu698z5WiBedbwEOdUJLQb_G4FOFgucqExVIS0,1087
|
|
9
|
+
groundmark-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Centre for Population Genomics
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|