groundmark 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
groundmark/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from anchorite import Anchor, BBox, annotate, resolve, strip
2
+
3
+ from groundmark.markdown import PROMPT
4
+ from groundmark.process import Config, ProcessResult, process
5
+
6
+ __all__ = ["PROMPT", "Anchor", "BBox", "Config", "ProcessResult", "annotate", "process", "resolve", "strip"]
groundmark/markdown.py ADDED
@@ -0,0 +1,104 @@
1
+ """PDF to Markdown conversion via Pydantic AI (any supported LLM)."""
2
+
3
+ import re
4
+ import unicodedata
5
+ from typing import Final
6
+
7
+ from anchorite.document import DocumentChunk
8
+ from pydantic_ai import Agent
9
+ from pydantic_ai.messages import BinaryContent
10
+
11
+ # Apparently, faithfully analyzing a PDF's complicated layout and transcribing
12
+ # it into well-structured Markdown isn't creative enough for Claude's content
13
+ # filter. Asking the model to add line numbers gives it something "original"
14
+ # to contribute, which satisfies the anti-regurgitation heuristic. We strip
15
+ # them right after.
16
+ # https://privacy.claude.com/en/articles/10023638-why-am-i-receiving-an-output-blocked-by-content-filtering-policy-error
17
+ _LINE_NUM_PREFIX = """
18
+ IMPORTANT: Prefix every output line with its line number followed by a
19
+ pipe character (no trailing space), e.g.:
20
+ 1|# Heading
21
+ 2|
22
+ 3|Some paragraph text here.
23
+ Start numbering at 1. This is required for all output.
24
+ """
25
+
26
+ _LINE_NUM_RE = re.compile(r"^\d+\|", re.MULTILINE)
27
+
28
+ PROMPT: Final[str] = (
29
+ """
30
+ Carefully transcribe the text for this pdf into a text file with
31
+ markdown annotations.
32
+ """
33
+ + _LINE_NUM_PREFIX
34
+ + """
35
+ **The final output must be formatted as text that visually
36
+ mimics in markdown the layout and hierarchy of the original PDF
37
+ when rendered (ignoring the line-number prefixes).**
38
+
39
+ * Do not include headers or footers that are repeated on each page.
40
+ * Do not include page numbers.
41
+ * Preserve the reading order of the text as it appears in the PDF.
42
+ * Remove hyphens that break words at the end of lines.
43
+ * e.g. "uti- lized" -> "utilized"
44
+ * Use Markdown headings (`#`, `##`, `###`) to reflect the size and
45
+ hierarchy of titles and subtitles in the PDF.
46
+ * Ensure that there are blank lines before and after headings, lists,
47
+ tables, and images.
48
+ * End each paragraph with a blank line.
49
+ * Do not break lines within paragraphs or headings.
50
+ * Render bullet points and numbered lettered lists as markdown lists.
51
+ * It is ok to remove brackets and other consistent punctuation around
52
+ list identifiers
53
+ * e.g. "a)" -> "a."
54
+ * Use blockquotes for any sidebars or highlighted text.
55
+ * Bold all words and phrases that appear bolded in the original
56
+ source material. Similarly, italicise all text in italics.
57
+ * Render tables as markdown, paying particular attention to copying
58
+ identifiers exactly.
59
+ * Break text into paragraphs and lists exactly as they appear in
60
+ the PDF.
61
+ * Preserve figure/chart captions verbatim — do not paraphrase, extend,
62
+ or interleave them with descriptions. If useful context is only visible
63
+ in the image (e.g. axis labels, legend entries, data values), add it
64
+ as a separate paragraph after the caption.
65
+ * Convert bar charts into markdown tables where possible.
66
+ * Convert tables contained in images into markdown.
67
+ * Keep mathematical expressions as close to the PDF's own characters as
68
+ possible — use the same Unicode symbols (×, ≥, α, β, etc.) rather than
69
+ converting to LaTeX commands.
70
+ * Only use LaTeX (`$...$` / `$$...$$`) for complex display equations
71
+ with fractions, integrals, summations, or multi-level notation that
72
+ cannot be represented legibly in plain text.
73
+ * Insert markers at the start of each page of the form `<!--page-->`
74
+ * Surround tables and figure descriptions with markers:
75
+ * `<!--table-->` ... `<!--end-->`
76
+ * `<!--figure-->` ... `<!--end-->`
77
+ """
78
+ )
79
+
80
+ _agent: Agent[None, str] = Agent(output_type=str)
81
+
82
+
83
+ class PydanticAIMarkdownProvider:
84
+ """MarkdownProvider that converts PDF chunks to Markdown via a vision-capable LLM."""
85
+
86
+ def __init__(self, model: str, *, prompt: str | None = None) -> None:
87
+ self.model = model
88
+ self._prompt = prompt or PROMPT
89
+
90
+ async def generate_markdown(self, chunk: DocumentChunk) -> str:
91
+ """Convert a document chunk to Markdown.
92
+
93
+ Returns:
94
+ Markdown string with ``<!--page-->`` markers between pages.
95
+ """
96
+ result = await _agent.run(
97
+ [BinaryContent(data=chunk.data, media_type=chunk.mime_type), self._prompt],
98
+ model=self.model,
99
+ )
100
+ # Strip the line-number prefixes added to bypass Claude's content filter.
101
+ markdown = _LINE_NUM_RE.sub("", result.output)
102
+ # NFKC-normalize so superscript digits, ligatures, etc. match the
103
+ # NFKC-normalized anchor text from pdfplumber extraction.
104
+ return unicodedata.normalize("NFKC", markdown)
groundmark/parse.py ADDED
@@ -0,0 +1,95 @@
1
+ """Bounding box extraction from PDFs using pdfplumber."""
2
+
3
+ import io
4
+ import logging
5
+ import unicodedata
6
+
7
+ import pdfplumber
8
+ from anchorite import Anchor, BBox
9
+ from anchorite.document import DocumentChunk
10
+
11
+
12
+ class PdfplumberAnchorProvider:
13
+ """AnchorProvider that extracts line-level bounding boxes via pdfplumber.
14
+
15
+ Body text lines are emitted individually; table rows are merged into
16
+ single anchors so that repeated short cell values (e.g. "Pathogenic")
17
+ become unique when combined with the full row.
18
+
19
+ Coordinates are normalized to a 0-1000 scale with top-left origin.
20
+ """
21
+
22
+ async def generate_anchors(self, chunk: DocumentChunk) -> list[Anchor]:
23
+ pdf = pdfplumber.open(io.BytesIO(chunk.data))
24
+ anchors: list[Anchor] = []
25
+
26
+ for page_offset, page in enumerate(pdf.pages):
27
+ pw, ph = page.width, page.height
28
+ if pw <= 0 or ph <= 0:
29
+ logging.warning(
30
+ "Page %d has invalid dimensions (%s x %s), skipping",
31
+ chunk.start_page + page_offset,
32
+ pw,
33
+ ph,
34
+ )
35
+ continue
36
+
37
+ page_num = chunk.start_page + page_offset
38
+
39
+ # Detect table regions on this page.
40
+ table_bboxes = [table.bbox for table in page.find_tables()]
41
+
42
+ lines = page.extract_text_lines(return_chars=False, use_text_flow=True)
43
+
44
+ for line in lines:
45
+ x0, x1 = line["x0"], line["x1"]
46
+ # pdfplumber's top starts at baseline minus font size,
47
+ # missing the ascender portion above the glyph. Pad upward
48
+ # by ~30% of line height to approximate full glyph bounds.
49
+ line_h = line["bottom"] - line["top"]
50
+ y0 = line["top"] - 0.3 * line_h
51
+ y1 = line["bottom"]
52
+ text = unicodedata.normalize("NFKC", line["text"].strip())
53
+ if not text:
54
+ continue
55
+
56
+ in_table = any(_rects_intersect((x0, y0, x1, y1), tb) for tb in table_bboxes)
57
+
58
+ if in_table:
59
+ # Table lines are already row-level from extract_text_lines.
60
+ anchors.append(
61
+ Anchor(
62
+ page=page_num,
63
+ box=_normalize(x0, y0, x1, y1, pw, ph),
64
+ text=text,
65
+ )
66
+ )
67
+ else:
68
+ anchors.append(
69
+ Anchor(
70
+ page=page_num,
71
+ box=_normalize(x0, y0, x1, y1, pw, ph),
72
+ text=text,
73
+ )
74
+ )
75
+
76
+ pdf.close()
77
+ logging.debug("Generated %d bounding boxes", len(anchors))
78
+ return anchors
79
+
80
+
81
+ def _rects_intersect(a: tuple[float, float, float, float], b: tuple[float, float, float, float]) -> bool:
82
+ """Check if two (x0, y0, x1, y1) rectangles overlap."""
83
+ ax0, ay0, ax1, ay1 = a
84
+ bx0, by0, bx1, by1 = b
85
+ return ax0 < bx1 and ax1 > bx0 and ay0 < by1 and ay1 > by0
86
+
87
+
88
+ def _normalize(x0: float, y0: float, x1: float, y1: float, pw: float, ph: float) -> BBox:
89
+ """Convert PDF coordinates to 0-1000 scale with top-left origin."""
90
+ return BBox(
91
+ top=int(y0 / ph * 1000),
92
+ left=int(x0 / pw * 1000),
93
+ bottom=int(y1 / ph * 1000),
94
+ right=int(x1 / pw * 1000),
95
+ )
groundmark/process.py ADDED
@@ -0,0 +1,82 @@
1
+ import dataclasses
2
+
3
+ from anchorite import process_document
4
+ from anchorite.document import DocumentChunk, chunks
5
+
6
+ from groundmark.markdown import PydanticAIMarkdownProvider
7
+ from groundmark.parse import PdfplumberAnchorProvider
8
+
9
+
10
+ @dataclasses.dataclass(frozen=True)
11
+ class Config:
12
+ """Configuration for the groundmark processing pipeline."""
13
+
14
+ model: str
15
+ """Pydantic AI model string (e.g. "bedrock:au.anthropic.claude-sonnet-4-6")."""
16
+ uniqueness_threshold: float = 0.5
17
+ """Minimum score ratio between best and second-best alignment match."""
18
+ min_overlap: float = 0.9
19
+ """Minimum overlap fraction required for a valid alignment match."""
20
+ page_count: int | None = None
21
+ """Pages per chunk (None = whole PDF in one chunk)."""
22
+ prompt: str | None = None
23
+ """Custom LLM prompt (None = use built-in default)."""
24
+
25
+
26
+ @dataclasses.dataclass
27
+ class ProcessResult:
28
+ """Result of processing a PDF into annotated Markdown."""
29
+
30
+ annotated_markdown: str
31
+ """Markdown with <span data-bbox="..." data-page="N"> tags."""
32
+ coverage_percent: float
33
+ """Fraction of markdown content covered by aligned bounding boxes."""
34
+
35
+
36
+ async def process(
37
+ pdf_bytes: bytes,
38
+ config: Config,
39
+ *,
40
+ markdown: str | None = None,
41
+ ) -> ProcessResult:
42
+ """Process a PDF into annotated Markdown with bounding box spans.
43
+
44
+ Args:
45
+ pdf_bytes: Raw PDF file bytes.
46
+ config: Processing configuration.
47
+ markdown: Optional pre-generated markdown (skips LLM call).
48
+
49
+ Returns:
50
+ ProcessResult with annotated markdown and coverage stats.
51
+ """
52
+ md_provider: _CachedMarkdownProvider | PydanticAIMarkdownProvider
53
+ if markdown is not None:
54
+ # Cached markdown corresponds to the whole PDF — don't chunk.
55
+ md_provider = _CachedMarkdownProvider(markdown)
56
+ doc_chunks = chunks(pdf_bytes)
57
+ else:
58
+ md_provider = PydanticAIMarkdownProvider(config.model, prompt=config.prompt)
59
+ doc_chunks = chunks(pdf_bytes, page_count=config.page_count)
60
+
61
+ result = await process_document(
62
+ doc_chunks,
63
+ markdown_provider=md_provider,
64
+ anchor_provider=PdfplumberAnchorProvider(),
65
+ alignment_uniqueness_threshold=config.uniqueness_threshold,
66
+ alignment_min_overlap=config.min_overlap,
67
+ )
68
+
69
+ return ProcessResult(
70
+ annotated_markdown=result.annotate(),
71
+ coverage_percent=result.coverage_percent,
72
+ )
73
+
74
+
75
+ class _CachedMarkdownProvider:
76
+ """Returns pre-generated markdown, ignoring the chunk."""
77
+
78
+ def __init__(self, markdown: str) -> None:
79
+ self._markdown = markdown
80
+
81
+ async def generate_markdown(self, _chunk: DocumentChunk) -> str:
82
+ return self._markdown
@@ -0,0 +1,201 @@
1
+ """Debug visualizer: overlay bounding boxes from annotated Markdown onto the source PDF."""
2
+
3
+ import asyncio
4
+ import io
5
+ import logging
6
+ import re
7
+ import sys
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Annotated
11
+
12
+ import typer
13
+ from anchorite import Anchor, align, annotate
14
+ from anchorite.document import chunks
15
+ from pypdf import PdfReader, PdfWriter
16
+ from pypdf.annotations import Highlight
17
+ from pypdf.generic import ArrayObject, FloatObject
18
+
19
+ from groundmark.markdown import PydanticAIMarkdownProvider
20
+ from groundmark.parse import PdfplumberAnchorProvider
21
+
22
+ _SPAN_RE = re.compile(
23
+ r'<span\s+data-bbox="(\d+),(\d+),(\d+),(\d+)"\s+data-page="(\d+)">(.*?)</span>',
24
+ re.DOTALL,
25
+ )
26
+
27
+ app = typer.Typer()
28
+
29
+
30
+ def _log(msg: str) -> None:
31
+ print(msg, file=sys.stderr)
32
+
33
+
34
+ def _add_highlight(
35
+ writer: PdfWriter,
36
+ page_num: int,
37
+ bbox: tuple[int, int, int, int],
38
+ page_width: float,
39
+ page_height: float,
40
+ *,
41
+ color: tuple[float, float, float],
42
+ ) -> None:
43
+ top, left, bottom, right = bbox
44
+ # Convert from 0-1000 scale to PDF coordinates (bottom-left origin).
45
+ x0 = left / 1000 * page_width
46
+ x1 = right / 1000 * page_width
47
+ y0 = page_height - bottom / 1000 * page_height
48
+ y1 = page_height - top / 1000 * page_height
49
+
50
+ hex_color = "".join(f"{int(c * 255):02x}" for c in color)
51
+ quad_points = ArrayObject(
52
+ [
53
+ FloatObject(x0),
54
+ FloatObject(y1), # top-left
55
+ FloatObject(x1),
56
+ FloatObject(y1), # top-right
57
+ FloatObject(x0),
58
+ FloatObject(y0), # bottom-left
59
+ FloatObject(x1),
60
+ FloatObject(y0), # bottom-right
61
+ ]
62
+ )
63
+ annotation = Highlight(
64
+ rect=(x0, y0, x1, y1),
65
+ quad_points=quad_points,
66
+ highlight_color=hex_color,
67
+ )
68
+ writer.add_annotation(page_number=page_num, annotation=annotation)
69
+
70
+
71
+ def _overlay_bboxes(
72
+ pdf_bytes: bytes,
73
+ annotated_markdown: str,
74
+ raw_anchors: list[Anchor],
75
+ ) -> bytes:
76
+ """Draw bounding box highlights onto the PDF.
77
+
78
+ Blue = raw extracted boxes, Red = aligned (from annotated Markdown).
79
+ """
80
+ reader = PdfReader(io.BytesIO(pdf_bytes))
81
+ writer = PdfWriter()
82
+ writer.append_pages_from_reader(reader)
83
+
84
+ # Blue: raw extracted boxes.
85
+ for anchor in raw_anchors:
86
+ if anchor.page >= len(reader.pages):
87
+ continue
88
+ page = reader.pages[anchor.page]
89
+ pw = float(page.mediabox.width)
90
+ ph = float(page.mediabox.height)
91
+ box = anchor.box
92
+ _add_highlight(writer, anchor.page, (box.top, box.left, box.bottom, box.right), pw, ph, color=(0.8, 0.9, 1))
93
+
94
+ # Red: aligned boxes from annotated Markdown.
95
+ for match in _SPAN_RE.finditer(annotated_markdown):
96
+ top, left, bottom, right = int(match[1]), int(match[2]), int(match[3]), int(match[4])
97
+ page_num = int(match[5])
98
+ if page_num >= len(reader.pages):
99
+ continue
100
+ page = reader.pages[page_num]
101
+ pw = float(page.mediabox.width)
102
+ ph = float(page.mediabox.height)
103
+ _add_highlight(writer, page_num, (top, left, bottom, right), pw, ph, color=(1, 0.85, 0.85))
104
+
105
+ buf = io.BytesIO()
106
+ writer.write(buf)
107
+ return buf.getvalue()
108
+
109
+
110
+ @app.command()
111
+ def visualize(
112
+ input_pdf: Annotated[Path, typer.Argument(help="Path to the source PDF.")],
113
+ output_pdf: Annotated[Path, typer.Argument(help="Path for the output PDF with bbox overlays.")],
114
+ model: Annotated[str, typer.Option(help="Pydantic AI model string.")] = "",
115
+ markdown_file: Annotated[Path | None, typer.Option("--markdown", "-m", help="Cached Markdown file.")] = None,
116
+ uniqueness_threshold: Annotated[float, typer.Option("--threshold", "-t")] = 0.5,
117
+ min_overlap: Annotated[float, typer.Option("--overlap", "-o")] = 0.9,
118
+ page_count: Annotated[int | None, typer.Option("--page-count", "-p", help="Pages per chunk.")] = None,
119
+ verbose: Annotated[bool, typer.Option("--verbose", "-v", help="Enable debug logging.")] = False,
120
+ ) -> None:
121
+ """Run the groundmark pipeline on a PDF and overlay bounding boxes."""
122
+ if verbose:
123
+ logging.basicConfig(level=logging.DEBUG, stream=sys.stderr)
124
+
125
+ if not markdown_file and not model:
126
+ _log("Error: provide --model or --markdown")
127
+ raise SystemExit(1)
128
+
129
+ asyncio.run(
130
+ _run(
131
+ input_pdf,
132
+ output_pdf,
133
+ model,
134
+ markdown_file,
135
+ uniqueness_threshold,
136
+ min_overlap,
137
+ page_count,
138
+ )
139
+ )
140
+
141
+
142
+ async def _run(
143
+ input_pdf: Path,
144
+ output_pdf: Path,
145
+ model: str,
146
+ markdown_file: Path | None,
147
+ uniqueness_threshold: float,
148
+ min_overlap: float,
149
+ page_count: int | None,
150
+ ) -> None:
151
+ pdf_bytes = input_pdf.read_bytes()
152
+ doc_chunks = list(chunks(pdf_bytes, page_count=page_count))
153
+ t0 = time.perf_counter()
154
+
155
+ # Extract anchors (bounding boxes) from the PDF.
156
+ _log("Extracting bounding boxes...")
157
+ t = time.perf_counter()
158
+ provider = PdfplumberAnchorProvider()
159
+ all_anchors = await asyncio.gather(*(provider.generate_anchors(c) for c in doc_chunks))
160
+ flat_anchors = [a for chunk_anchors in all_anchors for a in chunk_anchors]
161
+ _log(f" {len(flat_anchors)} bounding boxes ({time.perf_counter() - t:.1f}s)")
162
+
163
+ # Generate or load Markdown.
164
+ if markdown_file:
165
+ markdown = markdown_file.read_text()
166
+ _log(f" Loaded cached Markdown from {markdown_file}")
167
+ else:
168
+ _log("Generating Markdown...")
169
+ t = time.perf_counter()
170
+ md_provider = PydanticAIMarkdownProvider(model)
171
+ md_chunks = await asyncio.gather(*(md_provider.generate_markdown(c) for c in doc_chunks))
172
+ markdown = "\n\n<!--page-->\n\n".join(md_chunks)
173
+ _log(f" {len(markdown)} chars ({time.perf_counter() - t:.1f}s)")
174
+
175
+ # Write plain markdown before alignment so it's available even if alignment hangs.
176
+ if not markdown_file:
177
+ md_path = output_pdf.with_suffix(".md")
178
+ md_path.write_text(markdown)
179
+ _log(f" Markdown written to {md_path}")
180
+
181
+ # Align anchors to Markdown.
182
+ _log("Aligning...")
183
+ t = time.perf_counter()
184
+ alignment = align(
185
+ flat_anchors,
186
+ markdown,
187
+ uniqueness_threshold=uniqueness_threshold,
188
+ min_overlap=min_overlap,
189
+ )
190
+ annotated_markdown = annotate(markdown, alignment)
191
+ coverage = sum(e - s for s, e in alignment.values()) / len(markdown) if markdown else 0.0
192
+ _log(f" {coverage:.1%} coverage ({time.perf_counter() - t:.1f}s)")
193
+
194
+ output_bytes = _overlay_bboxes(pdf_bytes, annotated_markdown, flat_anchors)
195
+ output_pdf.write_bytes(output_bytes)
196
+ _log(f" Output written to {output_pdf}")
197
+ _log(f"Total: {time.perf_counter() - t0:.1f}s")
198
+
199
+
200
+ if __name__ == "__main__":
201
+ app()
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: groundmark
3
+ Version: 0.1.0
4
+ Summary: PDF to grounded Markdown with bounding box annotations
5
+ Project-URL: Homepage, https://github.com/populationgenomics/groundmark
6
+ Project-URL: Bug Tracker, https://github.com/populationgenomics/groundmark/issues
7
+ Author-email: Tobias Sargeant <tobias.sargeant@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: anchorite>=0.1.1
18
+ Requires-Dist: pdfplumber>=0.11.9
19
+ Requires-Dist: pydantic-ai-slim[anthropic,bedrock,google,openai]>=1.67.0
20
+ Requires-Dist: pypdf>=6.8.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # groundmark
24
+
25
+ <img src="groundmark.webp" alt="groundmark" width="200">
26
+
27
+ ## Grounded Markdown for PDFs
28
+
29
+ **groundmark is a thin, batteries-included wrapper around [anchorite](https://github.com/populationgenomics/anchorite).** It provides concrete implementations of anchorite's provider protocols — [Pydantic AI](https://ai.pydantic.dev/) for LLM-based Markdown generation and [pdfplumber](https://github.com/jsvine/pdfplumber) for bounding box extraction — so you can go from PDF bytes to annotated Markdown in a single call. All the heavy lifting (Smith-Waterman alignment, annotation, stripping, quote resolution) lives in anchorite.
30
+
31
+ Give it a PDF and a model string, get back Markdown with embedded bounding box coordinates that trace every text span back to its location in the source PDF.
32
+
33
+ ## Architecture
34
+
35
+ The library processes documents in two streams that are then merged:
36
+
37
+ 1. **Semantic Stream**: The PDF is sent to an LLM (via Pydantic AI) to produce clean Markdown with `<!--page-->` markers between pages.
38
+ 2. **Positional Stream**: The PDF is parsed locally by pdfplumber to extract line-level text segments and their bounding boxes.
39
+ 3. **Alignment**: Smith-Waterman alignment (via anchorite) maps each parsed line to its position in the Markdown, constrained by page boundaries.
40
+ 4. **Annotation**: Bounding box coordinates are injected as HTML span attributes:
41
+
42
+ ```html
43
+ <span data-bbox="120,45,180,890" data-page="3">The patient presented with</span>
44
+ ```
45
+
46
+ ## Quick Start
47
+
48
+ ```python
49
+ import asyncio
50
+ import groundmark as gm
51
+
52
+ async def main():
53
+ pdf_bytes = open("document.pdf", "rb").read()
54
+
55
+ config = gm.Config(model="bedrock:au.anthropic.claude-sonnet-4-6")
56
+
57
+ # PDF -> annotated Markdown (one call)
58
+ result = await gm.process(pdf_bytes, config)
59
+ print(f"Coverage: {result.coverage_percent:.2%}")
60
+ print(result.annotated_markdown[:500])
61
+
62
+ # Strip for LLM consumption
63
+ stripped = gm.strip(result.annotated_markdown)
64
+ # stripped.plain_text: clean Markdown with spans removed
65
+ # stripped.validation_map: list of (start, end, Anchor) ranges
66
+
67
+ # Resolve verbatim quotes to PDF coordinates
68
+ resolved = gm.resolve(result.annotated_markdown, ["the patient presented with"])
69
+ # -> {"the patient presented with": [(page, BBox), ...]}
70
+
71
+ if __name__ == "__main__":
72
+ asyncio.run(main())
73
+ ```
74
+
75
+ ## Debug Visualizer
76
+
77
+ The included visualizer overlays extracted bounding boxes onto the source PDF, useful for diagnosing alignment issues. Blue highlights show raw extracted boxes from pdfplumber; red highlights show aligned boxes from the annotated Markdown.
78
+
79
+ ```bash
80
+ python -m groundmark.visualize input.pdf output.pdf --model "bedrock:au.anthropic.claude-sonnet-4-6"
81
+
82
+ # Or with cached Markdown:
83
+ python -m groundmark.visualize input.pdf output.pdf --markdown cached.md
84
+ ```
85
+
86
+ ![Visualizer output showing blue (raw) and red (aligned) bounding box overlays](visualize_example.jpg)
87
+
88
+ *Screenshot from Santoro et al., "Health outcomes and drug utilisation in children with Noonan syndrome: a European cohort study," Orphanet J Rare Dis 20:76 (2025). [doi:10.1186/s13023-025-03594-7](https://doi.org/10.1186/s13023-025-03594-7). CC-BY 4.0.*
89
+
90
+ ## Configuration
91
+
92
+ ### Timeouts
93
+
94
+ The LLM call for PDF-to-Markdown conversion can take several minutes for large documents, especially with Opus on Bedrock. Timeout defaults by provider:
95
+
96
+ | Provider | Default | Environment Variable |
97
+ |----------|---------|---------------------|
98
+ | Bedrock (boto3) | 300s | `AWS_READ_TIMEOUT` |
99
+ | Anthropic (httpx) | 600s | — (use `ModelSettings(timeout=...)`) |
100
+
101
+ For Bedrock with Opus, 300s may not be enough. Set a higher timeout:
102
+
103
+ ```bash
104
+ export AWS_READ_TIMEOUT=600
105
+ ```
106
+
107
+ ## License
108
+
109
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,9 @@
1
+ groundmark/__init__.py,sha256=nuEYgTIqO-5cTZ7XpWzmnKt01EKpuRsqKC_eDHaolM0,273
2
+ groundmark/markdown.py,sha256=v_zBQyL7ArWqmdYrjBuj2nXlXMRHHEYm9kd9bfwC5IY,4407
3
+ groundmark/parse.py,sha256=sw91PbgskQiGzgU3A1N5HTly7yfEvBPqotazRnhwfD0,3460
4
+ groundmark/process.py,sha256=PHCNAwIyS6JgIYCQM094uJmYiOnGRODGEbAFXuiqb7w,2696
5
+ groundmark/visualize.py,sha256=BfvirOZyYSD__VqDuF_kZW9cAzT1AMjZ741gpJsFyOo,6755
6
+ groundmark-0.1.0.dist-info/METADATA,sha256=hIjcRwYGxhhVXav93h0bb3oi4M9Z35blwYkuMIiiNEQ,4690
7
+ groundmark-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
8
+ groundmark-0.1.0.dist-info/licenses/LICENSE,sha256=lwIanGu698z5WiBedbwEOdUJLQb_G4FOFgucqExVIS0,1087
9
+ groundmark-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Centre for Population Genomics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.