anchorite 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. anchorite-0.1.0/.github/workflows/build.yaml +33 -0
  2. anchorite-0.1.0/.github/workflows/lint.yaml +25 -0
  3. anchorite-0.1.0/.github/workflows/release.yaml +68 -0
  4. anchorite-0.1.0/.gitignore +7 -0
  5. anchorite-0.1.0/.markdownlint.yaml +5 -0
  6. anchorite-0.1.0/.pre-commit-config.yaml +51 -0
  7. anchorite-0.1.0/.python-version +1 -0
  8. anchorite-0.1.0/.readthedocs.yaml +17 -0
  9. anchorite-0.1.0/LICENSE +21 -0
  10. anchorite-0.1.0/PKG-INFO +246 -0
  11. anchorite-0.1.0/README.md +227 -0
  12. anchorite-0.1.0/docs/Makefile +20 -0
  13. anchorite-0.1.0/docs/make.bat +35 -0
  14. anchorite-0.1.0/docs/requirements.txt +4 -0
  15. anchorite-0.1.0/docs/source/_static/anchorite.svg +1 -0
  16. anchorite-0.1.0/docs/source/api.rst +32 -0
  17. anchorite-0.1.0/docs/source/conf.py +21 -0
  18. anchorite-0.1.0/docs/source/index.rst +16 -0
  19. anchorite-0.1.0/docs/source/readme.md +4 -0
  20. anchorite-0.1.0/pyproject.toml +63 -0
  21. anchorite-0.1.0/src/anchorite/__init__.py +392 -0
  22. anchorite-0.1.0/src/anchorite/bbox_alignment.py +421 -0
  23. anchorite-0.1.0/src/anchorite/document.py +102 -0
  24. anchorite-0.1.0/src/anchorite/markdown.py +18 -0
  25. anchorite-0.1.0/src/anchorite/orchestrator.py +111 -0
  26. anchorite-0.1.0/src/anchorite/providers.py +16 -0
  27. anchorite-0.1.0/src/anchorite/range_ops.py +182 -0
  28. anchorite-0.1.0/src/anchorite/types.py +30 -0
  29. anchorite-0.1.0/tests/fixtures/hubble_anchors.json +5194 -0
  30. anchorite-0.1.0/tests/fixtures/hubble_golden.md +312 -0
  31. anchorite-0.1.0/tests/fixtures/hubble_markdown_chunks.json +1 -0
  32. anchorite-0.1.0/tests/test_anchorite.py +76 -0
  33. anchorite-0.1.0/tests/test_bbox_alignment.py +88 -0
  34. anchorite-0.1.0/tests/test_markdown.py +12 -0
  35. anchorite-0.1.0/tests/test_ocr_annotation.py +80 -0
  36. anchorite-0.1.0/tests/test_ocr_nesting.py +75 -0
  37. anchorite-0.1.0/tests/test_range_ops.py +178 -0
  38. anchorite-0.1.0/tests/test_regression.py +80 -0
  39. anchorite-0.1.0/uv.lock +272 -0
@@ -0,0 +1,33 @@
1
+ name: CI Test Build
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ build:
11
+ name: Build pure Python package
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+
19
+ - name: Set up Python
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: '3.10'
23
+
24
+ - name: Update uv lock
25
+ run: uv lock
26
+
27
+ - name: Build package
28
+ run: uv build
29
+
30
+ - uses: actions/upload-artifact@v4
31
+ with:
32
+ name: dist
33
+ path: dist/*
@@ -0,0 +1,25 @@
1
+ name: Lint
2
+ on: [push]
3
+
4
+ jobs:
5
+ lint:
6
+ runs-on: ubuntu-latest
7
+ defaults:
8
+ run:
9
+ shell: bash -l {0}
10
+
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - uses: actions/setup-python@v4
15
+ with:
16
+ python-version: '3.11'
17
+
18
+ - name: Install packages
19
+ run: pip install -r requirements-dev.txt
20
+
21
+ - name: Install pre-commit hooks
22
+ run: pre-commit install --install-hooks
23
+
24
+ - name: Run pre-commit
25
+ run: pre-commit run --all-files
@@ -0,0 +1,68 @@
1
+ name: Release
2
+
3
+ on:
4
+ release:
5
+ types: [created]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build distribution
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: '3.10'
21
+
22
+ - name: Update uv lock
23
+ run: uv lock
24
+
25
+ - name: Build package
26
+ run: uv build
27
+
28
+ - uses: actions/upload-artifact@v4
29
+ with:
30
+ name: dist
31
+ path: dist/*
32
+
33
+ upload_release_assets:
34
+ name: Upload Assets to Release
35
+ runs-on: ubuntu-latest
36
+ needs: [build]
37
+ permissions:
38
+ contents: write
39
+ steps:
40
+ - name: Download all artifacts
41
+ uses: actions/download-artifact@v4
42
+ with:
43
+ path: artifacts
44
+ merge-multiple: true
45
+ - name: Upload Wheels and sdist
46
+ uses: softprops/action-gh-release@v2
47
+ with:
48
+ tag_name: ${{ github.event.release.tag_name }}
49
+ files: artifacts/*
50
+ overwrite_files: true
51
+
52
+ publish-to-pypi:
53
+ name: Publish to PyPI
54
+ runs-on: ubuntu-latest
55
+ needs: [build]
56
+ environment:
57
+ name: pypi
58
+ url: https://pypi.org/p/anchorite
59
+ permissions:
60
+ id-token: write # Required for trusted publishing
61
+ steps:
62
+ - name: Download all wheels and sdist
63
+ uses: actions/download-artifact@v4
64
+ with:
65
+ path: dist
66
+ merge-multiple: true
67
+ - name: Publish to PyPI
68
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,7 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .venv/
7
+ docs/build/
@@ -0,0 +1,5 @@
1
+ default: true
2
+ MD013: false
3
+ MD033:
4
+ allowed_elements:
5
+ - img
@@ -0,0 +1,51 @@
1
+ default_language_version:
2
+ python: python3.11
3
+ repos:
4
+ - repo: https://github.com/pre-commit/pre-commit-hooks
5
+ rev: v6.0.0
6
+ hooks:
7
+ - id: check-yaml
8
+ - id: end-of-file-fixer
9
+ exclude: 'tests/fixtures'
10
+ - id: trailing-whitespace
11
+ exclude: '\.txt$|\.tsv$'
12
+ - id: check-case-conflict
13
+ - id: check-merge-conflict
14
+ - id: detect-private-key
15
+ - id: debug-statements
16
+ - id: check-added-large-files
17
+
18
+ - repo: https://github.com/igorshubovych/markdownlint-cli
19
+ rev: v0.45.0
20
+ hooks:
21
+ - id: markdownlint
22
+ exclude: 'tests/fixtures'
23
+
24
+ - repo: https://github.com/populationgenomics/pre-commits
25
+ rev: "e37928f761f17d54aca5cedf93848b40ec7cff26"
26
+ hooks:
27
+ - id: cpg-id-checker
28
+
29
+ - repo: https://github.com/astral-sh/ruff-pre-commit
30
+ rev: v0.14.1
31
+ hooks:
32
+ - id: ruff
33
+ args: ["--fix"]
34
+ - id: ruff-format
35
+
36
+ - repo: https://github.com/pre-commit/mirrors-mypy
37
+ rev: v1.18.2
38
+ hooks:
39
+ - id: mypy
40
+ exclude: "docs/"
41
+ args:
42
+ [
43
+ --pretty,
44
+ --show-error-codes,
45
+ --no-strict-optional,
46
+ --ignore-missing-imports,
47
+ --install-types,
48
+ --non-interactive,
49
+ --config-file=./pyproject.toml
50
+ ]
51
+ additional_dependencies: [types-PyYAML==6.0.4, types-toml]
@@ -0,0 +1 @@
1
+ 3.11
@@ -0,0 +1,17 @@
1
+ version: 2
2
+
3
+ build:
4
+ os: ubuntu-lts-latest
5
+ tools:
6
+ python: "3.11"
7
+
8
+ sphinx:
9
+ configuration: docs/source/conf.py
10
+
11
+ formats: all
12
+
13
+ python:
14
+ install:
15
+ - requirements: docs/requirements.txt
16
+ - method: pip
17
+ path: .
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Centre for Population Genomics
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,246 @@
1
+ Metadata-Version: 2.4
2
+ Name: anchorite
3
+ Version: 0.1.0
4
+ Summary: Spatial text alignment and resolution for document OCR
5
+ Author-email: Tobias Sargeant <tobias.sargeant@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Python: >=3.11
15
+ Requires-Dist: fsspec
16
+ Requires-Dist: pymupdf
17
+ Requires-Dist: seq-smith>=0.5.1
18
+ Description-Content-Type: text/markdown
19
+
20
+ # anchorite
21
+
22
+ <img src="https://raw.githubusercontent.com/populationgenomics/anchorite/main/docs/source/_static/anchorite.svg" alt="anchorite" width="200">
23
+
24
+ **Spatial text alignment for document AI pipelines.**
25
+
26
+ `anchorite` aligns generated Markdown text back to the physical bounding boxes that an OCR engine found on the original document pages. It bridges the gap between generative AI (which produces high-quality, readable Markdown) and traditional OCR (which provides precise coordinates) by finding where each OCR word or phrase appears in the generated output.
27
+
28
+ ---
29
+
30
+ ## The problem
31
+
32
+ Modern document AI pipelines often combine two sources:
33
+
34
+ 1. **A generative model** (Gemini, Claude, GPT-4) that reads a page image and produces clean, well-structured Markdown.
35
+ 2. **An OCR engine** (Google Document AI, Tesseract, Docling) that identifies individual words and their bounding boxes on the page.
36
+
37
+ The generative model's output is readable and accurate but has no coordinates. The OCR output has precise coordinates but poor structure. `anchorite` fuses them: it takes the Markdown as the ground truth for text content and finds the corresponding bounding box for each OCR word or phrase within it.
38
+
39
+ ---
40
+
41
+ ## Installation
42
+
43
+ ```shell
44
+ pip install anchorite
45
+ ```
46
+
47
+ ---
48
+
49
+ ## Core concepts
50
+
51
+ **`Anchor`** — a piece of OCR text with its location: a `text` string, a `page` number (0-indexed), and a `BBox` (bounding box in 0–1000 normalised coordinates).
52
+
53
+ **`BBox`** — a bounding box `(top, left, bottom, right)`.
54
+
55
+ **`alignment`** — a `dict[Anchor, tuple[int, int]]` mapping each anchor to a `(start, end)` character span in the Markdown string.
56
+
57
+ ---
58
+
59
+ ## Workflows
60
+
61
+ ### 1. Align and annotate
62
+
63
+ The most common workflow: align OCR anchors to Markdown, then inject coordinate spans.
64
+
65
+ ```python
66
+ import anchorite
67
+
68
+ anchors = [
69
+ anchorite.Anchor(text="Observations of a Nebula", page=0, box=anchorite.BBox(52, 120, 68, 880)),
70
+ anchorite.Anchor(text="Edwin Hubble", page=0, box=anchorite.BBox(80, 340, 92, 660)),
71
+ ]
72
+
73
+ markdown = "# Observations of a Nebula\n\n*Edwin Hubble*, 1929"
74
+
75
+ alignment = anchorite.align(anchors, markdown)
76
+ annotated = anchorite.annotate(markdown, alignment)
77
+ # <span data-bbox="52,120,68,880" data-page="0">Observations of a Nebula</span>
78
+ # <span data-bbox="80,340,92,660" data-page="0">Edwin Hubble</span>
79
+ ```
80
+
81
+ The annotated Markdown is otherwise valid Markdown and can be rendered normally; the `<span>` tags carry coordinate metadata as HTML attributes.
82
+
83
+ ### 2. Resolve quotes to coordinates
84
+
85
+ Given annotated Markdown and a list of verbatim quotes (e.g. extracted by an LLM), find the bounding boxes that each quote covers. Useful for grounding LLM citations.
86
+
87
+ ```python
88
+ locations = anchorite.resolve(annotated, quotes=["Observations of a Nebula"])
89
+ # {"Observations of a Nebula": [(0, BBox(52, 120, 68, 880))]}
90
+ ```
91
+
92
+ `resolve` uses fuzzy iterative matching so it tolerates minor transcription differences. Each quote maps to a list of `(page, BBox)` pairs — one per distinct OCR anchor the quote overlaps.
93
+
94
+ ### 3. Strip annotations for downstream validation
95
+
96
+ `strip` is the inverse of `annotate`. It removes the `<span>` tags and returns a plain-text string alongside a validation map you can use to check whether a generated quote is grounded in the source document.
97
+
98
+ ```python
99
+ stripped = anchorite.strip(annotated)
100
+ # stripped.plain_text — Markdown with tags removed
101
+ # stripped.validation_map — list of (start, end, Anchor) in plain_text
102
+ ```
103
+
104
+ ### 4. Orchestrated multi-page processing
105
+
106
+ For pipelines that process multi-page documents, `process_document` handles parallelism, page-chunk assembly, and alignment in one call. You supply pre-chunked document data and implement two provider protocols.
107
+
108
+ ```python
109
+ import asyncio
110
+ import anchorite
111
+ from anchorite.document import DocumentChunk
112
+ from anchorite.providers import MarkdownProvider, AnchorProvider
113
+
114
+ class MyMarkdownProvider:
115
+ async def generate_markdown(self, chunk: DocumentChunk) -> str:
116
+ # Call your LLM or OCR layout model here
117
+ ...
118
+
119
+ class MyAnchorProvider:
120
+ async def generate_anchors(self, chunk: DocumentChunk) -> list[anchorite.Anchor]:
121
+ # Call your OCR engine here and return Anchor objects
122
+ ...
123
+
124
+ # Chunk the document yourself (e.g. 10 pages per chunk)
125
+ chunks = list(anchorite.document.chunks("paper.pdf", page_count=10))
126
+
127
+ result = asyncio.run(anchorite.process_document(
128
+ chunks,
129
+ MyMarkdownProvider(),
130
+ MyAnchorProvider(),
131
+ ))
132
+
133
+ print(result.coverage_percent) # fraction of Markdown covered by aligned anchors
134
+ annotated = result.annotate() # AlignmentResult.annotate() calls anchorite.annotate internally
135
+ ```
136
+
137
+ `process_document` runs the markdown and anchor providers concurrently across all chunks using `asyncio.gather`, then aligns the assembled full-document Markdown against the complete anchor set.
138
+
139
+ #### Provider protocols
140
+
141
+ ```python
142
+ class MarkdownProvider(Protocol):
143
+ async def generate_markdown(self, chunk: DocumentChunk) -> str: ...
144
+
145
+ class AnchorProvider(Protocol):
146
+ async def generate_anchors(self, chunk: DocumentChunk) -> list[Anchor]: ...
147
+ ```
148
+
149
+ Both are structural protocols — no inheritance required, duck typing works.
150
+
151
+ #### Document chunking
152
+
153
+ `anchorite.document.chunks(source, *, page_count, mime_type)` splits a PDF into sub-documents of `page_count` pages each. `source` can be a file path, URL, `bytes`, or a file-like object. Images (PNG, JPEG, WebP) are yielded as a single chunk unchanged.
154
+
155
+ You do not have to use `anchorite.document.chunks`. If your pipeline already produces chunks (for example, Docling's own document parser), create `DocumentChunk` objects directly:
156
+
157
+ ```python
158
+ from anchorite.document import DocumentChunk
159
+
160
+ chunk = DocumentChunk(
161
+ document_sha256="abc123...",
162
+ start_page=0,
163
+ end_page=10,
164
+ data=pdf_bytes,
165
+ mime_type="application/pdf",
166
+ )
167
+ ```
168
+
169
+ ---
170
+
171
+ ## API reference
172
+
173
+ ### `anchorite.align(anchors, markdown, *, uniqueness_threshold, min_overlap)`
174
+
175
+ Aligns a sequence of `Anchor` objects to a Markdown string. Returns `dict[Anchor, tuple[int, int]]`.
176
+
177
+ | Parameter | Default | Description |
178
+ |---|---|---|
179
+ | `uniqueness_threshold` | `0.5` | An anchor is accepted only if its best-match score exceeds this fraction of its second-best score. Higher values demand more unique matches. |
180
+ | `min_overlap` | `0.9` | Minimum fraction of the anchor's normalised length that must be covered by the alignment. |
181
+
182
+ ### `anchorite.annotate(markdown, alignment)`
183
+
184
+ Injects `<span data-bbox="t,l,b,r" data-page="N">` tags into Markdown at the positions given by `alignment`. Handles overlapping and nested spans. Math blocks (`$...$`, `$$...$$`) are detected and span boundaries are snapped to their edges so LaTeX is not broken.
185
+
186
+ ### `anchorite.strip(annotated_md)`
187
+
188
+ Removes `<span>` tags and returns a `StrippedMarkdown` with fields:
189
+
190
+ - `plain_text`: the Markdown with all tags removed
191
+ - `validation_map`: sorted list of `(start, end, Anchor)` tuples in `plain_text` coordinates
192
+
193
+ ### `anchorite.resolve(annotated_md, quotes)`
194
+
195
+ Resolves a list of verbatim quote strings to their bounding boxes using fuzzy iterative Smith-Waterman alignment against the stripped text. Returns `dict[str, list[tuple[int, BBox]]]` mapping each quote to a list of `(page, BBox)` pairs.
196
+
197
+ ### `anchorite.process_document(chunks, markdown_provider, anchor_provider, *, ...)`
198
+
199
+ Orchestrates multi-chunk document alignment. Returns `AlignmentResult`.
200
+
201
+ | Parameter | Default | Description |
202
+ |---|---|---|
203
+ | `alignment_uniqueness_threshold` | `0.5` | Passed to `align`. |
204
+ | `alignment_min_overlap` | `0.9` | Passed to `align`. |
205
+ | `renumber` | `True` | Renumber `<!--table-->` and `<!--figure-->` markers across chunks before joining. |
206
+
207
+ ---
208
+
209
+ ## Algorithm
210
+
211
+ ### Normalisation
212
+
213
+ Before any alignment, text is normalised to a reduced alphabet: letters are lowercased, all non-alphanumeric characters (punctuation, whitespace variants) are mapped to a single space, and consecutive spaces are collapsed to one. This makes the alignment robust to minor formatting differences between the OCR text and the generated Markdown (e.g. hyphenation, ligatures, smart quotes).
214
+
215
+ ### Document fragmentation
216
+
217
+ The Markdown is split at HTML comment markers (e.g. `<!--page-->`, `<!--table: 1-->`) into contiguous fragments. Each fragment inherits a page range from its position in the assembled document, which is used to restrict which anchors can match it — anchors are only compared against fragments whose page range includes the anchor's page number.
218
+
219
+ ### Iterative alignment
220
+
221
+ The core loop runs until all anchors are matched or no further progress is made.
222
+
223
+ **Pass 1 — ungapped alignment.** Each unmatched anchor is aligned against each compatible document fragment using ungapped Smith-Waterman local alignment (via `seq_smith.top_k_ungapped_local_align_many`, retrieving the top-2 scores per anchor per fragment). An anchor is promoted to a high-confidence candidate only if both conditions hold:
224
+
225
+ - *Overlap*: the best-match score covers at least `min_overlap` of the anchor's normalised length.
226
+ - *Uniqueness*: the best-match score exceeds `uniqueness_threshold` × the second-best score, ensuring the match is not ambiguous.
227
+
228
+ **Subsequent passes — gapped alignment.** The same candidate-selection logic is repeated using semi-global alignment (`seq_smith.local_global_align_many`), which allows gaps within the alignment. This recovers anchors that the LLM paraphrased or reformatted slightly.
229
+
230
+ ### Span assignment
231
+
232
+ Once a set of high-confidence candidates is identified for a fragment, each candidate is assigned a precise character range within the fragment. Candidates are processed in descending alignment score order and are accepted only if:
233
+
234
+ 1. At least 90% of the aligned positions are exact character matches (no-gap criterion within the assignment step).
235
+ 2. The proposed range is *page-consistent*: anchors from earlier pages must map to earlier positions in the Markdown than anchors from later pages.
236
+ 3. At least 90% of the proposed range is *new* coverage — not already claimed by a higher-scoring anchor in the same fragment.
237
+
238
+ The assigned range is mapped back from normalised-character coordinates to original Markdown character offsets via the `normalized_to_source` index.
239
+
240
+ ### Fragment splitting
241
+
242
+ After assignment, any portion of a document fragment not covered by any accepted anchor becomes a new sub-fragment for subsequent iterations. This allows later iterations to focus on progressively smaller uncovered regions, recovering matches that were hidden by initially ambiguous context.
243
+
244
+ ### Result
245
+
246
+ The final result is a `dict[Anchor, (start, end)]` giving the character span in the original Markdown for each successfully aligned anchor. Anchors that could not be matched with sufficient confidence are omitted.