inscriber 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inscriber-0.1.0/.gitignore +26 -0
- inscriber-0.1.0/AGENTS.md +151 -0
- inscriber-0.1.0/CLAUDE.md +7 -0
- inscriber-0.1.0/DESIGN.md +2681 -0
- inscriber-0.1.0/LICENSE +21 -0
- inscriber-0.1.0/PKG-INFO +313 -0
- inscriber-0.1.0/README.md +278 -0
- inscriber-0.1.0/TODO.md +129 -0
- inscriber-0.1.0/config.example.toml +167 -0
- inscriber-0.1.0/inscriber/__init__.py +6 -0
- inscriber-0.1.0/inscriber/__main__.py +8 -0
- inscriber-0.1.0/inscriber/bibtex/__init__.py +1 -0
- inscriber-0.1.0/inscriber/bibtex/arxiv.py +127 -0
- inscriber-0.1.0/inscriber/bibtex/chain.py +104 -0
- inscriber-0.1.0/inscriber/bibtex/local.py +39 -0
- inscriber-0.1.0/inscriber/bibtex/probe.py +125 -0
- inscriber-0.1.0/inscriber/bibtex/semantic_scholar.py +224 -0
- inscriber-0.1.0/inscriber/bundle.py +214 -0
- inscriber-0.1.0/inscriber/cache.py +390 -0
- inscriber-0.1.0/inscriber/cli.py +459 -0
- inscriber-0.1.0/inscriber/config.py +324 -0
- inscriber-0.1.0/inscriber/errors.py +16 -0
- inscriber-0.1.0/inscriber/input/__init__.py +1 -0
- inscriber-0.1.0/inscriber/input/domain_handlers.py +204 -0
- inscriber-0.1.0/inscriber/input/resolver.py +157 -0
- inscriber-0.1.0/inscriber/llama/__init__.py +1 -0
- inscriber-0.1.0/inscriber/llama/client.py +135 -0
- inscriber-0.1.0/inscriber/llama/server.py +412 -0
- inscriber-0.1.0/inscriber/logging.py +52 -0
- inscriber-0.1.0/inscriber/models.py +279 -0
- inscriber-0.1.0/inscriber/ocr/__init__.py +1 -0
- inscriber-0.1.0/inscriber/ocr/base.py +263 -0
- inscriber-0.1.0/inscriber/ocr/deepseek.py +212 -0
- inscriber-0.1.0/inscriber/ocr/glm.py +95 -0
- inscriber-0.1.0/inscriber/ocr/registry.py +33 -0
- inscriber-0.1.0/inscriber/output.py +133 -0
- inscriber-0.1.0/inscriber/pdf/__init__.py +1 -0
- inscriber-0.1.0/inscriber/pdf/crop.py +108 -0
- inscriber-0.1.0/inscriber/pdf/figures.py +83 -0
- inscriber-0.1.0/inscriber/pdf/rasterize.py +120 -0
- inscriber-0.1.0/inscriber/pipeline.py +1226 -0
- inscriber-0.1.0/inscriber/postprocess/__init__.py +1 -0
- inscriber-0.1.0/inscriber/postprocess/inject.py +90 -0
- inscriber-0.1.0/inscriber/postprocess/join.py +112 -0
- inscriber-0.1.0/inscriber/postprocess/notice.py +54 -0
- inscriber-0.1.0/inscriber/postprocess/prompt.py +107 -0
- inscriber-0.1.0/inscriber/postprocess/splitter.py +200 -0
- inscriber-0.1.0/inscriber/postprocess/stitch.py +201 -0
- inscriber-0.1.0/inscriber/postprocess/tables.py +278 -0
- inscriber-0.1.0/inscriber/serialize.py +62 -0
- inscriber-0.1.0/inscriber/setup.py +510 -0
- inscriber-0.1.0/inscriber/vlm/__init__.py +1 -0
- inscriber-0.1.0/inscriber/vlm/base.py +84 -0
- inscriber-0.1.0/inscriber/vlm/gemma.py +134 -0
- inscriber-0.1.0/inscriber/vlm/registry.py +21 -0
- inscriber-0.1.0/pyproject.toml +77 -0
- inscriber-0.1.0/tests/conftest.py +30 -0
- inscriber-0.1.0/tests/fixtures/bibtex_best_effort.txt +9 -0
- inscriber-0.1.0/tests/fixtures/bibtex_mock.txt +12 -0
- inscriber-0.1.0/tests/fixtures/calibration.json +136 -0
- inscriber-0.1.0/tests/fixtures/calibration.pdf +130 -0
- inscriber-0.1.0/tests/fixtures/deepseek_calibration_gundam2048_raw.txt +6 -0
- inscriber-0.1.0/tests/fixtures/deepseek_calibration_raw.txt +6 -0
- inscriber-0.1.0/tests/fixtures/deepseek_paper_p1_raw.txt +30 -0
- inscriber-0.1.0/tests/fixtures/deepseek_paper_table_p27_raw.txt +35 -0
- inscriber-0.1.0/tests/fixtures/make_calibration.py +150 -0
- inscriber-0.1.0/tests/fixtures/make_sample_paper.py +82 -0
- inscriber-0.1.0/tests/fixtures/sample_paper.pdf +0 -0
- inscriber-0.1.0/tests/test_bibtex.py +165 -0
- inscriber-0.1.0/tests/test_bibtex_chain.py +585 -0
- inscriber-0.1.0/tests/test_bibtex_probe.py +308 -0
- inscriber-0.1.0/tests/test_bundle_roundtrip.py +225 -0
- inscriber-0.1.0/tests/test_cache.py +204 -0
- inscriber-0.1.0/tests/test_cli.py +54 -0
- inscriber-0.1.0/tests/test_config.py +461 -0
- inscriber-0.1.0/tests/test_crop.py +57 -0
- inscriber-0.1.0/tests/test_deepseek_parser.py +171 -0
- inscriber-0.1.0/tests/test_domain_handlers.py +109 -0
- inscriber-0.1.0/tests/test_inject.py +77 -0
- inscriber-0.1.0/tests/test_join.py +208 -0
- inscriber-0.1.0/tests/test_llama_server.py +404 -0
- inscriber-0.1.0/tests/test_logging.py +33 -0
- inscriber-0.1.0/tests/test_notice.py +43 -0
- inscriber-0.1.0/tests/test_ocr_truncation.py +260 -0
- inscriber-0.1.0/tests/test_output.py +120 -0
- inscriber-0.1.0/tests/test_pdf_embedded_figures.py +75 -0
- inscriber-0.1.0/tests/test_pipeline_mocked.py +458 -0
- inscriber-0.1.0/tests/test_prompt.py +68 -0
- inscriber-0.1.0/tests/test_rasterize.py +112 -0
- inscriber-0.1.0/tests/test_resolver.py +119 -0
- inscriber-0.1.0/tests/test_setup.py +408 -0
- inscriber-0.1.0/tests/test_splitter.py +111 -0
- inscriber-0.1.0/tests/test_stitch.py +124 -0
- inscriber-0.1.0/tests/test_tables.py +927 -0
- inscriber-0.1.0/tests/test_vlm_truncation.py +96 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
paper2llm/
|
|
2
|
+
|
|
3
|
+
# Python
|
|
4
|
+
__pycache__/
|
|
5
|
+
*.py[cod]
|
|
6
|
+
*.egg-info/
|
|
7
|
+
.eggs/
|
|
8
|
+
build/
|
|
9
|
+
dist/
|
|
10
|
+
.venv/
|
|
11
|
+
.venv*/
|
|
12
|
+
venv/
|
|
13
|
+
.env
|
|
14
|
+
|
|
15
|
+
# Tooling caches
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.ruff_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
|
|
20
|
+
# inscriber runtime/test artifacts
|
|
21
|
+
config.toml
|
|
22
|
+
dev/benchmarks/**/*.pdf
|
|
23
|
+
*.inscriber-ocr/
|
|
24
|
+
/out*/
|
|
25
|
+
/out*.log
|
|
26
|
+
tmp-*/
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
## What this is
|
|
2
|
+
|
|
3
|
+
`inscriber` is a cross-platform CLI that converts academic PDFs into LLM-friendly
|
|
4
|
+
text-only Markdown **entirely locally** via llama.cpp (DeepSeek-OCR for text +
|
|
5
|
+
figure grounding; a Gemma 4 VLM for figure descriptions and table restructuring).
|
|
6
|
+
It is a Python port of the cloud web app `paper2llm`. No ML libraries in the
|
|
7
|
+
package — all inference is a llama.cpp subprocess driven over its
|
|
8
|
+
OpenAI-compatible HTTP API.
|
|
9
|
+
|
|
10
|
+
## Commands
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
python -m venv .venv
|
|
14
|
+
.venv/Scripts/activate # Windows; source .venv/bin/activate elsewhere
|
|
15
|
+
pip install -e ".[dev]"
|
|
16
|
+
|
|
17
|
+
pytest # full suite; mocked inference, no GPU/models needed
|
|
18
|
+
pytest tests/test_tables.py -k locator # single file / keyword
|
|
19
|
+
ruff check # lint (config in pyproject.toml)
|
|
20
|
+
mypy inscriber # type check (clean since 2026-06-11; keep it so)
|
|
21
|
+
|
|
22
|
+
python -m inscriber run paper.pdf -o out/ # real runs need llama.cpp + GGUFs
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Real runs read `./config.toml` (gitignored, machine-local) — `config.example.toml`
|
|
26
|
+
is the tracked template. The maintainer's setup is llama.cpp build 9587 on an
|
|
27
|
+
RTX 4060 8GB. Builds **older than 9587 are refused for OCR**
|
|
28
|
+
(`DeepSeekOcrBackend.min_server_build` — the grounding coordinate frame changed
|
|
29
|
+
upstream; DESIGN §2.2, `dev/notes/2026-06-10-build-9587-verification.md`).
|
|
30
|
+
|
|
31
|
+
## Where truth lives (read before changing behavior)
|
|
32
|
+
|
|
33
|
+
- **`DESIGN.md` is the authoritative, living spec** — it states the confirmed
|
|
34
|
+
model behavior directly (OCR grounding format/coordinate frame in §2.1–2.2 and
|
|
35
|
+
§8.3; the table pass and its pinned prompt in §9.7). Code comments cite its
|
|
36
|
+
sections (`§9.7` etc.). When you change behavior, update DESIGN.md, README.md,
|
|
37
|
+
and `config.example.toml` in the same change — this repo treats docs as
|
|
38
|
+
first-class.
|
|
39
|
+
- `dev/notes/` holds the **dated lab notes** (`YYYY-MM-DD-name.md`) — the
|
|
40
|
+
empirical evidence records behind those sections. Consult them before
|
|
41
|
+
changing model-facing behavior; when new real-hardware findings land, add a
|
|
42
|
+
new dated note (or an addendum/status line on an existing one) rather than
|
|
43
|
+
rewriting history.
|
|
44
|
+
- `TODO.md` tracks concrete pending items (real-hardware verifications, code
|
|
45
|
+
debts, blocked refinements) — add to it rather than burying TODOs in spec
|
|
46
|
+
prose; longer-horizon future work stays in DESIGN §22.
|
|
47
|
+
- `dev/` is developer-only material (never user-facing): scripts, the dated
|
|
48
|
+
lab notes above, `dev/plans/` — executed feature plans and build roadmaps,
|
|
49
|
+
archived as design records — and `dev/benchmarks/` — verified gold
|
|
50
|
+
transcriptions of real papers plus the error inventory of the run that
|
|
51
|
+
produced them, for comparing OCR/VLM approaches (see each folder's README;
|
|
52
|
+
source PDFs are gitignored, fetch them from the recorded URL).
|
|
53
|
+
|
|
54
|
+
## Architecture
|
|
55
|
+
|
|
56
|
+
Pipeline (`inscriber/pipeline.py` orchestrates; DESIGN §3): resolve input →
|
|
57
|
+
rasterize (PyMuPDF) → per-page OCR → figure crop → **VLM table restructuring →
|
|
58
|
+
VLM figure description → BibTeX citability probe** (in that order — figure
|
|
59
|
+
context must see clean tables; the text-only probe shares the open VLM
|
|
60
|
+
session) → stitch/clean → split (main/appendix/backmatter) → BibTeX (default
|
|
61
|
+
`auto`: provenance/probe citability → source chain; DESIGN §12) → write.
|
|
62
|
+
|
|
63
|
+
- **Five subcommands**: `run` (end-to-end), `ocr` (writes a portable _bundle_:
|
|
64
|
+
`manifest.json` + `figures/` crops + `pages/` rasters for table pages),
|
|
65
|
+
`describe` (bundle → VLM + assembly, no OCR model), `join` (rejoin
|
|
66
|
+
possibly hand-edited `{base}_main/_appendix/_backmatter.md` splits into
|
|
67
|
+
`{base}_full.md` — the §11 allparts form; pure text, no models needed —
|
|
68
|
+
though an existing config file is still structurally validated, the
|
|
69
|
+
deliberate global policy of DESIGN §13.1), and `setup` (DESIGN §13.4:
|
|
70
|
+
downloads the recommended GGUFs against
|
|
71
|
+
a **pinned sha256/size registry** in `inscriber/setup.py` and writes/updates
|
|
72
|
+
the platform `config.toml`; outside the pipeline — no RunConfig. The
|
|
73
|
+
registry pins and the README model table must change together). Output
|
|
74
|
+
base name (DESIGN §14): explicit `--name` > the BibTeX
|
|
75
|
+
citation key (`name_from_bibtex`, default on; e.g. `chang2025amortized`) >
|
|
76
|
+
source-derived.
|
|
77
|
+
`run` = `ocr` + `describe` sharing in-memory objects. The bundle's
|
|
78
|
+
`bundle_schema` int is the compatibility gate; new manifest fields must be
|
|
79
|
+
additive or bump it.
|
|
80
|
+
- **Sequential single-model-resident by default**: the OCR server is torn down
|
|
81
|
+
before the VLM server starts. Both VLM passes (tables, figures) share one
|
|
82
|
+
lazily-launched server (`_VlmSession`) that only starts on the first cache
|
|
83
|
+
miss. `--mode concurrent` pre-launches the VLM server instead.
|
|
84
|
+
- **Backends own their model's quirks**: `OcrBackend.ocr_page` owns the whole
|
|
85
|
+
per-page inference (prompt, calls, parsing, coordinate mapping into the
|
|
86
|
+
original-page `[0,1]` frame); `VlmBackend` owns `describe` and
|
|
87
|
+
`restructure_table`. Registries map names → classes; adding a backend must
|
|
88
|
+
require zero pipeline changes.
|
|
89
|
+
- **Caching is content-addressed and load-bearing** (`inscriber/cache.py`):
|
|
90
|
+
per-page OCR cache + shared VLM store (figure descriptions and restructured
|
|
91
|
+
tables, disjoint key payloads). Keys include model+mmproj _content_ identities,
|
|
92
|
+
the llama.cpp **build identity** (`llama_build_identity` in `llama/server.py`
|
|
93
|
+
— `llama-server --version`, or the endpoint's `/props` `build_info`), the
|
|
94
|
+
fully assembled prompt, sampling, and `chat_template_kwargs`. **Anything
|
|
95
|
+
that changes model output must become key material.** `--refresh` recomputes
|
|
96
|
+
and overwrites; `--no-cache` neither reads nor writes. Never cache a failed
|
|
97
|
+
result. One deliberate nuance: a *truncated* OCR page (repetition loop hit
|
|
98
|
+
the token cap) IS cached, flagged `truncated`, and re-warned on every hit —
|
|
99
|
+
its key pins every output-determining knob, so a recompute could only
|
|
100
|
+
reproduce the loop (DESIGN §8.6; the table pass differs because its key
|
|
101
|
+
excludes `ctx_size`).
|
|
102
|
+
|
|
103
|
+
## Invariants and gotchas
|
|
104
|
+
|
|
105
|
+
- `⟦INSCRIBER_FIG:{id}⟧` placeholders are the **only** anchors tying figures to
|
|
106
|
+
their position in page markdown (DeepSeek emits no `![]()`); never strip them
|
|
107
|
+
without injecting their replacement.
|
|
108
|
+
- **`ctx_size` is the single size knob.** VLM calls send no `max_tokens`
|
|
109
|
+
(generation bounded by the context window; truncation detected via
|
|
110
|
+
`finish_reason != "stop"`). The one exception is DeepSeek-OCR's internal 8192
|
|
111
|
+
cap — it is an anti-repetition-loop guard (llama.cpp lacks the model's n-gram
|
|
112
|
+
penalty), not a verbosity knob. Keep it. Also keep DeepSeek at BF16/F16
|
|
113
|
+
weights (Q4_K_M loops) and `temperature: 0` everywhere.
|
|
114
|
+
- Gemma 4 is a thinking model; thinking is explicitly activated per request via
|
|
115
|
+
`chat_template_kwargs: {"enable_thinking": true}` (verified to toggle on build
|
|
116
|
+
9028).
|
|
117
|
+
- The table pass falls back to the original `<table>` blob on **any** failure
|
|
118
|
+
(error, truncation, commentary, empty) — the blob still holds every value.
|
|
119
|
+
Bundle page rasters are written **verbatim** so `run` and `describe` share
|
|
120
|
+
table cache keys.
|
|
121
|
+
- **Many behaviors are deliberate verbatim ports from paper2llm** (the figure
|
|
122
|
+
prompt, `> **Image description.**` header strings, splitter regexes, BibTeX
|
|
123
|
+
mock/warning text, the allparts section reordering). DESIGN §23–24 maps each
|
|
124
|
+
to its TypeScript source — check there before "fixing" something that looks
|
|
125
|
+
odd.
|
|
126
|
+
- Cross-platform rules (DESIGN §15): `pathlib` everywhere, subprocess with list
|
|
127
|
+
args (never `shell=True`), `platformdirs` for config/cache dirs, `.exe`
|
|
128
|
+
suffixing via `config.binary_filename`, and **always write text files with
|
|
129
|
+
`encoding="utf-8", newline="\n"`**.
|
|
130
|
+
|
|
131
|
+
## Testing conventions
|
|
132
|
+
|
|
133
|
+
- Tests mock at the **chat-client boundary**: monkeypatch `ChatClient.chat_image`
|
|
134
|
+
AND `ChatClient.chat` (the text-only BibTeX probe lands on `chat`), and fake
|
|
135
|
+
`LlamaServerManager.serve` (see `tests/test_pipeline_mocked.py`,
|
|
136
|
+
`tests/test_tables.py`). Mock prompts are discriminated by content
|
|
137
|
+
(`"<|grounding|>"` / `"Convert the document to markdown"` for OCR,
|
|
138
|
+
`"reconstructing ONE table"` for tables, `"bibliographic metadata"` for the
|
|
139
|
+
BibTeX probe, else figure). Probe fakes default to `{"citable": false}` so
|
|
140
|
+
default-`auto` runs stay inert and network-free in tests.
|
|
141
|
+
- Use the shared `hermetic_cache` fixture (defined once in `tests/conftest.py`:
|
|
142
|
+
monkeypatches `cache.default_cache_dir` / `default_vlm_cache_dir` into tmp and
|
|
143
|
+
pins the llama.cpp build-identity probe) — never let tests touch the real
|
|
144
|
+
platformdirs cache. The near-duplicated per-file helpers (`_dummy_models`,
|
|
145
|
+
`_mock_inference`, cfg builders) deliberately stay file-local — see the
|
|
146
|
+
conftest docstring for why.
|
|
147
|
+
- `tests/fixtures/deepseek_paper_p1_raw.txt` is the golden real-output format the
|
|
148
|
+
DeepSeek parser is pinned to; extend it rather than inventing new shapes.
|
|
149
|
+
- Changes to llama.cpp-facing behavior (prompts, template kwargs, server flags)
|
|
150
|
+
cannot be proven by mocked tests — verify on real hardware (see
|
|
151
|
+
`dev/scripts/` for prior spike patterns) and record findings in `dev/notes/`.
|