inscriber 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. inscriber-0.1.0/.gitignore +26 -0
  2. inscriber-0.1.0/AGENTS.md +151 -0
  3. inscriber-0.1.0/CLAUDE.md +7 -0
  4. inscriber-0.1.0/DESIGN.md +2681 -0
  5. inscriber-0.1.0/LICENSE +21 -0
  6. inscriber-0.1.0/PKG-INFO +313 -0
  7. inscriber-0.1.0/README.md +278 -0
  8. inscriber-0.1.0/TODO.md +129 -0
  9. inscriber-0.1.0/config.example.toml +167 -0
  10. inscriber-0.1.0/inscriber/__init__.py +6 -0
  11. inscriber-0.1.0/inscriber/__main__.py +8 -0
  12. inscriber-0.1.0/inscriber/bibtex/__init__.py +1 -0
  13. inscriber-0.1.0/inscriber/bibtex/arxiv.py +127 -0
  14. inscriber-0.1.0/inscriber/bibtex/chain.py +104 -0
  15. inscriber-0.1.0/inscriber/bibtex/local.py +39 -0
  16. inscriber-0.1.0/inscriber/bibtex/probe.py +125 -0
  17. inscriber-0.1.0/inscriber/bibtex/semantic_scholar.py +224 -0
  18. inscriber-0.1.0/inscriber/bundle.py +214 -0
  19. inscriber-0.1.0/inscriber/cache.py +390 -0
  20. inscriber-0.1.0/inscriber/cli.py +459 -0
  21. inscriber-0.1.0/inscriber/config.py +324 -0
  22. inscriber-0.1.0/inscriber/errors.py +16 -0
  23. inscriber-0.1.0/inscriber/input/__init__.py +1 -0
  24. inscriber-0.1.0/inscriber/input/domain_handlers.py +204 -0
  25. inscriber-0.1.0/inscriber/input/resolver.py +157 -0
  26. inscriber-0.1.0/inscriber/llama/__init__.py +1 -0
  27. inscriber-0.1.0/inscriber/llama/client.py +135 -0
  28. inscriber-0.1.0/inscriber/llama/server.py +412 -0
  29. inscriber-0.1.0/inscriber/logging.py +52 -0
  30. inscriber-0.1.0/inscriber/models.py +279 -0
  31. inscriber-0.1.0/inscriber/ocr/__init__.py +1 -0
  32. inscriber-0.1.0/inscriber/ocr/base.py +263 -0
  33. inscriber-0.1.0/inscriber/ocr/deepseek.py +212 -0
  34. inscriber-0.1.0/inscriber/ocr/glm.py +95 -0
  35. inscriber-0.1.0/inscriber/ocr/registry.py +33 -0
  36. inscriber-0.1.0/inscriber/output.py +133 -0
  37. inscriber-0.1.0/inscriber/pdf/__init__.py +1 -0
  38. inscriber-0.1.0/inscriber/pdf/crop.py +108 -0
  39. inscriber-0.1.0/inscriber/pdf/figures.py +83 -0
  40. inscriber-0.1.0/inscriber/pdf/rasterize.py +120 -0
  41. inscriber-0.1.0/inscriber/pipeline.py +1226 -0
  42. inscriber-0.1.0/inscriber/postprocess/__init__.py +1 -0
  43. inscriber-0.1.0/inscriber/postprocess/inject.py +90 -0
  44. inscriber-0.1.0/inscriber/postprocess/join.py +112 -0
  45. inscriber-0.1.0/inscriber/postprocess/notice.py +54 -0
  46. inscriber-0.1.0/inscriber/postprocess/prompt.py +107 -0
  47. inscriber-0.1.0/inscriber/postprocess/splitter.py +200 -0
  48. inscriber-0.1.0/inscriber/postprocess/stitch.py +201 -0
  49. inscriber-0.1.0/inscriber/postprocess/tables.py +278 -0
  50. inscriber-0.1.0/inscriber/serialize.py +62 -0
  51. inscriber-0.1.0/inscriber/setup.py +510 -0
  52. inscriber-0.1.0/inscriber/vlm/__init__.py +1 -0
  53. inscriber-0.1.0/inscriber/vlm/base.py +84 -0
  54. inscriber-0.1.0/inscriber/vlm/gemma.py +134 -0
  55. inscriber-0.1.0/inscriber/vlm/registry.py +21 -0
  56. inscriber-0.1.0/pyproject.toml +77 -0
  57. inscriber-0.1.0/tests/conftest.py +30 -0
  58. inscriber-0.1.0/tests/fixtures/bibtex_best_effort.txt +9 -0
  59. inscriber-0.1.0/tests/fixtures/bibtex_mock.txt +12 -0
  60. inscriber-0.1.0/tests/fixtures/calibration.json +136 -0
  61. inscriber-0.1.0/tests/fixtures/calibration.pdf +130 -0
  62. inscriber-0.1.0/tests/fixtures/deepseek_calibration_gundam2048_raw.txt +6 -0
  63. inscriber-0.1.0/tests/fixtures/deepseek_calibration_raw.txt +6 -0
  64. inscriber-0.1.0/tests/fixtures/deepseek_paper_p1_raw.txt +30 -0
  65. inscriber-0.1.0/tests/fixtures/deepseek_paper_table_p27_raw.txt +35 -0
  66. inscriber-0.1.0/tests/fixtures/make_calibration.py +150 -0
  67. inscriber-0.1.0/tests/fixtures/make_sample_paper.py +82 -0
  68. inscriber-0.1.0/tests/fixtures/sample_paper.pdf +0 -0
  69. inscriber-0.1.0/tests/test_bibtex.py +165 -0
  70. inscriber-0.1.0/tests/test_bibtex_chain.py +585 -0
  71. inscriber-0.1.0/tests/test_bibtex_probe.py +308 -0
  72. inscriber-0.1.0/tests/test_bundle_roundtrip.py +225 -0
  73. inscriber-0.1.0/tests/test_cache.py +204 -0
  74. inscriber-0.1.0/tests/test_cli.py +54 -0
  75. inscriber-0.1.0/tests/test_config.py +461 -0
  76. inscriber-0.1.0/tests/test_crop.py +57 -0
  77. inscriber-0.1.0/tests/test_deepseek_parser.py +171 -0
  78. inscriber-0.1.0/tests/test_domain_handlers.py +109 -0
  79. inscriber-0.1.0/tests/test_inject.py +77 -0
  80. inscriber-0.1.0/tests/test_join.py +208 -0
  81. inscriber-0.1.0/tests/test_llama_server.py +404 -0
  82. inscriber-0.1.0/tests/test_logging.py +33 -0
  83. inscriber-0.1.0/tests/test_notice.py +43 -0
  84. inscriber-0.1.0/tests/test_ocr_truncation.py +260 -0
  85. inscriber-0.1.0/tests/test_output.py +120 -0
  86. inscriber-0.1.0/tests/test_pdf_embedded_figures.py +75 -0
  87. inscriber-0.1.0/tests/test_pipeline_mocked.py +458 -0
  88. inscriber-0.1.0/tests/test_prompt.py +68 -0
  89. inscriber-0.1.0/tests/test_rasterize.py +112 -0
  90. inscriber-0.1.0/tests/test_resolver.py +119 -0
  91. inscriber-0.1.0/tests/test_setup.py +408 -0
  92. inscriber-0.1.0/tests/test_splitter.py +111 -0
  93. inscriber-0.1.0/tests/test_stitch.py +124 -0
  94. inscriber-0.1.0/tests/test_tables.py +927 -0
  95. inscriber-0.1.0/tests/test_vlm_truncation.py +96 -0
@@ -0,0 +1,26 @@
1
+ paper2llm/
2
+
3
+ # Python
4
+ __pycache__/
5
+ *.py[cod]
6
+ *.egg-info/
7
+ .eggs/
8
+ build/
9
+ dist/
10
+ .venv/
11
+ .venv*/
12
+ venv/
13
+ .env
14
+
15
+ # Tooling caches
16
+ .pytest_cache/
17
+ .ruff_cache/
18
+ .mypy_cache/
19
+
20
+ # inscriber runtime/test artifacts
21
+ config.toml
22
+ dev/benchmarks/**/*.pdf
23
+ *.inscriber-ocr/
24
+ /out*/
25
+ /out*.log
26
+ tmp-*/
@@ -0,0 +1,151 @@
1
+ ## What this is
2
+
3
+ `inscriber` is a cross-platform CLI that converts academic PDFs into LLM-friendly
4
+ text-only Markdown **entirely locally** via llama.cpp (DeepSeek-OCR for text +
5
+ figure grounding; a Gemma 4 VLM for figure descriptions and table restructuring).
6
+ It is a Python port of the cloud web app `paper2llm`. No ML libraries in the
7
+ package — all inference is a llama.cpp subprocess driven over its
8
+ OpenAI-compatible HTTP API.
9
+
10
+ ## Commands
11
+
12
+ ```bash
13
+ python -m venv .venv
14
+ .venv/Scripts/activate # Windows; source .venv/bin/activate elsewhere
15
+ pip install -e ".[dev]"
16
+
17
+ pytest # full suite; mocked inference, no GPU/models needed
18
+ pytest tests/test_tables.py -k locator # single file / keyword
19
+ ruff check # lint (config in pyproject.toml)
20
+ mypy inscriber # type check (clean since 2026-06-11; keep it so)
21
+
22
+ python -m inscriber run paper.pdf -o out/ # real runs need llama.cpp + GGUFs
23
+ ```
24
+
25
+ Real runs read `./config.toml` (gitignored, machine-local) — `config.example.toml`
26
+ is the tracked template. The maintainer's setup is llama.cpp build 9587 on an
27
+ RTX 4060 8GB. Builds **older than 9587 are refused for OCR**
28
+ (`DeepSeekOcrBackend.min_server_build` — the grounding coordinate frame changed
29
+ upstream; DESIGN §2.2, `dev/notes/2026-06-10-build-9587-verification.md`).
30
+
31
+ ## Where truth lives (read before changing behavior)
32
+
33
+ - **`DESIGN.md` is the authoritative, living spec** — it states the confirmed
34
+ model behavior directly (OCR grounding format/coordinate frame in §2.1–2.2 and
35
+ §8.3; the table pass and its pinned prompt in §9.7). Code comments cite its
36
+ sections (`§9.7` etc.). When you change behavior, update DESIGN.md, README.md,
37
+ and `config.example.toml` in the same change — this repo treats docs as
38
+ first-class.
39
+ - `dev/notes/` holds the **dated lab notes** (`YYYY-MM-DD-name.md`) — the
40
+ empirical evidence records behind those sections. Consult them before
41
+ changing model-facing behavior; when new real-hardware findings land, add a
42
+ new dated note (or an addendum/status line on an existing one) rather than
43
+ rewriting history.
44
+ - `TODO.md` tracks concrete pending items (real-hardware verifications, code
45
+ debts, blocked refinements) — add to it rather than burying TODOs in spec
46
+ prose; longer-horizon future work stays in DESIGN §22.
47
+ - `dev/` is developer-only material (never user-facing): scripts, the dated
48
+ lab notes above, `dev/plans/` — executed feature plans and build roadmaps,
49
+ archived as design records — and `dev/benchmarks/` — verified gold
50
+ transcriptions of real papers plus the error inventory of the run that
51
+ produced them, for comparing OCR/VLM approaches (see each folder's README;
52
+ source PDFs are gitignored, fetch them from the recorded URL).
53
+
54
+ ## Architecture
55
+
56
+ Pipeline (`inscriber/pipeline.py` orchestrates; DESIGN §3): resolve input →
57
+ rasterize (PyMuPDF) → per-page OCR → figure crop → **VLM table restructuring →
58
+ VLM figure description → BibTeX citability probe** (in that order — figure
59
+ context must see clean tables; the text-only probe shares the open VLM
60
+ session) → stitch/clean → split (main/appendix/backmatter) → BibTeX (default
61
+ `auto`: provenance/probe citability → source chain; DESIGN §12) → write.
62
+
63
+ - **Five subcommands**: `run` (end-to-end), `ocr` (writes a portable _bundle_:
64
+ `manifest.json` + `figures/` crops + `pages/` rasters for table pages),
65
+ `describe` (bundle → VLM + assembly, no OCR model), `join` (rejoin
66
+ possibly hand-edited `{base}_main/_appendix/_backmatter.md` splits into
67
+ `{base}_full.md` — the §11 allparts form; pure text, no models needed —
68
+ though an existing config file is still structurally validated, the
69
+ deliberate global policy of DESIGN §13.1), and `setup` (DESIGN §13.4:
70
+ downloads the recommended GGUFs against
71
+ a **pinned sha256/size registry** in `inscriber/setup.py` and writes/updates
72
+ the platform `config.toml`; outside the pipeline — no RunConfig. The
73
+ registry pins and the README model table must change together). Output
74
+ base name (DESIGN §14): explicit `--name` > the BibTeX
75
+ citation key (`name_from_bibtex`, default on; e.g. `chang2025amortized`) >
76
+ source-derived.
77
+ `run` = `ocr` + `describe` sharing in-memory objects. The bundle's
78
+ `bundle_schema` int is the compatibility gate; new manifest fields must be
79
+ additive or bump it.
80
+ - **Sequential single-model-resident by default**: the OCR server is torn down
81
+ before the VLM server starts. Both VLM passes (tables, figures) share one
82
+ lazily-launched server (`_VlmSession`) that only starts on the first cache
83
+ miss. `--mode concurrent` pre-launches the VLM server instead.
84
+ - **Backends own their model's quirks**: `OcrBackend.ocr_page` owns the whole
85
+ per-page inference (prompt, calls, parsing, coordinate mapping into the
86
+ original-page `[0,1]` frame); `VlmBackend` owns `describe` and
87
+ `restructure_table`. Registries map names → classes; adding a backend must
88
+ require zero pipeline changes.
89
+ - **Caching is content-addressed and load-bearing** (`inscriber/cache.py`):
90
+ per-page OCR cache + shared VLM store (figure descriptions and restructured
91
+ tables, disjoint key payloads). Keys include model+mmproj _content_ identities,
92
+ the llama.cpp **build identity** (`llama_build_identity` in `llama/server.py`
93
+ — `llama-server --version`, or the endpoint's `/props` `build_info`), the
94
+ fully assembled prompt, sampling, and `chat_template_kwargs`. **Anything
95
+ that changes model output must become key material.** `--refresh` recomputes
96
+ and overwrites; `--no-cache` neither reads nor writes. Never cache a failed
97
+ result. One deliberate nuance: a *truncated* OCR page (repetition loop hit
98
+ the token cap) IS cached, flagged `truncated`, and re-warned on every hit —
99
+ its key pins every output-determining knob, so a recompute could only
100
+ reproduce the loop (DESIGN §8.6; the table pass differs because its key
101
+ excludes `ctx_size`).
102
+
103
+ ## Invariants and gotchas
104
+
105
+ - `⟦INSCRIBER_FIG:{id}⟧` placeholders are the **only** anchors tying figures to
106
+ their position in page markdown (DeepSeek emits no `![]()`); never strip them
107
+ without injecting their replacement.
108
+ - **`ctx_size` is the single size knob.** VLM calls send no `max_tokens`
109
+ (generation bounded by the context window; truncation detected via
110
+ `finish_reason != "stop"`). The one exception is DeepSeek-OCR's internal 8192
111
+ cap — it is an anti-repetition-loop guard (llama.cpp lacks the model's n-gram
112
+ penalty), not a verbosity knob. Keep it. Also keep DeepSeek at BF16/F16
113
+ weights (Q4_K_M loops) and `temperature: 0` everywhere.
114
+ - Gemma 4 is a thinking model; thinking is explicitly activated per request via
115
+ `chat_template_kwargs: {"enable_thinking": true}` (verified to toggle on build
116
+ 9028).
117
+ - The table pass falls back to the original `<table>` blob on **any** failure
118
+ (error, truncation, commentary, empty) — the blob still holds every value.
119
+ Bundle page rasters are written **verbatim** so `run` and `describe` share
120
+ table cache keys.
121
+ - **Many behaviors are deliberate verbatim ports from paper2llm** (the figure
122
+ prompt, `> **Image description.**` header strings, splitter regexes, BibTeX
123
+ mock/warning text, the allparts section reordering). DESIGN §23–24 maps each
124
+ to its TypeScript source — check there before "fixing" something that looks
125
+ odd.
126
+ - Cross-platform rules (DESIGN §15): `pathlib` everywhere, subprocess with list
127
+ args (never `shell=True`), `platformdirs` for config/cache dirs, `.exe`
128
+ suffixing via `config.binary_filename`, and **always write text files with
129
+ `encoding="utf-8", newline="\n"`**.
130
+
131
+ ## Testing conventions
132
+
133
+ - Tests mock at the **chat-client boundary**: monkeypatch `ChatClient.chat_image`
134
+ AND `ChatClient.chat` (the text-only BibTeX probe lands on `chat`), and fake
135
+ `LlamaServerManager.serve` (see `tests/test_pipeline_mocked.py`,
136
+ `tests/test_tables.py`). Mock prompts are discriminated by content
137
+ (`"<|grounding|>"` / `"Convert the document to markdown"` for OCR,
138
+ `"reconstructing ONE table"` for tables, `"bibliographic metadata"` for the
139
+ BibTeX probe, else figure). Probe fakes default to `{"citable": false}` so
140
+ default-`auto` runs stay inert and network-free in tests.
141
+ - Use the shared `hermetic_cache` fixture (defined once in `tests/conftest.py`:
142
+ monkeypatches `cache.default_cache_dir` / `default_vlm_cache_dir` into tmp and
143
+ pins the llama.cpp build-identity probe) — never let tests touch the real
144
+ platformdirs cache. The near-duplicated per-file helpers (`_dummy_models`,
145
+ `_mock_inference`, cfg builders) deliberately stay file-local — see the
146
+ conftest docstring for why.
147
+ - `tests/fixtures/deepseek_paper_p1_raw.txt` is the golden real-output format the
148
+ DeepSeek parser is pinned to; extend it rather than inventing new shapes.
149
+ - Changes to llama.cpp-facing behavior (prompts, template kwargs, server flags)
150
+ cannot be proven by mocked tests — verify on real hardware (see
151
+ `dev/scripts/` for prior spike patterns) and record findings in `dev/notes/`.
@@ -0,0 +1,7 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ **Project information (expanded below):**
6
+
7
+ @AGENTS.md