paper-verify 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. paper_verify-0.1.0/LICENSE +21 -0
  2. paper_verify-0.1.0/MANIFEST.in +7 -0
  3. paper_verify-0.1.0/PKG-INFO +475 -0
  4. paper_verify-0.1.0/README.md +439 -0
  5. paper_verify-0.1.0/docs/RELEASING.md +38 -0
  6. paper_verify-0.1.0/docs/harness/2026-06-11_webchat-cli-accessibility-design.md +161 -0
  7. paper_verify-0.1.0/docs/harness/2026-06-11_webchat-cli-accessibility-plan.md +1591 -0
  8. paper_verify-0.1.0/docs/harness-strategy.md +47 -0
  9. paper_verify-0.1.0/docs/providers/claude-code.md +24 -0
  10. paper_verify-0.1.0/docs/providers/codex.md +24 -0
  11. paper_verify-0.1.0/docs/providers/cursor.md +22 -0
  12. paper_verify-0.1.0/docs/providers/gemini.md +26 -0
  13. paper_verify-0.1.0/docs/webchat/webchat-prompt.ko.md +85 -0
  14. paper_verify-0.1.0/docs/webchat/webchat-prompt.md +88 -0
  15. paper_verify-0.1.0/examples/evidence-sample.json +50 -0
  16. paper_verify-0.1.0/examples/sample.md +25 -0
  17. paper_verify-0.1.0/integrations/plugins/paper-verify/.claude-plugin/plugin.json +15 -0
  18. paper_verify-0.1.0/integrations/plugins/paper-verify/.codex-plugin/plugin.json +26 -0
  19. paper_verify-0.1.0/integrations/plugins/paper-verify/.mcp.json +9 -0
  20. paper_verify-0.1.0/integrations/plugins/paper-verify/skills/paper-verify/SKILL.md +48 -0
  21. paper_verify-0.1.0/integrations/plugins/paper-verify/skills/paper-verify/agents/openai.yaml +4 -0
  22. paper_verify-0.1.0/integrations/skills/paper-verify/SKILL.md +92 -0
  23. paper_verify-0.1.0/integrations/skills/paper-verify/agents/openai.yaml +4 -0
  24. paper_verify-0.1.0/integrations/skills/paper-verify-webchat/SKILL.md +86 -0
  25. paper_verify-0.1.0/paper_verify.egg-info/PKG-INFO +475 -0
  26. paper_verify-0.1.0/paper_verify.egg-info/SOURCES.txt +71 -0
  27. paper_verify-0.1.0/paper_verify.egg-info/dependency_links.txt +1 -0
  28. paper_verify-0.1.0/paper_verify.egg-info/entry_points.txt +3 -0
  29. paper_verify-0.1.0/paper_verify.egg-info/requires.txt +21 -0
  30. paper_verify-0.1.0/paper_verify.egg-info/top_level.txt +1 -0
  31. paper_verify-0.1.0/paperverify/__init__.py +24 -0
  32. paper_verify-0.1.0/paperverify/__main__.py +6 -0
  33. paper_verify-0.1.0/paperverify/cli.py +352 -0
  34. paper_verify-0.1.0/paperverify/extract.py +106 -0
  35. paper_verify-0.1.0/paperverify/fetch.py +543 -0
  36. paper_verify-0.1.0/paperverify/harness/__init__.py +10 -0
  37. paper_verify-0.1.0/paperverify/harness/base.py +154 -0
  38. paper_verify-0.1.0/paperverify/judge.py +458 -0
  39. paper_verify-0.1.0/paperverify/mcp_server.py +238 -0
  40. paper_verify-0.1.0/paperverify/models.py +233 -0
  41. paper_verify-0.1.0/paperverify/offline.py +101 -0
  42. paper_verify-0.1.0/paperverify/report.py +203 -0
  43. paper_verify-0.1.0/paperverify/score.py +218 -0
  44. paper_verify-0.1.0/paperverify/sources.py +329 -0
  45. paper_verify-0.1.0/pyproject.toml +43 -0
  46. paper_verify-0.1.0/setup.cfg +4 -0
  47. paper_verify-0.1.0/tests/test_author_year.py +45 -0
  48. paper_verify-0.1.0/tests/test_blog_sources.py +91 -0
  49. paper_verify-0.1.0/tests/test_cli_coverage.py +386 -0
  50. paper_verify-0.1.0/tests/test_consensus_display.py +47 -0
  51. paper_verify-0.1.0/tests/test_cross_check_substantive.py +62 -0
  52. paper_verify-0.1.0/tests/test_extract.py +59 -0
  53. paper_verify-0.1.0/tests/test_extract_dedup.py +58 -0
  54. paper_verify-0.1.0/tests/test_fetch_coverage.py +685 -0
  55. paper_verify-0.1.0/tests/test_fetch_metadata.py +39 -0
  56. paper_verify-0.1.0/tests/test_fetch_ssrf.py +64 -0
  57. paper_verify-0.1.0/tests/test_harness_profiles.py +155 -0
  58. paper_verify-0.1.0/tests/test_json.py +185 -0
  59. paper_verify-0.1.0/tests/test_judge.py +30 -0
  60. paper_verify-0.1.0/tests/test_judge_coverage.py +509 -0
  61. paper_verify-0.1.0/tests/test_keyword_contradiction.py +58 -0
  62. paper_verify-0.1.0/tests/test_mcp_coverage.py +185 -0
  63. paper_verify-0.1.0/tests/test_mcp_server.py +123 -0
  64. paper_verify-0.1.0/tests/test_offline.py +180 -0
  65. paper_verify-0.1.0/tests/test_p1_robustness.py +406 -0
  66. paper_verify-0.1.0/tests/test_report.py +158 -0
  67. paper_verify-0.1.0/tests/test_report_keyword_caveat.py +54 -0
  68. paper_verify-0.1.0/tests/test_score.py +144 -0
  69. paper_verify-0.1.0/tests/test_score_l1_landing.py +46 -0
  70. paper_verify-0.1.0/tests/test_sources.py +338 -0
  71. paper_verify-0.1.0/tests/test_sources_arxiv.py +41 -0
  72. paper_verify-0.1.0/tests/test_sources_coverage.py +379 -0
  73. paper_verify-0.1.0/tests/test_webchat_assets.py +59 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Duchan Jin (진두찬)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,7 @@
1
+ include LICENSE
2
+ include README.md
3
+ graft docs
4
+ graft examples
5
+ graft integrations
6
+ graft tests
7
+ global-exclude __pycache__ *.py[cod] .DS_Store
@@ -0,0 +1,475 @@
1
+ Metadata-Version: 2.4
2
+ Name: paper-verify
3
+ Version: 0.1.0
4
+ Summary: Citation fact-checking for documents — extract, fetch, judge, and score citations on a 100-point rubric.
5
+ Author-email: "Duchan Jin (진두찬)" <nolainjin@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/nolainjin/paper-verify
8
+ Project-URL: Repository, https://github.com/nolainjin/paper-verify
9
+ Keywords: citation,fact-check,fact-checking,research,doi,arxiv,llm,verification
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Topic :: Scientific/Engineering
16
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
17
+ Requires-Python: >=3.10
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Provides-Extra: anthropic
21
+ Requires-Dist: anthropic>=0.40; extra == "anthropic"
22
+ Provides-Extra: openai
23
+ Requires-Dist: openai>=1.40; extra == "openai"
24
+ Provides-Extra: gemini
25
+ Requires-Dist: google-genai>=0.3; extra == "gemini"
26
+ Provides-Extra: mcp
27
+ Requires-Dist: mcp>=1.2; extra == "mcp"
28
+ Provides-Extra: all
29
+ Requires-Dist: anthropic>=0.40; extra == "all"
30
+ Requires-Dist: openai>=1.40; extra == "all"
31
+ Requires-Dist: google-genai>=0.3; extra == "all"
32
+ Requires-Dist: mcp>=1.2; extra == "all"
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest>=7; extra == "dev"
35
+ Dynamic: license-file
36
+
37
+ # paper-verify
38
+
39
+ > **한국어 가이드** → [README.ko.md](README.ko.md)
40
+
41
+ **Fact-check the citations in any document.** `paper-verify` extracts every
42
+ reference (URL / DOI / PMC / PMID / arXiv) from a Markdown or text file,
43
+ fetches each source, asks one or more LLMs whether the cited *claim* is actually
44
+ supported by the source, scores each citation on a transparent 100-point rubric,
45
+ and writes a Markdown report that flags fabricated, misquoted, or dead-link
46
+ citations.
47
+
48
+ > Built for researchers, grad students, lecturers, and bloggers who need to
49
+ > trust their own footnotes — and to catch AI-hallucinated citations before
50
+ > they ship.
51
+
52
+ It is also a **review triage tool**: use it to reduce a long bibliography or
53
+ blog source list into the small set a human should actually inspect. The JSON
54
+ surface exposes `tier`, `consensus`, `effective_verdict`, judge disagreement,
55
+ `source`, `landing_status`, and `soft_404_suspect`, so agents can build a
56
+ shortlist such as **Must Review**, **Review If Important**, and **Probably
57
+ Safe** instead of asking a human to read every cited source.
58
+
59
+ ## No framework dependency
60
+
61
+ The core has **zero required third-party dependencies** (Python stdlib only) and
62
+ **no dependency on any agent-orchestration framework**. Parallel fetching uses
63
+ `concurrent.futures.ThreadPoolExecutor`. LLM providers (Anthropic / OpenAI /
64
+ Gemini) are *optional extras*, and a dependency-free keyword judge lets the tool
65
+ run end-to-end with **no API keys at all**.
66
+
67
+ ## What it does
68
+
69
+ 1. **Extract** — regex-match citations and capture ~100 chars of surrounding
70
+ context (the claim being made), with line numbers; deduped by `(type, ref)`.
71
+ 2. **Fetch** — resolve each reference and fetch its source through an
72
+ **explicit fallback chain** (see below). For academic identifiers
73
+ (DOI / arXiv / PMID / PMC) it first queries free official **metadata APIs**
74
+ (Crossref / arXiv / NCBI) — bypassing paywalls — then falls back to a direct
75
+ HTTP fetch (browser-like User-Agent, 10 s timeout, follows redirects, strips
76
+ HTML to text), then to the **Wayback Machine** (`web.archive.org`).
77
+ 3. **Judge** — one or more pluggable judges decide a verdict
78
+ (Match / Partial / Mismatch / Uncertain / Inaccessible) + a one-line reason.
79
+ Multiple judges = independent cross-check; an optional `--tiebreak` judge
80
+ resolves disagreements.
81
+ 4. **Score** — apply the 100-point rubric and assign a tier.
82
+ 5. **Report** — emit `<basename>_report.md` (+ `<basename>_claims.jsonl` for
83
+ reuse). Any tier-F citation raises a document-level warning banner.
84
+
85
+ ## Best fit
86
+
87
+ paper-verify works best as the first pass before human review:
88
+
89
+ - **Research papers / reports** — find the citations most likely to need manual
90
+ paper reading (`F`, `C`, `Uncertain`, judge disagreement, weak author/year
91
+ match, dead landing pages).
92
+ - **Blog posts / newsletters** — catch dead links, soft-404s, claim/source
93
+ mismatch, and source drift before publishing.
94
+ - **Lecture notes / public handouts** — separate probably-safe citations from
95
+ sources that need a human spot-check.
96
+ - **Agent workflows** — let Claude Code, Codex, Cursor, or Gemini parse JSON and
97
+ loop only over the risky citations.
98
+
99
+ It is not a replacement for final expert review. It is designed to make that
100
+ review smaller, faster, and better targeted.
101
+
102
+ ## Quickstart
103
+
104
+ Run it **without installing anything** (needs [uv](https://docs.astral.sh/uv/)):
105
+
106
+ ```bash
107
+ uvx --from git+https://github.com/nolainjin/paper-verify paper-verify yourdoc.md --level L2 --out /tmp/pv
108
+ ```
109
+
110
+ Runs with **no API keys** (keyword judge, low confidence). Output resembles:
111
+
112
+ ```
113
+ paper-verify: 5 citations, level L2, judges: keyword
114
+ Overall: <score>/100 <tier> [🟢A:<n> 🟡B:<n> 🟠C:<n> 🔴F:<n>]
115
+ ⚠️ Document contains tier-F citations — see report.
116
+ Report: /tmp/pv/yourdoc_report.md
117
+ Claims: /tmp/pv/yourdoc_claims.jsonl
118
+ ```
119
+
120
+ For a real fact-check, add an LLM judge:
121
+
122
+ ```bash
123
+ export ANTHROPIC_API_KEY=sk-...
124
+ uvx --from "git+https://github.com/nolainjin/paper-verify" --with anthropic \
125
+ paper-verify paper.md --level L2 --judge anthropic:claude-sonnet-4-6
126
+ ```
127
+
128
+ ## 💬 No terminal? Use it from a web chat
129
+
130
+ - **Any web chat** (Claude / ChatGPT / Gemini with browsing): copy-paste
131
+ [`docs/webchat/webchat-prompt.md`](docs/webchat/webchat-prompt.md)
132
+ (한국어: [`webchat-prompt.ko.md`](docs/webchat/webchat-prompt.ko.md)) — the
133
+ model fetches your sources and scores them with this same 100-point rubric.
134
+ - **claude.ai (skill upload)**: upload the web-chat skill zip — extraction and
135
+ scoring run as bundled code, fetching/judging use Claude's web tools, and the
136
+ score comes from the real rubric via `--from-evidence`. Get the zip from
137
+ [Releases](https://github.com/nolainjin/paper-verify/releases) or build it:
138
+ `python tools/build_webchat_skill.py`.
139
+
140
+ ## Install
141
+
142
+ > Not on PyPI yet — these install straight from GitHub. (The PyPI release
143
+ > workflow is ready; see `docs/RELEASING.md`.)
144
+
145
+ ```bash
146
+ pipx install git+https://github.com/nolainjin/paper-verify # isolated CLI
147
+ pip install "paper-verify @ git+https://github.com/nolainjin/paper-verify" # core, stdlib only
148
+ pip install "paper-verify[anthropic] @ git+https://github.com/nolainjin/paper-verify" # + Anthropic judge
149
+ # extras: [anthropic] [openai] [gemini] [mcp] [all] [dev]
150
+
151
+ # for development:
152
+ git clone https://github.com/nolainjin/paper-verify && cd paper-verify
153
+ pip install -e ".[dev]"
154
+ ```
155
+
156
+ Requires Python ≥ 3.10. From a clone you can also run without installing:
157
+ `python -m paperverify yourdoc.md --level L2`.
158
+
159
+ ## Verification levels
160
+
161
+ | Level | Depth | Network / cost |
162
+ |---|---|---|
163
+ | **L1** | URL alive (HTTP 2xx) only — fast dead-link sweep, scored 100 for reachable / 0 for unreachable | network only, no LLM |
164
+ | **L2** | abstract / title vs. claim match (**default**) | network + ~1 LLM call per citation |
165
+ | **L3** | full content + claim/number alignment | network + several LLM calls per citation |
166
+
167
+ `L1` runs with no LLM at all. Pick the level with `--level L1|L2|L3`.
168
+
169
+ > **⚠️ L1 caveat.** L1 scores **reachability only** (HTTP-alive → 100,
170
+ > unreachable → 0). It does **not** verify that the page content supports the
171
+ > claim. Soft-404s (pages that return HTTP 200 with error / placeholder content)
172
+ > are now **detected heuristically** — a reachable-but-suspect page scores **50**
173
+ > (not a clean 100), with a `soft_404_suspect` flag in the output. The heuristic
174
+ > is **not perfect** (it checks error markers, deep-path→homepage redirects, and
175
+ > suspiciously tiny bodies); use **L2 / L3** for real content verification.
176
+
177
+ ### Academic metadata (paywall bypass)
178
+
179
+ For academic identifiers (DOI / arXiv / PMID / PMC, or URLs that clearly carry
180
+ one), paper-verify queries free official **metadata APIs** before scraping HTML:
181
+
182
+ | Source | API | Yields |
183
+ |---|---|---|
184
+ | Crossref | `api.crossref.org/works/{doi}` | title, authors, year, abstract |
185
+ | arXiv | `export.arxiv.org/api/query?id_list={id}` | title, authors, year, abstract |
186
+ | NCBI | E-utilities `esummary` (PubMed) / idconv (PMC) | title, authors, year |
187
+
188
+ This returns structured **title / authors / year / abstract even when the
189
+ publisher landing page is a paywall stub**, and makes the author/year rubric a
190
+ real comparison instead of a fuzzy HTML match.
191
+
192
+ **Explicit, observable fallback chain** (each step only on failure of the prior;
193
+ the path that actually served the data is recorded in the `source` field, never
194
+ silently — per the No-Silent-Fallback principle):
195
+
196
+ ```
197
+ metadata API (crossref|arxiv|ncbi) → HTTP fetch (http) → Wayback (archive) → none
198
+ ```
199
+
200
+ - A metadata call uses a short timeout (~8 s) + one retry/backoff on transient
201
+ errors (timeout / connection / HTTP 429 / 5xx); HTTP 404 means "not found"
202
+ (no retry).
203
+ - A failed metadata lookup **never crashes the run** — it falls through, and
204
+ `source` then reads `"http"` (so you can see the API did *not* serve it),
205
+ `"archive"`, or `"none"`.
206
+ - If the whole chain fails, the citation is `source="none"`, `status=0`, carries
207
+ an `error`, and is scored **Inaccessible** — no invented metadata, never scored
208
+ as alive.
209
+
210
+ The `source` field lets you see at a glance whether a citation was
211
+ **metadata-verified** (`crossref`/`arxiv`/`ncbi`), **HTML-scraped** (`http`),
212
+ served from **archive**, or **unverifiable** (`none`).
213
+
214
+ ## Judges & providers
215
+
216
+ Pass `--judge SPEC` (repeatable for cross-check). Spec forms:
217
+
218
+ | Spec | Judge | Requirement |
219
+ |---|---|---|
220
+ | `keyword` | token-overlap heuristic (**default**) | none — always available |
221
+ | `anthropic` / `anthropic:claude-sonnet-4-6` | Anthropic SDK | extras `[anthropic]` (see Install), `ANTHROPIC_API_KEY` |
222
+ | `openai` / `openai:gpt-4o-mini` | OpenAI SDK | extras `[openai]` (see Install), `OPENAI_API_KEY` |
223
+ | `gemini` / `gemini:gemini-2.0-flash` | google-genai SDK | extras `[gemini]` (see Install), `GEMINI_API_KEY` |
224
+ | `cli:gemini` / `cli:claude` / `cli:codex` | shells out to a locally-installed CLI | that CLI on `$PATH` |
225
+
226
+ The `keyword` judge is **clearly low-confidence** — it only measures lexical
227
+ overlap, not meaning. Use it to smoke-test the pipeline; use an LLM judge for
228
+ real verification.
229
+
230
+ ## Harness profiles (`--profile`)
231
+
232
+ A **harness profile** bundles the recommended judge order and frontend setup for
233
+ a given agent (Claude Code, Cursor, Codex, Gemini). Pass `--profile <key>` and,
234
+ **when you do not pass any explicit `--judge`**, paper-verify defaults the judges
235
+ to that profile's recommended list, trying them **in order of availability** —
236
+ it skips any judge whose SDK/CLI is not installed and uses the first available
237
+ one(s), falling back to `keyword` (with a one-line stderr note) if none are
238
+ available. An explicit `--judge` always wins; the profile is still recorded.
239
+
240
+ ```bash
241
+ paper-verify paper.md --profile claude-code --json
242
+ ```
243
+
244
+ Keys (aliases like `claude` → `claude-code` are accepted): `claude-code`,
245
+ `cursor`, `codex`, `gemini`. The active profile is recorded in the JSON output
246
+ under the top-level `"profile"` field (`null` when unset). List every profile as
247
+ JSON (no file argument needed):
248
+
249
+ ```bash
250
+ paper-verify --list-profiles | python -m json.tool
251
+ ```
252
+
253
+ See [`docs/harness-strategy.md`](docs/harness-strategy.md) for the full matrix.
254
+
255
+ ## Agent packaging phases
256
+
257
+ Treat this repository as a staged agent integration:
258
+
259
+ | Phase | Artifact | Path | Meaning |
260
+ |---|---|---|---|
261
+ | Phase 1 | Core package | `paperverify/`, CLI, JSON, MCP server | Provider-neutral citation verification engine. |
262
+ | Phase 2 | Codex skill | `integrations/skills/paper-verify/` | Local workflow instructions that teach an agent how to run paper-verify and triage risky sources. |
263
+ | Phase 3 | Codex plugin | `integrations/plugins/paper-verify/` | Installable plugin bundle with the skill plus MCP server registration metadata. |
264
+
265
+ Use Phase 2 when you want the workflow to be available inside an existing Codex
266
+ setup without packaging a full plugin. Use Phase 3 when you want a distributable
267
+ agent integration: plugin metadata, skill discovery, and MCP server wiring live
268
+ together.
269
+
270
+ ## How cross-check works
271
+
272
+ Supply two or more `--judge` flags. Each judge evaluates the same
273
+ `(claim_context, source_text)` independently. The verdict drives the claim-match
274
+ score; the **cross-check** rubric item awards 10 points only when judges **agree**
275
+ — so disagreement costs points and surfaces citations worth a human spot-check.
276
+
277
+ ```bash
278
+ paper-verify paper.md \
279
+ --judge anthropic:claude-sonnet-4-6 \
280
+ --judge gemini:gemini-2.0-flash
281
+ ```
282
+
283
+ **Tie-break (3rd judge).** When two or more judges disagree, pass an optional
284
+ `--tiebreak <spec>` judge. It runs **only on the split citations** and restores
285
+ the original 3-stage consensus spirit:
286
+
287
+ - on a genuine consensus (incl. **after** the tie-break) the citation keeps its
288
+ 10 cross-check points, using the **majority** verdict as consensus (the
289
+ tie-break judge arbitrates a genuine tie between distinct verdicts);
290
+ - if judges remain split with **no** `--tiebreak`, the effective verdict becomes
291
+ **Uncertain** (claim-match = 15) and cross-check = 0, flagging it for review.
292
+
293
+ ```bash
294
+ paper-verify paper.md \
295
+ --judge anthropic:claude-sonnet-4-6 \
296
+ --judge openai:gpt-4o-mini \
297
+ --tiebreak gemini:gemini-2.0-flash
298
+ ```
299
+
300
+ A judge may also answer **Uncertain** on its own when the source is insufficient
301
+ to decide — better than guessing. Uncertain citations are grouped in a
302
+ **"Needs re-check"** section of the Markdown report.
303
+
304
+ ## Using paper-verify from your own agent
305
+
306
+ paper-verify is **agent-callable** two ways. Both are provider-agnostic — pick
307
+ any judge (`keyword`, `anthropic`, `openai`, `gemini`, `cli:*`); the structured
308
+ output shape is identical regardless of judge.
309
+
310
+ Provider-specific harness profiles are documented in
311
+ [`docs/harness-strategy.md`](docs/harness-strategy.md), with frontend notes for
312
+ Claude Code, Cursor, Codex, and Gemini under [`docs/providers/`](docs/providers/).
313
+ The core pipeline stays shared; frontend differences live in profiles and docs
314
+ instead of long-lived provider branches.
315
+
316
+ ### (a) `--json` — capture structured output from stdout
317
+
318
+ Add `--json` and the CLI writes the **full result as JSON to stdout** while the
319
+ human summary goes to stderr, so an agent can capture and parse stdout directly:
320
+
321
+ ```bash
322
+ result=$(paper-verify paper.md --level L2 --judge keyword --json)
323
+ echo "$result" | python -m json.tool
324
+ # overall_score / overall_tier / has_failure live at the top level:
325
+ echo "$result" | python -c "import sys,json; d=json.load(sys.stdin); print(d['overall_score'], d['overall_tier'], d['has_failure'])"
326
+ ```
327
+
328
+ The JSON top-level keys are: `schema_version`, `source_file`, `level`,
329
+ `profile` (active harness profile key, or `null`), `judges`, `overall_score`,
330
+ `overall_tier`, `has_failure`, `tier_distribution` (counts per tier), and
331
+ `citations` (one object per citation: `citation`, `fetched`, `judgements`,
332
+ `consensus`, `effective_verdict`, `score`, `breakdown`, `tier`).
333
+
334
+ Each `citation.fetched` object carries (schema_version `"4"`): `status`,
335
+ `title`, `abstract`, `url_final`, `via_archive`, `error`, plus `authors` (list,
336
+ from metadata APIs), `year` (int or `null`), `source` (`crossref` | `arxiv` |
337
+ `ncbi` | `http` | `archive` | `none` — which path produced the data), and
338
+ `soft_404_suspect` (bool — a 2xx page that looks like an error/placeholder).
339
+
340
+ For triage automation, prioritize citations where any of these are true:
341
+
342
+ - `tier` is `F` or `C`.
343
+ - `consensus` / `effective_verdict` is `Uncertain`, `Mismatch`, or
344
+ `Inaccessible`.
345
+ - judges disagree in `judgements`.
346
+ - `fetched.soft_404_suspect` is `true`.
347
+ - `fetched.landing_status` is `403`, `404`, or another non-2xx status while
348
+ metadata still resolved the source.
349
+ - `fetched.source` is `archive` or `none`.
350
+
351
+ `--json` is **additive**: pass `--out DIR` to also write the `.md` / `.jsonl`
352
+ files. **Exit codes are stable for agents**: `0` = ran successfully *regardless
353
+ of grades* (a tier-F document still exits 0 — inspect `has_failure`); a nonzero
354
+ code (`2`) means a real error (file not found, bad judge spec).
355
+
356
+ ### (b) MCP server — register as a tool
357
+
358
+ `paper-verify[mcp]` ships an MCP server (stdio transport) exposing these tools:
359
+
360
+ | Tool | Purpose |
361
+ |---|---|
362
+ | `verify_file(path, level="L2", judges=["keyword"], workers=4, tiebreak=None)` | full pipeline on a file → structured dict |
363
+ | `verify_text(text, level="L2", judges=["keyword"], tiebreak=None)` | same, on raw document text |
364
+ | `extract_citations(text)` | extraction only — no network, no LLM |
365
+ | `list_profiles()` | list all harness profiles → list of dicts (self-discovery) |
366
+ | `get_profile(key)` | look up one profile by key/alias → dict (`{"error": ...}` if unknown) |
367
+
368
+ ```bash
369
+ pip install "paper-verify[mcp] @ git+https://github.com/nolainjin/paper-verify"
370
+ ```
371
+
372
+ Register it with an MCP client. For Claude Code:
373
+
374
+ ```bash
375
+ claude mcp add paper-verify -- paper-verify-mcp
376
+ ```
377
+
378
+ Or in an MCP client config (`mcpServers`):
379
+
380
+ ```json
381
+ {
382
+ "mcpServers": {
383
+ "paper-verify": {
384
+ "command": "paper-verify-mcp"
385
+ }
386
+ }
387
+ }
388
+ ```
389
+
390
+ The `mcp` package is an **optional extra** — the core tool and `--json` work
391
+ with `mcp` not installed; importing the server without it raises a clear
392
+ install hint for the `[mcp]` extra instead of crashing.
393
+
394
+ ### (c) `--from-evidence` — bring your own fetch/judge
395
+
396
+ If your agent (or a web chat) already fetched the sources and judged the
397
+ claims, hand paper-verify the evidence and let it apply the standard rubric —
398
+ identical scoring to a native run:
399
+
400
+ ```bash
401
+ paper-verify yourdoc.md --extract-only > citations.json # deterministic extraction
402
+ # …your agent fetches each citation and judges it, producing evidence.json…
403
+ paper-verify --from-evidence evidence.json --json --out out/
404
+ ```
405
+
406
+ The evidence shape is documented by example in
407
+ [`examples/evidence-sample.json`](examples/evidence-sample.json): per citation,
408
+ the `citation` object from `--extract-only` verbatim, a `fetched` object
409
+ (`status`, `title`, `abstract`, `authors`, `year`, `source`,
410
+ `soft_404_suspect`, …), and one or more `judgements`
411
+ (`{"judge", "verdict", "reason"}` — verdicts: Match | Partial | Mismatch |
412
+ Uncertain | Inaccessible). Malformed evidence exits 2 with the offending
413
+ citation index named. This is the engine behind the web-chat skill.
414
+
415
+ ## Scoring rubric (100 points)
416
+
417
+ | Item | Points | Criterion |
418
+ |---|---|---|
419
+ | URL accessible | 20 | source fetched with HTTP 2xx |
420
+ | Author / year match | 20 / 10 / 0 | author **and** year align = 20; only one = 10; neither = 0. When no metadata is available to compare, the slot is **neutral** (10, "metadata unavailable") rather than a misleading 0 |
421
+ | Claim match | 50 | Match = 50 · Partial = 25 · **Uncertain = 15** · Mismatch = 0 · Inaccessible = 10 |
422
+ | Cross-check agreement | 10 | judges agree (incl. after a `--tiebreak`); a split with no tie-break scores 0 and marks the citation **Uncertain** |
423
+
424
+ At **L1**, a reachable page scores 100, a **soft-404-suspect** reachable page
425
+ scores **50**, and an unreachable page scores 0.
426
+
427
+ **Document score** = average of per-citation scores. If **any** citation is
428
+ tier F, the whole document is flagged ⚠️.
429
+
430
+ ## Verdict & tier taxonomy
431
+
432
+ Verdicts (per judge):
433
+
434
+ | Verdict | Meaning |
435
+ |---|---|
436
+ | ✅ Match | claim is explicitly supported by the source |
437
+ | ⚠️ Partial | partially supported; numbers / year / nuance differ |
438
+ | ❌ Mismatch | absent from, or contradicted by, the source |
439
+ | ❓ Uncertain | source insufficient to decide (or judges split, no tie-break) — flagged for human review |
440
+ | ⚫ Inaccessible | paywall / 404 / timeout — could not verify |
441
+
442
+ Tiers (per citation, and document average):
443
+
444
+ | Tier | Score | Meaning |
445
+ |---|---|---|
446
+ | 🟢 A | 90–100 | citable in a thesis / formal report |
447
+ | 🟡 B | 70–89 | fine for a lecture / blog, minor fixes |
448
+ | 🟠 C | 50–69 | must be re-checked |
449
+ | 🔴 F | 0–49 | do not cite — replace the source |
450
+
451
+ ## Limitations
452
+
453
+ - **AI judgment reliability** — LLM judges can mis-read an abstract's meaning.
454
+ Cross-check with a second judge (+ `--tiebreak`) and spot-check important
455
+ citations by hand. A judge may answer **Uncertain** rather than guess.
456
+ - **Paywalls** — for DOI / arXiv / PMID / PMC the free metadata APIs return the
457
+ title / authors / year / abstract even behind a paywall. Full-text claims that
458
+ need the body (not just the abstract) may still land in tier C — expected.
459
+ - **Soft-404 detection is heuristic** — it catches common error markers, deep
460
+ path→homepage redirects, and tiny bodies, but can miss disguised error pages.
461
+ - **Archive misses** — when a URL is dead *and* absent from the Wayback Machine,
462
+ the fallback fails and the citation is marked Inaccessible.
463
+ - **JavaScript-heavy pages** — SPA pages may return little text to the stripper.
464
+ - The **keyword** judge measures lexical overlap only and is not a substitute
465
+ for semantic verification.
466
+
467
+ ## 한국어 요약
468
+
469
+ 문서 안의 인용 출처(URL·DOI·PMC·PMID·arXiv)를 추출해 실제 원문과 대조하고 100점
470
+ 루브릭으로 채점하는 도구입니다. **전체 한국어 가이드: [README.ko.md](README.ko.md)**
471
+ (터미널 없이 웹챗에서 쓰는 방법 포함).
472
+
473
+ ## License
474
+
475
+ MIT © 2026 Duchan Jin (진두찬). See [LICENSE](LICENSE).