inscriber 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- inscriber/__init__.py +6 -0
- inscriber/__main__.py +8 -0
- inscriber/bibtex/__init__.py +1 -0
- inscriber/bibtex/arxiv.py +127 -0
- inscriber/bibtex/chain.py +104 -0
- inscriber/bibtex/local.py +39 -0
- inscriber/bibtex/probe.py +125 -0
- inscriber/bibtex/semantic_scholar.py +224 -0
- inscriber/bundle.py +214 -0
- inscriber/cache.py +390 -0
- inscriber/cli.py +459 -0
- inscriber/config.py +324 -0
- inscriber/errors.py +16 -0
- inscriber/input/__init__.py +1 -0
- inscriber/input/domain_handlers.py +204 -0
- inscriber/input/resolver.py +157 -0
- inscriber/llama/__init__.py +1 -0
- inscriber/llama/client.py +135 -0
- inscriber/llama/server.py +412 -0
- inscriber/logging.py +52 -0
- inscriber/models.py +279 -0
- inscriber/ocr/__init__.py +1 -0
- inscriber/ocr/base.py +263 -0
- inscriber/ocr/deepseek.py +212 -0
- inscriber/ocr/glm.py +95 -0
- inscriber/ocr/registry.py +33 -0
- inscriber/output.py +133 -0
- inscriber/pdf/__init__.py +1 -0
- inscriber/pdf/crop.py +108 -0
- inscriber/pdf/figures.py +83 -0
- inscriber/pdf/rasterize.py +120 -0
- inscriber/pipeline.py +1226 -0
- inscriber/postprocess/__init__.py +1 -0
- inscriber/postprocess/inject.py +90 -0
- inscriber/postprocess/join.py +112 -0
- inscriber/postprocess/notice.py +54 -0
- inscriber/postprocess/prompt.py +107 -0
- inscriber/postprocess/splitter.py +200 -0
- inscriber/postprocess/stitch.py +201 -0
- inscriber/postprocess/tables.py +278 -0
- inscriber/serialize.py +62 -0
- inscriber/setup.py +510 -0
- inscriber/vlm/__init__.py +1 -0
- inscriber/vlm/base.py +84 -0
- inscriber/vlm/gemma.py +134 -0
- inscriber/vlm/registry.py +21 -0
- inscriber-0.1.0.dist-info/METADATA +313 -0
- inscriber-0.1.0.dist-info/RECORD +51 -0
- inscriber-0.1.0.dist-info/WHEEL +4 -0
- inscriber-0.1.0.dist-info/entry_points.txt +2 -0
- inscriber-0.1.0.dist-info/licenses/LICENSE +21 -0
inscriber/__init__.py
ADDED
inscriber/__main__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Optional, online BibTeX generation via Semantic Scholar (DESIGN §12)."""
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""arXiv sources for BibTeX auto mode (DESIGN §12).
|
|
2
|
+
|
|
3
|
+
``arxiv_id_from_url`` extracts the arXiv ID from a source URL (provenance).
|
|
4
|
+
``arxiv_bibtex`` is the export-API **availability fallback**: Semantic Scholar
|
|
5
|
+
(which knows the *published* version of a preprint) is consulted first; the
|
|
6
|
+
export API is authoritative for identification but can never know about later
|
|
7
|
+
venue publication. ``format_arxiv_misc`` is the standard arXiv ``@misc`` +
|
|
8
|
+
``eprint`` shape, shared with the chain's S2-preprint path.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from urllib.parse import urlparse
|
|
15
|
+
from xml.etree import ElementTree
|
|
16
|
+
|
|
17
|
+
import httpx
|
|
18
|
+
from defusedxml import DefusedXmlException
|
|
19
|
+
from defusedxml.ElementTree import fromstring as defused_fromstring
|
|
20
|
+
|
|
21
|
+
from inscriber.bibtex.semantic_scholar import (
|
|
22
|
+
USER_AGENT,
|
|
23
|
+
generate_citation_key,
|
|
24
|
+
sanitize_bibtex_text,
|
|
25
|
+
)
|
|
26
|
+
from inscriber.input.domain_handlers import host_matches
|
|
27
|
+
from inscriber.logging import get_logger
|
|
28
|
+
|
|
29
|
+
logger = get_logger()
|
|
30
|
+
|
|
31
|
+
API_URL = "https://export.arxiv.org/api/query"
|
|
32
|
+
_ATOM = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
|
|
33
|
+
|
|
34
|
+
# The domain handler's FILENAME-RULE pattern shape (input/domain_handlers.py
|
|
35
|
+
# ``_arxiv()``): it preserves ``v2``-style version suffixes and old-style
|
|
36
|
+
# ``cs.AI/0301001`` IDs. (The handler's ``url_patterns`` detection regex is NOT
|
|
37
|
+
# reusable here — its ``\d+\.\d+`` stops before the version suffix.)
|
|
38
|
+
_ARXIV_ID_RE = re.compile(r"/(?:abs|pdf|html)/([\w.-]+/?\d+|\d+\.\d+)")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def arxiv_id_from_url(url: str | None) -> str | None:
|
|
42
|
+
"""The arXiv ID (version suffix preserved) from an arxiv.org URL, else None.
|
|
43
|
+
|
|
44
|
+
Host matching is by suffix (`host_matches`, DESIGN §6) — a lookalike host
|
|
45
|
+
must not be treated as arXiv *provenance* (citable by construction, §12.1).
|
|
46
|
+
"""
|
|
47
|
+
if not url:
|
|
48
|
+
return None
|
|
49
|
+
parsed = urlparse(url)
|
|
50
|
+
if not host_matches(parsed.hostname or "", "arxiv.org"):
|
|
51
|
+
return None
|
|
52
|
+
m = _ARXIV_ID_RE.search(parsed.path)
|
|
53
|
+
return m.group(1) if m else None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def format_arxiv_misc(
|
|
57
|
+
title: str,
|
|
58
|
+
authors: list[str],
|
|
59
|
+
year: str | None,
|
|
60
|
+
arxiv_id: str,
|
|
61
|
+
*,
|
|
62
|
+
primary_class: str | None = None,
|
|
63
|
+
) -> str:
|
|
64
|
+
"""The standard arXiv ``@misc`` + ``eprint`` shape (humble entry types)."""
|
|
65
|
+
key = generate_citation_key(title, authors, year)
|
|
66
|
+
fields = [f" title={{{sanitize_bibtex_text(title)}}}"]
|
|
67
|
+
if authors:
|
|
68
|
+
fields.append(
|
|
69
|
+
" author={" + " and ".join(sanitize_bibtex_text(a) for a in authors) + "}"
|
|
70
|
+
)
|
|
71
|
+
if year:
|
|
72
|
+
fields.append(f" year={{{year}}}")
|
|
73
|
+
fields.append(f" eprint={{{arxiv_id}}}")
|
|
74
|
+
fields.append(" archivePrefix={arXiv}")
|
|
75
|
+
if primary_class:
|
|
76
|
+
fields.append(f" primaryClass={{{primary_class}}}")
|
|
77
|
+
fields.append(f" url={{https://arxiv.org/abs/{arxiv_id}}}")
|
|
78
|
+
return f"@misc{{{key},\n" + ",\n".join(fields) + "\n}"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def arxiv_bibtex(arxiv_id: str, *, timeout: float = 30.0) -> str | None:
|
|
82
|
+
"""Fetch + format the arXiv entry by ID via the export API (Atom, parsed
|
|
83
|
+
with ``defusedxml`` — this is the one place remote XML is parsed, and the
|
|
84
|
+
stdlib parser is documented as unsafe for malicious input; DESIGN §12.1);
|
|
85
|
+
``None`` on any HTTP/parse failure — log + fall through, mirroring
|
|
86
|
+
``search_semantic_scholar``'s degrade style."""
|
|
87
|
+
try:
|
|
88
|
+
resp = httpx.get(
|
|
89
|
+
API_URL,
|
|
90
|
+
params={"id_list": arxiv_id},
|
|
91
|
+
timeout=timeout,
|
|
92
|
+
headers={"User-Agent": USER_AGENT},
|
|
93
|
+
)
|
|
94
|
+
except httpx.HTTPError as e:
|
|
95
|
+
logger.warning("arXiv API request failed: %s", e)
|
|
96
|
+
return None
|
|
97
|
+
if resp.status_code != 200:
|
|
98
|
+
logger.warning("arXiv API returned HTTP %d", resp.status_code)
|
|
99
|
+
return None
|
|
100
|
+
try:
|
|
101
|
+
# forbid_dtd on top of defusedxml's entity/external-reference defaults:
|
|
102
|
+
# Atom needs no DTD, so a payload carrying one is rejected outright.
|
|
103
|
+
root = defused_fromstring(resp.text, forbid_dtd=True)
|
|
104
|
+
except (ElementTree.ParseError, DefusedXmlException) as e:
|
|
105
|
+
logger.warning("arXiv API returned unparseable XML: %s", e)
|
|
106
|
+
return None
|
|
107
|
+
entry = root.find("atom:entry", _ATOM)
|
|
108
|
+
if entry is None:
|
|
109
|
+
return None
|
|
110
|
+
entry_id = entry.findtext("atom:id", default="", namespaces=_ATOM)
|
|
111
|
+
if "api/errors" in entry_id: # arXiv reports a bad ID as an <entry> with an error id
|
|
112
|
+
logger.warning("arXiv API has no record for id %s", arxiv_id)
|
|
113
|
+
return None
|
|
114
|
+
raw_title = entry.findtext("atom:title", default="", namespaces=_ATOM)
|
|
115
|
+
title = re.sub(r"\s+", " ", raw_title).strip()
|
|
116
|
+
if not title:
|
|
117
|
+
return None
|
|
118
|
+
authors = []
|
|
119
|
+
for a in entry.findall("atom:author", _ATOM):
|
|
120
|
+
name = (a.findtext("atom:name", default="", namespaces=_ATOM) or "").strip()
|
|
121
|
+
if name:
|
|
122
|
+
authors.append(name)
|
|
123
|
+
published = entry.findtext("atom:published", default="", namespaces=_ATOM)
|
|
124
|
+
year = published[:4] if published[:4].isdigit() else None
|
|
125
|
+
primary = entry.find("arxiv:primary_category", _ATOM)
|
|
126
|
+
primary_class = primary.get("term") if primary is not None else None
|
|
127
|
+
return format_arxiv_misc(title, authors, year, arxiv_id, primary_class=primary_class)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""BibTeX auto-mode orchestration: citability → ordered source chain (DESIGN §12).
|
|
2
|
+
|
|
3
|
+
Citability: a URL matching **any of the seven recognized paper repositories**
|
|
4
|
+
settles it — the probe never vetoes provenance (a disagreement is logged,
|
|
5
|
+
nothing more). The probe governs only provenance-less documents, and it is
|
|
6
|
+
abstain-biased: with a default-on feature, a false positive (an unwanted
|
|
7
|
+
``.bib``) is worse than a false negative.
|
|
8
|
+
|
|
9
|
+
Sources, in order (preprint provenance ≠ preprint citation — many preprints are
|
|
10
|
+
later published at a venue): Semantic Scholar **by arXiv ID** (exact match;
|
|
11
|
+
prefers the published version when one exists) → arXiv export API (``@misc``
|
|
12
|
+
availability fallback) → Semantic Scholar title search → local best-effort.
|
|
13
|
+
Any failure falls through; this module never raises (DESIGN §16: BibTeX never
|
|
14
|
+
fails the run).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from inscriber.bibtex.arxiv import arxiv_bibtex, arxiv_id_from_url, format_arxiv_misc
|
|
20
|
+
from inscriber.bibtex.local import best_effort_bibtex
|
|
21
|
+
from inscriber.bibtex.probe import ProbeResult
|
|
22
|
+
from inscriber.bibtex.semantic_scholar import (
|
|
23
|
+
_format_entry,
|
|
24
|
+
_mismatch_warning,
|
|
25
|
+
lookup_arxiv,
|
|
26
|
+
search_semantic_scholar,
|
|
27
|
+
)
|
|
28
|
+
from inscriber.input.domain_handlers import find_handler
|
|
29
|
+
from inscriber.logging import get_logger
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def citable_provenance(url: str | None) -> bool:
|
|
35
|
+
"""Whether ``url`` matches any of the seven recognized paper repositories
|
|
36
|
+
(the domain-handler configs, DESIGN §6) — citable by construction."""
|
|
37
|
+
if not url:
|
|
38
|
+
return False
|
|
39
|
+
return find_handler(url) is not None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _is_preprint_venue(venue) -> bool:
|
|
43
|
+
return not venue or str(venue).strip().lower().startswith("arxiv")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _s2_arxiv_entry(paper: dict, arxiv_id: str) -> str:
|
|
47
|
+
"""Format an S2 by-ID record: a real publication venue → the published
|
|
48
|
+
``@article`` shape (shared with the title-search path); no venue (or an
|
|
49
|
+
"arXiv.org"-style one) → the preprint ``@misc`` + ``eprint`` shape. No
|
|
50
|
+
title validation on this path — the ID match is exact."""
|
|
51
|
+
if not _is_preprint_venue(paper.get("venue")):
|
|
52
|
+
bibtex, _ = _format_entry(paper, paper.get("title", "") or "")
|
|
53
|
+
return bibtex
|
|
54
|
+
authors = [a.get("name", "") for a in paper.get("authors", []) if a.get("name")]
|
|
55
|
+
year = str(paper["year"]) if paper.get("year") else None
|
|
56
|
+
return format_arxiv_misc(paper.get("title", "") or "", authors, year, arxiv_id)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def generate_bibtex_auto(
|
|
60
|
+
probe: ProbeResult | None,
|
|
61
|
+
*,
|
|
62
|
+
original_url: str | None,
|
|
63
|
+
online_allowed: bool,
|
|
64
|
+
fallback_title: str,
|
|
65
|
+
timeout: float = 30.0,
|
|
66
|
+
) -> tuple[str | None, str]:
|
|
67
|
+
"""Walk the auto-mode chain; returns ``(bibtex | None, source_label)``.
|
|
68
|
+
|
|
69
|
+
``source_label`` ∈ {``s2-arxiv-id``, ``arxiv-export``, ``s2-title``,
|
|
70
|
+
``best-effort``} on success; on a skip it is the reason
|
|
71
|
+
(``not-citable`` / ``unknown`` / ``no usable metadata``).
|
|
72
|
+
"""
|
|
73
|
+
provenance = citable_provenance(original_url)
|
|
74
|
+
if provenance and probe is not None and not probe.citable:
|
|
75
|
+
logger.info(
|
|
76
|
+
"BibTeX (auto): probe judged the document not citable, but the source "
|
|
77
|
+
"URL is a recognized paper repository; provenance wins"
|
|
78
|
+
)
|
|
79
|
+
if not provenance and (probe is None or not probe.citable):
|
|
80
|
+
return None, "not-citable" if probe is not None else "unknown"
|
|
81
|
+
|
|
82
|
+
if online_allowed:
|
|
83
|
+
aid = arxiv_id_from_url(original_url)
|
|
84
|
+
if aid:
|
|
85
|
+
paper = lookup_arxiv(aid, timeout=timeout)
|
|
86
|
+
if paper:
|
|
87
|
+
return _s2_arxiv_entry(paper, aid), "s2-arxiv-id"
|
|
88
|
+
entry = arxiv_bibtex(aid, timeout=timeout)
|
|
89
|
+
if entry:
|
|
90
|
+
return entry, "arxiv-export"
|
|
91
|
+
# Title search; validation compares against the same string used as the
|
|
92
|
+
# query (avoids a spurious % WARNING from a mangled OCR `# Title`).
|
|
93
|
+
title = (probe.title if probe and probe.title else None) or fallback_title
|
|
94
|
+
results = search_semantic_scholar(title, timeout=timeout)
|
|
95
|
+
if results:
|
|
96
|
+
bibtex, matches = _format_entry(results[0], title)
|
|
97
|
+
if not matches:
|
|
98
|
+
bibtex = _mismatch_warning(title, results[0].get("title", "")) + bibtex
|
|
99
|
+
return bibtex, "s2-title"
|
|
100
|
+
|
|
101
|
+
entry = best_effort_bibtex(probe)
|
|
102
|
+
if entry:
|
|
103
|
+
return entry, "best-effort"
|
|
104
|
+
return None, "no usable metadata"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Local best-effort BibTeX entry from probe-extracted front matter (DESIGN §12).
|
|
2
|
+
|
|
3
|
+
The last link in the auto-mode source chain: a clearly-marked ``@misc`` entry
|
|
4
|
+
assembled purely from what the probe transcribed off the document's first page.
|
|
5
|
+
Fully offline — no citation database is consulted. Transcription, not recall
|
|
6
|
+
(decision 5): absent fields are absent, never ``Unknown Journal``-style filler;
|
|
7
|
+
the extracted venue goes in ``note``, not ``journal`` (decision 4 — no
|
|
8
|
+
venue-type guessing).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from inscriber.bibtex.probe import ProbeResult
|
|
14
|
+
from inscriber.bibtex.semantic_scholar import generate_citation_key, sanitize_bibtex_text
|
|
15
|
+
|
|
16
|
+
# Canonical header (pinned by tests/fixtures/bibtex_best_effort.txt).
|
|
17
|
+
BEST_EFFORT_HEADER = (
|
|
18
|
+
"% NOTE: Best-effort entry generated from the document's own front matter\n"
|
|
19
|
+
"% by inscriber (no citation database was consulted). Verify before use.\n"
|
|
20
|
+
"%\n"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def best_effort_bibtex(probe: ProbeResult | None) -> str | None:
|
|
25
|
+
"""Assemble the marked ``@misc`` entry, or ``None`` when there is no usable
|
|
26
|
+
title (an entry without a title is noise, not a citation)."""
|
|
27
|
+
if probe is None or not probe.title:
|
|
28
|
+
return None
|
|
29
|
+
key = generate_citation_key(probe.title, probe.authors, probe.year)
|
|
30
|
+
fields = [f" title={{{sanitize_bibtex_text(probe.title)}}}"]
|
|
31
|
+
if probe.authors:
|
|
32
|
+
fields.append(
|
|
33
|
+
" author={" + " and ".join(sanitize_bibtex_text(a) for a in probe.authors) + "}"
|
|
34
|
+
)
|
|
35
|
+
if probe.year:
|
|
36
|
+
fields.append(f" year={{{probe.year}}}")
|
|
37
|
+
if probe.venue:
|
|
38
|
+
fields.append(f" note={{{sanitize_bibtex_text(probe.venue)}}}")
|
|
39
|
+
return BEST_EFFORT_HEADER + f"@misc{{{key},\n" + ",\n".join(fields) + "\n}"
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""BibTeX citability + metadata probe (DESIGN §12, auto mode).
|
|
2
|
+
|
|
3
|
+
One **text-only** VLM call per document: is this a citable scholarly work, and
|
|
4
|
+
what front-matter metadata is visible on its first page? The result drives the
|
|
5
|
+
auto-mode source chain (``chain.py``) and the local best-effort entry
|
|
6
|
+
(``local.py``).
|
|
7
|
+
|
|
8
|
+
The prompt is pinned, model-facing behavior (the table-pass discipline,
|
|
9
|
+
DESIGN §9.2): assembled exactly once per document via the backend
|
|
10
|
+
(``build_bibtex_probe_prompt``), used verbatim as cache-key material AND as the
|
|
11
|
+
request. Changes require re-validation on real hardware recorded in
|
|
12
|
+
``dev/notes/2026-06-10-bibtex-probe-findings.md``. The phrase "bibliographic metadata" is
|
|
13
|
+
the pinned mock-dispatch discriminator (AGENTS.md) and must survive any tuning.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
|
|
22
|
+
# Truncation cap on the page-1 text embedded in the prompt. Front matter
|
|
23
|
+
# (title/authors/venue) fits comfortably; deliberately its own constant, NOT
|
|
24
|
+
# the [figure].context_chars knob (that one is a figure-context setting).
|
|
25
|
+
PROBE_PAGE_CHARS = 3000
|
|
26
|
+
|
|
27
|
+
_CODE_FENCE_RE = re.compile(r"^```[A-Za-z]*\s*\n(?P<body>.*?)\n?```\s*$", re.DOTALL)
|
|
28
|
+
|
|
29
|
+
# The pinned probe prompt. Citability is abstain-biased (decision 1: with a
|
|
30
|
+
# default-on feature a false positive is worse than a false negative), and
|
|
31
|
+
# extraction is transcription-not-recall (decision 5: only fields visible in
|
|
32
|
+
# the supplied text; absent fields are omitted, never filled in).
|
|
33
|
+
PROBE_PROMPT_TEMPLATE = """You are extracting bibliographic metadata from the first page of a document.
|
|
34
|
+
|
|
35
|
+
First decide whether the document is CITABLE: a self-contained scholarly work \
|
|
36
|
+
— a research paper, preprint, thesis, or technical report — whose title and \
|
|
37
|
+
authors are identifiable in the text below. Slides, lecture notes, invoices, \
|
|
38
|
+
forms, manuals, web pages, and other non-scholarly documents are NOT citable. \
|
|
39
|
+
When unsure, answer "citable": false.
|
|
40
|
+
|
|
41
|
+
Then extract the metadata fields that are VISIBLE in the text. Copy them \
|
|
42
|
+
exactly as written; do not guess and do not recall from memory. Omit any field \
|
|
43
|
+
that is not visible in the text.
|
|
44
|
+
|
|
45
|
+
Answer with a single JSON object and nothing else, in this shape:
|
|
46
|
+
{{"citable": true, "title": "...", "authors": ["...", "..."], "year": "...", "venue": "..."}}
|
|
47
|
+
|
|
48
|
+
- "citable": required boolean.
|
|
49
|
+
- "title": the document's full title, when visible.
|
|
50
|
+
- "authors": the list of author names, when visible.
|
|
51
|
+
- "year": the publication year, when visible.
|
|
52
|
+
- "venue": the journal, conference, or repository name, when visible.
|
|
53
|
+
|
|
54
|
+
First page text:
|
|
55
|
+
<page_text>
|
|
56
|
+
{page_text}
|
|
57
|
+
</page_text>"""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class ProbeResult:
|
|
62
|
+
"""The parsed probe answer (front-matter metadata + citability verdict)."""
|
|
63
|
+
|
|
64
|
+
citable: bool
|
|
65
|
+
title: str | None = None
|
|
66
|
+
authors: list[str] = field(default_factory=list)
|
|
67
|
+
year: str | None = None
|
|
68
|
+
venue: str | None = None
|
|
69
|
+
raw: str = "" # the model's JSON (fence-stripped), for debugging + caching
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def format_probe_prompt(page_text: str) -> str:
|
|
73
|
+
"""Assemble the full probe prompt — also the probe cache key material."""
|
|
74
|
+
text = page_text.strip()
|
|
75
|
+
if len(text) > PROBE_PAGE_CHARS:
|
|
76
|
+
text = text[: PROBE_PAGE_CHARS - 3] + "..."
|
|
77
|
+
return PROBE_PROMPT_TEMPLATE.format(page_text=text)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def parse_probe_response(raw: str | None) -> ProbeResult | None:
|
|
81
|
+
"""Parse + type-check a probe response; ``None`` means "treat as unknown".
|
|
82
|
+
|
|
83
|
+
Tolerates a wrapping code fence (like the table pass's sanitizer); otherwise
|
|
84
|
+
strict: a single JSON object with a boolean ``citable`` and correctly-typed
|
|
85
|
+
optional fields. Anything malformed → ``None`` (and the caller must NOT
|
|
86
|
+
cache it).
|
|
87
|
+
"""
|
|
88
|
+
if not raw:
|
|
89
|
+
return None
|
|
90
|
+
text = raw.strip()
|
|
91
|
+
fence = _CODE_FENCE_RE.match(text)
|
|
92
|
+
if fence:
|
|
93
|
+
text = fence.group("body").strip()
|
|
94
|
+
try:
|
|
95
|
+
data = json.loads(text)
|
|
96
|
+
except ValueError:
|
|
97
|
+
return None
|
|
98
|
+
if not isinstance(data, dict) or not isinstance(data.get("citable"), bool):
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
title = data.get("title")
|
|
102
|
+
if title is not None and not isinstance(title, str):
|
|
103
|
+
return None
|
|
104
|
+
authors = data.get("authors", [])
|
|
105
|
+
if authors is None:
|
|
106
|
+
authors = []
|
|
107
|
+
if not isinstance(authors, list) or not all(isinstance(a, str) for a in authors):
|
|
108
|
+
return None
|
|
109
|
+
year = data.get("year")
|
|
110
|
+
if isinstance(year, int): # tolerate a bare-number year
|
|
111
|
+
year = str(year)
|
|
112
|
+
if year is not None and not isinstance(year, str):
|
|
113
|
+
return None
|
|
114
|
+
venue = data.get("venue")
|
|
115
|
+
if venue is not None and not isinstance(venue, str):
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
return ProbeResult(
|
|
119
|
+
citable=data["citable"],
|
|
120
|
+
title=title.strip() or None if title else None,
|
|
121
|
+
authors=[a.strip() for a in authors if a.strip()],
|
|
122
|
+
year=year.strip() or None if year else None,
|
|
123
|
+
venue=venue.strip() or None if venue else None,
|
|
124
|
+
raw=text,
|
|
125
|
+
)
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""BibTeX generation via Semantic Scholar (DESIGN §12) — optional & online.
|
|
2
|
+
|
|
3
|
+
Ported from paper2llm ``core/utils/bibtex-generator.ts`` + ``content-utils.ts``:
|
|
4
|
+
title→entry lookup, citation key, title validation, and the mock fallback. The
|
|
5
|
+
inscriber **standardizes on the single 4-line mismatch warning** (DESIGN §12) and
|
|
6
|
+
**adds a clean 429 / network degrade path** (the source had none).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from datetime import datetime, timezone
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from inscriber.logging import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
API_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
|
|
21
|
+
LOOKUP_API_URL = "https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}"
|
|
22
|
+
FIELDS = "title,authors,venue,year,abstract,externalIds,url"
|
|
23
|
+
USER_AGENT = "inscriber/0.1 (+https://github.com/lacerbi/inscriber)"
|
|
24
|
+
|
|
25
|
+
_SKIP_WORDS = {"a", "an", "the", "on", "in", "of", "for", "and", "or"}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _current_year() -> str:
|
|
29
|
+
return str(datetime.now(timezone.utc).year)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _today() -> str:
|
|
33
|
+
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def sanitize_bibtex_text(text: str) -> str:
|
|
37
|
+
"""Escape BibTeX special characters (port of ``sanitizeBibTeXText``)."""
|
|
38
|
+
if not text:
|
|
39
|
+
return ""
|
|
40
|
+
text = re.sub(r"[&%$#_{}~^\\\s]", lambda m: m.group() if m.group() == " " else "\\" + m.group(), text)
|
|
41
|
+
text = text.replace("“", "``").replace("”", "``") # curly double quotes
|
|
42
|
+
text = text.replace("‘", "''").replace("’", "''") # curly single quotes
|
|
43
|
+
text = text.replace("—", "---").replace("–", "--") # em / en dash
|
|
44
|
+
return text
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def generate_citation_key(title: str, authors: list[str], year: str | None) -> str:
|
|
48
|
+
"""``{firstAuthorLastName}{year}{firstSubstantiveTitleWord}`` (DESIGN §12)."""
|
|
49
|
+
author_part = "Unknown"
|
|
50
|
+
if authors:
|
|
51
|
+
author_part = authors[0].split(" ")[-1].lower()
|
|
52
|
+
|
|
53
|
+
title_part = ""
|
|
54
|
+
title_words = title.split(" ")
|
|
55
|
+
for word in title_words:
|
|
56
|
+
clean = re.sub(r"[^a-z0-9]", "", word.lower())
|
|
57
|
+
if len(clean) > 2 and clean not in _SKIP_WORDS:
|
|
58
|
+
title_part = clean
|
|
59
|
+
break
|
|
60
|
+
if not title_part and title_words:
|
|
61
|
+
title_part = re.sub(r"[^a-z0-9]", "", title_words[0].lower())
|
|
62
|
+
|
|
63
|
+
year_part = year or _current_year()
|
|
64
|
+
return f"{author_part}{year_part}{title_part}"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def normalize_title(title: str) -> str:
|
|
68
|
+
"""lower → strip all but ``[a-z\\s]`` → collapse whitespace → trim (DESIGN §12).
|
|
69
|
+
|
|
70
|
+
Whitespace is preserved through the strip and collapsed afterward (matching the
|
|
71
|
+
TS ``normalizeTitleForComparison``), so an embedded tab/newline keeps words apart
|
|
72
|
+
rather than fusing them.
|
|
73
|
+
"""
|
|
74
|
+
if not title:
|
|
75
|
+
return ""
|
|
76
|
+
s = re.sub(r"[^a-z\s]", "", title.lower())
|
|
77
|
+
return re.sub(r"\s+", " ", s).strip()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def titles_match(original: str, bibtex_title: str) -> bool:
|
|
81
|
+
"""Validate a retrieved title vs the paper title (DESIGN §12)."""
|
|
82
|
+
no, nb = normalize_title(original), normalize_title(bibtex_title)
|
|
83
|
+
if len(no) < 10 or len(nb) < 10:
|
|
84
|
+
return no == nb # short titles require exact normalized match
|
|
85
|
+
orig_words = no.split(" ")
|
|
86
|
+
bib_words = set(nb.split(" "))
|
|
87
|
+
common = sum(1 for w in orig_words if w in bib_words)
|
|
88
|
+
similarity = common / max(len(orig_words), len(bib_words))
|
|
89
|
+
return similarity > 0.75
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def search_semantic_scholar(title: str, *, limit: int = 3, timeout: float = 30.0) -> list[dict]:
|
|
93
|
+
"""Query Semantic Scholar; return ``data`` (or ``[]`` on any error/429)."""
|
|
94
|
+
try:
|
|
95
|
+
resp = httpx.get(
|
|
96
|
+
API_URL,
|
|
97
|
+
params={"query": title, "limit": limit, "fields": FIELDS},
|
|
98
|
+
timeout=timeout,
|
|
99
|
+
headers={"User-Agent": USER_AGENT},
|
|
100
|
+
)
|
|
101
|
+
except httpx.HTTPError as e:
|
|
102
|
+
logger.warning("Semantic Scholar request failed: %s; using fallback citation", e)
|
|
103
|
+
return []
|
|
104
|
+
if resp.status_code == 429:
|
|
105
|
+
logger.warning("Semantic Scholar rate-limited (HTTP 429); using fallback citation")
|
|
106
|
+
return []
|
|
107
|
+
if resp.status_code != 200:
|
|
108
|
+
logger.warning("Semantic Scholar returned HTTP %d; using fallback citation", resp.status_code)
|
|
109
|
+
return []
|
|
110
|
+
try:
|
|
111
|
+
return resp.json().get("data", []) or []
|
|
112
|
+
except ValueError:
|
|
113
|
+
logger.warning("Semantic Scholar returned non-JSON; using fallback citation")
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def strip_arxiv_version(arxiv_id: str) -> str:
|
|
118
|
+
"""``2510.18234v2`` → ``2510.18234`` (Semantic Scholar indexes the base ID)."""
|
|
119
|
+
return re.sub(r"v\d+$", "", arxiv_id)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def lookup_arxiv(arxiv_id: str, *, timeout: float = 30.0) -> dict | None:
|
|
123
|
+
"""Look a paper up by arXiv ID — an exact match, no title fuzziness; the
|
|
124
|
+
record carries the publication venue when one exists (the auto chain
|
|
125
|
+
prefers the published version of a preprint). ``None`` on any error / 429 /
|
|
126
|
+
no record (never raises)."""
|
|
127
|
+
url = LOOKUP_API_URL.format(arxiv_id=strip_arxiv_version(arxiv_id))
|
|
128
|
+
try:
|
|
129
|
+
resp = httpx.get(
|
|
130
|
+
url,
|
|
131
|
+
params={"fields": FIELDS},
|
|
132
|
+
timeout=timeout,
|
|
133
|
+
headers={"User-Agent": USER_AGENT},
|
|
134
|
+
)
|
|
135
|
+
except httpx.HTTPError as e:
|
|
136
|
+
logger.warning("Semantic Scholar arXiv lookup failed: %s", e)
|
|
137
|
+
return None
|
|
138
|
+
if resp.status_code == 429:
|
|
139
|
+
logger.warning("Semantic Scholar rate-limited (HTTP 429) on arXiv lookup")
|
|
140
|
+
return None
|
|
141
|
+
if resp.status_code == 404:
|
|
142
|
+
logger.info("Semantic Scholar has no record for arXiv:%s", arxiv_id)
|
|
143
|
+
return None
|
|
144
|
+
if resp.status_code != 200:
|
|
145
|
+
logger.warning("Semantic Scholar arXiv lookup returned HTTP %d", resp.status_code)
|
|
146
|
+
return None
|
|
147
|
+
try:
|
|
148
|
+
data = resp.json()
|
|
149
|
+
except ValueError:
|
|
150
|
+
logger.warning("Semantic Scholar arXiv lookup returned non-JSON")
|
|
151
|
+
return None
|
|
152
|
+
if not isinstance(data, dict) or not data.get("title"):
|
|
153
|
+
return None
|
|
154
|
+
return data
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _format_entry(paper: dict, original_title: str) -> tuple[str, bool]:
|
|
158
|
+
"""Format a Semantic Scholar paper as BibTeX; return ``(bibtex, title_matches)``."""
|
|
159
|
+
authors = [a.get("name", "") for a in paper.get("authors", [])]
|
|
160
|
+
year = str(paper["year"]) if paper.get("year") else None
|
|
161
|
+
bib_title = paper.get("title", "") or ""
|
|
162
|
+
key = generate_citation_key(bib_title, authors, year)
|
|
163
|
+
|
|
164
|
+
fields = [f" title={{{sanitize_bibtex_text(bib_title)}}}"]
|
|
165
|
+
if authors:
|
|
166
|
+
fields.append(" author={" + " and ".join(sanitize_bibtex_text(a) for a in authors) + "}")
|
|
167
|
+
else:
|
|
168
|
+
fields.append(" author={Unknown}")
|
|
169
|
+
if year:
|
|
170
|
+
fields.append(f" year={{{year}}}")
|
|
171
|
+
if paper.get("venue"):
|
|
172
|
+
fields.append(f" journal={{{sanitize_bibtex_text(paper['venue'])}}}")
|
|
173
|
+
doi = (paper.get("externalIds") or {}).get("DOI")
|
|
174
|
+
if doi:
|
|
175
|
+
fields.append(f" doi={{{doi}}}")
|
|
176
|
+
if paper.get("url"):
|
|
177
|
+
fields.append(f" url={{{paper['url']}}}")
|
|
178
|
+
|
|
179
|
+
bibtex = f"@article{{{key},\n" + ",\n".join(fields) + "\n}"
|
|
180
|
+
return bibtex, titles_match(original_title, bib_title)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _mismatch_warning(original: str, bibtex_title: str) -> str:
|
|
184
|
+
# The standardized 4-line form (DESIGN §12); note the trailing "% " line.
|
|
185
|
+
return (
|
|
186
|
+
"% WARNING: The retrieved citation title may not match the paper title.\n"
|
|
187
|
+
f'% Paper title: "{original}"\n'
|
|
188
|
+
f'% Citation title: "{bibtex_title}"\n'
|
|
189
|
+
"% \n"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def mock_bibtex(title: str, *, date: str | None = None, year: str | None = None) -> str:
|
|
194
|
+
"""The canonical fallback mock entry (DESIGN §12; review Fix 6)."""
|
|
195
|
+
return (
|
|
196
|
+
"% WARNING: This is a fallback mock citation.\n"
|
|
197
|
+
"% BibTeX generation failed to find this paper in academic databases.\n"
|
|
198
|
+
"% Please replace with the correct citation if available.\n"
|
|
199
|
+
"%\n"
|
|
200
|
+
f"% Generated: {date or _today()}\n"
|
|
201
|
+
"@article{unknownYear,\n"
|
|
202
|
+
f" title={{{title}}},\n"
|
|
203
|
+
" author={Unknown Author},\n"
|
|
204
|
+
" journal={Unknown Journal},\n"
|
|
205
|
+
f" year={{{year or _current_year()}}},\n"
|
|
206
|
+
" note={This is an automatically generated fallback citation}\n"
|
|
207
|
+
"}"
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def generate_bibtex(title: str, *, timeout: float = 30.0, date: str | None = None) -> str:
|
|
212
|
+
"""Generate a BibTeX string for ``title`` (DESIGN §12).
|
|
213
|
+
|
|
214
|
+
On a confident match → the formatted entry. On a title mismatch → the entry
|
|
215
|
+
prefixed with the 4-line warning. On no result / API error / 429 → the mock
|
|
216
|
+
fallback (never raises — BibTeX never fails the whole run).
|
|
217
|
+
"""
|
|
218
|
+
results = search_semantic_scholar(title, timeout=timeout)
|
|
219
|
+
if results:
|
|
220
|
+
bibtex, matches = _format_entry(results[0], title)
|
|
221
|
+
if not matches:
|
|
222
|
+
bibtex = _mismatch_warning(title, results[0].get("title", "")) + bibtex
|
|
223
|
+
return bibtex
|
|
224
|
+
return mock_bibtex(title, date=date)
|