inscriber 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. inscriber/__init__.py +6 -0
  2. inscriber/__main__.py +8 -0
  3. inscriber/bibtex/__init__.py +1 -0
  4. inscriber/bibtex/arxiv.py +127 -0
  5. inscriber/bibtex/chain.py +104 -0
  6. inscriber/bibtex/local.py +39 -0
  7. inscriber/bibtex/probe.py +125 -0
  8. inscriber/bibtex/semantic_scholar.py +224 -0
  9. inscriber/bundle.py +214 -0
  10. inscriber/cache.py +390 -0
  11. inscriber/cli.py +459 -0
  12. inscriber/config.py +324 -0
  13. inscriber/errors.py +16 -0
  14. inscriber/input/__init__.py +1 -0
  15. inscriber/input/domain_handlers.py +204 -0
  16. inscriber/input/resolver.py +157 -0
  17. inscriber/llama/__init__.py +1 -0
  18. inscriber/llama/client.py +135 -0
  19. inscriber/llama/server.py +412 -0
  20. inscriber/logging.py +52 -0
  21. inscriber/models.py +279 -0
  22. inscriber/ocr/__init__.py +1 -0
  23. inscriber/ocr/base.py +263 -0
  24. inscriber/ocr/deepseek.py +212 -0
  25. inscriber/ocr/glm.py +95 -0
  26. inscriber/ocr/registry.py +33 -0
  27. inscriber/output.py +133 -0
  28. inscriber/pdf/__init__.py +1 -0
  29. inscriber/pdf/crop.py +108 -0
  30. inscriber/pdf/figures.py +83 -0
  31. inscriber/pdf/rasterize.py +120 -0
  32. inscriber/pipeline.py +1226 -0
  33. inscriber/postprocess/__init__.py +1 -0
  34. inscriber/postprocess/inject.py +90 -0
  35. inscriber/postprocess/join.py +112 -0
  36. inscriber/postprocess/notice.py +54 -0
  37. inscriber/postprocess/prompt.py +107 -0
  38. inscriber/postprocess/splitter.py +200 -0
  39. inscriber/postprocess/stitch.py +201 -0
  40. inscriber/postprocess/tables.py +278 -0
  41. inscriber/serialize.py +62 -0
  42. inscriber/setup.py +510 -0
  43. inscriber/vlm/__init__.py +1 -0
  44. inscriber/vlm/base.py +84 -0
  45. inscriber/vlm/gemma.py +134 -0
  46. inscriber/vlm/registry.py +21 -0
  47. inscriber-0.1.0.dist-info/METADATA +313 -0
  48. inscriber-0.1.0.dist-info/RECORD +51 -0
  49. inscriber-0.1.0.dist-info/WHEEL +4 -0
  50. inscriber-0.1.0.dist-info/entry_points.txt +2 -0
  51. inscriber-0.1.0.dist-info/licenses/LICENSE +21 -0
inscriber/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """inscriber — local-first PDF → LLM-friendly Markdown via llama.cpp.
2
+
3
+ See DESIGN.md for the authoritative specification.
4
+ """
5
+
6
+ __version__ = "0.1.0"
inscriber/__main__.py ADDED
@@ -0,0 +1,8 @@
1
+ """Enables ``python -m inscriber``."""
2
+
3
+ import sys
4
+
5
+ from inscriber.cli import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
@@ -0,0 +1 @@
1
+ """Optional, online BibTeX generation via Semantic Scholar (DESIGN §12)."""
@@ -0,0 +1,127 @@
1
+ """arXiv sources for BibTeX auto mode (DESIGN §12).
2
+
3
+ ``arxiv_id_from_url`` extracts the arXiv ID from a source URL (provenance).
4
+ ``arxiv_bibtex`` is the export-API **availability fallback**: Semantic Scholar
5
+ (which knows the *published* version of a preprint) is consulted first; the
6
+ export API is authoritative for identification but can never know about later
7
+ venue publication. ``format_arxiv_misc`` is the standard arXiv ``@misc`` +
8
+ ``eprint`` shape, shared with the chain's S2-preprint path.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from urllib.parse import urlparse
15
+ from xml.etree import ElementTree
16
+
17
+ import httpx
18
+ from defusedxml import DefusedXmlException
19
+ from defusedxml.ElementTree import fromstring as defused_fromstring
20
+
21
+ from inscriber.bibtex.semantic_scholar import (
22
+ USER_AGENT,
23
+ generate_citation_key,
24
+ sanitize_bibtex_text,
25
+ )
26
+ from inscriber.input.domain_handlers import host_matches
27
+ from inscriber.logging import get_logger
28
+
29
+ logger = get_logger()
30
+
31
+ API_URL = "https://export.arxiv.org/api/query"
32
+ _ATOM = {"atom": "http://www.w3.org/2005/Atom", "arxiv": "http://arxiv.org/schemas/atom"}
33
+
34
+ # The domain handler's FILENAME-RULE pattern shape (input/domain_handlers.py
35
+ # ``_arxiv()``): it preserves ``v2``-style version suffixes and old-style
36
+ # ``cs.AI/0301001`` IDs. (The handler's ``url_patterns`` detection regex is NOT
37
+ # reusable here — its ``\d+\.\d+`` stops before the version suffix.)
38
+ _ARXIV_ID_RE = re.compile(r"/(?:abs|pdf|html)/([\w.-]+/?\d+|\d+\.\d+)")
39
+
40
+
41
+ def arxiv_id_from_url(url: str | None) -> str | None:
42
+ """The arXiv ID (version suffix preserved) from an arxiv.org URL, else None.
43
+
44
+ Host matching is by suffix (`host_matches`, DESIGN §6) — a lookalike host
45
+ must not be treated as arXiv *provenance* (citable by construction, §12.1).
46
+ """
47
+ if not url:
48
+ return None
49
+ parsed = urlparse(url)
50
+ if not host_matches(parsed.hostname or "", "arxiv.org"):
51
+ return None
52
+ m = _ARXIV_ID_RE.search(parsed.path)
53
+ return m.group(1) if m else None
54
+
55
+
56
+ def format_arxiv_misc(
57
+ title: str,
58
+ authors: list[str],
59
+ year: str | None,
60
+ arxiv_id: str,
61
+ *,
62
+ primary_class: str | None = None,
63
+ ) -> str:
64
+ """The standard arXiv ``@misc`` + ``eprint`` shape (humble entry types)."""
65
+ key = generate_citation_key(title, authors, year)
66
+ fields = [f" title={{{sanitize_bibtex_text(title)}}}"]
67
+ if authors:
68
+ fields.append(
69
+ " author={" + " and ".join(sanitize_bibtex_text(a) for a in authors) + "}"
70
+ )
71
+ if year:
72
+ fields.append(f" year={{{year}}}")
73
+ fields.append(f" eprint={{{arxiv_id}}}")
74
+ fields.append(" archivePrefix={arXiv}")
75
+ if primary_class:
76
+ fields.append(f" primaryClass={{{primary_class}}}")
77
+ fields.append(f" url={{https://arxiv.org/abs/{arxiv_id}}}")
78
+ return f"@misc{{{key},\n" + ",\n".join(fields) + "\n}"
79
+
80
+
81
+ def arxiv_bibtex(arxiv_id: str, *, timeout: float = 30.0) -> str | None:
82
+ """Fetch + format the arXiv entry by ID via the export API (Atom, parsed
83
+ with ``defusedxml`` — this is the one place remote XML is parsed, and the
84
+ stdlib parser is documented as unsafe for malicious input; DESIGN §12.1);
85
+ ``None`` on any HTTP/parse failure — log + fall through, mirroring
86
+ ``search_semantic_scholar``'s degrade style."""
87
+ try:
88
+ resp = httpx.get(
89
+ API_URL,
90
+ params={"id_list": arxiv_id},
91
+ timeout=timeout,
92
+ headers={"User-Agent": USER_AGENT},
93
+ )
94
+ except httpx.HTTPError as e:
95
+ logger.warning("arXiv API request failed: %s", e)
96
+ return None
97
+ if resp.status_code != 200:
98
+ logger.warning("arXiv API returned HTTP %d", resp.status_code)
99
+ return None
100
+ try:
101
+ # forbid_dtd on top of defusedxml's entity/external-reference defaults:
102
+ # Atom needs no DTD, so a payload carrying one is rejected outright.
103
+ root = defused_fromstring(resp.text, forbid_dtd=True)
104
+ except (ElementTree.ParseError, DefusedXmlException) as e:
105
+ logger.warning("arXiv API returned unparseable XML: %s", e)
106
+ return None
107
+ entry = root.find("atom:entry", _ATOM)
108
+ if entry is None:
109
+ return None
110
+ entry_id = entry.findtext("atom:id", default="", namespaces=_ATOM)
111
+ if "api/errors" in entry_id: # arXiv reports a bad ID as an <entry> with an error id
112
+ logger.warning("arXiv API has no record for id %s", arxiv_id)
113
+ return None
114
+ raw_title = entry.findtext("atom:title", default="", namespaces=_ATOM)
115
+ title = re.sub(r"\s+", " ", raw_title).strip()
116
+ if not title:
117
+ return None
118
+ authors = []
119
+ for a in entry.findall("atom:author", _ATOM):
120
+ name = (a.findtext("atom:name", default="", namespaces=_ATOM) or "").strip()
121
+ if name:
122
+ authors.append(name)
123
+ published = entry.findtext("atom:published", default="", namespaces=_ATOM)
124
+ year = published[:4] if published[:4].isdigit() else None
125
+ primary = entry.find("arxiv:primary_category", _ATOM)
126
+ primary_class = primary.get("term") if primary is not None else None
127
+ return format_arxiv_misc(title, authors, year, arxiv_id, primary_class=primary_class)
@@ -0,0 +1,104 @@
1
+ """BibTeX auto-mode orchestration: citability → ordered source chain (DESIGN §12).
2
+
3
+ Citability: a URL matching **any of the seven recognized paper repositories**
4
+ settles it — the probe never vetoes provenance (a disagreement is logged,
5
+ nothing more). The probe governs only provenance-less documents, and it is
6
+ abstain-biased: with a default-on feature, a false positive (an unwanted
7
+ ``.bib``) is worse than a false negative.
8
+
9
+ Sources, in order (preprint provenance ≠ preprint citation — many preprints are
10
+ later published at a venue): Semantic Scholar **by arXiv ID** (exact match;
11
+ prefers the published version when one exists) → arXiv export API (``@misc``
12
+ availability fallback) → Semantic Scholar title search → local best-effort.
13
+ Any failure falls through; this module never raises (DESIGN §16: BibTeX never
14
+ fails the run).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from inscriber.bibtex.arxiv import arxiv_bibtex, arxiv_id_from_url, format_arxiv_misc
20
+ from inscriber.bibtex.local import best_effort_bibtex
21
+ from inscriber.bibtex.probe import ProbeResult
22
+ from inscriber.bibtex.semantic_scholar import (
23
+ _format_entry,
24
+ _mismatch_warning,
25
+ lookup_arxiv,
26
+ search_semantic_scholar,
27
+ )
28
+ from inscriber.input.domain_handlers import find_handler
29
+ from inscriber.logging import get_logger
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ def citable_provenance(url: str | None) -> bool:
35
+ """Whether ``url`` matches any of the seven recognized paper repositories
36
+ (the domain-handler configs, DESIGN §6) — citable by construction."""
37
+ if not url:
38
+ return False
39
+ return find_handler(url) is not None
40
+
41
+
42
+ def _is_preprint_venue(venue) -> bool:
43
+ return not venue or str(venue).strip().lower().startswith("arxiv")
44
+
45
+
46
+ def _s2_arxiv_entry(paper: dict, arxiv_id: str) -> str:
47
+ """Format an S2 by-ID record: a real publication venue → the published
48
+ ``@article`` shape (shared with the title-search path); no venue (or an
49
+ "arXiv.org"-style one) → the preprint ``@misc`` + ``eprint`` shape. No
50
+ title validation on this path — the ID match is exact."""
51
+ if not _is_preprint_venue(paper.get("venue")):
52
+ bibtex, _ = _format_entry(paper, paper.get("title", "") or "")
53
+ return bibtex
54
+ authors = [a.get("name", "") for a in paper.get("authors", []) if a.get("name")]
55
+ year = str(paper["year"]) if paper.get("year") else None
56
+ return format_arxiv_misc(paper.get("title", "") or "", authors, year, arxiv_id)
57
+
58
+
59
+ def generate_bibtex_auto(
60
+ probe: ProbeResult | None,
61
+ *,
62
+ original_url: str | None,
63
+ online_allowed: bool,
64
+ fallback_title: str,
65
+ timeout: float = 30.0,
66
+ ) -> tuple[str | None, str]:
67
+ """Walk the auto-mode chain; returns ``(bibtex | None, source_label)``.
68
+
69
+ ``source_label`` ∈ {``s2-arxiv-id``, ``arxiv-export``, ``s2-title``,
70
+ ``best-effort``} on success; on a skip it is the reason
71
+ (``not-citable`` / ``unknown`` / ``no usable metadata``).
72
+ """
73
+ provenance = citable_provenance(original_url)
74
+ if provenance and probe is not None and not probe.citable:
75
+ logger.info(
76
+ "BibTeX (auto): probe judged the document not citable, but the source "
77
+ "URL is a recognized paper repository; provenance wins"
78
+ )
79
+ if not provenance and (probe is None or not probe.citable):
80
+ return None, "not-citable" if probe is not None else "unknown"
81
+
82
+ if online_allowed:
83
+ aid = arxiv_id_from_url(original_url)
84
+ if aid:
85
+ paper = lookup_arxiv(aid, timeout=timeout)
86
+ if paper:
87
+ return _s2_arxiv_entry(paper, aid), "s2-arxiv-id"
88
+ entry = arxiv_bibtex(aid, timeout=timeout)
89
+ if entry:
90
+ return entry, "arxiv-export"
91
+ # Title search; validation compares against the same string used as the
92
+ # query (avoids a spurious % WARNING from a mangled OCR `# Title`).
93
+ title = (probe.title if probe and probe.title else None) or fallback_title
94
+ results = search_semantic_scholar(title, timeout=timeout)
95
+ if results:
96
+ bibtex, matches = _format_entry(results[0], title)
97
+ if not matches:
98
+ bibtex = _mismatch_warning(title, results[0].get("title", "")) + bibtex
99
+ return bibtex, "s2-title"
100
+
101
+ entry = best_effort_bibtex(probe)
102
+ if entry:
103
+ return entry, "best-effort"
104
+ return None, "no usable metadata"
@@ -0,0 +1,39 @@
1
+ """Local best-effort BibTeX entry from probe-extracted front matter (DESIGN §12).
2
+
3
+ The last link in the auto-mode source chain: a clearly-marked ``@misc`` entry
4
+ assembled purely from what the probe transcribed off the document's first page.
5
+ Fully offline — no citation database is consulted. Transcription, not recall
6
+ (decision 5): absent fields are absent, never ``Unknown Journal``-style filler;
7
+ the extracted venue goes in ``note``, not ``journal`` (decision 4 — no
8
+ venue-type guessing).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from inscriber.bibtex.probe import ProbeResult
14
+ from inscriber.bibtex.semantic_scholar import generate_citation_key, sanitize_bibtex_text
15
+
16
+ # Canonical header (pinned by tests/fixtures/bibtex_best_effort.txt).
17
+ BEST_EFFORT_HEADER = (
18
+ "% NOTE: Best-effort entry generated from the document's own front matter\n"
19
+ "% by inscriber (no citation database was consulted). Verify before use.\n"
20
+ "%\n"
21
+ )
22
+
23
+
24
+ def best_effort_bibtex(probe: ProbeResult | None) -> str | None:
25
+ """Assemble the marked ``@misc`` entry, or ``None`` when there is no usable
26
+ title (an entry without a title is noise, not a citation)."""
27
+ if probe is None or not probe.title:
28
+ return None
29
+ key = generate_citation_key(probe.title, probe.authors, probe.year)
30
+ fields = [f" title={{{sanitize_bibtex_text(probe.title)}}}"]
31
+ if probe.authors:
32
+ fields.append(
33
+ " author={" + " and ".join(sanitize_bibtex_text(a) for a in probe.authors) + "}"
34
+ )
35
+ if probe.year:
36
+ fields.append(f" year={{{probe.year}}}")
37
+ if probe.venue:
38
+ fields.append(f" note={{{sanitize_bibtex_text(probe.venue)}}}")
39
+ return BEST_EFFORT_HEADER + f"@misc{{{key},\n" + ",\n".join(fields) + "\n}"
@@ -0,0 +1,125 @@
1
+ """BibTeX citability + metadata probe (DESIGN §12, auto mode).
2
+
3
+ One **text-only** VLM call per document: is this a citable scholarly work, and
4
+ what front-matter metadata is visible on its first page? The result drives the
5
+ auto-mode source chain (``chain.py``) and the local best-effort entry
6
+ (``local.py``).
7
+
8
+ The prompt is pinned, model-facing behavior (the table-pass discipline,
9
+ DESIGN §9.2): assembled exactly once per document via the backend
10
+ (``build_bibtex_probe_prompt``), used verbatim as cache-key material AND as the
11
+ request. Changes require re-validation on real hardware recorded in
12
+ ``dev/notes/2026-06-10-bibtex-probe-findings.md``. The phrase "bibliographic metadata" is
13
+ the pinned mock-dispatch discriminator (AGENTS.md) and must survive any tuning.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from dataclasses import dataclass, field
21
+
22
+ # Truncation cap on the page-1 text embedded in the prompt. Front matter
23
+ # (title/authors/venue) fits comfortably; deliberately its own constant, NOT
24
+ # the [figure].context_chars knob (that one is a figure-context setting).
25
+ PROBE_PAGE_CHARS = 3000
26
+
27
+ _CODE_FENCE_RE = re.compile(r"^```[A-Za-z]*\s*\n(?P<body>.*?)\n?```\s*$", re.DOTALL)
28
+
29
+ # The pinned probe prompt. Citability is abstain-biased (decision 1: with a
30
+ # default-on feature a false positive is worse than a false negative), and
31
+ # extraction is transcription-not-recall (decision 5: only fields visible in
32
+ # the supplied text; absent fields are omitted, never filled in).
33
+ PROBE_PROMPT_TEMPLATE = """You are extracting bibliographic metadata from the first page of a document.
34
+
35
+ First decide whether the document is CITABLE: a self-contained scholarly work \
36
+ — a research paper, preprint, thesis, or technical report — whose title and \
37
+ authors are identifiable in the text below. Slides, lecture notes, invoices, \
38
+ forms, manuals, web pages, and other non-scholarly documents are NOT citable. \
39
+ When unsure, answer "citable": false.
40
+
41
+ Then extract the metadata fields that are VISIBLE in the text. Copy them \
42
+ exactly as written; do not guess and do not recall from memory. Omit any field \
43
+ that is not visible in the text.
44
+
45
+ Answer with a single JSON object and nothing else, in this shape:
46
+ {{"citable": true, "title": "...", "authors": ["...", "..."], "year": "...", "venue": "..."}}
47
+
48
+ - "citable": required boolean.
49
+ - "title": the document's full title, when visible.
50
+ - "authors": the list of author names, when visible.
51
+ - "year": the publication year, when visible.
52
+ - "venue": the journal, conference, or repository name, when visible.
53
+
54
+ First page text:
55
+ <page_text>
56
+ {page_text}
57
+ </page_text>"""
58
+
59
+
60
+ @dataclass
61
+ class ProbeResult:
62
+ """The parsed probe answer (front-matter metadata + citability verdict)."""
63
+
64
+ citable: bool
65
+ title: str | None = None
66
+ authors: list[str] = field(default_factory=list)
67
+ year: str | None = None
68
+ venue: str | None = None
69
+ raw: str = "" # the model's JSON (fence-stripped), for debugging + caching
70
+
71
+
72
+ def format_probe_prompt(page_text: str) -> str:
73
+ """Assemble the full probe prompt — also the probe cache key material."""
74
+ text = page_text.strip()
75
+ if len(text) > PROBE_PAGE_CHARS:
76
+ text = text[: PROBE_PAGE_CHARS - 3] + "..."
77
+ return PROBE_PROMPT_TEMPLATE.format(page_text=text)
78
+
79
+
80
+ def parse_probe_response(raw: str | None) -> ProbeResult | None:
81
+ """Parse + type-check a probe response; ``None`` means "treat as unknown".
82
+
83
+ Tolerates a wrapping code fence (like the table pass's sanitizer); otherwise
84
+ strict: a single JSON object with a boolean ``citable`` and correctly-typed
85
+ optional fields. Anything malformed → ``None`` (and the caller must NOT
86
+ cache it).
87
+ """
88
+ if not raw:
89
+ return None
90
+ text = raw.strip()
91
+ fence = _CODE_FENCE_RE.match(text)
92
+ if fence:
93
+ text = fence.group("body").strip()
94
+ try:
95
+ data = json.loads(text)
96
+ except ValueError:
97
+ return None
98
+ if not isinstance(data, dict) or not isinstance(data.get("citable"), bool):
99
+ return None
100
+
101
+ title = data.get("title")
102
+ if title is not None and not isinstance(title, str):
103
+ return None
104
+ authors = data.get("authors", [])
105
+ if authors is None:
106
+ authors = []
107
+ if not isinstance(authors, list) or not all(isinstance(a, str) for a in authors):
108
+ return None
109
+ year = data.get("year")
110
+ if isinstance(year, int): # tolerate a bare-number year
111
+ year = str(year)
112
+ if year is not None and not isinstance(year, str):
113
+ return None
114
+ venue = data.get("venue")
115
+ if venue is not None and not isinstance(venue, str):
116
+ return None
117
+
118
+ return ProbeResult(
119
+ citable=data["citable"],
120
+ title=title.strip() or None if title else None,
121
+ authors=[a.strip() for a in authors if a.strip()],
122
+ year=year.strip() or None if year else None,
123
+ venue=venue.strip() or None if venue else None,
124
+ raw=text,
125
+ )
@@ -0,0 +1,224 @@
1
+ """BibTeX generation via Semantic Scholar (DESIGN §12) — optional & online.
2
+
3
+ Ported from paper2llm ``core/utils/bibtex-generator.ts`` + ``content-utils.ts``:
4
+ title→entry lookup, citation key, title validation, and the mock fallback. The
5
+ inscriber **standardizes on the single 4-line mismatch warning** (DESIGN §12) and
6
+ **adds a clean 429 / network degrade path** (the source had none).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from datetime import datetime, timezone
13
+
14
+ import httpx
15
+
16
+ from inscriber.logging import get_logger
17
+
18
+ logger = get_logger()
19
+
20
+ API_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
21
+ LOOKUP_API_URL = "https://api.semanticscholar.org/graph/v1/paper/arXiv:{arxiv_id}"
22
+ FIELDS = "title,authors,venue,year,abstract,externalIds,url"
23
+ USER_AGENT = "inscriber/0.1 (+https://github.com/lacerbi/inscriber)"
24
+
25
+ _SKIP_WORDS = {"a", "an", "the", "on", "in", "of", "for", "and", "or"}
26
+
27
+
28
+ def _current_year() -> str:
29
+ return str(datetime.now(timezone.utc).year)
30
+
31
+
32
+ def _today() -> str:
33
+ return datetime.now(timezone.utc).strftime("%Y-%m-%d")
34
+
35
+
36
+ def sanitize_bibtex_text(text: str) -> str:
37
+ """Escape BibTeX special characters (port of ``sanitizeBibTeXText``)."""
38
+ if not text:
39
+ return ""
40
+ text = re.sub(r"[&%$#_{}~^\\\s]", lambda m: m.group() if m.group() == " " else "\\" + m.group(), text)
41
+ text = text.replace("“", "``").replace("”", "``") # curly double quotes
42
+ text = text.replace("‘", "''").replace("’", "''") # curly single quotes
43
+ text = text.replace("—", "---").replace("–", "--") # em / en dash
44
+ return text
45
+
46
+
47
+ def generate_citation_key(title: str, authors: list[str], year: str | None) -> str:
48
+ """``{firstAuthorLastName}{year}{firstSubstantiveTitleWord}`` (DESIGN §12)."""
49
+ author_part = "Unknown"
50
+ if authors:
51
+ author_part = authors[0].split(" ")[-1].lower()
52
+
53
+ title_part = ""
54
+ title_words = title.split(" ")
55
+ for word in title_words:
56
+ clean = re.sub(r"[^a-z0-9]", "", word.lower())
57
+ if len(clean) > 2 and clean not in _SKIP_WORDS:
58
+ title_part = clean
59
+ break
60
+ if not title_part and title_words:
61
+ title_part = re.sub(r"[^a-z0-9]", "", title_words[0].lower())
62
+
63
+ year_part = year or _current_year()
64
+ return f"{author_part}{year_part}{title_part}"
65
+
66
+
67
+ def normalize_title(title: str) -> str:
68
+ """lower → strip all but ``[a-z\\s]`` → collapse whitespace → trim (DESIGN §12).
69
+
70
+ Whitespace is preserved through the strip and collapsed afterward (matching the
71
+ TS ``normalizeTitleForComparison``), so an embedded tab/newline keeps words apart
72
+ rather than fusing them.
73
+ """
74
+ if not title:
75
+ return ""
76
+ s = re.sub(r"[^a-z\s]", "", title.lower())
77
+ return re.sub(r"\s+", " ", s).strip()
78
+
79
+
80
+ def titles_match(original: str, bibtex_title: str) -> bool:
81
+ """Validate a retrieved title vs the paper title (DESIGN §12)."""
82
+ no, nb = normalize_title(original), normalize_title(bibtex_title)
83
+ if len(no) < 10 or len(nb) < 10:
84
+ return no == nb # short titles require exact normalized match
85
+ orig_words = no.split(" ")
86
+ bib_words = set(nb.split(" "))
87
+ common = sum(1 for w in orig_words if w in bib_words)
88
+ similarity = common / max(len(orig_words), len(bib_words))
89
+ return similarity > 0.75
90
+
91
+
92
+ def search_semantic_scholar(title: str, *, limit: int = 3, timeout: float = 30.0) -> list[dict]:
93
+ """Query Semantic Scholar; return ``data`` (or ``[]`` on any error/429)."""
94
+ try:
95
+ resp = httpx.get(
96
+ API_URL,
97
+ params={"query": title, "limit": limit, "fields": FIELDS},
98
+ timeout=timeout,
99
+ headers={"User-Agent": USER_AGENT},
100
+ )
101
+ except httpx.HTTPError as e:
102
+ logger.warning("Semantic Scholar request failed: %s; using fallback citation", e)
103
+ return []
104
+ if resp.status_code == 429:
105
+ logger.warning("Semantic Scholar rate-limited (HTTP 429); using fallback citation")
106
+ return []
107
+ if resp.status_code != 200:
108
+ logger.warning("Semantic Scholar returned HTTP %d; using fallback citation", resp.status_code)
109
+ return []
110
+ try:
111
+ return resp.json().get("data", []) or []
112
+ except ValueError:
113
+ logger.warning("Semantic Scholar returned non-JSON; using fallback citation")
114
+ return []
115
+
116
+
117
+ def strip_arxiv_version(arxiv_id: str) -> str:
118
+ """``2510.18234v2`` → ``2510.18234`` (Semantic Scholar indexes the base ID)."""
119
+ return re.sub(r"v\d+$", "", arxiv_id)
120
+
121
+
122
+ def lookup_arxiv(arxiv_id: str, *, timeout: float = 30.0) -> dict | None:
123
+ """Look a paper up by arXiv ID — an exact match, no title fuzziness; the
124
+ record carries the publication venue when one exists (the auto chain
125
+ prefers the published version of a preprint). ``None`` on any error / 429 /
126
+ no record (never raises)."""
127
+ url = LOOKUP_API_URL.format(arxiv_id=strip_arxiv_version(arxiv_id))
128
+ try:
129
+ resp = httpx.get(
130
+ url,
131
+ params={"fields": FIELDS},
132
+ timeout=timeout,
133
+ headers={"User-Agent": USER_AGENT},
134
+ )
135
+ except httpx.HTTPError as e:
136
+ logger.warning("Semantic Scholar arXiv lookup failed: %s", e)
137
+ return None
138
+ if resp.status_code == 429:
139
+ logger.warning("Semantic Scholar rate-limited (HTTP 429) on arXiv lookup")
140
+ return None
141
+ if resp.status_code == 404:
142
+ logger.info("Semantic Scholar has no record for arXiv:%s", arxiv_id)
143
+ return None
144
+ if resp.status_code != 200:
145
+ logger.warning("Semantic Scholar arXiv lookup returned HTTP %d", resp.status_code)
146
+ return None
147
+ try:
148
+ data = resp.json()
149
+ except ValueError:
150
+ logger.warning("Semantic Scholar arXiv lookup returned non-JSON")
151
+ return None
152
+ if not isinstance(data, dict) or not data.get("title"):
153
+ return None
154
+ return data
155
+
156
+
157
+ def _format_entry(paper: dict, original_title: str) -> tuple[str, bool]:
158
+ """Format a Semantic Scholar paper as BibTeX; return ``(bibtex, title_matches)``."""
159
+ authors = [a.get("name", "") for a in paper.get("authors", [])]
160
+ year = str(paper["year"]) if paper.get("year") else None
161
+ bib_title = paper.get("title", "") or ""
162
+ key = generate_citation_key(bib_title, authors, year)
163
+
164
+ fields = [f" title={{{sanitize_bibtex_text(bib_title)}}}"]
165
+ if authors:
166
+ fields.append(" author={" + " and ".join(sanitize_bibtex_text(a) for a in authors) + "}")
167
+ else:
168
+ fields.append(" author={Unknown}")
169
+ if year:
170
+ fields.append(f" year={{{year}}}")
171
+ if paper.get("venue"):
172
+ fields.append(f" journal={{{sanitize_bibtex_text(paper['venue'])}}}")
173
+ doi = (paper.get("externalIds") or {}).get("DOI")
174
+ if doi:
175
+ fields.append(f" doi={{{doi}}}")
176
+ if paper.get("url"):
177
+ fields.append(f" url={{{paper['url']}}}")
178
+
179
+ bibtex = f"@article{{{key},\n" + ",\n".join(fields) + "\n}"
180
+ return bibtex, titles_match(original_title, bib_title)
181
+
182
+
183
+ def _mismatch_warning(original: str, bibtex_title: str) -> str:
184
+ # The standardized 4-line form (DESIGN §12); note the trailing "% " line.
185
+ return (
186
+ "% WARNING: The retrieved citation title may not match the paper title.\n"
187
+ f'% Paper title: "{original}"\n'
188
+ f'% Citation title: "{bibtex_title}"\n'
189
+ "% \n"
190
+ )
191
+
192
+
193
+ def mock_bibtex(title: str, *, date: str | None = None, year: str | None = None) -> str:
194
+ """The canonical fallback mock entry (DESIGN §12; review Fix 6)."""
195
+ return (
196
+ "% WARNING: This is a fallback mock citation.\n"
197
+ "% BibTeX generation failed to find this paper in academic databases.\n"
198
+ "% Please replace with the correct citation if available.\n"
199
+ "%\n"
200
+ f"% Generated: {date or _today()}\n"
201
+ "@article{unknownYear,\n"
202
+ f" title={{{title}}},\n"
203
+ " author={Unknown Author},\n"
204
+ " journal={Unknown Journal},\n"
205
+ f" year={{{year or _current_year()}}},\n"
206
+ " note={This is an automatically generated fallback citation}\n"
207
+ "}"
208
+ )
209
+
210
+
211
+ def generate_bibtex(title: str, *, timeout: float = 30.0, date: str | None = None) -> str:
212
+ """Generate a BibTeX string for ``title`` (DESIGN §12).
213
+
214
+ On a confident match → the formatted entry. On a title mismatch → the entry
215
+ prefixed with the 4-line warning. On no result / API error / 429 → the mock
216
+ fallback (never raises — BibTeX never fails the whole run).
217
+ """
218
+ results = search_semantic_scholar(title, timeout=timeout)
219
+ if results:
220
+ bibtex, matches = _format_entry(results[0], title)
221
+ if not matches:
222
+ bibtex = _mismatch_warning(title, results[0].get("title", "")) + bibtex
223
+ return bibtex
224
+ return mock_bibtex(title, date=date)