deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +184 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/web/app.py +38 -3705
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- deepresearch_flow/recognize/cli.py +805 -26
- deepresearch_flow/recognize/katex_check.js +29 -0
- deepresearch_flow/recognize/math.py +719 -0
- deepresearch_flow/recognize/mermaid.py +690 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,3712 +1,35 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import html
|
|
4
|
-
import json
|
|
5
3
|
import logging
|
|
6
|
-
import unicodedata
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from html.parser import HTMLParser
|
|
9
4
|
from pathlib import Path
|
|
10
|
-
from typing import Any
|
|
11
|
-
import re
|
|
12
|
-
from urllib.parse import urlencode, quote
|
|
13
5
|
|
|
14
|
-
from markdown_it import MarkdownIt
|
|
15
|
-
try:
|
|
16
|
-
from mdit_py_plugins.footnote import footnote_plugin as footnote
|
|
17
|
-
except ImportError: # pragma: no cover - compatibility with older names
|
|
18
|
-
from mdit_py_plugins.footnote import footnote
|
|
19
6
|
from starlette.applications import Starlette
|
|
7
|
+
from starlette.middleware.base import BaseHTTPMiddleware
|
|
20
8
|
from starlette.requests import Request
|
|
21
|
-
from starlette.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, Response
|
|
22
9
|
from starlette.routing import Mount, Route
|
|
23
10
|
from starlette.staticfiles import StaticFiles
|
|
24
11
|
|
|
25
|
-
from deepresearch_flow.paper.
|
|
26
|
-
from deepresearch_flow.paper.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
12
|
+
from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
|
|
13
|
+
from deepresearch_flow.paper.web.constants import PDFJS_STATIC_DIR, STATIC_DIR
|
|
14
|
+
from deepresearch_flow.paper.web.handlers import (
|
|
15
|
+
api_papers,
|
|
16
|
+
api_pdf,
|
|
17
|
+
api_stats,
|
|
18
|
+
index_page,
|
|
19
|
+
paper_detail,
|
|
20
|
+
robots_txt,
|
|
21
|
+
stats_page,
|
|
30
22
|
)
|
|
31
|
-
from deepresearch_flow.paper.
|
|
32
|
-
from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
|
|
33
|
-
|
|
34
|
-
try:
|
|
35
|
-
from pybtex.database import parse_file
|
|
36
|
-
PYBTEX_AVAILABLE = True
|
|
37
|
-
except Exception:
|
|
38
|
-
PYBTEX_AVAILABLE = False
|
|
39
|
-
|
|
40
|
-
try:
|
|
41
|
-
from pypdf import PdfReader
|
|
42
|
-
PYPDF_AVAILABLE = True
|
|
43
|
-
except Exception:
|
|
44
|
-
PYPDF_AVAILABLE = False
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
_CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
|
|
48
|
-
_CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"
|
|
49
|
-
_CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
|
|
50
|
-
_CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"
|
|
51
|
-
_CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js"
|
|
52
|
-
# Use legacy builds to ensure `pdfjsLib` is available as a global.
|
|
53
|
-
_CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
|
|
54
|
-
_CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
|
|
55
|
-
_PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
|
|
56
|
-
_PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
|
|
23
|
+
from deepresearch_flow.paper.web.markdown import create_md_renderer
|
|
57
24
|
|
|
58
25
|
logger = logging.getLogger(__name__)
|
|
59
26
|
|
|
60
27
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
by_tag: dict[str, set[int]]
|
|
67
|
-
by_author: dict[str, set[int]]
|
|
68
|
-
by_year: dict[str, set[int]]
|
|
69
|
-
by_month: dict[str, set[int]]
|
|
70
|
-
by_venue: dict[str, set[int]]
|
|
71
|
-
stats: dict[str, Any]
|
|
72
|
-
md_path_by_hash: dict[str, Path]
|
|
73
|
-
translated_md_by_hash: dict[str, dict[str, Path]]
|
|
74
|
-
pdf_path_by_hash: dict[str, Path]
|
|
75
|
-
template_tags: list[str]
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def _split_csv(values: list[str]) -> list[str]:
|
|
79
|
-
out: list[str] = []
|
|
80
|
-
for value in values:
|
|
81
|
-
for part in value.split(","):
|
|
82
|
-
part = part.strip()
|
|
83
|
-
if part:
|
|
84
|
-
out.append(part)
|
|
85
|
-
return out
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
def _normalize_key(value: str) -> str:
|
|
89
|
-
return value.strip().lower()
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
|
|
93
|
-
if not date_str:
|
|
94
|
-
return None, None
|
|
95
|
-
text = str(date_str).strip()
|
|
96
|
-
year = None
|
|
97
|
-
month = None
|
|
98
|
-
|
|
99
|
-
year_match = re.search(r"(19|20)\d{2}", text)
|
|
100
|
-
if year_match:
|
|
101
|
-
year = year_match.group(0)
|
|
102
|
-
|
|
103
|
-
numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
|
|
104
|
-
if numeric_match:
|
|
105
|
-
m = int(numeric_match.group(2))
|
|
106
|
-
if 1 <= m <= 12:
|
|
107
|
-
month = f"{m:02d}"
|
|
108
|
-
return year, month
|
|
109
|
-
|
|
110
|
-
month_word = re.search(
|
|
111
|
-
r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
|
|
112
|
-
r"january|february|march|april|june|july|august|september|october|november|december)",
|
|
113
|
-
text.lower(),
|
|
114
|
-
)
|
|
115
|
-
if month_word:
|
|
116
|
-
lookup = {
|
|
117
|
-
"january": "01",
|
|
118
|
-
"february": "02",
|
|
119
|
-
"march": "03",
|
|
120
|
-
"april": "04",
|
|
121
|
-
"may": "05",
|
|
122
|
-
"june": "06",
|
|
123
|
-
"july": "07",
|
|
124
|
-
"august": "08",
|
|
125
|
-
"september": "09",
|
|
126
|
-
"october": "10",
|
|
127
|
-
"november": "11",
|
|
128
|
-
"december": "12",
|
|
129
|
-
"jan": "01",
|
|
130
|
-
"feb": "02",
|
|
131
|
-
"mar": "03",
|
|
132
|
-
"apr": "04",
|
|
133
|
-
"jun": "06",
|
|
134
|
-
"jul": "07",
|
|
135
|
-
"aug": "08",
|
|
136
|
-
"sep": "09",
|
|
137
|
-
"sept": "09",
|
|
138
|
-
"oct": "10",
|
|
139
|
-
"nov": "11",
|
|
140
|
-
"dec": "12",
|
|
141
|
-
}
|
|
142
|
-
month = lookup.get(month_word.group(0))
|
|
143
|
-
return year, month
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
def _normalize_month_token(value: str | int | None) -> str | None:
|
|
147
|
-
if value is None:
|
|
148
|
-
return None
|
|
149
|
-
if isinstance(value, int):
|
|
150
|
-
if 1 <= value <= 12:
|
|
151
|
-
return f"{value:02d}"
|
|
152
|
-
return None
|
|
153
|
-
raw = str(value).strip().lower()
|
|
154
|
-
if not raw:
|
|
155
|
-
return None
|
|
156
|
-
if raw.isdigit():
|
|
157
|
-
return _normalize_month_token(int(raw))
|
|
158
|
-
lookup = {
|
|
159
|
-
"january": "01",
|
|
160
|
-
"february": "02",
|
|
161
|
-
"march": "03",
|
|
162
|
-
"april": "04",
|
|
163
|
-
"may": "05",
|
|
164
|
-
"june": "06",
|
|
165
|
-
"july": "07",
|
|
166
|
-
"august": "08",
|
|
167
|
-
"september": "09",
|
|
168
|
-
"october": "10",
|
|
169
|
-
"november": "11",
|
|
170
|
-
"december": "12",
|
|
171
|
-
"jan": "01",
|
|
172
|
-
"feb": "02",
|
|
173
|
-
"mar": "03",
|
|
174
|
-
"apr": "04",
|
|
175
|
-
"jun": "06",
|
|
176
|
-
"jul": "07",
|
|
177
|
-
"aug": "08",
|
|
178
|
-
"sep": "09",
|
|
179
|
-
"sept": "09",
|
|
180
|
-
"oct": "10",
|
|
181
|
-
"nov": "11",
|
|
182
|
-
"dec": "12",
|
|
183
|
-
}
|
|
184
|
-
return lookup.get(raw)
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
def _extract_authors(paper: dict[str, Any]) -> list[str]:
|
|
188
|
-
value = paper.get("paper_authors")
|
|
189
|
-
if value is None:
|
|
190
|
-
return []
|
|
191
|
-
if isinstance(value, list):
|
|
192
|
-
return [str(item).strip() for item in value if str(item).strip()]
|
|
193
|
-
if isinstance(value, str):
|
|
194
|
-
return [part.strip() for part in value.split(",") if part.strip()]
|
|
195
|
-
return [str(value)]
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
def _extract_tags(paper: dict[str, Any]) -> list[str]:
|
|
199
|
-
tags = paper.get("ai_generated_tags") or []
|
|
200
|
-
if isinstance(tags, list):
|
|
201
|
-
return [str(tag).strip() for tag in tags if str(tag).strip()]
|
|
202
|
-
return []
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
def _extract_keywords(paper: dict[str, Any]) -> list[str]:
|
|
206
|
-
keywords = paper.get("keywords") or []
|
|
207
|
-
if isinstance(keywords, list):
|
|
208
|
-
return [str(keyword).strip() for keyword in keywords if str(keyword).strip()]
|
|
209
|
-
if isinstance(keywords, str):
|
|
210
|
-
parts = re.split(r"[;,]", keywords)
|
|
211
|
-
return [part.strip() for part in parts if part.strip()]
|
|
212
|
-
return []
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
_SUMMARY_FIELDS = (
|
|
216
|
-
"summary",
|
|
217
|
-
"abstract",
|
|
218
|
-
"keywords",
|
|
219
|
-
"question1",
|
|
220
|
-
"question2",
|
|
221
|
-
"question3",
|
|
222
|
-
"question4",
|
|
223
|
-
"question5",
|
|
224
|
-
"question6",
|
|
225
|
-
"question7",
|
|
226
|
-
"question8",
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
def _has_summary(paper: dict[str, Any], template_tags: list[str]) -> bool:
|
|
231
|
-
if template_tags:
|
|
232
|
-
return True
|
|
233
|
-
for key in _SUMMARY_FIELDS:
|
|
234
|
-
value = paper.get(key)
|
|
235
|
-
if isinstance(value, str) and value.strip():
|
|
236
|
-
return True
|
|
237
|
-
return False
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
def _extract_venue(paper: dict[str, Any]) -> str:
|
|
241
|
-
if isinstance(paper.get("bibtex"), dict):
|
|
242
|
-
bib = paper.get("bibtex") or {}
|
|
243
|
-
fields = bib.get("fields") or {}
|
|
244
|
-
bib_type = (bib.get("type") or "").lower()
|
|
245
|
-
if bib_type == "article" and fields.get("journal"):
|
|
246
|
-
return str(fields.get("journal"))
|
|
247
|
-
if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
|
|
248
|
-
return str(fields.get("booktitle"))
|
|
249
|
-
return str(paper.get("publication_venue") or "")
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
def build_index(
|
|
253
|
-
papers: list[dict[str, Any]],
|
|
254
|
-
*,
|
|
255
|
-
md_roots: list[Path] | None = None,
|
|
256
|
-
md_translated_roots: list[Path] | None = None,
|
|
257
|
-
pdf_roots: list[Path] | None = None,
|
|
258
|
-
) -> PaperIndex:
|
|
259
|
-
id_by_hash: dict[str, int] = {}
|
|
260
|
-
by_tag: dict[str, set[int]] = {}
|
|
261
|
-
by_author: dict[str, set[int]] = {}
|
|
262
|
-
by_year: dict[str, set[int]] = {}
|
|
263
|
-
by_month: dict[str, set[int]] = {}
|
|
264
|
-
by_venue: dict[str, set[int]] = {}
|
|
265
|
-
|
|
266
|
-
md_path_by_hash: dict[str, Path] = {}
|
|
267
|
-
translated_md_by_hash: dict[str, dict[str, Path]] = {}
|
|
268
|
-
pdf_path_by_hash: dict[str, Path] = {}
|
|
269
|
-
|
|
270
|
-
md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
|
|
271
|
-
translated_index = _build_translated_index(md_translated_roots or [])
|
|
272
|
-
pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
|
|
273
|
-
|
|
274
|
-
year_counts: dict[str, int] = {}
|
|
275
|
-
month_counts: dict[str, int] = {}
|
|
276
|
-
tag_counts: dict[str, int] = {}
|
|
277
|
-
keyword_counts: dict[str, int] = {}
|
|
278
|
-
author_counts: dict[str, int] = {}
|
|
279
|
-
venue_counts: dict[str, int] = {}
|
|
280
|
-
template_tag_counts: dict[str, int] = {}
|
|
281
|
-
|
|
282
|
-
def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
|
|
283
|
-
index.setdefault(key, set()).add(idx)
|
|
284
|
-
|
|
285
|
-
for idx, paper in enumerate(papers):
|
|
286
|
-
is_pdf_only = bool(paper.get("_is_pdf_only"))
|
|
287
|
-
source_hash = paper.get("source_hash")
|
|
288
|
-
if not source_hash and paper.get("source_path"):
|
|
289
|
-
source_hash = stable_hash(str(paper.get("source_path")))
|
|
290
|
-
if source_hash:
|
|
291
|
-
id_by_hash[str(source_hash)] = idx
|
|
292
|
-
|
|
293
|
-
title = str(paper.get("paper_title") or "")
|
|
294
|
-
paper["_title_lc"] = title.lower()
|
|
295
|
-
|
|
296
|
-
bib_fields: dict[str, Any] = {}
|
|
297
|
-
if isinstance(paper.get("bibtex"), dict):
|
|
298
|
-
bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
299
|
-
|
|
300
|
-
year = None
|
|
301
|
-
if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
|
|
302
|
-
year = str(bib_fields.get("year"))
|
|
303
|
-
month = _normalize_month_token(bib_fields.get("month"))
|
|
304
|
-
if not year or not month:
|
|
305
|
-
parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
|
|
306
|
-
year = year or parsed_year
|
|
307
|
-
month = month or parsed_month
|
|
308
|
-
|
|
309
|
-
year_label = year or "Unknown"
|
|
310
|
-
month_label = month or "Unknown"
|
|
311
|
-
paper["_year"] = year_label
|
|
312
|
-
paper["_month"] = month_label
|
|
313
|
-
add_index(by_year, _normalize_key(year_label), idx)
|
|
314
|
-
add_index(by_month, _normalize_key(month_label), idx)
|
|
315
|
-
if not is_pdf_only:
|
|
316
|
-
year_counts[year_label] = year_counts.get(year_label, 0) + 1
|
|
317
|
-
month_counts[month_label] = month_counts.get(month_label, 0) + 1
|
|
318
|
-
|
|
319
|
-
venue = _extract_venue(paper).strip()
|
|
320
|
-
paper["_venue"] = venue
|
|
321
|
-
if venue:
|
|
322
|
-
add_index(by_venue, _normalize_key(venue), idx)
|
|
323
|
-
if not is_pdf_only:
|
|
324
|
-
venue_counts[venue] = venue_counts.get(venue, 0) + 1
|
|
325
|
-
else:
|
|
326
|
-
add_index(by_venue, "unknown", idx)
|
|
327
|
-
if not is_pdf_only:
|
|
328
|
-
venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
|
|
329
|
-
|
|
330
|
-
authors = _extract_authors(paper)
|
|
331
|
-
paper["_authors"] = authors
|
|
332
|
-
for author in authors:
|
|
333
|
-
key = _normalize_key(author)
|
|
334
|
-
add_index(by_author, key, idx)
|
|
335
|
-
if not is_pdf_only:
|
|
336
|
-
author_counts[author] = author_counts.get(author, 0) + 1
|
|
337
|
-
|
|
338
|
-
tags = _extract_tags(paper)
|
|
339
|
-
paper["_tags"] = tags
|
|
340
|
-
for tag in tags:
|
|
341
|
-
key = _normalize_key(tag)
|
|
342
|
-
add_index(by_tag, key, idx)
|
|
343
|
-
if not is_pdf_only:
|
|
344
|
-
tag_counts[tag] = tag_counts.get(tag, 0) + 1
|
|
345
|
-
|
|
346
|
-
keywords = _extract_keywords(paper)
|
|
347
|
-
paper["_keywords"] = keywords
|
|
348
|
-
for keyword in keywords:
|
|
349
|
-
if not is_pdf_only:
|
|
350
|
-
keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
|
|
351
|
-
|
|
352
|
-
template_tags = _available_templates(paper)
|
|
353
|
-
if not template_tags:
|
|
354
|
-
fallback_tag = paper.get("template_tag") or paper.get("prompt_template")
|
|
355
|
-
if fallback_tag:
|
|
356
|
-
template_tags = [str(fallback_tag)]
|
|
357
|
-
paper["_template_tags"] = template_tags
|
|
358
|
-
paper["_template_tags_lc"] = [tag.lower() for tag in template_tags]
|
|
359
|
-
paper["_has_summary"] = _has_summary(paper, template_tags)
|
|
360
|
-
if not is_pdf_only:
|
|
361
|
-
for tag in template_tags:
|
|
362
|
-
template_tag_counts[tag] = template_tag_counts.get(tag, 0) + 1
|
|
363
|
-
|
|
364
|
-
search_parts = [title, venue, " ".join(authors), " ".join(tags)]
|
|
365
|
-
paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
|
|
366
|
-
|
|
367
|
-
source_hash_str = str(source_hash) if source_hash else str(idx)
|
|
368
|
-
md_path = _resolve_source_md(paper, md_file_index)
|
|
369
|
-
if md_path is not None:
|
|
370
|
-
md_path_by_hash[source_hash_str] = md_path
|
|
371
|
-
base_key = md_path.with_suffix("").name.lower()
|
|
372
|
-
translations = translated_index.get(base_key, {})
|
|
373
|
-
if translations:
|
|
374
|
-
translated_md_by_hash[source_hash_str] = translations
|
|
375
|
-
pdf_path = _resolve_pdf(paper, pdf_file_index)
|
|
376
|
-
if pdf_path is not None:
|
|
377
|
-
pdf_path_by_hash[source_hash_str] = pdf_path
|
|
378
|
-
|
|
379
|
-
def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
|
|
380
|
-
idx, paper = item
|
|
381
|
-
year_label = str(paper.get("_year") or "Unknown")
|
|
382
|
-
title_label = str(paper.get("paper_title") or "")
|
|
383
|
-
if year_label.isdigit():
|
|
384
|
-
return (0, -int(year_label), title_label.lower())
|
|
385
|
-
return (1, 0, title_label.lower())
|
|
386
|
-
|
|
387
|
-
ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
|
|
388
|
-
|
|
389
|
-
stats_total = sum(1 for paper in papers if not paper.get("_is_pdf_only"))
|
|
390
|
-
stats = {
|
|
391
|
-
"total": stats_total,
|
|
392
|
-
"years": _sorted_counts(year_counts, numeric_desc=True),
|
|
393
|
-
"months": _sorted_month_counts(month_counts),
|
|
394
|
-
"tags": _sorted_counts(tag_counts),
|
|
395
|
-
"keywords": _sorted_counts(keyword_counts),
|
|
396
|
-
"authors": _sorted_counts(author_counts),
|
|
397
|
-
"venues": _sorted_counts(venue_counts),
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
template_tags = sorted(template_tag_counts.keys(), key=lambda item: item.lower())
|
|
401
|
-
|
|
402
|
-
return PaperIndex(
|
|
403
|
-
papers=papers,
|
|
404
|
-
id_by_hash=id_by_hash,
|
|
405
|
-
ordered_ids=ordered_ids,
|
|
406
|
-
by_tag=by_tag,
|
|
407
|
-
by_author=by_author,
|
|
408
|
-
by_year=by_year,
|
|
409
|
-
by_month=by_month,
|
|
410
|
-
by_venue=by_venue,
|
|
411
|
-
stats=stats,
|
|
412
|
-
md_path_by_hash=md_path_by_hash,
|
|
413
|
-
translated_md_by_hash=translated_md_by_hash,
|
|
414
|
-
pdf_path_by_hash=pdf_path_by_hash,
|
|
415
|
-
template_tags=template_tags,
|
|
416
|
-
)
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
|
|
420
|
-
items = list(counts.items())
|
|
421
|
-
if numeric_desc:
|
|
422
|
-
def key(item: tuple[str, int]) -> tuple[int, int]:
|
|
423
|
-
label, count = item
|
|
424
|
-
if label.isdigit():
|
|
425
|
-
return (0, -int(label))
|
|
426
|
-
return (1, 0)
|
|
427
|
-
items.sort(key=key)
|
|
428
|
-
else:
|
|
429
|
-
items.sort(key=lambda item: item[1], reverse=True)
|
|
430
|
-
return [{"label": k, "count": v} for k, v in items]
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
|
|
434
|
-
def month_sort(label: str) -> int:
|
|
435
|
-
if label == "Unknown":
|
|
436
|
-
return 99
|
|
437
|
-
if label.isdigit():
|
|
438
|
-
return int(label)
|
|
439
|
-
return 98
|
|
440
|
-
|
|
441
|
-
items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
|
|
442
|
-
return [{"label": k, "count": v} for k, v in items]
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
_TEMPLATE_INFER_IGNORE_KEYS = {
|
|
446
|
-
"source_path",
|
|
447
|
-
"source_hash",
|
|
448
|
-
"provider",
|
|
449
|
-
"model",
|
|
450
|
-
"extracted_at",
|
|
451
|
-
"truncation",
|
|
452
|
-
"output_language",
|
|
453
|
-
"prompt_template",
|
|
454
|
-
}
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
|
|
458
|
-
inputs: list[dict[str, Any]] = []
|
|
459
|
-
for path in paths:
|
|
460
|
-
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
461
|
-
if isinstance(payload, list):
|
|
462
|
-
raise ValueError(
|
|
463
|
-
f"Input JSON must be an object with template_tag and papers (got array): {path}"
|
|
464
|
-
)
|
|
465
|
-
if not isinstance(payload, dict):
|
|
466
|
-
raise ValueError(f"Input JSON must be an object: {path}")
|
|
467
|
-
papers = payload.get("papers")
|
|
468
|
-
if not isinstance(papers, list):
|
|
469
|
-
raise ValueError(f"Input JSON missing papers list: {path}")
|
|
470
|
-
template_tag = payload.get("template_tag")
|
|
471
|
-
if not template_tag:
|
|
472
|
-
template_tag = _infer_template_tag(papers, path)
|
|
473
|
-
inputs.append({"template_tag": str(template_tag), "papers": papers})
|
|
474
|
-
return inputs
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
def _infer_template_tag(papers: list[dict[str, Any]], path: Path) -> str:
|
|
478
|
-
prompt_tags = {
|
|
479
|
-
str(paper.get("prompt_template"))
|
|
480
|
-
for paper in papers
|
|
481
|
-
if isinstance(paper, dict) and paper.get("prompt_template")
|
|
482
|
-
}
|
|
483
|
-
if len(prompt_tags) == 1:
|
|
484
|
-
return prompt_tags.pop()
|
|
485
|
-
|
|
486
|
-
sample = next((paper for paper in papers if isinstance(paper, dict)), None)
|
|
487
|
-
if sample is None:
|
|
488
|
-
raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
|
|
489
|
-
|
|
490
|
-
paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
|
|
491
|
-
if not paper_keys:
|
|
492
|
-
raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
|
|
493
|
-
|
|
494
|
-
best_tag = None
|
|
495
|
-
best_score = -1
|
|
496
|
-
for name in list_template_names_in_registry_order():
|
|
497
|
-
schema = load_schema_for_template(name)
|
|
498
|
-
schema_keys = set((schema.get("properties") or {}).keys())
|
|
499
|
-
score = len(paper_keys & schema_keys)
|
|
500
|
-
if score > best_score:
|
|
501
|
-
best_score = score
|
|
502
|
-
best_tag = name
|
|
503
|
-
elif score == best_score:
|
|
504
|
-
if best_tag != "simple" and name == "simple":
|
|
505
|
-
best_tag = name
|
|
506
|
-
|
|
507
|
-
if not best_tag:
|
|
508
|
-
raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
|
|
509
|
-
return best_tag
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
def _build_cache_meta(
|
|
513
|
-
db_paths: list[Path],
|
|
514
|
-
bibtex_path: Path | None,
|
|
515
|
-
pdf_roots_meta: list[dict[str, Any]] | None = None,
|
|
516
|
-
) -> dict[str, Any]:
|
|
517
|
-
def file_meta(path: Path) -> dict[str, Any]:
|
|
518
|
-
try:
|
|
519
|
-
stats = path.stat()
|
|
520
|
-
except OSError as exc:
|
|
521
|
-
raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
|
|
522
|
-
return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
|
|
523
|
-
|
|
524
|
-
meta = {
|
|
525
|
-
"version": 1,
|
|
526
|
-
"inputs": [file_meta(path) for path in db_paths],
|
|
527
|
-
"bibtex": file_meta(bibtex_path) if bibtex_path else None,
|
|
528
|
-
}
|
|
529
|
-
if pdf_roots_meta is not None:
|
|
530
|
-
meta["pdf_roots"] = pdf_roots_meta
|
|
531
|
-
return meta
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
|
|
535
|
-
meta_path = cache_dir / "db_serve_cache.meta.json"
|
|
536
|
-
data_path = cache_dir / "db_serve_cache.papers.json"
|
|
537
|
-
if not meta_path.exists() or not data_path.exists():
|
|
538
|
-
return None
|
|
539
|
-
try:
|
|
540
|
-
cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
|
|
541
|
-
if cached_meta != meta:
|
|
542
|
-
return None
|
|
543
|
-
cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
|
|
544
|
-
if not isinstance(cached_papers, list):
|
|
545
|
-
return None
|
|
546
|
-
return cached_papers
|
|
547
|
-
except Exception:
|
|
548
|
-
return None
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
|
|
552
|
-
meta_path = cache_dir / "db_serve_cache.meta.json"
|
|
553
|
-
data_path = cache_dir / "db_serve_cache.papers.json"
|
|
554
|
-
meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
555
|
-
data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
def _extract_year_for_matching(paper: dict[str, Any]) -> str | None:
|
|
559
|
-
if isinstance(paper.get("bibtex"), dict):
|
|
560
|
-
fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
561
|
-
year = fields.get("year")
|
|
562
|
-
if year and str(year).isdigit():
|
|
563
|
-
return str(year)
|
|
564
|
-
parsed_year, _ = _parse_year_month(str(paper.get("publication_date") or ""))
|
|
565
|
-
return parsed_year
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
def _prepare_paper_matching_fields(paper: dict[str, Any]) -> None:
|
|
569
|
-
if "_authors" not in paper:
|
|
570
|
-
paper["_authors"] = _extract_authors(paper)
|
|
571
|
-
if "_year" not in paper:
|
|
572
|
-
paper["_year"] = _extract_year_for_matching(paper) or ""
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
def _build_pdf_only_entries(
|
|
576
|
-
papers: list[dict[str, Any]],
|
|
577
|
-
pdf_paths: list[Path],
|
|
578
|
-
pdf_index: dict[str, list[Path]],
|
|
579
|
-
) -> list[dict[str, Any]]:
|
|
580
|
-
matched: set[Path] = set()
|
|
581
|
-
for paper in papers:
|
|
582
|
-
_prepare_paper_matching_fields(paper)
|
|
583
|
-
pdf_path = _resolve_pdf(paper, pdf_index)
|
|
584
|
-
if pdf_path:
|
|
585
|
-
matched.add(pdf_path.resolve())
|
|
586
|
-
|
|
587
|
-
entries: list[dict[str, Any]] = []
|
|
588
|
-
for path in pdf_paths:
|
|
589
|
-
resolved = path.resolve()
|
|
590
|
-
if resolved in matched:
|
|
591
|
-
continue
|
|
592
|
-
title = _read_pdf_metadata_title(resolved) or _extract_title_from_filename(resolved.name)
|
|
593
|
-
if not title:
|
|
594
|
-
title = resolved.stem
|
|
595
|
-
year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
|
|
596
|
-
entry: dict[str, Any] = {
|
|
597
|
-
"paper_title": title,
|
|
598
|
-
"paper_authors": [author_hint] if author_hint else [],
|
|
599
|
-
"publication_date": year_hint or "",
|
|
600
|
-
"source_hash": stable_hash(str(resolved)),
|
|
601
|
-
"source_path": str(resolved),
|
|
602
|
-
"_is_pdf_only": True,
|
|
603
|
-
}
|
|
604
|
-
entries.append(entry)
|
|
605
|
-
return entries
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
def _load_or_merge_papers(
|
|
609
|
-
db_paths: list[Path],
|
|
610
|
-
bibtex_path: Path | None,
|
|
611
|
-
cache_dir: Path | None,
|
|
612
|
-
use_cache: bool,
|
|
613
|
-
pdf_roots: list[Path] | None = None,
|
|
614
|
-
) -> list[dict[str, Any]]:
|
|
615
|
-
cache_meta = None
|
|
616
|
-
pdf_roots = pdf_roots or []
|
|
617
|
-
pdf_paths: list[Path] = []
|
|
618
|
-
pdf_roots_meta: list[dict[str, Any]] | None = None
|
|
619
|
-
if pdf_roots:
|
|
620
|
-
pdf_paths, pdf_roots_meta = _scan_pdf_roots(pdf_roots)
|
|
621
|
-
if cache_dir and use_cache:
|
|
622
|
-
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
623
|
-
cache_meta = _build_cache_meta(db_paths, bibtex_path, pdf_roots_meta)
|
|
624
|
-
cached = _load_cached_papers(cache_dir, cache_meta)
|
|
625
|
-
if cached is not None:
|
|
626
|
-
return cached
|
|
627
|
-
|
|
628
|
-
inputs = _load_paper_inputs(db_paths)
|
|
629
|
-
if bibtex_path is not None:
|
|
630
|
-
for bundle in inputs:
|
|
631
|
-
enrich_with_bibtex(bundle["papers"], bibtex_path)
|
|
632
|
-
papers = _merge_paper_inputs(inputs)
|
|
633
|
-
if pdf_paths:
|
|
634
|
-
pdf_index = _build_file_index_from_paths(pdf_paths, suffixes={".pdf"})
|
|
635
|
-
papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
|
|
636
|
-
|
|
637
|
-
if cache_dir and use_cache and cache_meta is not None:
|
|
638
|
-
_write_cached_papers(cache_dir, cache_meta, papers)
|
|
639
|
-
return papers
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
def _md_renderer() -> MarkdownIt:
|
|
643
|
-
md = MarkdownIt("commonmark", {"html": False, "linkify": True})
|
|
644
|
-
md.use(footnote)
|
|
645
|
-
md.enable("table")
|
|
646
|
-
return md
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
def _strip_paragraph_wrapped_tables(text: str) -> str:
|
|
650
|
-
lines = text.splitlines()
|
|
651
|
-
for idx, line in enumerate(lines):
|
|
652
|
-
line = re.sub(r"^\s*<p>\s*\|", "|", line)
|
|
653
|
-
line = re.sub(r"\|\s*</p>\s*$", "|", line)
|
|
654
|
-
lines[idx] = line
|
|
655
|
-
return "\n".join(lines)
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
def _normalize_markdown_images(text: str) -> str:
|
|
659
|
-
lines = text.splitlines()
|
|
660
|
-
out: list[str] = []
|
|
661
|
-
in_fence = False
|
|
662
|
-
fence_char = ""
|
|
663
|
-
fence_len = 0
|
|
664
|
-
img_re = re.compile(r"!\[[^\]]*\]\((?:[^)\\]|\\.)*\)")
|
|
665
|
-
list_re = re.compile(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+")
|
|
666
|
-
|
|
667
|
-
for line in lines:
|
|
668
|
-
stripped = line.lstrip()
|
|
669
|
-
if stripped.startswith(("```", "~~~")):
|
|
670
|
-
run_len = 0
|
|
671
|
-
while run_len < len(stripped) and stripped[run_len] == stripped[0]:
|
|
672
|
-
run_len += 1
|
|
673
|
-
if not in_fence:
|
|
674
|
-
in_fence = True
|
|
675
|
-
fence_char = stripped[0]
|
|
676
|
-
fence_len = run_len
|
|
677
|
-
elif stripped[0] == fence_char and run_len >= fence_len:
|
|
678
|
-
in_fence = False
|
|
679
|
-
out.append(line)
|
|
680
|
-
continue
|
|
681
|
-
if in_fence:
|
|
682
|
-
out.append(line)
|
|
683
|
-
continue
|
|
684
|
-
match = img_re.search(line)
|
|
685
|
-
if not match:
|
|
686
|
-
out.append(line)
|
|
687
|
-
continue
|
|
688
|
-
if list_re.match(line) or (line.lstrip().startswith("|") and line.count("|") >= 2):
|
|
689
|
-
out.append(line)
|
|
690
|
-
continue
|
|
691
|
-
prefix = line[:match.start()]
|
|
692
|
-
if prefix.strip():
|
|
693
|
-
out.append(prefix.rstrip())
|
|
694
|
-
out.append("")
|
|
695
|
-
out.append(line[match.start():].lstrip())
|
|
696
|
-
continue
|
|
697
|
-
if out and out[-1].strip():
|
|
698
|
-
out.append("")
|
|
699
|
-
out.append(line)
|
|
700
|
-
return "\n".join(out)
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
def _normalize_merge_title(value: str | None) -> str | None:
|
|
704
|
-
if not value:
|
|
705
|
-
return None
|
|
706
|
-
return str(value).replace("{", "").replace("}", "").strip().lower()
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
|
|
710
|
-
if not isinstance(paper.get("bibtex"), dict):
|
|
711
|
-
return None
|
|
712
|
-
fields = paper.get("bibtex", {}).get("fields", {}) or {}
|
|
713
|
-
return _normalize_merge_title(fields.get("title"))
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
def _extract_paper_title(paper: dict[str, Any]) -> str | None:
|
|
717
|
-
return _normalize_merge_title(paper.get("paper_title"))
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
def _available_templates(paper: dict[str, Any]) -> list[str]:
|
|
721
|
-
templates = paper.get("templates")
|
|
722
|
-
if not isinstance(templates, dict):
|
|
723
|
-
return []
|
|
724
|
-
order = paper.get("template_order") or list(templates.keys())
|
|
725
|
-
seen: set[str] = set()
|
|
726
|
-
available: list[str] = []
|
|
727
|
-
for tag in order:
|
|
728
|
-
if tag in templates and tag not in seen:
|
|
729
|
-
available.append(tag)
|
|
730
|
-
seen.add(tag)
|
|
731
|
-
for tag in templates:
|
|
732
|
-
if tag not in seen:
|
|
733
|
-
available.append(tag)
|
|
734
|
-
seen.add(tag)
|
|
735
|
-
return available
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
def _select_template_tag(
|
|
739
|
-
paper: dict[str, Any], requested: str | None
|
|
740
|
-
) -> tuple[str | None, list[str]]:
|
|
741
|
-
available = _available_templates(paper)
|
|
742
|
-
if not available:
|
|
743
|
-
return None, []
|
|
744
|
-
default_tag = paper.get("default_template")
|
|
745
|
-
if not default_tag:
|
|
746
|
-
default_tag = "simple" if "simple" in available else available[0]
|
|
747
|
-
selected = requested if requested in available else default_tag
|
|
748
|
-
return selected, available
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
|
|
752
|
-
bib_title = _extract_bibtex_title(paper)
|
|
753
|
-
group_bib = group.get("_merge_bibtex_titles") or set()
|
|
754
|
-
if bib_title and group_bib:
|
|
755
|
-
return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
|
|
756
|
-
|
|
757
|
-
paper_title = _extract_paper_title(paper)
|
|
758
|
-
group_titles = group.get("_merge_paper_titles") or set()
|
|
759
|
-
if paper_title and group_titles:
|
|
760
|
-
return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
|
|
761
|
-
return False
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
|
|
765
|
-
bib_title = _extract_bibtex_title(paper)
|
|
766
|
-
if bib_title:
|
|
767
|
-
group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
|
|
768
|
-
paper_title = _extract_paper_title(paper)
|
|
769
|
-
if paper_title:
|
|
770
|
-
group.setdefault("_merge_paper_titles", set()).add(paper_title)
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
774
|
-
merged: list[dict[str, Any]] = []
|
|
775
|
-
threshold = 0.95
|
|
776
|
-
prefix_len = 5
|
|
777
|
-
bibtex_exact: dict[str, set[int]] = {}
|
|
778
|
-
bibtex_prefix: dict[str, set[int]] = {}
|
|
779
|
-
paper_exact: dict[str, set[int]] = {}
|
|
780
|
-
paper_prefix: dict[str, set[int]] = {}
|
|
781
|
-
|
|
782
|
-
def prefix_key(value: str) -> str:
|
|
783
|
-
return value[:prefix_len] if len(value) >= prefix_len else value
|
|
784
|
-
|
|
785
|
-
def add_index(
|
|
786
|
-
value: str,
|
|
787
|
-
exact_index: dict[str, set[int]],
|
|
788
|
-
prefix_index: dict[str, set[int]],
|
|
789
|
-
idx: int,
|
|
790
|
-
) -> None:
|
|
791
|
-
exact_index.setdefault(value, set()).add(idx)
|
|
792
|
-
prefix_index.setdefault(prefix_key(value), set()).add(idx)
|
|
793
|
-
|
|
794
|
-
def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
|
|
795
|
-
ids: set[int] = set()
|
|
796
|
-
if bib_title:
|
|
797
|
-
ids |= bibtex_exact.get(bib_title, set())
|
|
798
|
-
ids |= bibtex_prefix.get(prefix_key(bib_title), set())
|
|
799
|
-
if paper_title:
|
|
800
|
-
ids |= paper_exact.get(paper_title, set())
|
|
801
|
-
ids |= paper_prefix.get(prefix_key(paper_title), set())
|
|
802
|
-
return sorted(ids)
|
|
803
|
-
|
|
804
|
-
for bundle in inputs:
|
|
805
|
-
template_tag = bundle.get("template_tag")
|
|
806
|
-
papers = bundle.get("papers") or []
|
|
807
|
-
for paper in papers:
|
|
808
|
-
if not isinstance(paper, dict):
|
|
809
|
-
raise ValueError("Input papers must be objects")
|
|
810
|
-
bib_title = _extract_bibtex_title(paper)
|
|
811
|
-
paper_title = _extract_paper_title(paper)
|
|
812
|
-
match = None
|
|
813
|
-
match_idx = None
|
|
814
|
-
for idx in candidate_ids(bib_title, paper_title):
|
|
815
|
-
candidate = merged[idx]
|
|
816
|
-
if _titles_match(candidate, paper, threshold=threshold):
|
|
817
|
-
match = candidate
|
|
818
|
-
match_idx = idx
|
|
819
|
-
break
|
|
820
|
-
if match is None:
|
|
821
|
-
group = {
|
|
822
|
-
"templates": {template_tag: paper},
|
|
823
|
-
"template_order": [template_tag],
|
|
824
|
-
}
|
|
825
|
-
_add_merge_titles(group, paper)
|
|
826
|
-
merged.append(group)
|
|
827
|
-
group_idx = len(merged) - 1
|
|
828
|
-
if bib_title:
|
|
829
|
-
add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
|
|
830
|
-
if paper_title:
|
|
831
|
-
add_index(paper_title, paper_exact, paper_prefix, group_idx)
|
|
832
|
-
else:
|
|
833
|
-
templates = match.setdefault("templates", {})
|
|
834
|
-
templates[template_tag] = paper
|
|
835
|
-
order = match.setdefault("template_order", [])
|
|
836
|
-
if template_tag not in order:
|
|
837
|
-
order.append(template_tag)
|
|
838
|
-
_add_merge_titles(match, paper)
|
|
839
|
-
if match_idx is not None:
|
|
840
|
-
if bib_title:
|
|
841
|
-
add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
|
|
842
|
-
if paper_title:
|
|
843
|
-
add_index(paper_title, paper_exact, paper_prefix, match_idx)
|
|
844
|
-
|
|
845
|
-
for group in merged:
|
|
846
|
-
templates = group.get("templates") or {}
|
|
847
|
-
order = group.get("template_order") or list(templates.keys())
|
|
848
|
-
default_tag = "simple" if "simple" in order else (order[0] if order else None)
|
|
849
|
-
group["default_template"] = default_tag
|
|
850
|
-
if default_tag and default_tag in templates:
|
|
851
|
-
base = templates[default_tag]
|
|
852
|
-
for key, value in base.items():
|
|
853
|
-
group[key] = value
|
|
854
|
-
group.pop("_merge_bibtex_titles", None)
|
|
855
|
-
group.pop("_merge_paper_titles", None)
|
|
856
|
-
return merged
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
def _render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
|
|
860
|
-
text = _strip_paragraph_wrapped_tables(text)
|
|
861
|
-
rendered, table_placeholders = _extract_html_table_placeholders(text)
|
|
862
|
-
rendered, img_placeholders = _extract_html_img_placeholders(rendered)
|
|
863
|
-
rendered, placeholders = _extract_math_placeholders(rendered)
|
|
864
|
-
html_out = md.render(rendered)
|
|
865
|
-
for key, value in placeholders.items():
|
|
866
|
-
html_out = html_out.replace(key, html.escape(value))
|
|
867
|
-
for key, value in img_placeholders.items():
|
|
868
|
-
html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: value, html_out)
|
|
869
|
-
html_out = html_out.replace(key, value)
|
|
870
|
-
for key, value in table_placeholders.items():
|
|
871
|
-
safe_html = _sanitize_table_html(value)
|
|
872
|
-
html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: safe_html, html_out)
|
|
873
|
-
return html_out
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
def _extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
877
|
-
placeholders: dict[str, str] = {}
|
|
878
|
-
out: list[str] = []
|
|
879
|
-
idx = 0
|
|
880
|
-
in_fence = False
|
|
881
|
-
fence_char = ""
|
|
882
|
-
fence_len = 0
|
|
883
|
-
inline_delim_len = 0
|
|
884
|
-
|
|
885
|
-
def next_placeholder(value: str) -> str:
|
|
886
|
-
key = f"@@MATH_{len(placeholders)}@@"
|
|
887
|
-
placeholders[key] = value
|
|
888
|
-
return key
|
|
889
|
-
|
|
890
|
-
while idx < len(text):
|
|
891
|
-
at_line_start = idx == 0 or text[idx - 1] == "\n"
|
|
892
|
-
|
|
893
|
-
if inline_delim_len == 0 and at_line_start:
|
|
894
|
-
line_end = text.find("\n", idx)
|
|
895
|
-
if line_end == -1:
|
|
896
|
-
line_end = len(text)
|
|
897
|
-
line = text[idx:line_end]
|
|
898
|
-
stripped = line.lstrip(" ")
|
|
899
|
-
leading_spaces = len(line) - len(stripped)
|
|
900
|
-
if leading_spaces <= 3 and stripped:
|
|
901
|
-
first = stripped[0]
|
|
902
|
-
if first in {"`", "~"}:
|
|
903
|
-
run_len = 0
|
|
904
|
-
while run_len < len(stripped) and stripped[run_len] == first:
|
|
905
|
-
run_len += 1
|
|
906
|
-
if run_len >= 3:
|
|
907
|
-
if not in_fence:
|
|
908
|
-
in_fence = True
|
|
909
|
-
fence_char = first
|
|
910
|
-
fence_len = run_len
|
|
911
|
-
elif first == fence_char and run_len >= fence_len:
|
|
912
|
-
in_fence = False
|
|
913
|
-
fence_char = ""
|
|
914
|
-
fence_len = 0
|
|
915
|
-
out.append(line)
|
|
916
|
-
idx = line_end
|
|
917
|
-
continue
|
|
918
|
-
|
|
919
|
-
if in_fence:
|
|
920
|
-
out.append(text[idx])
|
|
921
|
-
idx += 1
|
|
922
|
-
continue
|
|
923
|
-
|
|
924
|
-
if inline_delim_len > 0:
|
|
925
|
-
delim = "`" * inline_delim_len
|
|
926
|
-
if text.startswith(delim, idx):
|
|
927
|
-
out.append(delim)
|
|
928
|
-
idx += inline_delim_len
|
|
929
|
-
inline_delim_len = 0
|
|
930
|
-
continue
|
|
931
|
-
out.append(text[idx])
|
|
932
|
-
idx += 1
|
|
933
|
-
continue
|
|
934
|
-
|
|
935
|
-
ch = text[idx]
|
|
936
|
-
if ch == "`":
|
|
937
|
-
run_len = 0
|
|
938
|
-
while idx + run_len < len(text) and text[idx + run_len] == "`":
|
|
939
|
-
run_len += 1
|
|
940
|
-
inline_delim_len = run_len
|
|
941
|
-
out.append("`" * run_len)
|
|
942
|
-
idx += run_len
|
|
943
|
-
continue
|
|
944
|
-
|
|
945
|
-
# Block math: $$...$$ (can span lines)
|
|
946
|
-
if text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
|
|
947
|
-
search_from = idx + 2
|
|
948
|
-
end = text.find("$$", search_from)
|
|
949
|
-
while end != -1 and text[end - 1] == "\\":
|
|
950
|
-
search_from = end + 2
|
|
951
|
-
end = text.find("$$", search_from)
|
|
952
|
-
if end != -1:
|
|
953
|
-
out.append(next_placeholder(text[idx : end + 2]))
|
|
954
|
-
idx = end + 2
|
|
955
|
-
continue
|
|
956
|
-
|
|
957
|
-
# Inline math: $...$ (single-line)
|
|
958
|
-
if ch == "$" and not text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
|
|
959
|
-
line_end = text.find("\n", idx + 1)
|
|
960
|
-
if line_end == -1:
|
|
961
|
-
line_end = len(text)
|
|
962
|
-
search_from = idx + 1
|
|
963
|
-
end = text.find("$", search_from, line_end)
|
|
964
|
-
while end != -1 and text[end - 1] == "\\":
|
|
965
|
-
search_from = end + 1
|
|
966
|
-
end = text.find("$", search_from, line_end)
|
|
967
|
-
if end != -1:
|
|
968
|
-
out.append(next_placeholder(text[idx : end + 1]))
|
|
969
|
-
idx = end + 1
|
|
970
|
-
continue
|
|
971
|
-
|
|
972
|
-
out.append(ch)
|
|
973
|
-
idx += 1
|
|
974
|
-
|
|
975
|
-
return "".join(out), placeholders
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
class _TableSanitizer(HTMLParser):
|
|
979
|
-
def __init__(self) -> None:
|
|
980
|
-
super().__init__(convert_charrefs=True)
|
|
981
|
-
self._out: list[str] = []
|
|
982
|
-
self._stack: list[str] = []
|
|
983
|
-
|
|
984
|
-
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
985
|
-
t = tag.lower()
|
|
986
|
-
if t not in {
|
|
987
|
-
"table",
|
|
988
|
-
"thead",
|
|
989
|
-
"tbody",
|
|
990
|
-
"tfoot",
|
|
991
|
-
"tr",
|
|
992
|
-
"th",
|
|
993
|
-
"td",
|
|
994
|
-
"caption",
|
|
995
|
-
"colgroup",
|
|
996
|
-
"col",
|
|
997
|
-
"br",
|
|
998
|
-
}:
|
|
999
|
-
return
|
|
1000
|
-
|
|
1001
|
-
allowed: dict[str, str] = {}
|
|
1002
|
-
for name, value in attrs:
|
|
1003
|
-
if value is None:
|
|
1004
|
-
continue
|
|
1005
|
-
n = name.lower()
|
|
1006
|
-
v = value.strip()
|
|
1007
|
-
if t in {"td", "th"} and n in {"colspan", "rowspan"} and v.isdigit():
|
|
1008
|
-
allowed[n] = v
|
|
1009
|
-
elif t in {"td", "th"} and n == "align" and v.lower() in {"left", "right", "center"}:
|
|
1010
|
-
allowed[n] = v.lower()
|
|
1011
|
-
|
|
1012
|
-
attr_text = "".join(f' {k}="{html.escape(v, quote=True)}"' for k, v in allowed.items())
|
|
1013
|
-
self._out.append(f"<{t}{attr_text}>")
|
|
1014
|
-
if t not in {"br", "col"}:
|
|
1015
|
-
self._stack.append(t)
|
|
1016
|
-
|
|
1017
|
-
def handle_endtag(self, tag: str) -> None:
|
|
1018
|
-
t = tag.lower()
|
|
1019
|
-
if t not in self._stack:
|
|
1020
|
-
return
|
|
1021
|
-
while self._stack:
|
|
1022
|
-
popped = self._stack.pop()
|
|
1023
|
-
self._out.append(f"</{popped}>")
|
|
1024
|
-
if popped == t:
|
|
1025
|
-
break
|
|
1026
|
-
|
|
1027
|
-
def handle_data(self, data: str) -> None:
|
|
1028
|
-
self._out.append(html.escape(data))
|
|
1029
|
-
|
|
1030
|
-
def handle_entityref(self, name: str) -> None:
|
|
1031
|
-
self._out.append(f"&{name};")
|
|
1032
|
-
|
|
1033
|
-
def handle_charref(self, name: str) -> None:
|
|
1034
|
-
self._out.append(f"&#{name};")
|
|
1035
|
-
|
|
1036
|
-
def close(self) -> None:
|
|
1037
|
-
super().close()
|
|
1038
|
-
while self._stack:
|
|
1039
|
-
self._out.append(f"</{self._stack.pop()}>")
|
|
1040
|
-
|
|
1041
|
-
def get_html(self) -> str:
|
|
1042
|
-
return "".join(self._out)
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
def _sanitize_table_html(raw: str) -> str:
|
|
1046
|
-
parser = _TableSanitizer()
|
|
1047
|
-
try:
|
|
1048
|
-
parser.feed(raw)
|
|
1049
|
-
parser.close()
|
|
1050
|
-
except Exception:
|
|
1051
|
-
return f"<pre><code>{html.escape(raw)}</code></pre>"
|
|
1052
|
-
return parser.get_html()
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
def _sanitize_img_html(raw: str) -> str | None:
|
|
1056
|
-
attrs = {}
|
|
1057
|
-
for match in re.finditer(r"(\w+)\s*=\s*(\"[^\"]*\"|'[^']*'|[^\s>]+)", raw):
|
|
1058
|
-
name = match.group(1).lower()
|
|
1059
|
-
value = match.group(2).strip()
|
|
1060
|
-
if value and value[0] in {"\"", "'"} and value[-1] == value[0]:
|
|
1061
|
-
value = value[1:-1]
|
|
1062
|
-
attrs[name] = value
|
|
1063
|
-
|
|
1064
|
-
src = attrs.get("src", "")
|
|
1065
|
-
src_lower = src.lower()
|
|
1066
|
-
if not src_lower.startswith("data:image/") or ";base64," not in src_lower:
|
|
1067
|
-
return None
|
|
1068
|
-
|
|
1069
|
-
alt = attrs.get("alt", "")
|
|
1070
|
-
alt_attr = f' alt="{html.escape(alt, quote=True)}"' if alt else ""
|
|
1071
|
-
return f'<img src="{html.escape(src, quote=True)}"{alt_attr} />'
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
def _extract_html_img_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
1075
|
-
placeholders: dict[str, str] = {}
|
|
1076
|
-
out: list[str] = []
|
|
1077
|
-
idx = 0
|
|
1078
|
-
in_fence = False
|
|
1079
|
-
fence_char = ""
|
|
1080
|
-
fence_len = 0
|
|
1081
|
-
inline_delim_len = 0
|
|
1082
|
-
|
|
1083
|
-
def next_placeholder(value: str) -> str:
|
|
1084
|
-
key = f"@@HTML_IMG_{len(placeholders)}@@"
|
|
1085
|
-
placeholders[key] = value
|
|
1086
|
-
return key
|
|
1087
|
-
|
|
1088
|
-
lower = text.lower()
|
|
1089
|
-
while idx < len(text):
|
|
1090
|
-
at_line_start = idx == 0 or text[idx - 1] == "\n"
|
|
1091
|
-
|
|
1092
|
-
if inline_delim_len == 0 and at_line_start:
|
|
1093
|
-
line_end = text.find("\n", idx)
|
|
1094
|
-
if line_end == -1:
|
|
1095
|
-
line_end = len(text)
|
|
1096
|
-
line = text[idx:line_end]
|
|
1097
|
-
stripped = line.lstrip(" ")
|
|
1098
|
-
leading_spaces = len(line) - len(stripped)
|
|
1099
|
-
if leading_spaces <= 3 and stripped:
|
|
1100
|
-
first = stripped[0]
|
|
1101
|
-
if first in {"`", "~"}:
|
|
1102
|
-
run_len = 0
|
|
1103
|
-
while run_len < len(stripped) and stripped[run_len] == first:
|
|
1104
|
-
run_len += 1
|
|
1105
|
-
if run_len >= 3:
|
|
1106
|
-
if not in_fence:
|
|
1107
|
-
in_fence = True
|
|
1108
|
-
fence_char = first
|
|
1109
|
-
fence_len = run_len
|
|
1110
|
-
elif first == fence_char and run_len >= fence_len:
|
|
1111
|
-
in_fence = False
|
|
1112
|
-
fence_char = ""
|
|
1113
|
-
fence_len = 0
|
|
1114
|
-
out.append(line)
|
|
1115
|
-
idx = line_end
|
|
1116
|
-
continue
|
|
1117
|
-
|
|
1118
|
-
if in_fence:
|
|
1119
|
-
out.append(text[idx])
|
|
1120
|
-
idx += 1
|
|
1121
|
-
continue
|
|
1122
|
-
|
|
1123
|
-
if inline_delim_len > 0:
|
|
1124
|
-
delim = "`" * inline_delim_len
|
|
1125
|
-
if text.startswith(delim, idx):
|
|
1126
|
-
out.append(delim)
|
|
1127
|
-
idx += inline_delim_len
|
|
1128
|
-
inline_delim_len = 0
|
|
1129
|
-
continue
|
|
1130
|
-
out.append(text[idx])
|
|
1131
|
-
idx += 1
|
|
1132
|
-
continue
|
|
1133
|
-
|
|
1134
|
-
if text[idx] == "`":
|
|
1135
|
-
run_len = 0
|
|
1136
|
-
while idx + run_len < len(text) and text[idx + run_len] == "`":
|
|
1137
|
-
run_len += 1
|
|
1138
|
-
inline_delim_len = run_len
|
|
1139
|
-
out.append("`" * run_len)
|
|
1140
|
-
idx += run_len
|
|
1141
|
-
continue
|
|
1142
|
-
|
|
1143
|
-
if lower.startswith("<img", idx):
|
|
1144
|
-
end = text.find(">", idx)
|
|
1145
|
-
if end != -1:
|
|
1146
|
-
raw = text[idx : end + 1]
|
|
1147
|
-
safe_html = _sanitize_img_html(raw)
|
|
1148
|
-
if safe_html:
|
|
1149
|
-
out.append(next_placeholder(safe_html))
|
|
1150
|
-
idx = end + 1
|
|
1151
|
-
continue
|
|
1152
|
-
|
|
1153
|
-
out.append(text[idx])
|
|
1154
|
-
idx += 1
|
|
1155
|
-
|
|
1156
|
-
return "".join(out), placeholders
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
def _extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
1160
|
-
placeholders: dict[str, str] = {}
|
|
1161
|
-
out: list[str] = []
|
|
1162
|
-
idx = 0
|
|
1163
|
-
in_fence = False
|
|
1164
|
-
fence_char = ""
|
|
1165
|
-
fence_len = 0
|
|
1166
|
-
inline_delim_len = 0
|
|
1167
|
-
|
|
1168
|
-
def next_placeholder(value: str) -> str:
|
|
1169
|
-
key = f"@@HTML_TABLE_{len(placeholders)}@@"
|
|
1170
|
-
placeholders[key] = value
|
|
1171
|
-
return key
|
|
1172
|
-
|
|
1173
|
-
lower = text.lower()
|
|
1174
|
-
while idx < len(text):
|
|
1175
|
-
at_line_start = idx == 0 or text[idx - 1] == "\n"
|
|
1176
|
-
|
|
1177
|
-
if inline_delim_len == 0 and at_line_start:
|
|
1178
|
-
line_end = text.find("\n", idx)
|
|
1179
|
-
if line_end == -1:
|
|
1180
|
-
line_end = len(text)
|
|
1181
|
-
line = text[idx:line_end]
|
|
1182
|
-
stripped = line.lstrip(" ")
|
|
1183
|
-
leading_spaces = len(line) - len(stripped)
|
|
1184
|
-
if leading_spaces <= 3 and stripped:
|
|
1185
|
-
first = stripped[0]
|
|
1186
|
-
if first in {"`", "~"}:
|
|
1187
|
-
run_len = 0
|
|
1188
|
-
while run_len < len(stripped) and stripped[run_len] == first:
|
|
1189
|
-
run_len += 1
|
|
1190
|
-
if run_len >= 3:
|
|
1191
|
-
if not in_fence:
|
|
1192
|
-
in_fence = True
|
|
1193
|
-
fence_char = first
|
|
1194
|
-
fence_len = run_len
|
|
1195
|
-
elif first == fence_char and run_len >= fence_len:
|
|
1196
|
-
in_fence = False
|
|
1197
|
-
fence_char = ""
|
|
1198
|
-
fence_len = 0
|
|
1199
|
-
out.append(line)
|
|
1200
|
-
idx = line_end
|
|
1201
|
-
continue
|
|
1202
|
-
|
|
1203
|
-
if in_fence:
|
|
1204
|
-
out.append(text[idx])
|
|
1205
|
-
idx += 1
|
|
1206
|
-
continue
|
|
1207
|
-
|
|
1208
|
-
if inline_delim_len > 0:
|
|
1209
|
-
delim = "`" * inline_delim_len
|
|
1210
|
-
if text.startswith(delim, idx):
|
|
1211
|
-
out.append(delim)
|
|
1212
|
-
idx += inline_delim_len
|
|
1213
|
-
inline_delim_len = 0
|
|
1214
|
-
continue
|
|
1215
|
-
out.append(text[idx])
|
|
1216
|
-
idx += 1
|
|
1217
|
-
continue
|
|
1218
|
-
|
|
1219
|
-
if text[idx] == "`":
|
|
1220
|
-
run_len = 0
|
|
1221
|
-
while idx + run_len < len(text) and text[idx + run_len] == "`":
|
|
1222
|
-
run_len += 1
|
|
1223
|
-
inline_delim_len = run_len
|
|
1224
|
-
out.append("`" * run_len)
|
|
1225
|
-
idx += run_len
|
|
1226
|
-
continue
|
|
1227
|
-
|
|
1228
|
-
if lower.startswith("<table", idx):
|
|
1229
|
-
end = lower.find("</table>", idx)
|
|
1230
|
-
if end != -1:
|
|
1231
|
-
end += len("</table>")
|
|
1232
|
-
raw = text[idx:end]
|
|
1233
|
-
key = next_placeholder(raw)
|
|
1234
|
-
if out and not out[-1].endswith("\n"):
|
|
1235
|
-
out.append("\n\n")
|
|
1236
|
-
out.append(key)
|
|
1237
|
-
out.append("\n\n")
|
|
1238
|
-
idx = end
|
|
1239
|
-
continue
|
|
1240
|
-
|
|
1241
|
-
out.append(text[idx])
|
|
1242
|
-
idx += 1
|
|
1243
|
-
|
|
1244
|
-
return "".join(out), placeholders
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
def _render_paper_markdown(
|
|
1248
|
-
paper: dict[str, Any],
|
|
1249
|
-
fallback_language: str,
|
|
1250
|
-
*,
|
|
1251
|
-
template_tag: str | None = None,
|
|
1252
|
-
) -> tuple[str, str, str | None]:
|
|
1253
|
-
selected_tag, _ = _select_template_tag(paper, template_tag)
|
|
1254
|
-
selected_paper = paper
|
|
1255
|
-
if selected_tag:
|
|
1256
|
-
selected_paper = (paper.get("templates") or {}).get(selected_tag, paper)
|
|
1257
|
-
|
|
1258
|
-
template_name = selected_tag or selected_paper.get("prompt_template")
|
|
1259
|
-
warning = None
|
|
1260
|
-
if template_name:
|
|
1261
|
-
try:
|
|
1262
|
-
template = load_render_template(str(template_name))
|
|
1263
|
-
except Exception:
|
|
1264
|
-
template = load_default_template()
|
|
1265
|
-
warning = "Rendered using default template (missing template)."
|
|
1266
|
-
template_name = "default_paper"
|
|
1267
|
-
else:
|
|
1268
|
-
template = load_default_template()
|
|
1269
|
-
warning = "Rendered using default template (no template specified)."
|
|
1270
|
-
template_name = "default_paper"
|
|
1271
|
-
|
|
1272
|
-
context = dict(selected_paper)
|
|
1273
|
-
if not context.get("output_language"):
|
|
1274
|
-
context["output_language"] = fallback_language
|
|
1275
|
-
return template.render(**context), str(template_name), warning
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
_TITLE_PREFIX_LEN = 16
|
|
1279
|
-
_TITLE_MIN_CHARS = 24
|
|
1280
|
-
_TITLE_MIN_TOKENS = 4
|
|
1281
|
-
_AUTHOR_YEAR_MIN_SIMILARITY = 0.8
|
|
1282
|
-
_LEADING_NUMERIC_MAX_LEN = 2
|
|
1283
|
-
_SIMILARITY_START = 0.95
|
|
1284
|
-
_SIMILARITY_STEP = 0.05
|
|
1285
|
-
_SIMILARITY_MAX_STEPS = 10
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
def _normalize_title_key(title: str) -> str:
|
|
1289
|
-
value = unicodedata.normalize("NFKD", title)
|
|
1290
|
-
greek_map = {
|
|
1291
|
-
"α": "alpha",
|
|
1292
|
-
"β": "beta",
|
|
1293
|
-
"γ": "gamma",
|
|
1294
|
-
"δ": "delta",
|
|
1295
|
-
"ε": "epsilon",
|
|
1296
|
-
"ζ": "zeta",
|
|
1297
|
-
"η": "eta",
|
|
1298
|
-
"θ": "theta",
|
|
1299
|
-
"ι": "iota",
|
|
1300
|
-
"κ": "kappa",
|
|
1301
|
-
"λ": "lambda",
|
|
1302
|
-
"μ": "mu",
|
|
1303
|
-
"ν": "nu",
|
|
1304
|
-
"ξ": "xi",
|
|
1305
|
-
"ο": "omicron",
|
|
1306
|
-
"π": "pi",
|
|
1307
|
-
"ρ": "rho",
|
|
1308
|
-
"σ": "sigma",
|
|
1309
|
-
"τ": "tau",
|
|
1310
|
-
"υ": "upsilon",
|
|
1311
|
-
"φ": "phi",
|
|
1312
|
-
"χ": "chi",
|
|
1313
|
-
"ψ": "psi",
|
|
1314
|
-
"ω": "omega",
|
|
1315
|
-
}
|
|
1316
|
-
for char, name in greek_map.items():
|
|
1317
|
-
value = value.replace(char, f" {name} ")
|
|
1318
|
-
value = re.sub(
|
|
1319
|
-
r"\\(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
|
|
1320
|
-
r" \1 ",
|
|
1321
|
-
value,
|
|
1322
|
-
flags=re.IGNORECASE,
|
|
1323
|
-
)
|
|
1324
|
-
value = value.replace("{", "").replace("}", "")
|
|
1325
|
-
value = value.replace("_", " ")
|
|
1326
|
-
value = re.sub(r"([a-z])([0-9])", r"\1 \2", value, flags=re.IGNORECASE)
|
|
1327
|
-
value = re.sub(r"([0-9])([a-z])", r"\1 \2", value, flags=re.IGNORECASE)
|
|
1328
|
-
value = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
|
1329
|
-
value = re.sub(r"\s+", " ", value).strip()
|
|
1330
|
-
tokens = value.split()
|
|
1331
|
-
if not tokens:
|
|
1332
|
-
return ""
|
|
1333
|
-
merged: list[str] = []
|
|
1334
|
-
idx = 0
|
|
1335
|
-
while idx < len(tokens):
|
|
1336
|
-
token = tokens[idx]
|
|
1337
|
-
if len(token) == 1 and idx + 1 < len(tokens):
|
|
1338
|
-
merged.append(token + tokens[idx + 1])
|
|
1339
|
-
idx += 2
|
|
1340
|
-
continue
|
|
1341
|
-
merged.append(token)
|
|
1342
|
-
idx += 1
|
|
1343
|
-
return " ".join(merged)
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
def _compact_title_key(title_key: str) -> str:
|
|
1347
|
-
return title_key.replace(" ", "")
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
def _strip_leading_numeric_tokens(title_key: str) -> str:
|
|
1351
|
-
tokens = title_key.split()
|
|
1352
|
-
idx = 0
|
|
1353
|
-
while idx < len(tokens):
|
|
1354
|
-
token = tokens[idx]
|
|
1355
|
-
if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
|
|
1356
|
-
idx += 1
|
|
1357
|
-
continue
|
|
1358
|
-
break
|
|
1359
|
-
if idx == 0:
|
|
1360
|
-
return title_key
|
|
1361
|
-
return " ".join(tokens[idx:])
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
def _strip_pdf_hash_suffix(name: str) -> str:
|
|
1365
|
-
return re.sub(r"(?i)(\.pdf)(?:-[0-9a-f\-]{8,})$", r"\1", name)
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
def _extract_title_from_filename(name: str) -> str:
|
|
1369
|
-
base = name
|
|
1370
|
-
lower = base.lower()
|
|
1371
|
-
if lower.endswith(".md"):
|
|
1372
|
-
base = base[:-3]
|
|
1373
|
-
lower = base.lower()
|
|
1374
|
-
if ".pdf-" in lower:
|
|
1375
|
-
base = _strip_pdf_hash_suffix(base)
|
|
1376
|
-
lower = base.lower()
|
|
1377
|
-
if lower.endswith(".pdf"):
|
|
1378
|
-
base = base[:-4]
|
|
1379
|
-
base = base.replace("_", " ").strip()
|
|
1380
|
-
match = re.match(r"\s*\d{4}\s*-\s*(.+)$", base)
|
|
1381
|
-
if match:
|
|
1382
|
-
return match.group(1).strip()
|
|
1383
|
-
match = re.match(r"\s*.+?\s*-\s*\d{4}\s*-\s*(.+)$", base)
|
|
1384
|
-
if match:
|
|
1385
|
-
return match.group(1).strip()
|
|
1386
|
-
return base.strip()
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
def _clean_pdf_metadata_title(value: str | None, path: Path) -> str | None:
|
|
1390
|
-
if not value:
|
|
1391
|
-
return None
|
|
1392
|
-
text = str(value).replace("\x00", "").strip()
|
|
1393
|
-
if not text:
|
|
1394
|
-
return None
|
|
1395
|
-
text = re.sub(r"(?i)^microsoft\\s+word\\s*-\\s*", "", text)
|
|
1396
|
-
text = re.sub(r"(?i)^pdf\\s*-\\s*", "", text)
|
|
1397
|
-
text = re.sub(r"(?i)^untitled\\b", "", text).strip()
|
|
1398
|
-
if text.lower().endswith(".pdf"):
|
|
1399
|
-
text = text[:-4].strip()
|
|
1400
|
-
if len(text) < 3:
|
|
1401
|
-
return None
|
|
1402
|
-
stem = path.stem.strip()
|
|
1403
|
-
if stem and text.lower() == stem.lower():
|
|
1404
|
-
return None
|
|
1405
|
-
return text
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
def _read_pdf_metadata_title(path: Path) -> str | None:
|
|
1409
|
-
if not PYPDF_AVAILABLE:
|
|
1410
|
-
return None
|
|
1411
|
-
try:
|
|
1412
|
-
reader = PdfReader(str(path))
|
|
1413
|
-
meta = reader.metadata
|
|
1414
|
-
title = meta.title if meta else None
|
|
1415
|
-
except Exception:
|
|
1416
|
-
return None
|
|
1417
|
-
return _clean_pdf_metadata_title(title, path)
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
def _is_pdf_like(path: Path) -> bool:
|
|
1421
|
-
suffix = path.suffix.lower()
|
|
1422
|
-
if suffix == ".pdf":
|
|
1423
|
-
return True
|
|
1424
|
-
name_lower = path.name.lower()
|
|
1425
|
-
return ".pdf-" in name_lower and not name_lower.endswith(".md")
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
|
|
1429
|
-
pdf_paths: list[Path] = []
|
|
1430
|
-
meta: list[dict[str, Any]] = []
|
|
1431
|
-
seen: set[Path] = set()
|
|
1432
|
-
for root in roots:
|
|
1433
|
-
try:
|
|
1434
|
-
if not root.exists() or not root.is_dir():
|
|
1435
|
-
continue
|
|
1436
|
-
except OSError:
|
|
1437
|
-
continue
|
|
1438
|
-
files: list[Path] = []
|
|
1439
|
-
for path in root.rglob("*"):
|
|
1440
|
-
try:
|
|
1441
|
-
if not path.is_file():
|
|
1442
|
-
continue
|
|
1443
|
-
except OSError:
|
|
1444
|
-
continue
|
|
1445
|
-
if not _is_pdf_like(path):
|
|
1446
|
-
continue
|
|
1447
|
-
resolved = path.resolve()
|
|
1448
|
-
if resolved in seen:
|
|
1449
|
-
continue
|
|
1450
|
-
seen.add(resolved)
|
|
1451
|
-
files.append(resolved)
|
|
1452
|
-
max_mtime = 0.0
|
|
1453
|
-
total_size = 0
|
|
1454
|
-
for path in files:
|
|
1455
|
-
try:
|
|
1456
|
-
stats = path.stat()
|
|
1457
|
-
except OSError:
|
|
1458
|
-
continue
|
|
1459
|
-
max_mtime = max(max_mtime, stats.st_mtime)
|
|
1460
|
-
total_size += stats.st_size
|
|
1461
|
-
pdf_paths.extend(files)
|
|
1462
|
-
meta.append(
|
|
1463
|
-
{
|
|
1464
|
-
"path": str(root),
|
|
1465
|
-
"count": len(files),
|
|
1466
|
-
"max_mtime": max_mtime,
|
|
1467
|
-
"size": total_size,
|
|
1468
|
-
}
|
|
1469
|
-
)
|
|
1470
|
-
return pdf_paths, meta
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
def _extract_year_author_from_filename(name: str) -> tuple[str | None, str | None]:
|
|
1474
|
-
base = name
|
|
1475
|
-
lower = base.lower()
|
|
1476
|
-
if lower.endswith(".md"):
|
|
1477
|
-
base = base[:-3]
|
|
1478
|
-
lower = base.lower()
|
|
1479
|
-
if ".pdf-" in lower:
|
|
1480
|
-
base = _strip_pdf_hash_suffix(base)
|
|
1481
|
-
lower = base.lower()
|
|
1482
|
-
if lower.endswith(".pdf"):
|
|
1483
|
-
base = base[:-4]
|
|
1484
|
-
match = re.match(r"\s*(.+?)\s*-\s*((?:19|20)\d{2})\s*-\s*", base)
|
|
1485
|
-
if match:
|
|
1486
|
-
return match.group(2), match.group(1).strip()
|
|
1487
|
-
match = re.match(r"\s*((?:19|20)\d{2})\s*-\s*", base)
|
|
1488
|
-
if match:
|
|
1489
|
-
return match.group(1), None
|
|
1490
|
-
return None, None
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
def _normalize_author_key(name: str) -> str:
|
|
1494
|
-
raw = name.lower().strip()
|
|
1495
|
-
raw = raw.replace("et al.", "").replace("et al", "")
|
|
1496
|
-
if "," in raw:
|
|
1497
|
-
raw = raw.split(",", 1)[0]
|
|
1498
|
-
raw = re.sub(r"[^a-z0-9]+", " ", raw)
|
|
1499
|
-
raw = re.sub(r"\s+", " ", raw).strip()
|
|
1500
|
-
if not raw:
|
|
1501
|
-
return ""
|
|
1502
|
-
parts = raw.split()
|
|
1503
|
-
return parts[-1] if parts else raw
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
def _title_prefix_key(title_key: str) -> str | None:
|
|
1507
|
-
if len(title_key.split()) < _TITLE_MIN_TOKENS:
|
|
1508
|
-
return None
|
|
1509
|
-
compact = _compact_title_key(title_key)
|
|
1510
|
-
if len(compact) < _TITLE_PREFIX_LEN:
|
|
1511
|
-
return None
|
|
1512
|
-
prefix = compact[:_TITLE_PREFIX_LEN]
|
|
1513
|
-
if not prefix:
|
|
1514
|
-
return None
|
|
1515
|
-
return f"prefix:{prefix}"
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
def _title_overlap_match(a: str, b: str) -> bool:
|
|
1519
|
-
if not a or not b:
|
|
1520
|
-
return False
|
|
1521
|
-
if a == b:
|
|
1522
|
-
return True
|
|
1523
|
-
shorter, longer = (a, b) if len(a) <= len(b) else (b, a)
|
|
1524
|
-
token_count = len(shorter.split())
|
|
1525
|
-
if len(shorter) >= _TITLE_MIN_CHARS or token_count >= _TITLE_MIN_TOKENS:
|
|
1526
|
-
if longer.startswith(shorter) or shorter in longer:
|
|
1527
|
-
return True
|
|
1528
|
-
return False
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
def _adaptive_similarity_match(title_key: str, candidates: list[Path]) -> Path | None:
|
|
1532
|
-
if not title_key:
|
|
1533
|
-
return None
|
|
1534
|
-
scored: list[tuple[Path, float]] = []
|
|
1535
|
-
for path in candidates:
|
|
1536
|
-
candidate_title = _normalize_title_key(_extract_title_from_filename(path.name))
|
|
1537
|
-
if not candidate_title:
|
|
1538
|
-
continue
|
|
1539
|
-
if _title_overlap_match(title_key, candidate_title):
|
|
1540
|
-
return path
|
|
1541
|
-
scored.append((path, _title_similarity(title_key, candidate_title)))
|
|
1542
|
-
if not scored:
|
|
1543
|
-
return None
|
|
1544
|
-
|
|
1545
|
-
def matches_at(threshold: float) -> list[Path]:
|
|
1546
|
-
return [path for path, score in scored if score >= threshold]
|
|
1547
|
-
|
|
1548
|
-
threshold = _SIMILARITY_START
|
|
1549
|
-
step = _SIMILARITY_STEP
|
|
1550
|
-
prev_threshold = None
|
|
1551
|
-
prev_count = None
|
|
1552
|
-
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
1553
|
-
matches = matches_at(threshold)
|
|
1554
|
-
if len(matches) == 1:
|
|
1555
|
-
return matches[0]
|
|
1556
|
-
if len(matches) == 0:
|
|
1557
|
-
prev_threshold = threshold
|
|
1558
|
-
prev_count = 0
|
|
1559
|
-
threshold -= step
|
|
1560
|
-
continue
|
|
1561
|
-
if prev_count == 0 and prev_threshold is not None:
|
|
1562
|
-
low = threshold
|
|
1563
|
-
high = prev_threshold
|
|
1564
|
-
for _ in range(_SIMILARITY_MAX_STEPS):
|
|
1565
|
-
mid = (low + high) / 2
|
|
1566
|
-
mid_matches = matches_at(mid)
|
|
1567
|
-
if len(mid_matches) == 1:
|
|
1568
|
-
return mid_matches[0]
|
|
1569
|
-
if len(mid_matches) == 0:
|
|
1570
|
-
high = mid
|
|
1571
|
-
else:
|
|
1572
|
-
low = mid
|
|
1573
|
-
return None
|
|
1574
|
-
prev_threshold = threshold
|
|
1575
|
-
prev_count = len(matches)
|
|
1576
|
-
threshold -= step
|
|
1577
|
-
return None
|
|
1578
|
-
|
|
1579
|
-
|
|
1580
|
-
def _resolve_by_title_and_meta(
|
|
1581
|
-
paper: dict[str, Any],
|
|
1582
|
-
file_index: dict[str, list[Path]],
|
|
1583
|
-
) -> Path | None:
|
|
1584
|
-
title = str(paper.get("paper_title") or "")
|
|
1585
|
-
title_key = _normalize_title_key(title)
|
|
1586
|
-
if not title_key:
|
|
1587
|
-
title_key = ""
|
|
1588
|
-
candidates = file_index.get(title_key, [])
|
|
1589
|
-
if candidates:
|
|
1590
|
-
return candidates[0]
|
|
1591
|
-
if title_key:
|
|
1592
|
-
compact_key = _compact_title_key(title_key)
|
|
1593
|
-
compact_candidates = file_index.get(f"compact:{compact_key}", [])
|
|
1594
|
-
if compact_candidates:
|
|
1595
|
-
return compact_candidates[0]
|
|
1596
|
-
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1597
|
-
if stripped_key and stripped_key != title_key:
|
|
1598
|
-
stripped_candidates = file_index.get(stripped_key, [])
|
|
1599
|
-
if stripped_candidates:
|
|
1600
|
-
return stripped_candidates[0]
|
|
1601
|
-
stripped_compact = _compact_title_key(stripped_key)
|
|
1602
|
-
stripped_candidates = file_index.get(f"compact:{stripped_compact}", [])
|
|
1603
|
-
if stripped_candidates:
|
|
1604
|
-
return stripped_candidates[0]
|
|
1605
|
-
prefix_candidates: list[Path] = []
|
|
1606
|
-
prefix_key = _title_prefix_key(title_key)
|
|
1607
|
-
if prefix_key:
|
|
1608
|
-
prefix_candidates = file_index.get(prefix_key, [])
|
|
1609
|
-
if not prefix_candidates:
|
|
1610
|
-
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1611
|
-
if stripped_key and stripped_key != title_key:
|
|
1612
|
-
prefix_key = _title_prefix_key(stripped_key)
|
|
1613
|
-
if prefix_key:
|
|
1614
|
-
prefix_candidates = file_index.get(prefix_key, [])
|
|
1615
|
-
if prefix_candidates:
|
|
1616
|
-
match = _adaptive_similarity_match(title_key, prefix_candidates)
|
|
1617
|
-
if match is not None:
|
|
1618
|
-
return match
|
|
1619
|
-
year = str(paper.get("_year") or "").strip()
|
|
1620
|
-
if not year.isdigit():
|
|
1621
|
-
return None
|
|
1622
|
-
author_key = ""
|
|
1623
|
-
authors = paper.get("_authors") or []
|
|
1624
|
-
if authors:
|
|
1625
|
-
author_key = _normalize_author_key(str(authors[0]))
|
|
1626
|
-
candidates = []
|
|
1627
|
-
if author_key:
|
|
1628
|
-
candidates = file_index.get(f"authoryear:{year}:{author_key}", [])
|
|
1629
|
-
if not candidates:
|
|
1630
|
-
candidates = file_index.get(f"year:{year}", [])
|
|
1631
|
-
if not candidates:
|
|
1632
|
-
return None
|
|
1633
|
-
if len(candidates) == 1 and not title_key:
|
|
1634
|
-
return candidates[0]
|
|
1635
|
-
match = _adaptive_similarity_match(title_key, candidates)
|
|
1636
|
-
if match is not None:
|
|
1637
|
-
return match
|
|
1638
|
-
return None
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
|
|
1642
|
-
index: dict[str, list[Path]] = {}
|
|
1643
|
-
for root in roots:
|
|
1644
|
-
try:
|
|
1645
|
-
if not root.exists() or not root.is_dir():
|
|
1646
|
-
continue
|
|
1647
|
-
except OSError:
|
|
1648
|
-
continue
|
|
1649
|
-
for path in root.rglob("*"):
|
|
1650
|
-
try:
|
|
1651
|
-
if not path.is_file():
|
|
1652
|
-
continue
|
|
1653
|
-
except OSError:
|
|
1654
|
-
continue
|
|
1655
|
-
suffix = path.suffix.lower()
|
|
1656
|
-
if suffix not in suffixes:
|
|
1657
|
-
name_lower = path.name.lower()
|
|
1658
|
-
if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
|
|
1659
|
-
pass
|
|
1660
|
-
else:
|
|
1661
|
-
continue
|
|
1662
|
-
resolved = path.resolve()
|
|
1663
|
-
name_key = path.name.lower()
|
|
1664
|
-
index.setdefault(name_key, []).append(resolved)
|
|
1665
|
-
title_candidate = _extract_title_from_filename(path.name)
|
|
1666
|
-
title_key = _normalize_title_key(title_candidate)
|
|
1667
|
-
if title_key:
|
|
1668
|
-
if title_key != name_key:
|
|
1669
|
-
index.setdefault(title_key, []).append(resolved)
|
|
1670
|
-
compact_key = _compact_title_key(title_key)
|
|
1671
|
-
if compact_key:
|
|
1672
|
-
index.setdefault(f"compact:{compact_key}", []).append(resolved)
|
|
1673
|
-
prefix_key = _title_prefix_key(title_key)
|
|
1674
|
-
if prefix_key:
|
|
1675
|
-
index.setdefault(prefix_key, []).append(resolved)
|
|
1676
|
-
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1677
|
-
if stripped_key and stripped_key != title_key:
|
|
1678
|
-
index.setdefault(stripped_key, []).append(resolved)
|
|
1679
|
-
stripped_compact = _compact_title_key(stripped_key)
|
|
1680
|
-
if stripped_compact:
|
|
1681
|
-
index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
|
|
1682
|
-
stripped_prefix = _title_prefix_key(stripped_key)
|
|
1683
|
-
if stripped_prefix:
|
|
1684
|
-
index.setdefault(stripped_prefix, []).append(resolved)
|
|
1685
|
-
year_hint, author_hint = _extract_year_author_from_filename(path.name)
|
|
1686
|
-
if year_hint:
|
|
1687
|
-
index.setdefault(f"year:{year_hint}", []).append(resolved)
|
|
1688
|
-
if author_hint:
|
|
1689
|
-
author_key = _normalize_author_key(author_hint)
|
|
1690
|
-
if author_key:
|
|
1691
|
-
index.setdefault(f"authoryear:{year_hint}:{author_key}", []).append(resolved)
|
|
1692
|
-
return index
|
|
1693
|
-
|
|
1694
|
-
|
|
1695
|
-
def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
|
|
1696
|
-
index: dict[str, list[Path]] = {}
|
|
1697
|
-
for path in paths:
|
|
1698
|
-
try:
|
|
1699
|
-
if not path.is_file():
|
|
1700
|
-
continue
|
|
1701
|
-
except OSError:
|
|
1702
|
-
continue
|
|
1703
|
-
suffix = path.suffix.lower()
|
|
1704
|
-
if suffix not in suffixes:
|
|
1705
|
-
name_lower = path.name.lower()
|
|
1706
|
-
if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
|
|
1707
|
-
pass
|
|
1708
|
-
else:
|
|
1709
|
-
continue
|
|
1710
|
-
resolved = path.resolve()
|
|
1711
|
-
name_key = path.name.lower()
|
|
1712
|
-
index.setdefault(name_key, []).append(resolved)
|
|
1713
|
-
title_candidate = _extract_title_from_filename(path.name)
|
|
1714
|
-
title_key = _normalize_title_key(title_candidate)
|
|
1715
|
-
if title_key:
|
|
1716
|
-
if title_key != name_key:
|
|
1717
|
-
index.setdefault(title_key, []).append(resolved)
|
|
1718
|
-
compact_key = _compact_title_key(title_key)
|
|
1719
|
-
if compact_key:
|
|
1720
|
-
index.setdefault(f"compact:{compact_key}", []).append(resolved)
|
|
1721
|
-
prefix_key = _title_prefix_key(title_key)
|
|
1722
|
-
if prefix_key:
|
|
1723
|
-
index.setdefault(prefix_key, []).append(resolved)
|
|
1724
|
-
stripped_key = _strip_leading_numeric_tokens(title_key)
|
|
1725
|
-
if stripped_key and stripped_key != title_key:
|
|
1726
|
-
index.setdefault(stripped_key, []).append(resolved)
|
|
1727
|
-
stripped_compact = _compact_title_key(stripped_key)
|
|
1728
|
-
if stripped_compact:
|
|
1729
|
-
index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
|
|
1730
|
-
stripped_prefix = _title_prefix_key(stripped_key)
|
|
1731
|
-
if stripped_prefix:
|
|
1732
|
-
index.setdefault(stripped_prefix, []).append(resolved)
|
|
1733
|
-
return index
|
|
1734
|
-
|
|
1735
|
-
|
|
1736
|
-
def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
|
|
1737
|
-
source_path = paper.get("source_path")
|
|
1738
|
-
if not source_path:
|
|
1739
|
-
source_path = ""
|
|
1740
|
-
if source_path:
|
|
1741
|
-
name = Path(str(source_path)).name.lower()
|
|
1742
|
-
candidates = md_index.get(name, [])
|
|
1743
|
-
if candidates:
|
|
1744
|
-
return candidates[0]
|
|
1745
|
-
return _resolve_by_title_and_meta(paper, md_index)
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
|
|
1749
|
-
index: dict[str, dict[str, Path]] = {}
|
|
1750
|
-
candidates: list[Path] = []
|
|
1751
|
-
for root in roots:
|
|
1752
|
-
try:
|
|
1753
|
-
if not root.exists() or not root.is_dir():
|
|
1754
|
-
continue
|
|
1755
|
-
except OSError:
|
|
1756
|
-
continue
|
|
1757
|
-
try:
|
|
1758
|
-
candidates.extend(root.rglob("*.md"))
|
|
1759
|
-
except OSError:
|
|
1760
|
-
continue
|
|
1761
|
-
for path in sorted(candidates, key=lambda item: str(item)):
|
|
1762
|
-
try:
|
|
1763
|
-
if not path.is_file():
|
|
1764
|
-
continue
|
|
1765
|
-
except OSError:
|
|
1766
|
-
continue
|
|
1767
|
-
name = path.name
|
|
1768
|
-
match = re.match(r"^(.+)\.([^.]+)\.md$", name, flags=re.IGNORECASE)
|
|
1769
|
-
if not match:
|
|
1770
|
-
continue
|
|
1771
|
-
base_name = match.group(1).strip()
|
|
1772
|
-
lang = match.group(2).strip()
|
|
1773
|
-
if not base_name or not lang:
|
|
1774
|
-
continue
|
|
1775
|
-
base_key = base_name.lower()
|
|
1776
|
-
lang_key = lang.lower()
|
|
1777
|
-
index.setdefault(base_key, {}).setdefault(lang_key, path.resolve())
|
|
1778
|
-
return index
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
|
|
1782
|
-
source_path = paper.get("source_path")
|
|
1783
|
-
if not source_path:
|
|
1784
|
-
return []
|
|
1785
|
-
name = Path(str(source_path)).name
|
|
1786
|
-
match = re.match(r"(?i)(.+\\.pdf)(?:-[0-9a-f\\-]{8,})?\\.md$", name)
|
|
1787
|
-
if match:
|
|
1788
|
-
return [Path(match.group(1)).name]
|
|
1789
|
-
if ".pdf-" in name.lower():
|
|
1790
|
-
base = name[: name.lower().rfind(".pdf-") + 4]
|
|
1791
|
-
return [Path(base).name]
|
|
1792
|
-
if name.lower().endswith(".pdf"):
|
|
1793
|
-
return [name]
|
|
1794
|
-
if name.lower().endswith(".pdf.md"):
|
|
1795
|
-
return [name[:-3]]
|
|
1796
|
-
return []
|
|
1797
|
-
|
|
1798
|
-
|
|
1799
|
-
def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
|
|
1800
|
-
for filename in _guess_pdf_names(paper):
|
|
1801
|
-
candidates = pdf_index.get(filename.lower(), [])
|
|
1802
|
-
if candidates:
|
|
1803
|
-
return candidates[0]
|
|
1804
|
-
return _resolve_by_title_and_meta(paper, pdf_index)
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
|
|
1808
|
-
resolved = path.resolve()
|
|
1809
|
-
for root in roots:
|
|
1810
|
-
try:
|
|
1811
|
-
resolved.relative_to(root.resolve())
|
|
1812
|
-
return True
|
|
1813
|
-
except Exception:
|
|
1814
|
-
continue
|
|
1815
|
-
return False
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
_BOOL_TRUE = {"1", "true", "yes", "with", "has"}
|
|
1819
|
-
_BOOL_FALSE = {"0", "false", "no", "without"}
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
def _tokenize_filter_query(text: str) -> list[str]:
|
|
1823
|
-
out: list[str] = []
|
|
1824
|
-
buf: list[str] = []
|
|
1825
|
-
in_quote = False
|
|
1826
|
-
|
|
1827
|
-
for ch in text:
|
|
1828
|
-
if ch == '"':
|
|
1829
|
-
in_quote = not in_quote
|
|
1830
|
-
continue
|
|
1831
|
-
if not in_quote and ch.isspace():
|
|
1832
|
-
token = "".join(buf).strip()
|
|
1833
|
-
if token:
|
|
1834
|
-
out.append(token)
|
|
1835
|
-
buf = []
|
|
1836
|
-
continue
|
|
1837
|
-
buf.append(ch)
|
|
1838
|
-
|
|
1839
|
-
token = "".join(buf).strip()
|
|
1840
|
-
if token:
|
|
1841
|
-
out.append(token)
|
|
1842
|
-
return out
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
def _normalize_presence_value(value: str) -> str | None:
|
|
1846
|
-
token = value.strip().lower()
|
|
1847
|
-
if token in _BOOL_TRUE:
|
|
1848
|
-
return "with"
|
|
1849
|
-
if token in _BOOL_FALSE:
|
|
1850
|
-
return "without"
|
|
1851
|
-
return None
|
|
1852
|
-
|
|
1853
|
-
|
|
1854
|
-
def _parse_filter_query(text: str) -> dict[str, set[str]]:
|
|
1855
|
-
parsed = {
|
|
1856
|
-
"pdf": set(),
|
|
1857
|
-
"source": set(),
|
|
1858
|
-
"summary": set(),
|
|
1859
|
-
"translated": set(),
|
|
1860
|
-
"template": set(),
|
|
1861
|
-
}
|
|
1862
|
-
for token in _tokenize_filter_query(text):
|
|
1863
|
-
if ":" not in token:
|
|
1864
|
-
continue
|
|
1865
|
-
key, raw_value = token.split(":", 1)
|
|
1866
|
-
key = key.strip().lower()
|
|
1867
|
-
raw_value = raw_value.strip()
|
|
1868
|
-
if not raw_value:
|
|
1869
|
-
continue
|
|
1870
|
-
if key in {"tmpl", "template"}:
|
|
1871
|
-
for part in raw_value.split(","):
|
|
1872
|
-
tag = part.strip()
|
|
1873
|
-
if tag:
|
|
1874
|
-
parsed["template"].add(tag.lower())
|
|
1875
|
-
continue
|
|
1876
|
-
if key in {"pdf", "source", "summary", "translated"}:
|
|
1877
|
-
for part in raw_value.split(","):
|
|
1878
|
-
normalized = _normalize_presence_value(part)
|
|
1879
|
-
if normalized:
|
|
1880
|
-
parsed[key].add(normalized)
|
|
1881
|
-
continue
|
|
1882
|
-
if key in {"has", "no"}:
|
|
1883
|
-
targets = [part.strip().lower() for part in raw_value.split(",") if part.strip()]
|
|
1884
|
-
for target in targets:
|
|
1885
|
-
if target not in {"pdf", "source", "summary", "translated"}:
|
|
1886
|
-
continue
|
|
1887
|
-
parsed[target].add("with" if key == "has" else "without")
|
|
1888
|
-
return parsed
|
|
1889
|
-
|
|
1890
|
-
|
|
1891
|
-
def _presence_filter(values: list[str]) -> set[str] | None:
|
|
1892
|
-
normalized = set()
|
|
1893
|
-
for value in values:
|
|
1894
|
-
token = _normalize_presence_value(value)
|
|
1895
|
-
if token:
|
|
1896
|
-
normalized.add(token)
|
|
1897
|
-
if not normalized or normalized == {"with", "without"}:
|
|
1898
|
-
return None
|
|
1899
|
-
return normalized
|
|
1900
|
-
|
|
1901
|
-
|
|
1902
|
-
def _merge_filter_set(primary: set[str] | None, secondary: set[str] | None) -> set[str] | None:
|
|
1903
|
-
if not primary:
|
|
1904
|
-
return secondary
|
|
1905
|
-
if not secondary:
|
|
1906
|
-
return primary
|
|
1907
|
-
return primary & secondary
|
|
1908
|
-
|
|
1909
|
-
|
|
1910
|
-
def _matches_presence(allowed: set[str] | None, has_value: bool) -> bool:
|
|
1911
|
-
if not allowed:
|
|
1912
|
-
return True
|
|
1913
|
-
if has_value and "with" in allowed:
|
|
1914
|
-
return True
|
|
1915
|
-
if not has_value and "without" in allowed:
|
|
1916
|
-
return True
|
|
1917
|
-
return False
|
|
1918
|
-
|
|
1919
|
-
|
|
1920
|
-
def _template_tag_map(index: PaperIndex) -> dict[str, str]:
|
|
1921
|
-
return {tag.lower(): tag for tag in index.template_tags}
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
def _compute_counts(index: PaperIndex, ids: set[int]) -> dict[str, Any]:
|
|
1925
|
-
template_order = list(index.template_tags)
|
|
1926
|
-
template_counts = {tag: 0 for tag in template_order}
|
|
1927
|
-
pdf_count = 0
|
|
1928
|
-
source_count = 0
|
|
1929
|
-
summary_count = 0
|
|
1930
|
-
translated_count = 0
|
|
1931
|
-
total_count = 0
|
|
1932
|
-
tag_map = _template_tag_map(index)
|
|
1933
|
-
|
|
1934
|
-
for idx in ids:
|
|
1935
|
-
paper = index.papers[idx]
|
|
1936
|
-
if paper.get("_is_pdf_only"):
|
|
1937
|
-
continue
|
|
1938
|
-
total_count += 1
|
|
1939
|
-
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
1940
|
-
has_source = source_hash in index.md_path_by_hash
|
|
1941
|
-
has_pdf = source_hash in index.pdf_path_by_hash
|
|
1942
|
-
has_summary = bool(paper.get("_has_summary"))
|
|
1943
|
-
has_translated = bool(index.translated_md_by_hash.get(source_hash))
|
|
1944
|
-
if has_source:
|
|
1945
|
-
source_count += 1
|
|
1946
|
-
if has_pdf:
|
|
1947
|
-
pdf_count += 1
|
|
1948
|
-
if has_summary:
|
|
1949
|
-
summary_count += 1
|
|
1950
|
-
if has_translated:
|
|
1951
|
-
translated_count += 1
|
|
1952
|
-
for tag_lc in paper.get("_template_tags_lc") or []:
|
|
1953
|
-
display = tag_map.get(tag_lc)
|
|
1954
|
-
if display:
|
|
1955
|
-
template_counts[display] = template_counts.get(display, 0) + 1
|
|
1956
|
-
|
|
1957
|
-
return {
|
|
1958
|
-
"total": total_count,
|
|
1959
|
-
"pdf": pdf_count,
|
|
1960
|
-
"source": source_count,
|
|
1961
|
-
"summary": summary_count,
|
|
1962
|
-
"translated": translated_count,
|
|
1963
|
-
"templates": template_counts,
|
|
1964
|
-
"template_order": template_order,
|
|
1965
|
-
}
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
def _apply_query(index: PaperIndex, query: Query) -> set[int]:
|
|
1969
|
-
all_ids = set(index.ordered_ids)
|
|
1970
|
-
|
|
1971
|
-
def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
|
|
1972
|
-
value_lc = term.value.lower()
|
|
1973
|
-
if term.field is None:
|
|
1974
|
-
return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
|
|
1975
|
-
if term.field == "title":
|
|
1976
|
-
return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
|
|
1977
|
-
if term.field == "venue":
|
|
1978
|
-
return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
|
|
1979
|
-
if term.field == "tag":
|
|
1980
|
-
exact = index.by_tag.get(value_lc)
|
|
1981
|
-
if exact is not None:
|
|
1982
|
-
return exact & base
|
|
1983
|
-
return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
|
|
1984
|
-
if term.field == "author":
|
|
1985
|
-
exact = index.by_author.get(value_lc)
|
|
1986
|
-
if exact is not None:
|
|
1987
|
-
return exact & base
|
|
1988
|
-
return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
|
|
1989
|
-
if term.field == "month":
|
|
1990
|
-
exact = index.by_month.get(value_lc)
|
|
1991
|
-
if exact is not None:
|
|
1992
|
-
return exact & base
|
|
1993
|
-
return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
|
|
1994
|
-
if term.field == "year":
|
|
1995
|
-
if ".." in term.value:
|
|
1996
|
-
start_str, end_str = term.value.split("..", 1)
|
|
1997
|
-
if start_str.strip().isdigit() and end_str.strip().isdigit():
|
|
1998
|
-
start = int(start_str.strip())
|
|
1999
|
-
end = int(end_str.strip())
|
|
2000
|
-
ids: set[int] = set()
|
|
2001
|
-
for y in range(min(start, end), max(start, end) + 1):
|
|
2002
|
-
ids |= index.by_year.get(str(y), set())
|
|
2003
|
-
return ids & base
|
|
2004
|
-
exact = index.by_year.get(value_lc)
|
|
2005
|
-
if exact is not None:
|
|
2006
|
-
return exact & base
|
|
2007
|
-
return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
|
|
2008
|
-
return set()
|
|
2009
|
-
|
|
2010
|
-
result: set[int] = set()
|
|
2011
|
-
for group in query.groups:
|
|
2012
|
-
group_ids = set(all_ids)
|
|
2013
|
-
for term in group:
|
|
2014
|
-
matched = ids_for_term(term, group_ids if not term.negated else all_ids)
|
|
2015
|
-
if term.negated:
|
|
2016
|
-
group_ids -= matched
|
|
2017
|
-
else:
|
|
2018
|
-
group_ids &= matched
|
|
2019
|
-
result |= group_ids
|
|
2020
|
-
|
|
2021
|
-
return result
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
def _page_shell(
|
|
2025
|
-
title: str,
|
|
2026
|
-
body_html: str,
|
|
2027
|
-
extra_head: str = "",
|
|
2028
|
-
extra_scripts: str = "",
|
|
2029
|
-
header_title: str | None = None,
|
|
2030
|
-
) -> str:
|
|
2031
|
-
header_html = """
|
|
2032
|
-
<header>
|
|
2033
|
-
<a href="/">Papers</a>
|
|
2034
|
-
<a href="/stats">Stats</a>
|
|
2035
|
-
</header>
|
|
2036
|
-
"""
|
|
2037
|
-
if header_title:
|
|
2038
|
-
safe_title = html.escape(header_title)
|
|
2039
|
-
header_html = f"""
|
|
2040
|
-
<header class="detail-header">
|
|
2041
|
-
<div class="header-row">
|
|
2042
|
-
<a class="header-back" href="/">← Papers</a>
|
|
2043
|
-
<span class="header-title" title="{safe_title}">{safe_title}</span>
|
|
2044
|
-
<a class="header-link" href="/stats">Stats</a>
|
|
2045
|
-
</div>
|
|
2046
|
-
</header>
|
|
2047
|
-
"""
|
|
2048
|
-
return f"""<!doctype html>
|
|
2049
|
-
<html lang="en">
|
|
2050
|
-
<head>
|
|
2051
|
-
<meta charset="utf-8" />
|
|
2052
|
-
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
2053
|
-
<title>{html.escape(title)}</title>
|
|
2054
|
-
<style>
|
|
2055
|
-
body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; }}
|
|
2056
|
-
header {{ position: sticky; top: 0; background: #0b1220; color: #fff; padding: 12px 16px; z-index: 10; }}
|
|
2057
|
-
header a {{ color: #cfe3ff; text-decoration: none; margin-right: 12px; }}
|
|
2058
|
-
.detail-header .header-row {{ display: grid; grid-template-columns: auto minmax(0, 1fr) auto; align-items: center; gap: 12px; }}
|
|
2059
|
-
.detail-header .header-title {{ text-align: center; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; }}
|
|
2060
|
-
.detail-header .header-back {{ margin-right: 0; }}
|
|
2061
|
-
.detail-header .header-link {{ margin-right: 0; }}
|
|
2062
|
-
.container {{ max-width: 1100px; margin: 0 auto; padding: 16px; }}
|
|
2063
|
-
.filters {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 8px; margin: 12px 0 16px; }}
|
|
2064
|
-
.filters input {{ width: 100%; padding: 8px; border: 1px solid #d0d7de; border-radius: 6px; }}
|
|
2065
|
-
.filters select {{ width: 100%; border: 1px solid #d0d7de; border-radius: 6px; background: #fff; font-size: 13px; }}
|
|
2066
|
-
.filters select:not([multiple]) {{ padding: 6px 8px; }}
|
|
2067
|
-
.filters select[multiple] {{ padding: 2px; line-height: 1.25; min-height: 72px; font-size: 13px; }}
|
|
2068
|
-
.filters select[multiple] option {{ padding: 2px 6px; line-height: 1.25; }}
|
|
2069
|
-
.filters label {{ font-size: 12px; color: #57606a; }}
|
|
2070
|
-
.filter-group {{ display: flex; flex-direction: column; gap: 4px; }}
|
|
2071
|
-
.card {{ border: 1px solid #d0d7de; border-radius: 10px; padding: 12px; margin: 10px 0; }}
|
|
2072
|
-
.muted {{ color: #57606a; font-size: 13px; }}
|
|
2073
|
-
.pill {{ display: inline-block; padding: 2px 8px; border-radius: 999px; border: 1px solid #d0d7de; margin-right: 6px; font-size: 12px; }}
|
|
2074
|
-
.pill.template {{ border-color: #8a92a5; color: #243b53; background: #f6f8fa; }}
|
|
2075
|
-
.pill.pdf-only {{ border-color: #c8a951; background: #fff8dc; color: #5b4a00; }}
|
|
2076
|
-
.warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
|
|
2077
|
-
.tabs {{ display: flex; gap: 8px; flex-wrap: wrap; }}
|
|
2078
|
-
.tab {{ display: inline-block; padding: 6px 12px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; color: #0969da; text-decoration: none; font-size: 13px; }}
|
|
2079
|
-
.tab:hover {{ background: #eef1f4; }}
|
|
2080
|
-
.tab.active {{ background: #0969da; border-color: #0969da; color: #fff; }}
|
|
2081
|
-
.detail-shell {{ display: flex; flex-direction: column; gap: 12px; min-height: calc(100vh - 120px); }}
|
|
2082
|
-
.detail-toolbar {{ display: flex; flex-wrap: wrap; align-items: center; justify-content: flex-start; gap: 12px; padding: 6px 8px 10px; border-bottom: 1px solid #e5e7eb; box-sizing: border-box; }}
|
|
2083
|
-
.detail-toolbar .tabs {{ margin: 0; }}
|
|
2084
|
-
.toolbar-actions {{ display: flex; flex-wrap: wrap; align-items: center; gap: 10px; margin-left: auto; padding-right: 16px; }}
|
|
2085
|
-
.search-row {{ display: flex; flex-wrap: wrap; gap: 8px; margin-top: 8px; align-items: stretch; }}
|
|
2086
|
-
.search-row input {{ flex: 1 1 320px; min-width: 0; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; }}
|
|
2087
|
-
.search-row select {{ flex: 0 1 220px; min-width: 0; max-width: 100%; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; background: #fff; }}
|
|
2088
|
-
.filter-row {{ display: flex; flex-wrap: wrap; gap: 8px; align-items: center; margin-top: 8px; }}
|
|
2089
|
-
.filter-row input {{ flex: 1 1 320px; min-width: 0; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; }}
|
|
2090
|
-
.filter-row .help-icon {{ flex: 0 0 auto; }}
|
|
2091
|
-
.adv-actions {{ display: flex; gap: 8px; align-items: center; margin-top: 8px; flex-wrap: wrap; }}
|
|
2092
|
-
.split-inline {{ display: flex; flex-wrap: wrap; align-items: center; gap: 6px; }}
|
|
2093
|
-
.split-inline select {{ padding: 6px 8px; border-radius: 8px; border: 1px solid #d0d7de; background: #fff; min-width: 140px; }}
|
|
2094
|
-
.split-actions {{ display: flex; align-items: center; justify-content: center; gap: 8px; }}
|
|
2095
|
-
.split-actions button {{ padding: 6px 10px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; min-width: 36px; }}
|
|
2096
|
-
.lang-select {{ display: flex; align-items: center; gap: 6px; }}
|
|
2097
|
-
.lang-select label {{ color: #57606a; font-size: 13px; }}
|
|
2098
|
-
.lang-select select {{ padding: 6px 8px; border-radius: 8px; border: 1px solid #d0d7de; background: #fff; min-width: 120px; }}
|
|
2099
|
-
.fullscreen-actions {{ display: flex; align-items: center; gap: 6px; }}
|
|
2100
|
-
.fullscreen-actions button {{ padding: 6px 10px; border-radius: 8px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; }}
|
|
2101
|
-
.fullscreen-exit {{ display: none; }}
|
|
2102
|
-
body.detail-fullscreen {{ overflow: hidden; --outline-top: 16px; }}
|
|
2103
|
-
body.detail-fullscreen header {{ display: none; }}
|
|
2104
|
-
body.detail-fullscreen .container {{ max-width: 100%; padding: 0; }}
|
|
2105
|
-
body.detail-fullscreen .detail-shell {{
|
|
2106
|
-
position: fixed;
|
|
2107
|
-
inset: 0;
|
|
2108
|
-
padding: 12px 16px;
|
|
2109
|
-
background: #fff;
|
|
2110
|
-
z-index: 40;
|
|
2111
|
-
overflow: auto;
|
|
2112
|
-
}}
|
|
2113
|
-
body.detail-fullscreen .detail-toolbar {{ position: sticky; top: 0; background: #fff; z-index: 41; }}
|
|
2114
|
-
body.detail-fullscreen .fullscreen-enter {{ display: none; }}
|
|
2115
|
-
body.detail-fullscreen .fullscreen-exit {{ display: inline-flex; }}
|
|
2116
|
-
.detail-body {{ display: flex; flex-direction: column; gap: 8px; flex: 1; min-height: 0; }}
|
|
2117
|
-
.help-icon {{ display: inline-flex; align-items: center; justify-content: center; width: 18px; height: 18px; border-radius: 50%; border: 1px solid #d0d7de; color: #57606a; font-size: 12px; cursor: default; position: relative; }}
|
|
2118
|
-
.help-icon::after {{ content: attr(data-tip); display: none; position: absolute; top: 24px; right: 0; background: #0b1220; color: #e6edf3; padding: 8px 10px; border-radius: 8px; font-size: 12px; white-space: pre-line; width: 260px; z-index: 20; }}
|
|
2119
|
-
.help-icon:hover::after {{ display: block; }}
|
|
2120
|
-
.stats {{ margin: 12px 0 6px; }}
|
|
2121
|
-
.stats-row {{ display: flex; flex-wrap: wrap; gap: 6px; align-items: center; }}
|
|
2122
|
-
.stats-label {{ font-weight: 600; color: #0b1220; margin-right: 4px; }}
|
|
2123
|
-
.pill.stat {{ background: #f6f8fa; border-color: #c7d2e0; color: #1f2a37; }}
|
|
2124
|
-
.footnotes {{ border-top: 1px solid #e5e7eb; margin-top: 16px; padding-top: 12px; color: #57606a; }}
|
|
2125
|
-
.footnotes ol {{ padding-left: 20px; }}
|
|
2126
|
-
.footnotes li {{ margin-bottom: 6px; }}
|
|
2127
|
-
.footnote-ref {{ font-size: 0.85em; }}
|
|
2128
|
-
.footnote-tip {{ position: relative; display: inline-block; }}
|
|
2129
|
-
.footnote-tip::after {{
|
|
2130
|
-
content: attr(data-footnote);
|
|
2131
|
-
position: absolute;
|
|
2132
|
-
left: 50%;
|
|
2133
|
-
bottom: 130%;
|
|
2134
|
-
transform: translateX(-50%);
|
|
2135
|
-
width: min(320px, 70vw);
|
|
2136
|
-
padding: 8px 10px;
|
|
2137
|
-
border-radius: 8px;
|
|
2138
|
-
background: #0b1220;
|
|
2139
|
-
color: #e6edf3;
|
|
2140
|
-
font-size: 12px;
|
|
2141
|
-
line-height: 1.35;
|
|
2142
|
-
white-space: pre-line;
|
|
2143
|
-
box-shadow: 0 10px 24px rgba(0, 0, 0, 0.18);
|
|
2144
|
-
opacity: 0;
|
|
2145
|
-
pointer-events: none;
|
|
2146
|
-
z-index: 30;
|
|
2147
|
-
transition: opacity 0.12s ease-in-out;
|
|
2148
|
-
}}
|
|
2149
|
-
.footnote-tip:hover::after,
|
|
2150
|
-
.footnote-tip:focus::after {{
|
|
2151
|
-
opacity: 1;
|
|
2152
|
-
}}
|
|
2153
|
-
pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
|
|
2154
|
-
code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
|
|
2155
|
-
a {{ color: #0969da; }}
|
|
2156
|
-
@media (max-width: 640px) {{
|
|
2157
|
-
.search-row {{
|
|
2158
|
-
flex-direction: column;
|
|
2159
|
-
}}
|
|
2160
|
-
.search-row input,
|
|
2161
|
-
.search-row select {{
|
|
2162
|
-
width: 100%;
|
|
2163
|
-
}}
|
|
2164
|
-
.filter-row {{
|
|
2165
|
-
flex-direction: column;
|
|
2166
|
-
align-items: stretch;
|
|
2167
|
-
}}
|
|
2168
|
-
.filter-row .help-icon {{
|
|
2169
|
-
align-self: flex-end;
|
|
2170
|
-
}}
|
|
2171
|
-
.adv-actions {{
|
|
2172
|
-
flex-direction: column;
|
|
2173
|
-
align-items: stretch;
|
|
2174
|
-
}}
|
|
2175
|
-
.detail-toolbar {{
|
|
2176
|
-
flex-wrap: nowrap;
|
|
2177
|
-
overflow-x: auto;
|
|
2178
|
-
padding-bottom: 8px;
|
|
2179
|
-
}}
|
|
2180
|
-
.detail-toolbar::-webkit-scrollbar {{ height: 6px; }}
|
|
2181
|
-
.detail-toolbar::-webkit-scrollbar-thumb {{ background: #c7d2e0; border-radius: 999px; }}
|
|
2182
|
-
.detail-toolbar .tabs,
|
|
2183
|
-
.toolbar-actions {{
|
|
2184
|
-
flex: 0 0 auto;
|
|
2185
|
-
}}
|
|
2186
|
-
}}
|
|
2187
|
-
</style>
|
|
2188
|
-
{extra_head}
|
|
2189
|
-
</head>
|
|
2190
|
-
<body>
|
|
2191
|
-
{header_html}
|
|
2192
|
-
<div class="container">
|
|
2193
|
-
{body_html}
|
|
2194
|
-
</div>
|
|
2195
|
-
{extra_scripts}
|
|
2196
|
-
</body>
|
|
2197
|
-
</html>"""
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
def _embed_shell(title: str, body_html: str, extra_head: str = "", extra_scripts: str = "") -> str:
|
|
2201
|
-
return f"""<!doctype html>
|
|
2202
|
-
<html lang="en">
|
|
2203
|
-
<head>
|
|
2204
|
-
<meta charset="utf-8" />
|
|
2205
|
-
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
|
2206
|
-
<title>{html.escape(title)}</title>
|
|
2207
|
-
<style>
|
|
2208
|
-
body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; padding: 16px; }}
|
|
2209
|
-
h1, h2, h3, h4 {{ margin-top: 1.2em; }}
|
|
2210
|
-
.muted {{ color: #57606a; font-size: 13px; }}
|
|
2211
|
-
.warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
|
|
2212
|
-
pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
|
|
2213
|
-
code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
|
|
2214
|
-
a {{ color: #0969da; }}
|
|
2215
|
-
</style>
|
|
2216
|
-
{extra_head}
|
|
2217
|
-
</head>
|
|
2218
|
-
<body>
|
|
2219
|
-
{body_html}
|
|
2220
|
-
{extra_scripts}
|
|
2221
|
-
</body>
|
|
2222
|
-
</html>"""
|
|
2223
|
-
|
|
2224
|
-
|
|
2225
|
-
def _build_pdfjs_viewer_url(pdf_url: str) -> str:
|
|
2226
|
-
encoded = quote(pdf_url, safe="")
|
|
2227
|
-
return f"{_PDFJS_VIEWER_PATH}?file={encoded}"
|
|
2228
|
-
|
|
2229
|
-
|
|
2230
|
-
def _outline_assets(outline_top: str) -> tuple[str, str, str]:
|
|
2231
|
-
outline_html = """
|
|
2232
|
-
<button id="outlineToggle" class="outline-toggle" title="Toggle outline">☰</button>
|
|
2233
|
-
<div id="outlinePanel" class="outline-panel collapsed">
|
|
2234
|
-
<div class="outline-title">Outline</div>
|
|
2235
|
-
<div id="outlineList" class="outline-list"></div>
|
|
2236
|
-
</div>
|
|
2237
|
-
<button id="backToTop" class="back-to-top" title="Back to top">↑</button>
|
|
2238
|
-
"""
|
|
2239
|
-
outline_css = f"""
|
|
2240
|
-
<style>
|
|
2241
|
-
:root {{
|
|
2242
|
-
--outline-top: {outline_top};
|
|
2243
|
-
}}
|
|
2244
|
-
.outline-toggle {{
|
|
2245
|
-
position: fixed;
|
|
2246
|
-
top: var(--outline-top);
|
|
2247
|
-
left: 16px;
|
|
2248
|
-
z-index: 20;
|
|
2249
|
-
padding: 6px 10px;
|
|
2250
|
-
border-radius: 8px;
|
|
2251
|
-
border: 1px solid #d0d7de;
|
|
2252
|
-
background: #f6f8fa;
|
|
2253
|
-
cursor: pointer;
|
|
2254
|
-
}}
|
|
2255
|
-
.outline-panel {{
|
|
2256
|
-
position: fixed;
|
|
2257
|
-
top: calc(var(--outline-top) + 42px);
|
|
2258
|
-
left: 16px;
|
|
2259
|
-
width: 240px;
|
|
2260
|
-
max-height: 60vh;
|
|
2261
|
-
overflow: auto;
|
|
2262
|
-
border: 1px solid #d0d7de;
|
|
2263
|
-
border-radius: 10px;
|
|
2264
|
-
background: #ffffff;
|
|
2265
|
-
padding: 10px;
|
|
2266
|
-
z-index: 20;
|
|
2267
|
-
box-shadow: 0 6px 18px rgba(0, 0, 0, 0.08);
|
|
2268
|
-
}}
|
|
2269
|
-
.outline-panel.collapsed {{
|
|
2270
|
-
display: none;
|
|
2271
|
-
}}
|
|
2272
|
-
.outline-title {{
|
|
2273
|
-
font-size: 12px;
|
|
2274
|
-
text-transform: uppercase;
|
|
2275
|
-
letter-spacing: 0.08em;
|
|
2276
|
-
color: #57606a;
|
|
2277
|
-
margin-bottom: 8px;
|
|
2278
|
-
}}
|
|
2279
|
-
.outline-list a {{
|
|
2280
|
-
display: block;
|
|
2281
|
-
color: #0969da;
|
|
2282
|
-
text-decoration: none;
|
|
2283
|
-
padding: 4px 0;
|
|
2284
|
-
}}
|
|
2285
|
-
.outline-list a:hover {{
|
|
2286
|
-
text-decoration: underline;
|
|
2287
|
-
}}
|
|
2288
|
-
.back-to-top {{
|
|
2289
|
-
position: fixed;
|
|
2290
|
-
left: 16px;
|
|
2291
|
-
bottom: 16px;
|
|
2292
|
-
padding: 6px 10px;
|
|
2293
|
-
border-radius: 999px;
|
|
2294
|
-
border: 1px solid #d0d7de;
|
|
2295
|
-
background: #ffffff;
|
|
2296
|
-
cursor: pointer;
|
|
2297
|
-
opacity: 0;
|
|
2298
|
-
pointer-events: none;
|
|
2299
|
-
transition: opacity 0.2s ease;
|
|
2300
|
-
z-index: 20;
|
|
2301
|
-
}}
|
|
2302
|
-
.back-to-top.visible {{
|
|
2303
|
-
opacity: 1;
|
|
2304
|
-
pointer-events: auto;
|
|
2305
|
-
}}
|
|
2306
|
-
@media (max-width: 900px) {{
|
|
2307
|
-
.outline-panel {{
|
|
2308
|
-
width: 200px;
|
|
2309
|
-
}}
|
|
2310
|
-
}}
|
|
2311
|
-
</style>
|
|
2312
|
-
"""
|
|
2313
|
-
outline_js = """
|
|
2314
|
-
const outlineToggle = document.getElementById('outlineToggle');
|
|
2315
|
-
const outlinePanel = document.getElementById('outlinePanel');
|
|
2316
|
-
const outlineList = document.getElementById('outlineList');
|
|
2317
|
-
const backToTop = document.getElementById('backToTop');
|
|
2318
|
-
|
|
2319
|
-
function slugify(text) {
|
|
2320
|
-
return text.toLowerCase().trim()
|
|
2321
|
-
.replace(/[^a-z0-9\\s-]/g, '')
|
|
2322
|
-
.replace(/\\s+/g, '-')
|
|
2323
|
-
.replace(/-+/g, '-');
|
|
2324
|
-
}
|
|
2325
|
-
|
|
2326
|
-
function buildOutline() {
|
|
2327
|
-
if (!outlineList) return;
|
|
2328
|
-
const content = document.getElementById('content');
|
|
2329
|
-
if (!content) return;
|
|
2330
|
-
const headings = content.querySelectorAll('h1, h2, h3, h4');
|
|
2331
|
-
if (!headings.length) {
|
|
2332
|
-
outlineList.innerHTML = '<div class="muted">No headings</div>';
|
|
2333
|
-
return;
|
|
2334
|
-
}
|
|
2335
|
-
const used = new Set();
|
|
2336
|
-
outlineList.innerHTML = '';
|
|
2337
|
-
headings.forEach((heading) => {
|
|
2338
|
-
let id = heading.id;
|
|
2339
|
-
if (!id) {
|
|
2340
|
-
const base = slugify(heading.textContent || 'section') || 'section';
|
|
2341
|
-
id = base;
|
|
2342
|
-
let i = 1;
|
|
2343
|
-
while (used.has(id) || document.getElementById(id)) {
|
|
2344
|
-
id = `${base}-${i++}`;
|
|
2345
|
-
}
|
|
2346
|
-
heading.id = id;
|
|
2347
|
-
}
|
|
2348
|
-
used.add(id);
|
|
2349
|
-
const level = parseInt(heading.tagName.slice(1), 10) || 1;
|
|
2350
|
-
const link = document.createElement('a');
|
|
2351
|
-
link.href = `#${id}`;
|
|
2352
|
-
link.textContent = heading.textContent || '';
|
|
2353
|
-
link.style.paddingLeft = `${(level - 1) * 12}px`;
|
|
2354
|
-
outlineList.appendChild(link);
|
|
2355
|
-
});
|
|
2356
|
-
}
|
|
2357
|
-
|
|
2358
|
-
function toggleBackToTop() {
|
|
2359
|
-
if (!backToTop) return;
|
|
2360
|
-
if (window.scrollY > 300) {
|
|
2361
|
-
backToTop.classList.add('visible');
|
|
2362
|
-
} else {
|
|
2363
|
-
backToTop.classList.remove('visible');
|
|
2364
|
-
}
|
|
2365
|
-
}
|
|
2366
|
-
|
|
2367
|
-
if (outlineToggle && outlinePanel) {
|
|
2368
|
-
outlineToggle.addEventListener('click', () => {
|
|
2369
|
-
outlinePanel.classList.toggle('collapsed');
|
|
2370
|
-
});
|
|
2371
|
-
}
|
|
2372
|
-
|
|
2373
|
-
if (backToTop) {
|
|
2374
|
-
backToTop.addEventListener('click', () => {
|
|
2375
|
-
window.scrollTo({ top: 0, behavior: 'smooth' });
|
|
2376
|
-
});
|
|
2377
|
-
}
|
|
2378
|
-
|
|
2379
|
-
buildOutline();
|
|
2380
|
-
window.addEventListener('scroll', toggleBackToTop);
|
|
2381
|
-
toggleBackToTop();
|
|
2382
|
-
"""
|
|
2383
|
-
return outline_html, outline_css, outline_js
|
|
2384
|
-
|
|
2385
|
-
|
|
2386
|
-
async def _index_page(request: Request) -> HTMLResponse:
|
|
2387
|
-
index: PaperIndex = request.app.state.index
|
|
2388
|
-
template_options = "".join(
|
|
2389
|
-
f'<option value="{html.escape(tag)}">{html.escape(tag)}</option>'
|
|
2390
|
-
for tag in index.template_tags
|
|
2391
|
-
)
|
|
2392
|
-
if not template_options:
|
|
2393
|
-
template_options = '<option value="" disabled>(no templates)</option>'
|
|
2394
|
-
filter_help = (
|
|
2395
|
-
"Filters syntax:\\n"
|
|
2396
|
-
"pdf:yes|no source:yes|no translated:yes|no summary:yes|no\\n"
|
|
2397
|
-
"tmpl:<tag> or template:<tag>\\n"
|
|
2398
|
-
"has:pdf / no:source aliases\\n"
|
|
2399
|
-
"Content tags still use the search box (tag:fpga)."
|
|
2400
|
-
)
|
|
2401
|
-
filter_help_attr = html.escape(filter_help).replace("\n", " ")
|
|
2402
|
-
body_html = """
|
|
2403
|
-
<h2>Paper Database</h2>
|
|
2404
|
-
<div class="card">
|
|
2405
|
-
<div class="muted">Search (Scholar-style): <code>tag:fpga year:2023..2025 -survey</code> · Use quotes for phrases and <code>OR</code> for alternatives.</div>
|
|
2406
|
-
<div class="search-row">
|
|
2407
|
-
<input id="query" placeholder='Search... e.g. title:"nearest neighbor" tag:fpga year:2023..2025' />
|
|
2408
|
-
<select id="openView">
|
|
2409
|
-
<option value="summary" selected>Open: Summary</option>
|
|
2410
|
-
<option value="source">Open: Source</option>
|
|
2411
|
-
<option value="translated">Open: Translated</option>
|
|
2412
|
-
<option value="pdf">Open: PDF</option>
|
|
2413
|
-
<option value="pdfjs">Open: PDF Viewer</option>
|
|
2414
|
-
<option value="split">Open: Split</option>
|
|
2415
|
-
</select>
|
|
2416
|
-
</div>
|
|
2417
|
-
<div class="filters" style="margin-top:10px;">
|
|
2418
|
-
<div class="filter-group">
|
|
2419
|
-
<label>PDF</label>
|
|
2420
|
-
<select id="filterPdf" multiple size="2">
|
|
2421
|
-
<option value="with">With</option>
|
|
2422
|
-
<option value="without">Without</option>
|
|
2423
|
-
</select>
|
|
2424
|
-
</div>
|
|
2425
|
-
<div class="filter-group">
|
|
2426
|
-
<label>Source</label>
|
|
2427
|
-
<select id="filterSource" multiple size="2">
|
|
2428
|
-
<option value="with">With</option>
|
|
2429
|
-
<option value="without">Without</option>
|
|
2430
|
-
</select>
|
|
2431
|
-
</div>
|
|
2432
|
-
<div class="filter-group">
|
|
2433
|
-
<label>Translated</label>
|
|
2434
|
-
<select id="filterTranslated" multiple size="2">
|
|
2435
|
-
<option value="with">With</option>
|
|
2436
|
-
<option value="without">Without</option>
|
|
2437
|
-
</select>
|
|
2438
|
-
</div>
|
|
2439
|
-
<div class="filter-group">
|
|
2440
|
-
<label>Summary</label>
|
|
2441
|
-
<select id="filterSummary" multiple size="2">
|
|
2442
|
-
<option value="with">With</option>
|
|
2443
|
-
<option value="without">Without</option>
|
|
2444
|
-
</select>
|
|
2445
|
-
</div>
|
|
2446
|
-
<div class="filter-group">
|
|
2447
|
-
<label>Template</label>
|
|
2448
|
-
<select id="filterTemplate" multiple size="4">
|
|
2449
|
-
__TEMPLATE_OPTIONS__
|
|
2450
|
-
</select>
|
|
2451
|
-
</div>
|
|
2452
|
-
</div>
|
|
2453
|
-
<div class="filter-row">
|
|
2454
|
-
<input id="filterQuery" placeholder='Filters... e.g. pdf:yes tmpl:simple' />
|
|
2455
|
-
<span class="help-icon" data-tip="__FILTER_HELP__">?</span>
|
|
2456
|
-
</div>
|
|
2457
|
-
<details style="margin-top:10px;">
|
|
2458
|
-
<summary>Advanced search</summary>
|
|
2459
|
-
<div style="margin-top:10px;" class="muted">Build a query:</div>
|
|
2460
|
-
<div class="filters">
|
|
2461
|
-
<input id="advTitle" placeholder="title contains..." />
|
|
2462
|
-
<input id="advAuthor" placeholder="author contains..." />
|
|
2463
|
-
<input id="advTag" placeholder="tag (comma separated)" />
|
|
2464
|
-
<input id="advYear" placeholder="year (e.g. 2020..2024)" />
|
|
2465
|
-
<input id="advMonth" placeholder="month (01-12)" />
|
|
2466
|
-
<input id="advVenue" placeholder="venue contains..." />
|
|
2467
|
-
</div>
|
|
2468
|
-
<div class="adv-actions">
|
|
2469
|
-
<button id="buildQuery" style="padding:8px 12px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Build</button>
|
|
2470
|
-
<div class="muted">Generated: <code id="generated"></code></div>
|
|
2471
|
-
</div>
|
|
2472
|
-
</details>
|
|
2473
|
-
</div>
|
|
2474
|
-
<div id="stats" class="stats">
|
|
2475
|
-
<div id="statsTotal" class="stats-row"></div>
|
|
2476
|
-
<div id="statsFiltered" class="stats-row" style="margin-top:6px;"></div>
|
|
2477
|
-
</div>
|
|
2478
|
-
<div id="results"></div>
|
|
2479
|
-
<div id="loading" class="muted">Loading...</div>
|
|
2480
|
-
<script>
|
|
2481
|
-
let page = 1;
|
|
2482
|
-
let loading = false;
|
|
2483
|
-
let done = false;
|
|
2484
|
-
|
|
2485
|
-
function currentParams(nextPage) {
|
|
2486
|
-
const params = new URLSearchParams();
|
|
2487
|
-
params.set("page", String(nextPage));
|
|
2488
|
-
params.set("page_size", "30");
|
|
2489
|
-
const q = document.getElementById("query").value.trim();
|
|
2490
|
-
if (q) params.set("q", q);
|
|
2491
|
-
const fq = document.getElementById("filterQuery").value.trim();
|
|
2492
|
-
if (fq) params.set("fq", fq);
|
|
2493
|
-
function addMulti(id, key) {
|
|
2494
|
-
const el = document.getElementById(id);
|
|
2495
|
-
const values = Array.from(el.selectedOptions).map(opt => opt.value).filter(Boolean);
|
|
2496
|
-
for (const value of values) {
|
|
2497
|
-
params.append(key, value);
|
|
2498
|
-
}
|
|
2499
|
-
}
|
|
2500
|
-
addMulti("filterPdf", "pdf");
|
|
2501
|
-
addMulti("filterSource", "source");
|
|
2502
|
-
addMulti("filterTranslated", "translated");
|
|
2503
|
-
addMulti("filterSummary", "summary");
|
|
2504
|
-
addMulti("filterTemplate", "template");
|
|
2505
|
-
return params;
|
|
2506
|
-
}
|
|
2507
|
-
|
|
2508
|
-
function escapeHtml(text) {
|
|
2509
|
-
const div = document.createElement("div");
|
|
2510
|
-
div.textContent = text;
|
|
2511
|
-
return div.innerHTML;
|
|
2512
|
-
}
|
|
2513
|
-
|
|
2514
|
-
function viewSuffixForItem(item) {
|
|
2515
|
-
let view = document.getElementById("openView").value;
|
|
2516
|
-
const isPdfOnly = item.is_pdf_only;
|
|
2517
|
-
const pdfFallback = item.has_pdf ? "pdfjs" : "pdf";
|
|
2518
|
-
if (isPdfOnly && (view === "summary" || view === "source" || view === "translated")) {
|
|
2519
|
-
view = pdfFallback;
|
|
2520
|
-
}
|
|
2521
|
-
if (!view || view === "summary") return "";
|
|
2522
|
-
const params = new URLSearchParams();
|
|
2523
|
-
params.set("view", view);
|
|
2524
|
-
if (view === "split") {
|
|
2525
|
-
if (isPdfOnly) {
|
|
2526
|
-
params.set("left", pdfFallback);
|
|
2527
|
-
params.set("right", pdfFallback);
|
|
2528
|
-
} else {
|
|
2529
|
-
params.set("left", "summary");
|
|
2530
|
-
if (item.has_pdf) {
|
|
2531
|
-
params.set("right", "pdfjs");
|
|
2532
|
-
} else if (item.has_source) {
|
|
2533
|
-
params.set("right", "source");
|
|
2534
|
-
} else {
|
|
2535
|
-
params.set("right", "summary");
|
|
2536
|
-
}
|
|
2537
|
-
}
|
|
2538
|
-
}
|
|
2539
|
-
return `?${params.toString()}`;
|
|
2540
|
-
}
|
|
2541
|
-
|
|
2542
|
-
function renderItem(item) {
|
|
2543
|
-
const tags = (item.tags || []).map(t => `<span class="pill">${escapeHtml(t)}</span>`).join("");
|
|
2544
|
-
const templateTags = (item.template_tags || []).map(t => `<span class="pill template">tmpl:${escapeHtml(t)}</span>`).join("");
|
|
2545
|
-
const authors = (item.authors || []).slice(0, 6).map(a => escapeHtml(a)).join(", ");
|
|
2546
|
-
const meta = `${escapeHtml(item.year || "")}-${escapeHtml(item.month || "")} · ${escapeHtml(item.venue || "")}`;
|
|
2547
|
-
const viewSuffix = viewSuffixForItem(item);
|
|
2548
|
-
const badges = [
|
|
2549
|
-
item.has_source ? `<span class="pill">source</span>` : "",
|
|
2550
|
-
item.has_translation ? `<span class="pill">translated</span>` : "",
|
|
2551
|
-
item.has_pdf ? `<span class="pill">pdf</span>` : "",
|
|
2552
|
-
item.is_pdf_only ? `<span class="pill pdf-only">pdf-only</span>` : "",
|
|
2553
|
-
].join("");
|
|
2554
|
-
return `
|
|
2555
|
-
<div class="card">
|
|
2556
|
-
<div><a href="/paper/${encodeURIComponent(item.source_hash)}${viewSuffix}">${escapeHtml(item.title || "")}</a></div>
|
|
2557
|
-
<div class="muted">${authors}</div>
|
|
2558
|
-
<div class="muted">${meta}</div>
|
|
2559
|
-
<div style="margin-top:6px">${badges} ${templateTags} ${tags}</div>
|
|
2560
|
-
</div>
|
|
2561
|
-
`;
|
|
2562
|
-
}
|
|
2563
|
-
|
|
2564
|
-
function renderStatsRow(targetId, label, counts) {
|
|
2565
|
-
const row = document.getElementById(targetId);
|
|
2566
|
-
if (!row || !counts) return;
|
|
2567
|
-
const pills = [];
|
|
2568
|
-
pills.push(`<span class="stats-label">${escapeHtml(label)}</span>`);
|
|
2569
|
-
pills.push(`<span class="pill stat">Count ${counts.total}</span>`);
|
|
2570
|
-
pills.push(`<span class="pill stat">PDF ${counts.pdf}</span>`);
|
|
2571
|
-
pills.push(`<span class="pill stat">Source ${counts.source}</span>`);
|
|
2572
|
-
pills.push(`<span class="pill stat">Translated ${counts.translated || 0}</span>`);
|
|
2573
|
-
pills.push(`<span class="pill stat">Summary ${counts.summary}</span>`);
|
|
2574
|
-
const order = counts.template_order || Object.keys(counts.templates || {});
|
|
2575
|
-
for (const tag of order) {
|
|
2576
|
-
const count = (counts.templates && counts.templates[tag]) || 0;
|
|
2577
|
-
pills.push(`<span class="pill stat">tmpl:${escapeHtml(tag)} ${count}</span>`);
|
|
2578
|
-
}
|
|
2579
|
-
row.innerHTML = pills.join("");
|
|
2580
|
-
}
|
|
2581
|
-
|
|
2582
|
-
function updateStats(stats) {
|
|
2583
|
-
if (!stats) return;
|
|
2584
|
-
renderStatsRow("statsTotal", "Total", stats.all);
|
|
2585
|
-
renderStatsRow("statsFiltered", "Filtered", stats.filtered);
|
|
2586
|
-
}
|
|
2587
|
-
|
|
2588
|
-
async function loadMore() {
|
|
2589
|
-
if (loading || done) return;
|
|
2590
|
-
loading = true;
|
|
2591
|
-
document.getElementById("loading").textContent = "Loading...";
|
|
2592
|
-
const res = await fetch(`/api/papers?${currentParams(page).toString()}`);
|
|
2593
|
-
const data = await res.json();
|
|
2594
|
-
if (data.stats) {
|
|
2595
|
-
updateStats(data.stats);
|
|
2596
|
-
}
|
|
2597
|
-
const results = document.getElementById("results");
|
|
2598
|
-
for (const item of data.items) {
|
|
2599
|
-
results.insertAdjacentHTML("beforeend", renderItem(item));
|
|
2600
|
-
}
|
|
2601
|
-
if (!data.has_more) {
|
|
2602
|
-
done = true;
|
|
2603
|
-
document.getElementById("loading").textContent = "End.";
|
|
2604
|
-
} else {
|
|
2605
|
-
page += 1;
|
|
2606
|
-
document.getElementById("loading").textContent = "Scroll to load more...";
|
|
2607
|
-
}
|
|
2608
|
-
loading = false;
|
|
2609
|
-
}
|
|
2610
|
-
|
|
2611
|
-
function resetAndLoad() {
|
|
2612
|
-
page = 1;
|
|
2613
|
-
done = false;
|
|
2614
|
-
document.getElementById("results").innerHTML = "";
|
|
2615
|
-
loadMore();
|
|
2616
|
-
}
|
|
2617
|
-
|
|
2618
|
-
document.getElementById("query").addEventListener("change", resetAndLoad);
|
|
2619
|
-
document.getElementById("openView").addEventListener("change", resetAndLoad);
|
|
2620
|
-
document.getElementById("filterQuery").addEventListener("change", resetAndLoad);
|
|
2621
|
-
document.getElementById("filterPdf").addEventListener("change", resetAndLoad);
|
|
2622
|
-
document.getElementById("filterSource").addEventListener("change", resetAndLoad);
|
|
2623
|
-
document.getElementById("filterTranslated").addEventListener("change", resetAndLoad);
|
|
2624
|
-
document.getElementById("filterSummary").addEventListener("change", resetAndLoad);
|
|
2625
|
-
document.getElementById("filterTemplate").addEventListener("change", resetAndLoad);
|
|
2626
|
-
|
|
2627
|
-
document.getElementById("buildQuery").addEventListener("click", () => {
|
|
2628
|
-
function add(field, value) {
|
|
2629
|
-
value = value.trim();
|
|
2630
|
-
if (!value) return "";
|
|
2631
|
-
if (value.includes(" ")) return `${field}:"${value}"`;
|
|
2632
|
-
return `${field}:${value}`;
|
|
2633
|
-
}
|
|
2634
|
-
const parts = [];
|
|
2635
|
-
const t = document.getElementById("advTitle").value.trim();
|
|
2636
|
-
const a = document.getElementById("advAuthor").value.trim();
|
|
2637
|
-
const tag = document.getElementById("advTag").value.trim();
|
|
2638
|
-
const y = document.getElementById("advYear").value.trim();
|
|
2639
|
-
const m = document.getElementById("advMonth").value.trim();
|
|
2640
|
-
const v = document.getElementById("advVenue").value.trim();
|
|
2641
|
-
if (t) parts.push(add("title", t));
|
|
2642
|
-
if (a) parts.push(add("author", a));
|
|
2643
|
-
if (tag) {
|
|
2644
|
-
for (const item of tag.split(",")) {
|
|
2645
|
-
const val = item.trim();
|
|
2646
|
-
if (val) parts.push(add("tag", val));
|
|
2647
|
-
}
|
|
2648
|
-
}
|
|
2649
|
-
if (y) parts.push(add("year", y));
|
|
2650
|
-
if (m) parts.push(add("month", m));
|
|
2651
|
-
if (v) parts.push(add("venue", v));
|
|
2652
|
-
const q = parts.join(" ");
|
|
2653
|
-
document.getElementById("generated").textContent = q;
|
|
2654
|
-
document.getElementById("query").value = q;
|
|
2655
|
-
resetAndLoad();
|
|
2656
|
-
});
|
|
2657
|
-
|
|
2658
|
-
window.addEventListener("scroll", () => {
|
|
2659
|
-
if ((window.innerHeight + window.scrollY) >= (document.body.offsetHeight - 600)) {
|
|
2660
|
-
loadMore();
|
|
2661
|
-
}
|
|
2662
|
-
});
|
|
2663
|
-
|
|
2664
|
-
loadMore();
|
|
2665
|
-
</script>
|
|
2666
|
-
"""
|
|
2667
|
-
body_html = body_html.replace("__TEMPLATE_OPTIONS__", template_options)
|
|
2668
|
-
body_html = body_html.replace("__FILTER_HELP__", filter_help_attr)
|
|
2669
|
-
return HTMLResponse(_page_shell("Paper DB", body_html))
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
def _parse_filters(request: Request) -> dict[str, list[str] | str | int]:
|
|
2673
|
-
qp = request.query_params
|
|
2674
|
-
page = int(qp.get("page", "1"))
|
|
2675
|
-
page_size = int(qp.get("page_size", "30"))
|
|
2676
|
-
page = max(1, page)
|
|
2677
|
-
page_size = min(max(1, page_size), 200)
|
|
2678
|
-
|
|
2679
|
-
q = qp.get("q", "").strip()
|
|
2680
|
-
filter_query = qp.get("fq", "").strip()
|
|
2681
|
-
pdf_filters = [item for item in qp.getlist("pdf") if item]
|
|
2682
|
-
source_filters = [item for item in qp.getlist("source") if item]
|
|
2683
|
-
summary_filters = [item for item in qp.getlist("summary") if item]
|
|
2684
|
-
translated_filters = [item for item in qp.getlist("translated") if item]
|
|
2685
|
-
template_filters = [item for item in qp.getlist("template") if item]
|
|
2686
|
-
|
|
2687
|
-
return {
|
|
2688
|
-
"page": page,
|
|
2689
|
-
"page_size": page_size,
|
|
2690
|
-
"q": q,
|
|
2691
|
-
"filter_query": filter_query,
|
|
2692
|
-
"pdf": pdf_filters,
|
|
2693
|
-
"source": source_filters,
|
|
2694
|
-
"summary": summary_filters,
|
|
2695
|
-
"translated": translated_filters,
|
|
2696
|
-
"template": template_filters,
|
|
2697
|
-
}
|
|
2698
|
-
|
|
2699
|
-
|
|
2700
|
-
async def _api_papers(request: Request) -> JSONResponse:
|
|
2701
|
-
index: PaperIndex = request.app.state.index
|
|
2702
|
-
filters = _parse_filters(request)
|
|
2703
|
-
page = int(filters["page"])
|
|
2704
|
-
page_size = int(filters["page_size"])
|
|
2705
|
-
q = str(filters["q"])
|
|
2706
|
-
filter_query = str(filters["filter_query"])
|
|
2707
|
-
query = parse_query(q)
|
|
2708
|
-
candidate = _apply_query(index, query)
|
|
2709
|
-
filter_terms = _parse_filter_query(filter_query)
|
|
2710
|
-
pdf_filter = _merge_filter_set(_presence_filter(filters["pdf"]), _presence_filter(list(filter_terms["pdf"])))
|
|
2711
|
-
source_filter = _merge_filter_set(
|
|
2712
|
-
_presence_filter(filters["source"]), _presence_filter(list(filter_terms["source"]))
|
|
2713
|
-
)
|
|
2714
|
-
summary_filter = _merge_filter_set(
|
|
2715
|
-
_presence_filter(filters["summary"]), _presence_filter(list(filter_terms["summary"]))
|
|
2716
|
-
)
|
|
2717
|
-
translated_filter = _merge_filter_set(
|
|
2718
|
-
_presence_filter(filters["translated"]), _presence_filter(list(filter_terms["translated"]))
|
|
2719
|
-
)
|
|
2720
|
-
template_selected = {item.lower() for item in filters["template"] if item}
|
|
2721
|
-
template_filter = _merge_filter_set(
|
|
2722
|
-
template_selected or None,
|
|
2723
|
-
filter_terms["template"] or None,
|
|
2724
|
-
)
|
|
2725
|
-
|
|
2726
|
-
if candidate:
|
|
2727
|
-
filtered: set[int] = set()
|
|
2728
|
-
for idx in candidate:
|
|
2729
|
-
paper = index.papers[idx]
|
|
2730
|
-
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
2731
|
-
has_source = source_hash in index.md_path_by_hash
|
|
2732
|
-
has_pdf = source_hash in index.pdf_path_by_hash
|
|
2733
|
-
has_summary = bool(paper.get("_has_summary"))
|
|
2734
|
-
has_translated = bool(index.translated_md_by_hash.get(source_hash))
|
|
2735
|
-
if not _matches_presence(pdf_filter, has_pdf):
|
|
2736
|
-
continue
|
|
2737
|
-
if not _matches_presence(source_filter, has_source):
|
|
2738
|
-
continue
|
|
2739
|
-
if not _matches_presence(summary_filter, has_summary):
|
|
2740
|
-
continue
|
|
2741
|
-
if not _matches_presence(translated_filter, has_translated):
|
|
2742
|
-
continue
|
|
2743
|
-
if template_filter:
|
|
2744
|
-
tags = paper.get("_template_tags_lc") or []
|
|
2745
|
-
if not any(tag in template_filter for tag in tags):
|
|
2746
|
-
continue
|
|
2747
|
-
filtered.add(idx)
|
|
2748
|
-
candidate = filtered
|
|
2749
|
-
ordered = [idx for idx in index.ordered_ids if idx in candidate]
|
|
2750
|
-
total = len(ordered)
|
|
2751
|
-
start = (page - 1) * page_size
|
|
2752
|
-
end = min(start + page_size, total)
|
|
2753
|
-
page_ids = ordered[start:end]
|
|
2754
|
-
stats_payload = None
|
|
2755
|
-
if page == 1:
|
|
2756
|
-
all_ids = set(index.ordered_ids)
|
|
2757
|
-
stats_payload = {
|
|
2758
|
-
"all": _compute_counts(index, all_ids),
|
|
2759
|
-
"filtered": _compute_counts(index, candidate),
|
|
2760
|
-
}
|
|
2761
|
-
|
|
2762
|
-
items: list[dict[str, Any]] = []
|
|
2763
|
-
for idx in page_ids:
|
|
2764
|
-
paper = index.papers[idx]
|
|
2765
|
-
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
2766
|
-
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
2767
|
-
translation_languages = sorted(translations.keys(), key=str.lower)
|
|
2768
|
-
items.append(
|
|
2769
|
-
{
|
|
2770
|
-
"source_hash": source_hash,
|
|
2771
|
-
"title": paper.get("paper_title") or "",
|
|
2772
|
-
"authors": paper.get("_authors") or [],
|
|
2773
|
-
"year": paper.get("_year") or "",
|
|
2774
|
-
"month": paper.get("_month") or "",
|
|
2775
|
-
"venue": paper.get("_venue") or "",
|
|
2776
|
-
"tags": paper.get("_tags") or [],
|
|
2777
|
-
"template_tags": paper.get("_template_tags") or [],
|
|
2778
|
-
"has_source": source_hash in index.md_path_by_hash,
|
|
2779
|
-
"has_translation": bool(translation_languages),
|
|
2780
|
-
"has_pdf": source_hash in index.pdf_path_by_hash,
|
|
2781
|
-
"has_summary": bool(paper.get("_has_summary")),
|
|
2782
|
-
"is_pdf_only": bool(paper.get("_is_pdf_only")),
|
|
2783
|
-
"translation_languages": translation_languages,
|
|
2784
|
-
}
|
|
2785
|
-
)
|
|
2786
|
-
|
|
2787
|
-
return JSONResponse(
|
|
2788
|
-
{
|
|
2789
|
-
"page": page,
|
|
2790
|
-
"page_size": page_size,
|
|
2791
|
-
"total": total,
|
|
2792
|
-
"has_more": end < total,
|
|
2793
|
-
"items": items,
|
|
2794
|
-
"stats": stats_payload,
|
|
2795
|
-
}
|
|
2796
|
-
)
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
async def _paper_detail(request: Request) -> HTMLResponse:
|
|
2800
|
-
index: PaperIndex = request.app.state.index
|
|
2801
|
-
md = request.app.state.md
|
|
2802
|
-
source_hash = request.path_params["source_hash"]
|
|
2803
|
-
idx = index.id_by_hash.get(source_hash)
|
|
2804
|
-
if idx is None:
|
|
2805
|
-
return RedirectResponse("/")
|
|
2806
|
-
paper = index.papers[idx]
|
|
2807
|
-
is_pdf_only = bool(paper.get("_is_pdf_only"))
|
|
2808
|
-
page_title = str(paper.get("paper_title") or "Paper")
|
|
2809
|
-
view = request.query_params.get("view")
|
|
2810
|
-
template_param = request.query_params.get("template")
|
|
2811
|
-
embed = request.query_params.get("embed") == "1"
|
|
2812
|
-
|
|
2813
|
-
pdf_path = index.pdf_path_by_hash.get(source_hash)
|
|
2814
|
-
pdf_url = f"/api/pdf/{source_hash}"
|
|
2815
|
-
source_available = source_hash in index.md_path_by_hash
|
|
2816
|
-
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
2817
|
-
translation_langs = sorted(translations.keys(), key=str.lower)
|
|
2818
|
-
lang_param = request.query_params.get("lang")
|
|
2819
|
-
normalized_lang = lang_param.lower() if lang_param else None
|
|
2820
|
-
selected_lang = None
|
|
2821
|
-
if translation_langs:
|
|
2822
|
-
if normalized_lang and normalized_lang in translations:
|
|
2823
|
-
selected_lang = normalized_lang
|
|
2824
|
-
elif "zh" in translations:
|
|
2825
|
-
selected_lang = "zh"
|
|
2826
|
-
else:
|
|
2827
|
-
selected_lang = translation_langs[0]
|
|
2828
|
-
allowed_views = {"summary", "source", "translated", "pdf", "pdfjs", "split"}
|
|
2829
|
-
if is_pdf_only:
|
|
2830
|
-
allowed_views = {"pdf", "pdfjs", "split"}
|
|
2831
|
-
|
|
2832
|
-
def normalize_view(value: str | None, default: str) -> str:
|
|
2833
|
-
if value in allowed_views:
|
|
2834
|
-
return value
|
|
2835
|
-
return default
|
|
2836
|
-
|
|
2837
|
-
preferred_pdf_view = "pdfjs" if pdf_path else "pdf"
|
|
2838
|
-
default_view = preferred_pdf_view if is_pdf_only else "summary"
|
|
2839
|
-
view = normalize_view(view, default_view)
|
|
2840
|
-
if view == "split":
|
|
2841
|
-
embed = False
|
|
2842
|
-
if is_pdf_only:
|
|
2843
|
-
left_param = request.query_params.get("left")
|
|
2844
|
-
right_param = request.query_params.get("right")
|
|
2845
|
-
left = normalize_view(left_param, preferred_pdf_view) if left_param else preferred_pdf_view
|
|
2846
|
-
right = normalize_view(right_param, preferred_pdf_view) if right_param else preferred_pdf_view
|
|
2847
|
-
else:
|
|
2848
|
-
default_left = preferred_pdf_view if pdf_path else ("source" if source_available else "summary")
|
|
2849
|
-
default_right = "summary"
|
|
2850
|
-
left_param = request.query_params.get("left")
|
|
2851
|
-
right_param = request.query_params.get("right")
|
|
2852
|
-
left = normalize_view(left_param, default_left) if left_param else default_left
|
|
2853
|
-
right = normalize_view(right_param, default_right) if right_param else default_right
|
|
2854
|
-
|
|
2855
|
-
def render_page(title: str, body: str, extra_head: str = "", extra_scripts: str = "") -> HTMLResponse:
|
|
2856
|
-
if embed:
|
|
2857
|
-
return HTMLResponse(_embed_shell(title, body, extra_head, extra_scripts))
|
|
2858
|
-
return HTMLResponse(_page_shell(title, body, extra_head, extra_scripts, header_title=page_title))
|
|
2859
|
-
|
|
2860
|
-
def nav_link(label: str, v: str) -> str:
|
|
2861
|
-
active = " active" if view == v else ""
|
|
2862
|
-
params: dict[str, str] = {"view": v}
|
|
2863
|
-
if v == "summary" and template_param:
|
|
2864
|
-
params["template"] = str(template_param)
|
|
2865
|
-
if v == "translated" and selected_lang:
|
|
2866
|
-
params["lang"] = selected_lang
|
|
2867
|
-
if v == "split":
|
|
2868
|
-
params["left"] = left
|
|
2869
|
-
params["right"] = right
|
|
2870
|
-
href = f"/paper/{source_hash}?{urlencode(params)}"
|
|
2871
|
-
return f'<a class="tab{active}" href="{html.escape(href)}">{html.escape(label)}</a>'
|
|
2872
|
-
|
|
2873
|
-
tab_defs = [
|
|
2874
|
-
("Summary", "summary"),
|
|
2875
|
-
("Source", "source"),
|
|
2876
|
-
("Translated", "translated"),
|
|
2877
|
-
("PDF", "pdf"),
|
|
2878
|
-
("PDF Viewer", "pdfjs"),
|
|
2879
|
-
("Split", "split"),
|
|
2880
|
-
]
|
|
2881
|
-
if is_pdf_only:
|
|
2882
|
-
tab_defs = [
|
|
2883
|
-
("PDF", "pdf"),
|
|
2884
|
-
("PDF Viewer", "pdfjs"),
|
|
2885
|
-
("Split", "split"),
|
|
2886
|
-
]
|
|
2887
|
-
tabs_html = '<div class="tabs">' + "".join(nav_link(label, v) for label, v in tab_defs) + "</div>"
|
|
2888
|
-
fullscreen_controls = """
|
|
2889
|
-
<div class="fullscreen-actions">
|
|
2890
|
-
<button id="fullscreenEnter" class="fullscreen-enter" type="button" title="Enter fullscreen">Fullscreen</button>
|
|
2891
|
-
<button id="fullscreenExit" class="fullscreen-exit" type="button" title="Exit fullscreen">Exit Fullscreen</button>
|
|
2892
|
-
</div>
|
|
2893
|
-
"""
|
|
2894
|
-
|
|
2895
|
-
def detail_toolbar(extra_controls: str = "") -> str:
|
|
2896
|
-
if embed:
|
|
2897
|
-
return ""
|
|
2898
|
-
controls = extra_controls.strip()
|
|
2899
|
-
toolbar_controls = f"{controls}{fullscreen_controls}" if controls else fullscreen_controls
|
|
2900
|
-
return f"""
|
|
2901
|
-
<div class="detail-toolbar">
|
|
2902
|
-
{tabs_html}
|
|
2903
|
-
<div class="toolbar-actions">
|
|
2904
|
-
{toolbar_controls}
|
|
2905
|
-
</div>
|
|
2906
|
-
</div>
|
|
2907
|
-
"""
|
|
2908
|
-
|
|
2909
|
-
def wrap_detail(content: str, toolbar_html: str | None = None) -> str:
|
|
2910
|
-
if embed:
|
|
2911
|
-
return content
|
|
2912
|
-
toolbar = detail_toolbar() if toolbar_html is None else toolbar_html
|
|
2913
|
-
return f"""
|
|
2914
|
-
<div class="detail-shell">
|
|
2915
|
-
{toolbar}
|
|
2916
|
-
<div class="detail-body">
|
|
2917
|
-
{content}
|
|
2918
|
-
</div>
|
|
2919
|
-
</div>
|
|
2920
|
-
"""
|
|
2921
|
-
|
|
2922
|
-
fullscreen_script = ""
|
|
2923
|
-
if not embed:
|
|
2924
|
-
fullscreen_script = """
|
|
2925
|
-
<script>
|
|
2926
|
-
const fullscreenEnter = document.getElementById('fullscreenEnter');
|
|
2927
|
-
const fullscreenExit = document.getElementById('fullscreenExit');
|
|
2928
|
-
function setFullscreen(enable) {
|
|
2929
|
-
document.body.classList.toggle('detail-fullscreen', enable);
|
|
2930
|
-
}
|
|
2931
|
-
if (fullscreenEnter) {
|
|
2932
|
-
fullscreenEnter.addEventListener('click', () => setFullscreen(true));
|
|
2933
|
-
}
|
|
2934
|
-
if (fullscreenExit) {
|
|
2935
|
-
fullscreenExit.addEventListener('click', () => setFullscreen(false));
|
|
2936
|
-
}
|
|
2937
|
-
document.addEventListener('keydown', (event) => {
|
|
2938
|
-
if (event.key === 'Escape' && document.body.classList.contains('detail-fullscreen')) {
|
|
2939
|
-
setFullscreen(false);
|
|
2940
|
-
}
|
|
2941
|
-
});
|
|
2942
|
-
</script>
|
|
2943
|
-
"""
|
|
2944
|
-
pdf_only_warning_html = ""
|
|
2945
|
-
if is_pdf_only:
|
|
2946
|
-
pdf_only_warning_html = (
|
|
2947
|
-
'<div class="warning">PDF-only entry: summary and source views are unavailable.</div>'
|
|
2948
|
-
)
|
|
2949
|
-
outline_top = "72px" if not embed else "16px"
|
|
2950
|
-
outline_html, outline_css, outline_js = _outline_assets(outline_top)
|
|
2951
|
-
|
|
2952
|
-
if view == "split":
|
|
2953
|
-
def pane_src(pane_view: str) -> str:
|
|
2954
|
-
if pane_view == "pdfjs" and pdf_path:
|
|
2955
|
-
return _build_pdfjs_viewer_url(pdf_url)
|
|
2956
|
-
params: dict[str, str] = {"view": pane_view, "embed": "1"}
|
|
2957
|
-
if pane_view == "summary" and template_param:
|
|
2958
|
-
params["template"] = str(template_param)
|
|
2959
|
-
if pane_view == "translated" and selected_lang:
|
|
2960
|
-
params["lang"] = selected_lang
|
|
2961
|
-
return f"/paper/{source_hash}?{urlencode(params)}"
|
|
2962
|
-
|
|
2963
|
-
left_src = pane_src(left)
|
|
2964
|
-
right_src = pane_src(right)
|
|
2965
|
-
options = [
|
|
2966
|
-
("summary", "Summary"),
|
|
2967
|
-
("source", "Source"),
|
|
2968
|
-
("translated", "Translated"),
|
|
2969
|
-
("pdf", "PDF"),
|
|
2970
|
-
("pdfjs", "PDF Viewer"),
|
|
2971
|
-
]
|
|
2972
|
-
if is_pdf_only:
|
|
2973
|
-
options = [
|
|
2974
|
-
("pdf", "PDF"),
|
|
2975
|
-
("pdfjs", "PDF Viewer"),
|
|
2976
|
-
]
|
|
2977
|
-
if translation_langs:
|
|
2978
|
-
lang_options = "\n".join(
|
|
2979
|
-
f'<option value="{html.escape(lang)}"{" selected" if lang == selected_lang else ""}>'
|
|
2980
|
-
f'{html.escape(lang)}</option>'
|
|
2981
|
-
for lang in translation_langs
|
|
2982
|
-
)
|
|
2983
|
-
lang_disabled = ""
|
|
2984
|
-
else:
|
|
2985
|
-
lang_options = '<option value="" selected>(no translations)</option>'
|
|
2986
|
-
lang_disabled = " disabled"
|
|
2987
|
-
left_options = "\n".join(
|
|
2988
|
-
f'<option value="{value}"{" selected" if value == left else ""}>{label}</option>'
|
|
2989
|
-
for value, label in options
|
|
2990
|
-
)
|
|
2991
|
-
right_options = "\n".join(
|
|
2992
|
-
f'<option value="{value}"{" selected" if value == right else ""}>{label}</option>'
|
|
2993
|
-
for value, label in options
|
|
2994
|
-
)
|
|
2995
|
-
split_controls = f"""
|
|
2996
|
-
<div class="split-inline">
|
|
2997
|
-
<span class="muted">Left</span>
|
|
2998
|
-
<select id="splitLeft">
|
|
2999
|
-
{left_options}
|
|
3000
|
-
</select>
|
|
3001
|
-
<div class="split-actions">
|
|
3002
|
-
<button id="splitTighten" type="button" title="Tighten width">-</button>
|
|
3003
|
-
<button id="splitSwap" type="button" title="Swap panes">⇄</button>
|
|
3004
|
-
<button id="splitWiden" type="button" title="Widen width">+</button>
|
|
3005
|
-
</div>
|
|
3006
|
-
<span class="muted">Right</span>
|
|
3007
|
-
<select id="splitRight">
|
|
3008
|
-
{right_options}
|
|
3009
|
-
</select>
|
|
3010
|
-
<span class="muted">Lang</span>
|
|
3011
|
-
<select id="splitLang"{lang_disabled}>
|
|
3012
|
-
{lang_options}
|
|
3013
|
-
</select>
|
|
3014
|
-
</div>
|
|
3015
|
-
"""
|
|
3016
|
-
toolbar_html = detail_toolbar(split_controls)
|
|
3017
|
-
split_layout = f"""
|
|
3018
|
-
{pdf_only_warning_html}
|
|
3019
|
-
<div class="split-layout">
|
|
3020
|
-
<div class="split-pane">
|
|
3021
|
-
<iframe id="leftPane" src="{html.escape(left_src)}" title="Left pane"></iframe>
|
|
3022
|
-
</div>
|
|
3023
|
-
<div class="split-pane">
|
|
3024
|
-
<iframe id="rightPane" src="{html.escape(right_src)}" title="Right pane"></iframe>
|
|
3025
|
-
</div>
|
|
3026
|
-
</div>
|
|
3027
|
-
"""
|
|
3028
|
-
body = wrap_detail(split_layout, toolbar_html=toolbar_html)
|
|
3029
|
-
extra_head = """
|
|
3030
|
-
<style>
|
|
3031
|
-
.container {
|
|
3032
|
-
max-width: 100%;
|
|
3033
|
-
width: 100%;
|
|
3034
|
-
margin: 0 auto;
|
|
3035
|
-
}
|
|
3036
|
-
.split-layout {
|
|
3037
|
-
display: flex;
|
|
3038
|
-
gap: 12px;
|
|
3039
|
-
width: 100%;
|
|
3040
|
-
max-width: var(--split-max-width, 100%);
|
|
3041
|
-
margin: 0 auto;
|
|
3042
|
-
flex: 1;
|
|
3043
|
-
min-height: 440px;
|
|
3044
|
-
}
|
|
3045
|
-
.split-pane {
|
|
3046
|
-
flex: 1;
|
|
3047
|
-
border: 1px solid #d0d7de;
|
|
3048
|
-
border-radius: 10px;
|
|
3049
|
-
overflow: hidden;
|
|
3050
|
-
background: #fff;
|
|
3051
|
-
}
|
|
3052
|
-
.split-pane iframe {
|
|
3053
|
-
width: 100%;
|
|
3054
|
-
height: 100%;
|
|
3055
|
-
border: 0;
|
|
3056
|
-
}
|
|
3057
|
-
@media (max-width: 900px) {
|
|
3058
|
-
.split-layout {
|
|
3059
|
-
flex-direction: column;
|
|
3060
|
-
min-height: 0;
|
|
3061
|
-
}
|
|
3062
|
-
.split-pane {
|
|
3063
|
-
height: 70vh;
|
|
3064
|
-
}
|
|
3065
|
-
}
|
|
3066
|
-
</style>
|
|
3067
|
-
"""
|
|
3068
|
-
extra_scripts = """
|
|
3069
|
-
<script>
|
|
3070
|
-
const leftSelect = document.getElementById('splitLeft');
|
|
3071
|
-
const rightSelect = document.getElementById('splitRight');
|
|
3072
|
-
const langSelect = document.getElementById('splitLang');
|
|
3073
|
-
const swapButton = document.getElementById('splitSwap');
|
|
3074
|
-
const tightenButton = document.getElementById('splitTighten');
|
|
3075
|
-
const widenButton = document.getElementById('splitWiden');
|
|
3076
|
-
function updateSplit() {
|
|
3077
|
-
const params = new URLSearchParams(window.location.search);
|
|
3078
|
-
params.set('view', 'split');
|
|
3079
|
-
params.set('left', leftSelect.value);
|
|
3080
|
-
params.set('right', rightSelect.value);
|
|
3081
|
-
if (langSelect && langSelect.value) {
|
|
3082
|
-
params.set('lang', langSelect.value);
|
|
3083
|
-
}
|
|
3084
|
-
window.location.search = params.toString();
|
|
3085
|
-
}
|
|
3086
|
-
leftSelect.addEventListener('change', updateSplit);
|
|
3087
|
-
rightSelect.addEventListener('change', updateSplit);
|
|
3088
|
-
if (langSelect) {
|
|
3089
|
-
langSelect.addEventListener('change', updateSplit);
|
|
3090
|
-
}
|
|
3091
|
-
swapButton.addEventListener('click', () => {
|
|
3092
|
-
const leftValue = leftSelect.value;
|
|
3093
|
-
leftSelect.value = rightSelect.value;
|
|
3094
|
-
rightSelect.value = leftValue;
|
|
3095
|
-
updateSplit();
|
|
3096
|
-
});
|
|
3097
|
-
const widthSteps = ["1200px", "1400px", "1600px", "1800px", "2000px", "100%"];
|
|
3098
|
-
let widthIndex = widthSteps.length - 1;
|
|
3099
|
-
try {
|
|
3100
|
-
const stored = localStorage.getItem('splitWidthIndex');
|
|
3101
|
-
if (stored !== null) {
|
|
3102
|
-
const parsed = Number.parseInt(stored, 10);
|
|
3103
|
-
if (!Number.isNaN(parsed)) {
|
|
3104
|
-
widthIndex = Math.max(0, Math.min(widthSteps.length - 1, parsed));
|
|
3105
|
-
}
|
|
3106
|
-
}
|
|
3107
|
-
} catch (err) {
|
|
3108
|
-
// Ignore storage errors (e.g. private mode)
|
|
3109
|
-
}
|
|
3110
|
-
|
|
3111
|
-
function applySplitWidth() {
|
|
3112
|
-
const value = widthSteps[widthIndex];
|
|
3113
|
-
document.documentElement.style.setProperty('--split-max-width', value);
|
|
3114
|
-
try {
|
|
3115
|
-
localStorage.setItem('splitWidthIndex', String(widthIndex));
|
|
3116
|
-
} catch (err) {
|
|
3117
|
-
// Ignore storage errors
|
|
3118
|
-
}
|
|
3119
|
-
}
|
|
3120
|
-
|
|
3121
|
-
tightenButton.addEventListener('click', () => {
|
|
3122
|
-
widthIndex = Math.max(0, widthIndex - 1);
|
|
3123
|
-
applySplitWidth();
|
|
3124
|
-
});
|
|
3125
|
-
widenButton.addEventListener('click', () => {
|
|
3126
|
-
widthIndex = Math.min(widthSteps.length - 1, widthIndex + 1);
|
|
3127
|
-
applySplitWidth();
|
|
3128
|
-
});
|
|
3129
|
-
applySplitWidth();
|
|
3130
|
-
</script>
|
|
3131
|
-
"""
|
|
3132
|
-
return render_page(
|
|
3133
|
-
"Split View",
|
|
3134
|
-
body,
|
|
3135
|
-
extra_head=extra_head,
|
|
3136
|
-
extra_scripts=extra_scripts + fullscreen_script,
|
|
3137
|
-
)
|
|
3138
|
-
|
|
3139
|
-
if view == "translated":
|
|
3140
|
-
if translation_langs:
|
|
3141
|
-
lang_options = "\n".join(
|
|
3142
|
-
f'<option value="{html.escape(lang)}"{" selected" if lang == selected_lang else ""}>'
|
|
3143
|
-
f'{html.escape(lang)}</option>'
|
|
3144
|
-
for lang in translation_langs
|
|
3145
|
-
)
|
|
3146
|
-
disabled_attr = ""
|
|
3147
|
-
else:
|
|
3148
|
-
lang_options = '<option value="" selected>(no translations)</option>'
|
|
3149
|
-
disabled_attr = " disabled"
|
|
3150
|
-
lang_controls = f"""
|
|
3151
|
-
<div class="lang-select">
|
|
3152
|
-
<label for="translationLang">Language</label>
|
|
3153
|
-
<select id="translationLang"{disabled_attr}>
|
|
3154
|
-
{lang_options}
|
|
3155
|
-
</select>
|
|
3156
|
-
</div>
|
|
3157
|
-
"""
|
|
3158
|
-
toolbar_html = detail_toolbar(lang_controls)
|
|
3159
|
-
if not translation_langs or not selected_lang:
|
|
3160
|
-
body = wrap_detail(
|
|
3161
|
-
'<div class="warning">No translated markdown found. '
|
|
3162
|
-
'Provide <code>--md-translated-root</code> and place '
|
|
3163
|
-
'<code><base>.<lang>.md</code> under that root.</div>',
|
|
3164
|
-
toolbar_html=toolbar_html,
|
|
3165
|
-
)
|
|
3166
|
-
return render_page("Translated", body, extra_scripts=fullscreen_script)
|
|
3167
|
-
translated_path = translations.get(selected_lang)
|
|
3168
|
-
if not translated_path:
|
|
3169
|
-
body = wrap_detail(
|
|
3170
|
-
'<div class="warning">Translated markdown not found for the selected language.</div>',
|
|
3171
|
-
toolbar_html=toolbar_html,
|
|
3172
|
-
)
|
|
3173
|
-
return render_page("Translated", body, extra_scripts=fullscreen_script)
|
|
3174
|
-
try:
|
|
3175
|
-
raw = translated_path.read_text(encoding="utf-8")
|
|
3176
|
-
except UnicodeDecodeError:
|
|
3177
|
-
raw = translated_path.read_text(encoding="latin-1")
|
|
3178
|
-
raw = _normalize_markdown_images(raw)
|
|
3179
|
-
rendered = _render_markdown_with_math_placeholders(md, raw)
|
|
3180
|
-
body = wrap_detail(
|
|
3181
|
-
f"""
|
|
3182
|
-
<div class="muted">Language: {html.escape(selected_lang)}</div>
|
|
3183
|
-
<div class="muted">{html.escape(str(translated_path))}</div>
|
|
3184
|
-
<div class="muted" style="margin-top:10px;">Rendered from translated markdown:</div>
|
|
3185
|
-
{outline_html}
|
|
3186
|
-
<div id="content">{rendered}</div>
|
|
3187
|
-
<details style="margin-top:12px;"><summary>Raw markdown</summary>
|
|
3188
|
-
<pre><code>{html.escape(raw)}</code></pre>
|
|
3189
|
-
</details>
|
|
3190
|
-
""",
|
|
3191
|
-
toolbar_html=toolbar_html,
|
|
3192
|
-
)
|
|
3193
|
-
extra_head = f"""
|
|
3194
|
-
<link rel="stylesheet" href="{_CDN_KATEX}" />
|
|
3195
|
-
{outline_css}
|
|
3196
|
-
<style>
|
|
3197
|
-
#content img {{
|
|
3198
|
-
max-width: 100%;
|
|
3199
|
-
height: auto;
|
|
3200
|
-
}}
|
|
3201
|
-
</style>
|
|
3202
|
-
"""
|
|
3203
|
-
extra_scripts = f"""
|
|
3204
|
-
<script src="{_CDN_MERMAID}"></script>
|
|
3205
|
-
<script src="{_CDN_KATEX_JS}"></script>
|
|
3206
|
-
<script src="{_CDN_KATEX_AUTO}"></script>
|
|
3207
|
-
<script>
|
|
3208
|
-
const translationSelect = document.getElementById('translationLang');
|
|
3209
|
-
if (translationSelect) {{
|
|
3210
|
-
translationSelect.addEventListener('change', () => {{
|
|
3211
|
-
const params = new URLSearchParams(window.location.search);
|
|
3212
|
-
params.set('view', 'translated');
|
|
3213
|
-
params.set('lang', translationSelect.value);
|
|
3214
|
-
window.location.search = params.toString();
|
|
3215
|
-
}});
|
|
3216
|
-
}}
|
|
3217
|
-
document.querySelectorAll('code.language-mermaid').forEach((code) => {{
|
|
3218
|
-
const pre = code.parentElement;
|
|
3219
|
-
const div = document.createElement('div');
|
|
3220
|
-
div.className = 'mermaid';
|
|
3221
|
-
div.textContent = code.textContent;
|
|
3222
|
-
pre.replaceWith(div);
|
|
3223
|
-
}});
|
|
3224
|
-
if (window.mermaid) {{
|
|
3225
|
-
mermaid.initialize({{ startOnLoad: false }});
|
|
3226
|
-
mermaid.run();
|
|
3227
|
-
}}
|
|
3228
|
-
if (window.renderMathInElement) {{
|
|
3229
|
-
renderMathInElement(document.getElementById('content'), {{
|
|
3230
|
-
delimiters: [
|
|
3231
|
-
{{left: '$$', right: '$$', display: true}},
|
|
3232
|
-
{{left: '$', right: '$', display: false}},
|
|
3233
|
-
{{left: '\\\\(', right: '\\\\)', display: false}},
|
|
3234
|
-
{{left: '\\\\[', right: '\\\\]', display: true}}
|
|
3235
|
-
],
|
|
3236
|
-
throwOnError: false
|
|
3237
|
-
}});
|
|
3238
|
-
}}
|
|
3239
|
-
if (document.querySelector('.footnotes')) {{
|
|
3240
|
-
const notes = {{}};
|
|
3241
|
-
document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
|
|
3242
|
-
const id = li.getAttribute('id');
|
|
3243
|
-
if (!id) return;
|
|
3244
|
-
const clone = li.cloneNode(true);
|
|
3245
|
-
clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
|
|
3246
|
-
const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
|
|
3247
|
-
if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
|
|
3248
|
-
}});
|
|
3249
|
-
document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
|
|
3250
|
-
const ref = link.getAttribute('href');
|
|
3251
|
-
const text = notes[ref];
|
|
3252
|
-
if (!text) return;
|
|
3253
|
-
link.dataset.footnote = text;
|
|
3254
|
-
link.classList.add('footnote-tip');
|
|
3255
|
-
}});
|
|
3256
|
-
}}
|
|
3257
|
-
{outline_js}
|
|
3258
|
-
</script>
|
|
3259
|
-
"""
|
|
3260
|
-
return render_page(
|
|
3261
|
-
"Translated",
|
|
3262
|
-
body,
|
|
3263
|
-
extra_head=extra_head,
|
|
3264
|
-
extra_scripts=extra_scripts + fullscreen_script,
|
|
3265
|
-
)
|
|
3266
|
-
|
|
3267
|
-
if view == "source":
|
|
3268
|
-
source_path = index.md_path_by_hash.get(source_hash)
|
|
3269
|
-
if not source_path:
|
|
3270
|
-
body = wrap_detail(
|
|
3271
|
-
'<div class="warning">Source markdown not found. Provide --md-root to enable source viewing.</div>'
|
|
3272
|
-
)
|
|
3273
|
-
return render_page("Source", body, extra_scripts=fullscreen_script)
|
|
3274
|
-
try:
|
|
3275
|
-
raw = source_path.read_text(encoding="utf-8")
|
|
3276
|
-
except UnicodeDecodeError:
|
|
3277
|
-
raw = source_path.read_text(encoding="latin-1")
|
|
3278
|
-
rendered = _render_markdown_with_math_placeholders(md, raw)
|
|
3279
|
-
body = wrap_detail(
|
|
3280
|
-
f"""
|
|
3281
|
-
<div class="muted">{html.escape(str(source_path))}</div>
|
|
3282
|
-
<div class="muted" style="margin-top:10px;">Rendered from source markdown:</div>
|
|
3283
|
-
{outline_html}
|
|
3284
|
-
<div id="content">{rendered}</div>
|
|
3285
|
-
<details style="margin-top:12px;"><summary>Raw markdown</summary>
|
|
3286
|
-
<pre><code>{html.escape(raw)}</code></pre>
|
|
3287
|
-
</details>
|
|
3288
|
-
"""
|
|
3289
|
-
)
|
|
3290
|
-
extra_head = f"""
|
|
3291
|
-
<link rel="stylesheet" href="{_CDN_KATEX}" />
|
|
3292
|
-
{outline_css}
|
|
3293
|
-
<style>
|
|
3294
|
-
#content img {{
|
|
3295
|
-
max-width: 100%;
|
|
3296
|
-
height: auto;
|
|
3297
|
-
}}
|
|
3298
|
-
</style>
|
|
3299
|
-
"""
|
|
3300
|
-
extra_scripts = f"""
|
|
3301
|
-
<script src="{_CDN_MERMAID}"></script>
|
|
3302
|
-
<script src="{_CDN_KATEX_JS}"></script>
|
|
3303
|
-
<script src="{_CDN_KATEX_AUTO}"></script>
|
|
3304
|
-
<script>
|
|
3305
|
-
document.querySelectorAll('code.language-mermaid').forEach((code) => {{
|
|
3306
|
-
const pre = code.parentElement;
|
|
3307
|
-
const div = document.createElement('div');
|
|
3308
|
-
div.className = 'mermaid';
|
|
3309
|
-
div.textContent = code.textContent;
|
|
3310
|
-
pre.replaceWith(div);
|
|
3311
|
-
}});
|
|
3312
|
-
if (window.mermaid) {{
|
|
3313
|
-
mermaid.initialize({{ startOnLoad: false }});
|
|
3314
|
-
mermaid.run();
|
|
3315
|
-
}}
|
|
3316
|
-
if (window.renderMathInElement) {{
|
|
3317
|
-
renderMathInElement(document.getElementById('content'), {{
|
|
3318
|
-
delimiters: [
|
|
3319
|
-
{{left: '$$', right: '$$', display: true}},
|
|
3320
|
-
{{left: '$', right: '$', display: false}},
|
|
3321
|
-
{{left: '\\\\(', right: '\\\\)', display: false}},
|
|
3322
|
-
{{left: '\\\\[', right: '\\\\]', display: true}}
|
|
3323
|
-
],
|
|
3324
|
-
throwOnError: false
|
|
3325
|
-
}});
|
|
3326
|
-
}}
|
|
3327
|
-
if (document.querySelector('.footnotes')) {{
|
|
3328
|
-
const notes = {{}};
|
|
3329
|
-
document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
|
|
3330
|
-
const id = li.getAttribute('id');
|
|
3331
|
-
if (!id) return;
|
|
3332
|
-
const clone = li.cloneNode(true);
|
|
3333
|
-
clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
|
|
3334
|
-
const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
|
|
3335
|
-
if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
|
|
3336
|
-
}});
|
|
3337
|
-
document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
|
|
3338
|
-
const ref = link.getAttribute('href');
|
|
3339
|
-
const text = notes[ref];
|
|
3340
|
-
if (!text) return;
|
|
3341
|
-
link.dataset.footnote = text;
|
|
3342
|
-
link.classList.add('footnote-tip');
|
|
3343
|
-
}});
|
|
3344
|
-
}}
|
|
3345
|
-
{outline_js}
|
|
3346
|
-
</script>
|
|
3347
|
-
"""
|
|
3348
|
-
return render_page("Source", body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
|
|
3349
|
-
|
|
3350
|
-
if view == "pdf":
|
|
3351
|
-
if not pdf_path:
|
|
3352
|
-
body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
|
|
3353
|
-
return render_page("PDF", body, extra_scripts=fullscreen_script)
|
|
3354
|
-
body = wrap_detail(
|
|
3355
|
-
f"""
|
|
3356
|
-
{pdf_only_warning_html}
|
|
3357
|
-
<div class="muted">{html.escape(str(pdf_path.name))}</div>
|
|
3358
|
-
<div style="display:flex; gap:8px; align-items:center; margin: 10px 0;">
|
|
3359
|
-
<button id="prev" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Prev</button>
|
|
3360
|
-
<button id="next" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Next</button>
|
|
3361
|
-
<span class="muted">Page <span id="page_num">1</span> / <span id="page_count">?</span></span>
|
|
3362
|
-
<span style="flex:1"></span>
|
|
3363
|
-
<button id="zoomOut" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">-</button>
|
|
3364
|
-
<button id="zoomIn" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">+</button>
|
|
3365
|
-
</div>
|
|
3366
|
-
<canvas id="the-canvas" style="width: 100%; border: 1px solid #d0d7de; border-radius: 10px;"></canvas>
|
|
3367
|
-
"""
|
|
3368
|
-
)
|
|
3369
|
-
extra_scripts = f"""
|
|
3370
|
-
<script src="{_CDN_PDFJS}"></script>
|
|
3371
|
-
<script>
|
|
3372
|
-
const url = {json.dumps(pdf_url)};
|
|
3373
|
-
pdfjsLib.GlobalWorkerOptions.workerSrc = {json.dumps(_CDN_PDFJS_WORKER)};
|
|
3374
|
-
let pdfDoc = null;
|
|
3375
|
-
let pageNum = 1;
|
|
3376
|
-
let pageRendering = false;
|
|
3377
|
-
let pageNumPending = null;
|
|
3378
|
-
let zoomLevel = 1.0;
|
|
3379
|
-
const canvas = document.getElementById('the-canvas');
|
|
3380
|
-
const ctx = canvas.getContext('2d');
|
|
3381
|
-
|
|
3382
|
-
function renderPage(num) {{
|
|
3383
|
-
pageRendering = true;
|
|
3384
|
-
pdfDoc.getPage(num).then((page) => {{
|
|
3385
|
-
const baseViewport = page.getViewport({{scale: 1}});
|
|
3386
|
-
const containerWidth = canvas.clientWidth || baseViewport.width;
|
|
3387
|
-
const fitScale = containerWidth / baseViewport.width;
|
|
3388
|
-
const scale = fitScale * zoomLevel;
|
|
3389
|
-
|
|
3390
|
-
const viewport = page.getViewport({{scale}});
|
|
3391
|
-
const outputScale = window.devicePixelRatio || 1;
|
|
3392
|
-
|
|
3393
|
-
canvas.width = Math.floor(viewport.width * outputScale);
|
|
3394
|
-
canvas.height = Math.floor(viewport.height * outputScale);
|
|
3395
|
-
canvas.style.width = Math.floor(viewport.width) + 'px';
|
|
3396
|
-
canvas.style.height = Math.floor(viewport.height) + 'px';
|
|
3397
|
-
|
|
3398
|
-
const transform = outputScale !== 1 ? [outputScale, 0, 0, outputScale, 0, 0] : null;
|
|
3399
|
-
const renderContext = {{ canvasContext: ctx, viewport, transform }};
|
|
3400
|
-
const renderTask = page.render(renderContext);
|
|
3401
|
-
renderTask.promise.then(() => {{
|
|
3402
|
-
pageRendering = false;
|
|
3403
|
-
document.getElementById('page_num').textContent = String(pageNum);
|
|
3404
|
-
if (pageNumPending !== null) {{
|
|
3405
|
-
const next = pageNumPending;
|
|
3406
|
-
pageNumPending = null;
|
|
3407
|
-
renderPage(next);
|
|
3408
|
-
}}
|
|
3409
|
-
}});
|
|
3410
|
-
}});
|
|
3411
|
-
}}
|
|
3412
|
-
|
|
3413
|
-
function queueRenderPage(num) {{
|
|
3414
|
-
if (pageRendering) {{
|
|
3415
|
-
pageNumPending = num;
|
|
3416
|
-
}} else {{
|
|
3417
|
-
renderPage(num);
|
|
3418
|
-
}}
|
|
3419
|
-
}}
|
|
3420
|
-
|
|
3421
|
-
function onPrevPage() {{
|
|
3422
|
-
if (pageNum <= 1) return;
|
|
3423
|
-
pageNum--;
|
|
3424
|
-
queueRenderPage(pageNum);
|
|
3425
|
-
}}
|
|
3426
|
-
|
|
3427
|
-
function onNextPage() {{
|
|
3428
|
-
if (pageNum >= pdfDoc.numPages) return;
|
|
3429
|
-
pageNum++;
|
|
3430
|
-
queueRenderPage(pageNum);
|
|
3431
|
-
}}
|
|
3432
|
-
|
|
3433
|
-
function adjustZoom(delta) {{
|
|
3434
|
-
zoomLevel = Math.max(0.5, Math.min(3.0, zoomLevel + delta));
|
|
3435
|
-
queueRenderPage(pageNum);
|
|
3436
|
-
}}
|
|
3437
|
-
|
|
3438
|
-
document.getElementById('prev').addEventListener('click', onPrevPage);
|
|
3439
|
-
document.getElementById('next').addEventListener('click', onNextPage);
|
|
3440
|
-
document.getElementById('zoomOut').addEventListener('click', () => adjustZoom(-0.1));
|
|
3441
|
-
document.getElementById('zoomIn').addEventListener('click', () => adjustZoom(0.1));
|
|
3442
|
-
|
|
3443
|
-
pdfjsLib.getDocument(url).promise.then((pdfDoc_) => {{
|
|
3444
|
-
pdfDoc = pdfDoc_;
|
|
3445
|
-
document.getElementById('page_count').textContent = String(pdfDoc.numPages);
|
|
3446
|
-
renderPage(pageNum);
|
|
3447
|
-
}});
|
|
3448
|
-
|
|
3449
|
-
let resizeTimer = null;
|
|
3450
|
-
window.addEventListener('resize', () => {{
|
|
3451
|
-
if (!pdfDoc) return;
|
|
3452
|
-
if (resizeTimer) clearTimeout(resizeTimer);
|
|
3453
|
-
resizeTimer = setTimeout(() => queueRenderPage(pageNum), 150);
|
|
3454
|
-
}});
|
|
3455
|
-
</script>
|
|
3456
|
-
"""
|
|
3457
|
-
return render_page("PDF", body, extra_scripts=extra_scripts + fullscreen_script)
|
|
3458
|
-
|
|
3459
|
-
if view == "pdfjs":
|
|
3460
|
-
if not pdf_path:
|
|
3461
|
-
body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
|
|
3462
|
-
return render_page("PDF Viewer", body, extra_scripts=fullscreen_script)
|
|
3463
|
-
viewer_url = _build_pdfjs_viewer_url(pdf_url)
|
|
3464
|
-
frame_height = "calc(100vh - 32px)" if embed else "100%"
|
|
3465
|
-
body = wrap_detail(
|
|
3466
|
-
f"""
|
|
3467
|
-
{pdf_only_warning_html}
|
|
3468
|
-
<div class="muted">{html.escape(str(pdf_path.name))}</div>
|
|
3469
|
-
<iframe class="pdfjs-frame" src="{html.escape(viewer_url)}" title="PDF.js Viewer"></iframe>
|
|
3470
|
-
"""
|
|
3471
|
-
)
|
|
3472
|
-
extra_head = f"""
|
|
3473
|
-
<style>
|
|
3474
|
-
.pdfjs-frame {{
|
|
3475
|
-
width: 100%;
|
|
3476
|
-
height: {frame_height};
|
|
3477
|
-
border: 1px solid #d0d7de;
|
|
3478
|
-
border-radius: 10px;
|
|
3479
|
-
flex: 1;
|
|
3480
|
-
}}
|
|
3481
|
-
</style>
|
|
3482
|
-
"""
|
|
3483
|
-
return render_page("PDF Viewer", body, extra_head=extra_head, extra_scripts=fullscreen_script)
|
|
3484
|
-
|
|
3485
|
-
selected_tag, available_templates = _select_template_tag(paper, template_param)
|
|
3486
|
-
markdown, template_name, warning = _render_paper_markdown(
|
|
3487
|
-
paper,
|
|
3488
|
-
request.app.state.fallback_language,
|
|
3489
|
-
template_tag=selected_tag,
|
|
3490
|
-
)
|
|
3491
|
-
rendered_html = _render_markdown_with_math_placeholders(md, markdown)
|
|
3492
|
-
|
|
3493
|
-
warning_html = f'<div class="warning">{html.escape(warning)}</div>' if warning else ""
|
|
3494
|
-
template_controls = f'<div class="muted">Template: {html.escape(template_name)}</div>'
|
|
3495
|
-
if available_templates:
|
|
3496
|
-
options = "\n".join(
|
|
3497
|
-
f'<option value="{html.escape(tag)}"{" selected" if tag == selected_tag else ""}>{html.escape(tag)}</option>'
|
|
3498
|
-
for tag in available_templates
|
|
3499
|
-
)
|
|
3500
|
-
template_controls = f"""
|
|
3501
|
-
<div class="muted" style="margin: 6px 0;">
|
|
3502
|
-
Template:
|
|
3503
|
-
<select id="templateSelect" style="padding:6px 8px; border:1px solid #d0d7de; border-radius:6px;">
|
|
3504
|
-
{options}
|
|
3505
|
-
</select>
|
|
3506
|
-
</div>
|
|
3507
|
-
<script>
|
|
3508
|
-
const templateSelect = document.getElementById('templateSelect');
|
|
3509
|
-
if (templateSelect) {{
|
|
3510
|
-
templateSelect.addEventListener('change', () => {{
|
|
3511
|
-
const params = new URLSearchParams(window.location.search);
|
|
3512
|
-
params.set('view', 'summary');
|
|
3513
|
-
params.set('template', templateSelect.value);
|
|
3514
|
-
window.location.search = params.toString();
|
|
3515
|
-
}});
|
|
3516
|
-
}}
|
|
3517
|
-
</script>
|
|
3518
|
-
"""
|
|
3519
|
-
content_html = f"""
|
|
3520
|
-
{template_controls}
|
|
3521
|
-
{warning_html}
|
|
3522
|
-
{outline_html}
|
|
3523
|
-
<div id="content">{rendered_html}</div>
|
|
3524
|
-
"""
|
|
3525
|
-
body = wrap_detail(content_html)
|
|
3526
|
-
|
|
3527
|
-
extra_head = f"""
|
|
3528
|
-
<link rel="stylesheet" href="{_CDN_KATEX}" />
|
|
3529
|
-
{outline_css}
|
|
3530
|
-
"""
|
|
3531
|
-
extra_scripts = f"""
|
|
3532
|
-
<script src="{_CDN_MERMAID}"></script>
|
|
3533
|
-
<script src="{_CDN_KATEX_JS}"></script>
|
|
3534
|
-
<script src="{_CDN_KATEX_AUTO}"></script>
|
|
3535
|
-
<script>
|
|
3536
|
-
// Mermaid: convert fenced code blocks to mermaid divs
|
|
3537
|
-
document.querySelectorAll('code.language-mermaid').forEach((code) => {{
|
|
3538
|
-
const pre = code.parentElement;
|
|
3539
|
-
const div = document.createElement('div');
|
|
3540
|
-
div.className = 'mermaid';
|
|
3541
|
-
div.textContent = code.textContent;
|
|
3542
|
-
pre.replaceWith(div);
|
|
3543
|
-
}});
|
|
3544
|
-
if (window.mermaid) {{
|
|
3545
|
-
mermaid.initialize({{ startOnLoad: false }});
|
|
3546
|
-
mermaid.run();
|
|
3547
|
-
}}
|
|
3548
|
-
if (window.renderMathInElement) {{
|
|
3549
|
-
renderMathInElement(document.getElementById('content'), {{
|
|
3550
|
-
delimiters: [
|
|
3551
|
-
{{left: '$$', right: '$$', display: true}},
|
|
3552
|
-
{{left: '$', right: '$', display: false}},
|
|
3553
|
-
{{left: '\\\\(', right: '\\\\)', display: false}},
|
|
3554
|
-
{{left: '\\\\[', right: '\\\\]', display: true}}
|
|
3555
|
-
],
|
|
3556
|
-
throwOnError: false
|
|
3557
|
-
}});
|
|
3558
|
-
}}
|
|
3559
|
-
if (document.querySelector('.footnotes')) {{
|
|
3560
|
-
const notes = {{}};
|
|
3561
|
-
document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
|
|
3562
|
-
const id = li.getAttribute('id');
|
|
3563
|
-
if (!id) return;
|
|
3564
|
-
const clone = li.cloneNode(true);
|
|
3565
|
-
clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
|
|
3566
|
-
const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
|
|
3567
|
-
if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
|
|
3568
|
-
}});
|
|
3569
|
-
document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
|
|
3570
|
-
const ref = link.getAttribute('href');
|
|
3571
|
-
const text = notes[ref];
|
|
3572
|
-
if (!text) return;
|
|
3573
|
-
link.dataset.footnote = text;
|
|
3574
|
-
link.classList.add('footnote-tip');
|
|
3575
|
-
}});
|
|
3576
|
-
}}
|
|
3577
|
-
{outline_js}
|
|
3578
|
-
</script>
|
|
3579
|
-
"""
|
|
3580
|
-
return render_page(page_title, body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
|
|
3581
|
-
|
|
3582
|
-
|
|
3583
|
-
async def _api_stats(request: Request) -> JSONResponse:
|
|
3584
|
-
index: PaperIndex = request.app.state.index
|
|
3585
|
-
return JSONResponse(index.stats)
|
|
3586
|
-
|
|
3587
|
-
|
|
3588
|
-
async def _api_pdf(request: Request) -> Response:
|
|
3589
|
-
index: PaperIndex = request.app.state.index
|
|
3590
|
-
source_hash = request.path_params["source_hash"]
|
|
3591
|
-
pdf_path = index.pdf_path_by_hash.get(source_hash)
|
|
3592
|
-
if not pdf_path:
|
|
3593
|
-
return Response("PDF not found", status_code=404)
|
|
3594
|
-
allowed_roots: list[Path] = request.app.state.pdf_roots
|
|
3595
|
-
if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
|
|
3596
|
-
return Response("Forbidden", status_code=403)
|
|
3597
|
-
return FileResponse(pdf_path)
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
async def _stats_page(request: Request) -> HTMLResponse:
|
|
3601
|
-
body = """
|
|
3602
|
-
<h2>Stats</h2>
|
|
3603
|
-
<div class="muted">Charts are rendered with ECharts (CDN).</div>
|
|
3604
|
-
<div id="year" style="width:100%;height:360px"></div>
|
|
3605
|
-
<div id="month" style="width:100%;height:360px"></div>
|
|
3606
|
-
<div id="tags" style="width:100%;height:420px"></div>
|
|
3607
|
-
<div id="keywords" style="width:100%;height:420px"></div>
|
|
3608
|
-
<div id="authors" style="width:100%;height:420px"></div>
|
|
3609
|
-
<div id="venues" style="width:100%;height:420px"></div>
|
|
3610
|
-
"""
|
|
3611
|
-
scripts = f"""
|
|
3612
|
-
<script src="{_CDN_ECHARTS}"></script>
|
|
3613
|
-
<script>
|
|
3614
|
-
async function main() {{
|
|
3615
|
-
const res = await fetch('/api/stats');
|
|
3616
|
-
const data = await res.json();
|
|
3617
|
-
|
|
3618
|
-
function bar(el, title, items) {{
|
|
3619
|
-
const chart = echarts.init(document.getElementById(el));
|
|
3620
|
-
const labels = items.map(x => x.label);
|
|
3621
|
-
const counts = items.map(x => x.count);
|
|
3622
|
-
chart.setOption({{
|
|
3623
|
-
title: {{ text: title }},
|
|
3624
|
-
tooltip: {{ trigger: 'axis' }},
|
|
3625
|
-
xAxis: {{ type: 'category', data: labels }},
|
|
3626
|
-
yAxis: {{ type: 'value' }},
|
|
3627
|
-
series: [{{ type: 'bar', data: counts }}]
|
|
3628
|
-
}});
|
|
3629
|
-
}}
|
|
3630
|
-
|
|
3631
|
-
bar('year', 'Publication Year', data.years || []);
|
|
3632
|
-
bar('month', 'Publication Month', data.months || []);
|
|
3633
|
-
bar('tags', 'Top Tags', (data.tags || []).slice(0, 20));
|
|
3634
|
-
bar('keywords', 'Top Keywords', (data.keywords || []).slice(0, 20));
|
|
3635
|
-
bar('authors', 'Top Authors', (data.authors || []).slice(0, 20));
|
|
3636
|
-
bar('venues', 'Top Venues', (data.venues || []).slice(0, 20));
|
|
3637
|
-
}}
|
|
3638
|
-
main();
|
|
3639
|
-
</script>
|
|
3640
|
-
"""
|
|
3641
|
-
return HTMLResponse(_page_shell("Stats", body, extra_scripts=scripts))
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
def _normalize_bibtex_title(title: str) -> str:
|
|
3645
|
-
value = title.replace("{", "").replace("}", "")
|
|
3646
|
-
value = re.sub(r"[^a-z0-9]+", " ", value.lower())
|
|
3647
|
-
return re.sub(r"\\s+", " ", value).strip()
|
|
3648
|
-
|
|
3649
|
-
|
|
3650
|
-
def _title_similarity(a: str, b: str) -> float:
|
|
3651
|
-
import difflib
|
|
3652
|
-
|
|
3653
|
-
if not a or not b:
|
|
3654
|
-
return 0.0
|
|
3655
|
-
return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
|
|
3659
|
-
if not PYBTEX_AVAILABLE:
|
|
3660
|
-
raise RuntimeError("pybtex is required for --bibtex support")
|
|
3661
|
-
|
|
3662
|
-
bib_data = parse_file(str(bibtex_path))
|
|
3663
|
-
entries: list[dict[str, Any]] = []
|
|
3664
|
-
by_prefix: dict[str, list[int]] = {}
|
|
3665
|
-
for key, entry in bib_data.entries.items():
|
|
3666
|
-
fields = dict(entry.fields)
|
|
3667
|
-
title = str(fields.get("title") or "").strip()
|
|
3668
|
-
title_norm = _normalize_bibtex_title(title)
|
|
3669
|
-
if not title_norm:
|
|
3670
|
-
continue
|
|
3671
|
-
record = {
|
|
3672
|
-
"key": key,
|
|
3673
|
-
"type": entry.type,
|
|
3674
|
-
"fields": fields,
|
|
3675
|
-
"persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
|
|
3676
|
-
"_title_norm": title_norm,
|
|
3677
|
-
}
|
|
3678
|
-
idx = len(entries)
|
|
3679
|
-
entries.append(record)
|
|
3680
|
-
prefix = title_norm[:16]
|
|
3681
|
-
by_prefix.setdefault(prefix, []).append(idx)
|
|
3682
|
-
|
|
3683
|
-
for paper in papers:
|
|
3684
|
-
if isinstance(paper.get("bibtex"), dict):
|
|
3685
|
-
continue
|
|
3686
|
-
title = str(paper.get("paper_title") or "").strip()
|
|
3687
|
-
if not title:
|
|
3688
|
-
continue
|
|
3689
|
-
norm = _normalize_bibtex_title(title)
|
|
3690
|
-
if not norm:
|
|
3691
|
-
continue
|
|
3692
|
-
|
|
3693
|
-
candidates = []
|
|
3694
|
-
prefix = norm[:16]
|
|
3695
|
-
for cand_idx in by_prefix.get(prefix, []):
|
|
3696
|
-
candidates.append(entries[cand_idx])
|
|
3697
|
-
if not candidates:
|
|
3698
|
-
candidates = entries
|
|
3699
|
-
|
|
3700
|
-
best = None
|
|
3701
|
-
best_score = 0.0
|
|
3702
|
-
for entry in candidates:
|
|
3703
|
-
score = _title_similarity(norm, entry["_title_norm"])
|
|
3704
|
-
if score > best_score:
|
|
3705
|
-
best_score = score
|
|
3706
|
-
best = entry
|
|
3707
|
-
|
|
3708
|
-
if best is not None and best_score >= 0.9:
|
|
3709
|
-
paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
|
|
28
|
+
class _NoIndexMiddleware(BaseHTTPMiddleware):
|
|
29
|
+
async def dispatch(self, request: Request, call_next): # type: ignore[override]
|
|
30
|
+
response = await call_next(request)
|
|
31
|
+
response.headers["X-Robots-Tag"] = "noindex, nofollow, noarchive, nosnippet, noai, noimageai"
|
|
32
|
+
return response
|
|
3710
33
|
|
|
3711
34
|
|
|
3712
35
|
def create_app(
|
|
@@ -3720,7 +43,7 @@ def create_app(
|
|
|
3720
43
|
cache_dir: Path | None = None,
|
|
3721
44
|
use_cache: bool = True,
|
|
3722
45
|
) -> Starlette:
|
|
3723
|
-
papers =
|
|
46
|
+
papers = load_and_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
|
|
3724
47
|
|
|
3725
48
|
md_roots = md_roots or []
|
|
3726
49
|
md_translated_roots = md_translated_roots or []
|
|
@@ -3731,29 +54,39 @@ def create_app(
|
|
|
3731
54
|
md_translated_roots=md_translated_roots,
|
|
3732
55
|
pdf_roots=pdf_roots,
|
|
3733
56
|
)
|
|
3734
|
-
md =
|
|
57
|
+
md = create_md_renderer()
|
|
3735
58
|
routes = [
|
|
3736
|
-
Route("/",
|
|
3737
|
-
Route("/
|
|
3738
|
-
Route("/
|
|
3739
|
-
Route("/
|
|
3740
|
-
Route("/api/
|
|
3741
|
-
Route("/api/
|
|
59
|
+
Route("/", index_page, methods=["GET"]),
|
|
60
|
+
Route("/robots.txt", robots_txt, methods=["GET"]),
|
|
61
|
+
Route("/stats", stats_page, methods=["GET"]),
|
|
62
|
+
Route("/paper/{source_hash:str}", paper_detail, methods=["GET"]),
|
|
63
|
+
Route("/api/papers", api_papers, methods=["GET"]),
|
|
64
|
+
Route("/api/stats", api_stats, methods=["GET"]),
|
|
65
|
+
Route("/api/pdf/{source_hash:str}", api_pdf, methods=["GET"]),
|
|
3742
66
|
]
|
|
3743
|
-
if
|
|
67
|
+
if PDFJS_STATIC_DIR.exists():
|
|
3744
68
|
routes.append(
|
|
3745
69
|
Mount(
|
|
3746
70
|
"/pdfjs",
|
|
3747
|
-
app=StaticFiles(directory=str(
|
|
71
|
+
app=StaticFiles(directory=str(PDFJS_STATIC_DIR), html=True),
|
|
3748
72
|
name="pdfjs",
|
|
3749
73
|
)
|
|
3750
74
|
)
|
|
3751
75
|
elif pdf_roots:
|
|
3752
76
|
logger.warning(
|
|
3753
77
|
"PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable.",
|
|
3754
|
-
|
|
78
|
+
PDFJS_STATIC_DIR,
|
|
79
|
+
)
|
|
80
|
+
if STATIC_DIR.exists():
|
|
81
|
+
routes.append(
|
|
82
|
+
Mount(
|
|
83
|
+
"/static",
|
|
84
|
+
app=StaticFiles(directory=str(STATIC_DIR)),
|
|
85
|
+
name="static",
|
|
86
|
+
)
|
|
3755
87
|
)
|
|
3756
88
|
app = Starlette(routes=routes)
|
|
89
|
+
app.add_middleware(_NoIndexMiddleware)
|
|
3757
90
|
app.state.index = index
|
|
3758
91
|
app.state.md = md
|
|
3759
92
|
app.state.fallback_language = fallback_language
|