deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. deepresearch_flow/paper/db.py +184 -0
  2. deepresearch_flow/paper/db_ops.py +1939 -0
  3. deepresearch_flow/paper/web/app.py +38 -3705
  4. deepresearch_flow/paper/web/constants.py +23 -0
  5. deepresearch_flow/paper/web/filters.py +255 -0
  6. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  7. deepresearch_flow/paper/web/handlers/api.py +217 -0
  8. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  9. deepresearch_flow/paper/web/markdown.py +549 -0
  10. deepresearch_flow/paper/web/static/css/main.css +857 -0
  11. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  12. deepresearch_flow/paper/web/static/js/index.js +266 -0
  13. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  14. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  15. deepresearch_flow/paper/web/templates/base.html +43 -0
  16. deepresearch_flow/paper/web/templates/detail.html +332 -0
  17. deepresearch_flow/paper/web/templates/index.html +114 -0
  18. deepresearch_flow/paper/web/templates/stats.html +29 -0
  19. deepresearch_flow/paper/web/templates.py +85 -0
  20. deepresearch_flow/paper/web/text.py +68 -0
  21. deepresearch_flow/recognize/cli.py +805 -26
  22. deepresearch_flow/recognize/katex_check.js +29 -0
  23. deepresearch_flow/recognize/math.py +719 -0
  24. deepresearch_flow/recognize/mermaid.py +690 -0
  25. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
  26. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
  27. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
  28. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
  29. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
  30. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,3712 +1,35 @@
1
1
  from __future__ import annotations
2
2
 
3
- import html
4
- import json
5
3
  import logging
6
- import unicodedata
7
- from dataclasses import dataclass
8
- from html.parser import HTMLParser
9
4
  from pathlib import Path
10
- from typing import Any
11
- import re
12
- from urllib.parse import urlencode, quote
13
5
 
14
- from markdown_it import MarkdownIt
15
- try:
16
- from mdit_py_plugins.footnote import footnote_plugin as footnote
17
- except ImportError: # pragma: no cover - compatibility with older names
18
- from mdit_py_plugins.footnote import footnote
19
6
  from starlette.applications import Starlette
7
+ from starlette.middleware.base import BaseHTTPMiddleware
20
8
  from starlette.requests import Request
21
- from starlette.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, Response
22
9
  from starlette.routing import Mount, Route
23
10
  from starlette.staticfiles import StaticFiles
24
11
 
25
- from deepresearch_flow.paper.render import load_default_template
26
- from deepresearch_flow.paper.template_registry import (
27
- list_template_names_in_registry_order,
28
- load_render_template,
29
- load_schema_for_template,
12
+ from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
13
+ from deepresearch_flow.paper.web.constants import PDFJS_STATIC_DIR, STATIC_DIR
14
+ from deepresearch_flow.paper.web.handlers import (
15
+ api_papers,
16
+ api_pdf,
17
+ api_stats,
18
+ index_page,
19
+ paper_detail,
20
+ robots_txt,
21
+ stats_page,
30
22
  )
31
- from deepresearch_flow.paper.utils import stable_hash
32
- from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
33
-
34
- try:
35
- from pybtex.database import parse_file
36
- PYBTEX_AVAILABLE = True
37
- except Exception:
38
- PYBTEX_AVAILABLE = False
39
-
40
- try:
41
- from pypdf import PdfReader
42
- PYPDF_AVAILABLE = True
43
- except Exception:
44
- PYPDF_AVAILABLE = False
45
-
46
-
47
- _CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
48
- _CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"
49
- _CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
50
- _CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"
51
- _CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js"
52
- # Use legacy builds to ensure `pdfjsLib` is available as a global.
53
- _CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
54
- _CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
55
- _PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
56
- _PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
23
+ from deepresearch_flow.paper.web.markdown import create_md_renderer
57
24
 
58
25
  logger = logging.getLogger(__name__)
59
26
 
60
27
 
61
- @dataclass(frozen=True)
62
- class PaperIndex:
63
- papers: list[dict[str, Any]]
64
- id_by_hash: dict[str, int]
65
- ordered_ids: list[int]
66
- by_tag: dict[str, set[int]]
67
- by_author: dict[str, set[int]]
68
- by_year: dict[str, set[int]]
69
- by_month: dict[str, set[int]]
70
- by_venue: dict[str, set[int]]
71
- stats: dict[str, Any]
72
- md_path_by_hash: dict[str, Path]
73
- translated_md_by_hash: dict[str, dict[str, Path]]
74
- pdf_path_by_hash: dict[str, Path]
75
- template_tags: list[str]
76
-
77
-
78
- def _split_csv(values: list[str]) -> list[str]:
79
- out: list[str] = []
80
- for value in values:
81
- for part in value.split(","):
82
- part = part.strip()
83
- if part:
84
- out.append(part)
85
- return out
86
-
87
-
88
- def _normalize_key(value: str) -> str:
89
- return value.strip().lower()
90
-
91
-
92
- def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
93
- if not date_str:
94
- return None, None
95
- text = str(date_str).strip()
96
- year = None
97
- month = None
98
-
99
- year_match = re.search(r"(19|20)\d{2}", text)
100
- if year_match:
101
- year = year_match.group(0)
102
-
103
- numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
104
- if numeric_match:
105
- m = int(numeric_match.group(2))
106
- if 1 <= m <= 12:
107
- month = f"{m:02d}"
108
- return year, month
109
-
110
- month_word = re.search(
111
- r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
112
- r"january|february|march|april|june|july|august|september|october|november|december)",
113
- text.lower(),
114
- )
115
- if month_word:
116
- lookup = {
117
- "january": "01",
118
- "february": "02",
119
- "march": "03",
120
- "april": "04",
121
- "may": "05",
122
- "june": "06",
123
- "july": "07",
124
- "august": "08",
125
- "september": "09",
126
- "october": "10",
127
- "november": "11",
128
- "december": "12",
129
- "jan": "01",
130
- "feb": "02",
131
- "mar": "03",
132
- "apr": "04",
133
- "jun": "06",
134
- "jul": "07",
135
- "aug": "08",
136
- "sep": "09",
137
- "sept": "09",
138
- "oct": "10",
139
- "nov": "11",
140
- "dec": "12",
141
- }
142
- month = lookup.get(month_word.group(0))
143
- return year, month
144
-
145
-
146
- def _normalize_month_token(value: str | int | None) -> str | None:
147
- if value is None:
148
- return None
149
- if isinstance(value, int):
150
- if 1 <= value <= 12:
151
- return f"{value:02d}"
152
- return None
153
- raw = str(value).strip().lower()
154
- if not raw:
155
- return None
156
- if raw.isdigit():
157
- return _normalize_month_token(int(raw))
158
- lookup = {
159
- "january": "01",
160
- "february": "02",
161
- "march": "03",
162
- "april": "04",
163
- "may": "05",
164
- "june": "06",
165
- "july": "07",
166
- "august": "08",
167
- "september": "09",
168
- "october": "10",
169
- "november": "11",
170
- "december": "12",
171
- "jan": "01",
172
- "feb": "02",
173
- "mar": "03",
174
- "apr": "04",
175
- "jun": "06",
176
- "jul": "07",
177
- "aug": "08",
178
- "sep": "09",
179
- "sept": "09",
180
- "oct": "10",
181
- "nov": "11",
182
- "dec": "12",
183
- }
184
- return lookup.get(raw)
185
-
186
-
187
- def _extract_authors(paper: dict[str, Any]) -> list[str]:
188
- value = paper.get("paper_authors")
189
- if value is None:
190
- return []
191
- if isinstance(value, list):
192
- return [str(item).strip() for item in value if str(item).strip()]
193
- if isinstance(value, str):
194
- return [part.strip() for part in value.split(",") if part.strip()]
195
- return [str(value)]
196
-
197
-
198
- def _extract_tags(paper: dict[str, Any]) -> list[str]:
199
- tags = paper.get("ai_generated_tags") or []
200
- if isinstance(tags, list):
201
- return [str(tag).strip() for tag in tags if str(tag).strip()]
202
- return []
203
-
204
-
205
- def _extract_keywords(paper: dict[str, Any]) -> list[str]:
206
- keywords = paper.get("keywords") or []
207
- if isinstance(keywords, list):
208
- return [str(keyword).strip() for keyword in keywords if str(keyword).strip()]
209
- if isinstance(keywords, str):
210
- parts = re.split(r"[;,]", keywords)
211
- return [part.strip() for part in parts if part.strip()]
212
- return []
213
-
214
-
215
- _SUMMARY_FIELDS = (
216
- "summary",
217
- "abstract",
218
- "keywords",
219
- "question1",
220
- "question2",
221
- "question3",
222
- "question4",
223
- "question5",
224
- "question6",
225
- "question7",
226
- "question8",
227
- )
228
-
229
-
230
- def _has_summary(paper: dict[str, Any], template_tags: list[str]) -> bool:
231
- if template_tags:
232
- return True
233
- for key in _SUMMARY_FIELDS:
234
- value = paper.get(key)
235
- if isinstance(value, str) and value.strip():
236
- return True
237
- return False
238
-
239
-
240
- def _extract_venue(paper: dict[str, Any]) -> str:
241
- if isinstance(paper.get("bibtex"), dict):
242
- bib = paper.get("bibtex") or {}
243
- fields = bib.get("fields") or {}
244
- bib_type = (bib.get("type") or "").lower()
245
- if bib_type == "article" and fields.get("journal"):
246
- return str(fields.get("journal"))
247
- if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
248
- return str(fields.get("booktitle"))
249
- return str(paper.get("publication_venue") or "")
250
-
251
-
252
- def build_index(
253
- papers: list[dict[str, Any]],
254
- *,
255
- md_roots: list[Path] | None = None,
256
- md_translated_roots: list[Path] | None = None,
257
- pdf_roots: list[Path] | None = None,
258
- ) -> PaperIndex:
259
- id_by_hash: dict[str, int] = {}
260
- by_tag: dict[str, set[int]] = {}
261
- by_author: dict[str, set[int]] = {}
262
- by_year: dict[str, set[int]] = {}
263
- by_month: dict[str, set[int]] = {}
264
- by_venue: dict[str, set[int]] = {}
265
-
266
- md_path_by_hash: dict[str, Path] = {}
267
- translated_md_by_hash: dict[str, dict[str, Path]] = {}
268
- pdf_path_by_hash: dict[str, Path] = {}
269
-
270
- md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
271
- translated_index = _build_translated_index(md_translated_roots or [])
272
- pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
273
-
274
- year_counts: dict[str, int] = {}
275
- month_counts: dict[str, int] = {}
276
- tag_counts: dict[str, int] = {}
277
- keyword_counts: dict[str, int] = {}
278
- author_counts: dict[str, int] = {}
279
- venue_counts: dict[str, int] = {}
280
- template_tag_counts: dict[str, int] = {}
281
-
282
- def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
283
- index.setdefault(key, set()).add(idx)
284
-
285
- for idx, paper in enumerate(papers):
286
- is_pdf_only = bool(paper.get("_is_pdf_only"))
287
- source_hash = paper.get("source_hash")
288
- if not source_hash and paper.get("source_path"):
289
- source_hash = stable_hash(str(paper.get("source_path")))
290
- if source_hash:
291
- id_by_hash[str(source_hash)] = idx
292
-
293
- title = str(paper.get("paper_title") or "")
294
- paper["_title_lc"] = title.lower()
295
-
296
- bib_fields: dict[str, Any] = {}
297
- if isinstance(paper.get("bibtex"), dict):
298
- bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
299
-
300
- year = None
301
- if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
302
- year = str(bib_fields.get("year"))
303
- month = _normalize_month_token(bib_fields.get("month"))
304
- if not year or not month:
305
- parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
306
- year = year or parsed_year
307
- month = month or parsed_month
308
-
309
- year_label = year or "Unknown"
310
- month_label = month or "Unknown"
311
- paper["_year"] = year_label
312
- paper["_month"] = month_label
313
- add_index(by_year, _normalize_key(year_label), idx)
314
- add_index(by_month, _normalize_key(month_label), idx)
315
- if not is_pdf_only:
316
- year_counts[year_label] = year_counts.get(year_label, 0) + 1
317
- month_counts[month_label] = month_counts.get(month_label, 0) + 1
318
-
319
- venue = _extract_venue(paper).strip()
320
- paper["_venue"] = venue
321
- if venue:
322
- add_index(by_venue, _normalize_key(venue), idx)
323
- if not is_pdf_only:
324
- venue_counts[venue] = venue_counts.get(venue, 0) + 1
325
- else:
326
- add_index(by_venue, "unknown", idx)
327
- if not is_pdf_only:
328
- venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
329
-
330
- authors = _extract_authors(paper)
331
- paper["_authors"] = authors
332
- for author in authors:
333
- key = _normalize_key(author)
334
- add_index(by_author, key, idx)
335
- if not is_pdf_only:
336
- author_counts[author] = author_counts.get(author, 0) + 1
337
-
338
- tags = _extract_tags(paper)
339
- paper["_tags"] = tags
340
- for tag in tags:
341
- key = _normalize_key(tag)
342
- add_index(by_tag, key, idx)
343
- if not is_pdf_only:
344
- tag_counts[tag] = tag_counts.get(tag, 0) + 1
345
-
346
- keywords = _extract_keywords(paper)
347
- paper["_keywords"] = keywords
348
- for keyword in keywords:
349
- if not is_pdf_only:
350
- keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
351
-
352
- template_tags = _available_templates(paper)
353
- if not template_tags:
354
- fallback_tag = paper.get("template_tag") or paper.get("prompt_template")
355
- if fallback_tag:
356
- template_tags = [str(fallback_tag)]
357
- paper["_template_tags"] = template_tags
358
- paper["_template_tags_lc"] = [tag.lower() for tag in template_tags]
359
- paper["_has_summary"] = _has_summary(paper, template_tags)
360
- if not is_pdf_only:
361
- for tag in template_tags:
362
- template_tag_counts[tag] = template_tag_counts.get(tag, 0) + 1
363
-
364
- search_parts = [title, venue, " ".join(authors), " ".join(tags)]
365
- paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
366
-
367
- source_hash_str = str(source_hash) if source_hash else str(idx)
368
- md_path = _resolve_source_md(paper, md_file_index)
369
- if md_path is not None:
370
- md_path_by_hash[source_hash_str] = md_path
371
- base_key = md_path.with_suffix("").name.lower()
372
- translations = translated_index.get(base_key, {})
373
- if translations:
374
- translated_md_by_hash[source_hash_str] = translations
375
- pdf_path = _resolve_pdf(paper, pdf_file_index)
376
- if pdf_path is not None:
377
- pdf_path_by_hash[source_hash_str] = pdf_path
378
-
379
- def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
380
- idx, paper = item
381
- year_label = str(paper.get("_year") or "Unknown")
382
- title_label = str(paper.get("paper_title") or "")
383
- if year_label.isdigit():
384
- return (0, -int(year_label), title_label.lower())
385
- return (1, 0, title_label.lower())
386
-
387
- ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
388
-
389
- stats_total = sum(1 for paper in papers if not paper.get("_is_pdf_only"))
390
- stats = {
391
- "total": stats_total,
392
- "years": _sorted_counts(year_counts, numeric_desc=True),
393
- "months": _sorted_month_counts(month_counts),
394
- "tags": _sorted_counts(tag_counts),
395
- "keywords": _sorted_counts(keyword_counts),
396
- "authors": _sorted_counts(author_counts),
397
- "venues": _sorted_counts(venue_counts),
398
- }
399
-
400
- template_tags = sorted(template_tag_counts.keys(), key=lambda item: item.lower())
401
-
402
- return PaperIndex(
403
- papers=papers,
404
- id_by_hash=id_by_hash,
405
- ordered_ids=ordered_ids,
406
- by_tag=by_tag,
407
- by_author=by_author,
408
- by_year=by_year,
409
- by_month=by_month,
410
- by_venue=by_venue,
411
- stats=stats,
412
- md_path_by_hash=md_path_by_hash,
413
- translated_md_by_hash=translated_md_by_hash,
414
- pdf_path_by_hash=pdf_path_by_hash,
415
- template_tags=template_tags,
416
- )
417
-
418
-
419
- def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
420
- items = list(counts.items())
421
- if numeric_desc:
422
- def key(item: tuple[str, int]) -> tuple[int, int]:
423
- label, count = item
424
- if label.isdigit():
425
- return (0, -int(label))
426
- return (1, 0)
427
- items.sort(key=key)
428
- else:
429
- items.sort(key=lambda item: item[1], reverse=True)
430
- return [{"label": k, "count": v} for k, v in items]
431
-
432
-
433
- def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
434
- def month_sort(label: str) -> int:
435
- if label == "Unknown":
436
- return 99
437
- if label.isdigit():
438
- return int(label)
439
- return 98
440
-
441
- items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
442
- return [{"label": k, "count": v} for k, v in items]
443
-
444
-
445
- _TEMPLATE_INFER_IGNORE_KEYS = {
446
- "source_path",
447
- "source_hash",
448
- "provider",
449
- "model",
450
- "extracted_at",
451
- "truncation",
452
- "output_language",
453
- "prompt_template",
454
- }
455
-
456
-
457
- def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
458
- inputs: list[dict[str, Any]] = []
459
- for path in paths:
460
- payload = json.loads(path.read_text(encoding="utf-8"))
461
- if isinstance(payload, list):
462
- raise ValueError(
463
- f"Input JSON must be an object with template_tag and papers (got array): {path}"
464
- )
465
- if not isinstance(payload, dict):
466
- raise ValueError(f"Input JSON must be an object: {path}")
467
- papers = payload.get("papers")
468
- if not isinstance(papers, list):
469
- raise ValueError(f"Input JSON missing papers list: {path}")
470
- template_tag = payload.get("template_tag")
471
- if not template_tag:
472
- template_tag = _infer_template_tag(papers, path)
473
- inputs.append({"template_tag": str(template_tag), "papers": papers})
474
- return inputs
475
-
476
-
477
- def _infer_template_tag(papers: list[dict[str, Any]], path: Path) -> str:
478
- prompt_tags = {
479
- str(paper.get("prompt_template"))
480
- for paper in papers
481
- if isinstance(paper, dict) and paper.get("prompt_template")
482
- }
483
- if len(prompt_tags) == 1:
484
- return prompt_tags.pop()
485
-
486
- sample = next((paper for paper in papers if isinstance(paper, dict)), None)
487
- if sample is None:
488
- raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
489
-
490
- paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
491
- if not paper_keys:
492
- raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
493
-
494
- best_tag = None
495
- best_score = -1
496
- for name in list_template_names_in_registry_order():
497
- schema = load_schema_for_template(name)
498
- schema_keys = set((schema.get("properties") or {}).keys())
499
- score = len(paper_keys & schema_keys)
500
- if score > best_score:
501
- best_score = score
502
- best_tag = name
503
- elif score == best_score:
504
- if best_tag != "simple" and name == "simple":
505
- best_tag = name
506
-
507
- if not best_tag:
508
- raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
509
- return best_tag
510
-
511
-
512
- def _build_cache_meta(
513
- db_paths: list[Path],
514
- bibtex_path: Path | None,
515
- pdf_roots_meta: list[dict[str, Any]] | None = None,
516
- ) -> dict[str, Any]:
517
- def file_meta(path: Path) -> dict[str, Any]:
518
- try:
519
- stats = path.stat()
520
- except OSError as exc:
521
- raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
522
- return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
523
-
524
- meta = {
525
- "version": 1,
526
- "inputs": [file_meta(path) for path in db_paths],
527
- "bibtex": file_meta(bibtex_path) if bibtex_path else None,
528
- }
529
- if pdf_roots_meta is not None:
530
- meta["pdf_roots"] = pdf_roots_meta
531
- return meta
532
-
533
-
534
- def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
535
- meta_path = cache_dir / "db_serve_cache.meta.json"
536
- data_path = cache_dir / "db_serve_cache.papers.json"
537
- if not meta_path.exists() or not data_path.exists():
538
- return None
539
- try:
540
- cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
541
- if cached_meta != meta:
542
- return None
543
- cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
544
- if not isinstance(cached_papers, list):
545
- return None
546
- return cached_papers
547
- except Exception:
548
- return None
549
-
550
-
551
- def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
552
- meta_path = cache_dir / "db_serve_cache.meta.json"
553
- data_path = cache_dir / "db_serve_cache.papers.json"
554
- meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
555
- data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
556
-
557
-
558
- def _extract_year_for_matching(paper: dict[str, Any]) -> str | None:
559
- if isinstance(paper.get("bibtex"), dict):
560
- fields = paper.get("bibtex", {}).get("fields", {}) or {}
561
- year = fields.get("year")
562
- if year and str(year).isdigit():
563
- return str(year)
564
- parsed_year, _ = _parse_year_month(str(paper.get("publication_date") or ""))
565
- return parsed_year
566
-
567
-
568
- def _prepare_paper_matching_fields(paper: dict[str, Any]) -> None:
569
- if "_authors" not in paper:
570
- paper["_authors"] = _extract_authors(paper)
571
- if "_year" not in paper:
572
- paper["_year"] = _extract_year_for_matching(paper) or ""
573
-
574
-
575
- def _build_pdf_only_entries(
576
- papers: list[dict[str, Any]],
577
- pdf_paths: list[Path],
578
- pdf_index: dict[str, list[Path]],
579
- ) -> list[dict[str, Any]]:
580
- matched: set[Path] = set()
581
- for paper in papers:
582
- _prepare_paper_matching_fields(paper)
583
- pdf_path = _resolve_pdf(paper, pdf_index)
584
- if pdf_path:
585
- matched.add(pdf_path.resolve())
586
-
587
- entries: list[dict[str, Any]] = []
588
- for path in pdf_paths:
589
- resolved = path.resolve()
590
- if resolved in matched:
591
- continue
592
- title = _read_pdf_metadata_title(resolved) or _extract_title_from_filename(resolved.name)
593
- if not title:
594
- title = resolved.stem
595
- year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
596
- entry: dict[str, Any] = {
597
- "paper_title": title,
598
- "paper_authors": [author_hint] if author_hint else [],
599
- "publication_date": year_hint or "",
600
- "source_hash": stable_hash(str(resolved)),
601
- "source_path": str(resolved),
602
- "_is_pdf_only": True,
603
- }
604
- entries.append(entry)
605
- return entries
606
-
607
-
608
- def _load_or_merge_papers(
609
- db_paths: list[Path],
610
- bibtex_path: Path | None,
611
- cache_dir: Path | None,
612
- use_cache: bool,
613
- pdf_roots: list[Path] | None = None,
614
- ) -> list[dict[str, Any]]:
615
- cache_meta = None
616
- pdf_roots = pdf_roots or []
617
- pdf_paths: list[Path] = []
618
- pdf_roots_meta: list[dict[str, Any]] | None = None
619
- if pdf_roots:
620
- pdf_paths, pdf_roots_meta = _scan_pdf_roots(pdf_roots)
621
- if cache_dir and use_cache:
622
- cache_dir.mkdir(parents=True, exist_ok=True)
623
- cache_meta = _build_cache_meta(db_paths, bibtex_path, pdf_roots_meta)
624
- cached = _load_cached_papers(cache_dir, cache_meta)
625
- if cached is not None:
626
- return cached
627
-
628
- inputs = _load_paper_inputs(db_paths)
629
- if bibtex_path is not None:
630
- for bundle in inputs:
631
- enrich_with_bibtex(bundle["papers"], bibtex_path)
632
- papers = _merge_paper_inputs(inputs)
633
- if pdf_paths:
634
- pdf_index = _build_file_index_from_paths(pdf_paths, suffixes={".pdf"})
635
- papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
636
-
637
- if cache_dir and use_cache and cache_meta is not None:
638
- _write_cached_papers(cache_dir, cache_meta, papers)
639
- return papers
640
-
641
-
642
- def _md_renderer() -> MarkdownIt:
643
- md = MarkdownIt("commonmark", {"html": False, "linkify": True})
644
- md.use(footnote)
645
- md.enable("table")
646
- return md
647
-
648
-
649
- def _strip_paragraph_wrapped_tables(text: str) -> str:
650
- lines = text.splitlines()
651
- for idx, line in enumerate(lines):
652
- line = re.sub(r"^\s*<p>\s*\|", "|", line)
653
- line = re.sub(r"\|\s*</p>\s*$", "|", line)
654
- lines[idx] = line
655
- return "\n".join(lines)
656
-
657
-
658
- def _normalize_markdown_images(text: str) -> str:
659
- lines = text.splitlines()
660
- out: list[str] = []
661
- in_fence = False
662
- fence_char = ""
663
- fence_len = 0
664
- img_re = re.compile(r"!\[[^\]]*\]\((?:[^)\\]|\\.)*\)")
665
- list_re = re.compile(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+")
666
-
667
- for line in lines:
668
- stripped = line.lstrip()
669
- if stripped.startswith(("```", "~~~")):
670
- run_len = 0
671
- while run_len < len(stripped) and stripped[run_len] == stripped[0]:
672
- run_len += 1
673
- if not in_fence:
674
- in_fence = True
675
- fence_char = stripped[0]
676
- fence_len = run_len
677
- elif stripped[0] == fence_char and run_len >= fence_len:
678
- in_fence = False
679
- out.append(line)
680
- continue
681
- if in_fence:
682
- out.append(line)
683
- continue
684
- match = img_re.search(line)
685
- if not match:
686
- out.append(line)
687
- continue
688
- if list_re.match(line) or (line.lstrip().startswith("|") and line.count("|") >= 2):
689
- out.append(line)
690
- continue
691
- prefix = line[:match.start()]
692
- if prefix.strip():
693
- out.append(prefix.rstrip())
694
- out.append("")
695
- out.append(line[match.start():].lstrip())
696
- continue
697
- if out and out[-1].strip():
698
- out.append("")
699
- out.append(line)
700
- return "\n".join(out)
701
-
702
-
703
- def _normalize_merge_title(value: str | None) -> str | None:
704
- if not value:
705
- return None
706
- return str(value).replace("{", "").replace("}", "").strip().lower()
707
-
708
-
709
- def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
710
- if not isinstance(paper.get("bibtex"), dict):
711
- return None
712
- fields = paper.get("bibtex", {}).get("fields", {}) or {}
713
- return _normalize_merge_title(fields.get("title"))
714
-
715
-
716
- def _extract_paper_title(paper: dict[str, Any]) -> str | None:
717
- return _normalize_merge_title(paper.get("paper_title"))
718
-
719
-
720
- def _available_templates(paper: dict[str, Any]) -> list[str]:
721
- templates = paper.get("templates")
722
- if not isinstance(templates, dict):
723
- return []
724
- order = paper.get("template_order") or list(templates.keys())
725
- seen: set[str] = set()
726
- available: list[str] = []
727
- for tag in order:
728
- if tag in templates and tag not in seen:
729
- available.append(tag)
730
- seen.add(tag)
731
- for tag in templates:
732
- if tag not in seen:
733
- available.append(tag)
734
- seen.add(tag)
735
- return available
736
-
737
-
738
- def _select_template_tag(
739
- paper: dict[str, Any], requested: str | None
740
- ) -> tuple[str | None, list[str]]:
741
- available = _available_templates(paper)
742
- if not available:
743
- return None, []
744
- default_tag = paper.get("default_template")
745
- if not default_tag:
746
- default_tag = "simple" if "simple" in available else available[0]
747
- selected = requested if requested in available else default_tag
748
- return selected, available
749
-
750
-
751
- def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
752
- bib_title = _extract_bibtex_title(paper)
753
- group_bib = group.get("_merge_bibtex_titles") or set()
754
- if bib_title and group_bib:
755
- return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
756
-
757
- paper_title = _extract_paper_title(paper)
758
- group_titles = group.get("_merge_paper_titles") or set()
759
- if paper_title and group_titles:
760
- return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
761
- return False
762
-
763
-
764
- def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
765
- bib_title = _extract_bibtex_title(paper)
766
- if bib_title:
767
- group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
768
- paper_title = _extract_paper_title(paper)
769
- if paper_title:
770
- group.setdefault("_merge_paper_titles", set()).add(paper_title)
771
-
772
-
773
- def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
774
- merged: list[dict[str, Any]] = []
775
- threshold = 0.95
776
- prefix_len = 5
777
- bibtex_exact: dict[str, set[int]] = {}
778
- bibtex_prefix: dict[str, set[int]] = {}
779
- paper_exact: dict[str, set[int]] = {}
780
- paper_prefix: dict[str, set[int]] = {}
781
-
782
- def prefix_key(value: str) -> str:
783
- return value[:prefix_len] if len(value) >= prefix_len else value
784
-
785
- def add_index(
786
- value: str,
787
- exact_index: dict[str, set[int]],
788
- prefix_index: dict[str, set[int]],
789
- idx: int,
790
- ) -> None:
791
- exact_index.setdefault(value, set()).add(idx)
792
- prefix_index.setdefault(prefix_key(value), set()).add(idx)
793
-
794
- def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
795
- ids: set[int] = set()
796
- if bib_title:
797
- ids |= bibtex_exact.get(bib_title, set())
798
- ids |= bibtex_prefix.get(prefix_key(bib_title), set())
799
- if paper_title:
800
- ids |= paper_exact.get(paper_title, set())
801
- ids |= paper_prefix.get(prefix_key(paper_title), set())
802
- return sorted(ids)
803
-
804
- for bundle in inputs:
805
- template_tag = bundle.get("template_tag")
806
- papers = bundle.get("papers") or []
807
- for paper in papers:
808
- if not isinstance(paper, dict):
809
- raise ValueError("Input papers must be objects")
810
- bib_title = _extract_bibtex_title(paper)
811
- paper_title = _extract_paper_title(paper)
812
- match = None
813
- match_idx = None
814
- for idx in candidate_ids(bib_title, paper_title):
815
- candidate = merged[idx]
816
- if _titles_match(candidate, paper, threshold=threshold):
817
- match = candidate
818
- match_idx = idx
819
- break
820
- if match is None:
821
- group = {
822
- "templates": {template_tag: paper},
823
- "template_order": [template_tag],
824
- }
825
- _add_merge_titles(group, paper)
826
- merged.append(group)
827
- group_idx = len(merged) - 1
828
- if bib_title:
829
- add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
830
- if paper_title:
831
- add_index(paper_title, paper_exact, paper_prefix, group_idx)
832
- else:
833
- templates = match.setdefault("templates", {})
834
- templates[template_tag] = paper
835
- order = match.setdefault("template_order", [])
836
- if template_tag not in order:
837
- order.append(template_tag)
838
- _add_merge_titles(match, paper)
839
- if match_idx is not None:
840
- if bib_title:
841
- add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
842
- if paper_title:
843
- add_index(paper_title, paper_exact, paper_prefix, match_idx)
844
-
845
- for group in merged:
846
- templates = group.get("templates") or {}
847
- order = group.get("template_order") or list(templates.keys())
848
- default_tag = "simple" if "simple" in order else (order[0] if order else None)
849
- group["default_template"] = default_tag
850
- if default_tag and default_tag in templates:
851
- base = templates[default_tag]
852
- for key, value in base.items():
853
- group[key] = value
854
- group.pop("_merge_bibtex_titles", None)
855
- group.pop("_merge_paper_titles", None)
856
- return merged
857
-
858
-
859
- def _render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
860
- text = _strip_paragraph_wrapped_tables(text)
861
- rendered, table_placeholders = _extract_html_table_placeholders(text)
862
- rendered, img_placeholders = _extract_html_img_placeholders(rendered)
863
- rendered, placeholders = _extract_math_placeholders(rendered)
864
- html_out = md.render(rendered)
865
- for key, value in placeholders.items():
866
- html_out = html_out.replace(key, html.escape(value))
867
- for key, value in img_placeholders.items():
868
- html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: value, html_out)
869
- html_out = html_out.replace(key, value)
870
- for key, value in table_placeholders.items():
871
- safe_html = _sanitize_table_html(value)
872
- html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: safe_html, html_out)
873
- return html_out
874
-
875
-
876
- def _extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
877
- placeholders: dict[str, str] = {}
878
- out: list[str] = []
879
- idx = 0
880
- in_fence = False
881
- fence_char = ""
882
- fence_len = 0
883
- inline_delim_len = 0
884
-
885
- def next_placeholder(value: str) -> str:
886
- key = f"@@MATH_{len(placeholders)}@@"
887
- placeholders[key] = value
888
- return key
889
-
890
- while idx < len(text):
891
- at_line_start = idx == 0 or text[idx - 1] == "\n"
892
-
893
- if inline_delim_len == 0 and at_line_start:
894
- line_end = text.find("\n", idx)
895
- if line_end == -1:
896
- line_end = len(text)
897
- line = text[idx:line_end]
898
- stripped = line.lstrip(" ")
899
- leading_spaces = len(line) - len(stripped)
900
- if leading_spaces <= 3 and stripped:
901
- first = stripped[0]
902
- if first in {"`", "~"}:
903
- run_len = 0
904
- while run_len < len(stripped) and stripped[run_len] == first:
905
- run_len += 1
906
- if run_len >= 3:
907
- if not in_fence:
908
- in_fence = True
909
- fence_char = first
910
- fence_len = run_len
911
- elif first == fence_char and run_len >= fence_len:
912
- in_fence = False
913
- fence_char = ""
914
- fence_len = 0
915
- out.append(line)
916
- idx = line_end
917
- continue
918
-
919
- if in_fence:
920
- out.append(text[idx])
921
- idx += 1
922
- continue
923
-
924
- if inline_delim_len > 0:
925
- delim = "`" * inline_delim_len
926
- if text.startswith(delim, idx):
927
- out.append(delim)
928
- idx += inline_delim_len
929
- inline_delim_len = 0
930
- continue
931
- out.append(text[idx])
932
- idx += 1
933
- continue
934
-
935
- ch = text[idx]
936
- if ch == "`":
937
- run_len = 0
938
- while idx + run_len < len(text) and text[idx + run_len] == "`":
939
- run_len += 1
940
- inline_delim_len = run_len
941
- out.append("`" * run_len)
942
- idx += run_len
943
- continue
944
-
945
- # Block math: $$...$$ (can span lines)
946
- if text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
947
- search_from = idx + 2
948
- end = text.find("$$", search_from)
949
- while end != -1 and text[end - 1] == "\\":
950
- search_from = end + 2
951
- end = text.find("$$", search_from)
952
- if end != -1:
953
- out.append(next_placeholder(text[idx : end + 2]))
954
- idx = end + 2
955
- continue
956
-
957
- # Inline math: $...$ (single-line)
958
- if ch == "$" and not text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
959
- line_end = text.find("\n", idx + 1)
960
- if line_end == -1:
961
- line_end = len(text)
962
- search_from = idx + 1
963
- end = text.find("$", search_from, line_end)
964
- while end != -1 and text[end - 1] == "\\":
965
- search_from = end + 1
966
- end = text.find("$", search_from, line_end)
967
- if end != -1:
968
- out.append(next_placeholder(text[idx : end + 1]))
969
- idx = end + 1
970
- continue
971
-
972
- out.append(ch)
973
- idx += 1
974
-
975
- return "".join(out), placeholders
976
-
977
-
978
- class _TableSanitizer(HTMLParser):
979
- def __init__(self) -> None:
980
- super().__init__(convert_charrefs=True)
981
- self._out: list[str] = []
982
- self._stack: list[str] = []
983
-
984
- def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
985
- t = tag.lower()
986
- if t not in {
987
- "table",
988
- "thead",
989
- "tbody",
990
- "tfoot",
991
- "tr",
992
- "th",
993
- "td",
994
- "caption",
995
- "colgroup",
996
- "col",
997
- "br",
998
- }:
999
- return
1000
-
1001
- allowed: dict[str, str] = {}
1002
- for name, value in attrs:
1003
- if value is None:
1004
- continue
1005
- n = name.lower()
1006
- v = value.strip()
1007
- if t in {"td", "th"} and n in {"colspan", "rowspan"} and v.isdigit():
1008
- allowed[n] = v
1009
- elif t in {"td", "th"} and n == "align" and v.lower() in {"left", "right", "center"}:
1010
- allowed[n] = v.lower()
1011
-
1012
- attr_text = "".join(f' {k}="{html.escape(v, quote=True)}"' for k, v in allowed.items())
1013
- self._out.append(f"<{t}{attr_text}>")
1014
- if t not in {"br", "col"}:
1015
- self._stack.append(t)
1016
-
1017
- def handle_endtag(self, tag: str) -> None:
1018
- t = tag.lower()
1019
- if t not in self._stack:
1020
- return
1021
- while self._stack:
1022
- popped = self._stack.pop()
1023
- self._out.append(f"</{popped}>")
1024
- if popped == t:
1025
- break
1026
-
1027
- def handle_data(self, data: str) -> None:
1028
- self._out.append(html.escape(data))
1029
-
1030
- def handle_entityref(self, name: str) -> None:
1031
- self._out.append(f"&{name};")
1032
-
1033
- def handle_charref(self, name: str) -> None:
1034
- self._out.append(f"&#{name};")
1035
-
1036
- def close(self) -> None:
1037
- super().close()
1038
- while self._stack:
1039
- self._out.append(f"</{self._stack.pop()}>")
1040
-
1041
- def get_html(self) -> str:
1042
- return "".join(self._out)
1043
-
1044
-
1045
- def _sanitize_table_html(raw: str) -> str:
1046
- parser = _TableSanitizer()
1047
- try:
1048
- parser.feed(raw)
1049
- parser.close()
1050
- except Exception:
1051
- return f"<pre><code>{html.escape(raw)}</code></pre>"
1052
- return parser.get_html()
1053
-
1054
-
1055
- def _sanitize_img_html(raw: str) -> str | None:
1056
- attrs = {}
1057
- for match in re.finditer(r"(\w+)\s*=\s*(\"[^\"]*\"|'[^']*'|[^\s>]+)", raw):
1058
- name = match.group(1).lower()
1059
- value = match.group(2).strip()
1060
- if value and value[0] in {"\"", "'"} and value[-1] == value[0]:
1061
- value = value[1:-1]
1062
- attrs[name] = value
1063
-
1064
- src = attrs.get("src", "")
1065
- src_lower = src.lower()
1066
- if not src_lower.startswith("data:image/") or ";base64," not in src_lower:
1067
- return None
1068
-
1069
- alt = attrs.get("alt", "")
1070
- alt_attr = f' alt="{html.escape(alt, quote=True)}"' if alt else ""
1071
- return f'<img src="{html.escape(src, quote=True)}"{alt_attr} />'
1072
-
1073
-
1074
- def _extract_html_img_placeholders(text: str) -> tuple[str, dict[str, str]]:
1075
- placeholders: dict[str, str] = {}
1076
- out: list[str] = []
1077
- idx = 0
1078
- in_fence = False
1079
- fence_char = ""
1080
- fence_len = 0
1081
- inline_delim_len = 0
1082
-
1083
- def next_placeholder(value: str) -> str:
1084
- key = f"@@HTML_IMG_{len(placeholders)}@@"
1085
- placeholders[key] = value
1086
- return key
1087
-
1088
- lower = text.lower()
1089
- while idx < len(text):
1090
- at_line_start = idx == 0 or text[idx - 1] == "\n"
1091
-
1092
- if inline_delim_len == 0 and at_line_start:
1093
- line_end = text.find("\n", idx)
1094
- if line_end == -1:
1095
- line_end = len(text)
1096
- line = text[idx:line_end]
1097
- stripped = line.lstrip(" ")
1098
- leading_spaces = len(line) - len(stripped)
1099
- if leading_spaces <= 3 and stripped:
1100
- first = stripped[0]
1101
- if first in {"`", "~"}:
1102
- run_len = 0
1103
- while run_len < len(stripped) and stripped[run_len] == first:
1104
- run_len += 1
1105
- if run_len >= 3:
1106
- if not in_fence:
1107
- in_fence = True
1108
- fence_char = first
1109
- fence_len = run_len
1110
- elif first == fence_char and run_len >= fence_len:
1111
- in_fence = False
1112
- fence_char = ""
1113
- fence_len = 0
1114
- out.append(line)
1115
- idx = line_end
1116
- continue
1117
-
1118
- if in_fence:
1119
- out.append(text[idx])
1120
- idx += 1
1121
- continue
1122
-
1123
- if inline_delim_len > 0:
1124
- delim = "`" * inline_delim_len
1125
- if text.startswith(delim, idx):
1126
- out.append(delim)
1127
- idx += inline_delim_len
1128
- inline_delim_len = 0
1129
- continue
1130
- out.append(text[idx])
1131
- idx += 1
1132
- continue
1133
-
1134
- if text[idx] == "`":
1135
- run_len = 0
1136
- while idx + run_len < len(text) and text[idx + run_len] == "`":
1137
- run_len += 1
1138
- inline_delim_len = run_len
1139
- out.append("`" * run_len)
1140
- idx += run_len
1141
- continue
1142
-
1143
- if lower.startswith("<img", idx):
1144
- end = text.find(">", idx)
1145
- if end != -1:
1146
- raw = text[idx : end + 1]
1147
- safe_html = _sanitize_img_html(raw)
1148
- if safe_html:
1149
- out.append(next_placeholder(safe_html))
1150
- idx = end + 1
1151
- continue
1152
-
1153
- out.append(text[idx])
1154
- idx += 1
1155
-
1156
- return "".join(out), placeholders
1157
-
1158
-
1159
- def _extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
1160
- placeholders: dict[str, str] = {}
1161
- out: list[str] = []
1162
- idx = 0
1163
- in_fence = False
1164
- fence_char = ""
1165
- fence_len = 0
1166
- inline_delim_len = 0
1167
-
1168
- def next_placeholder(value: str) -> str:
1169
- key = f"@@HTML_TABLE_{len(placeholders)}@@"
1170
- placeholders[key] = value
1171
- return key
1172
-
1173
- lower = text.lower()
1174
- while idx < len(text):
1175
- at_line_start = idx == 0 or text[idx - 1] == "\n"
1176
-
1177
- if inline_delim_len == 0 and at_line_start:
1178
- line_end = text.find("\n", idx)
1179
- if line_end == -1:
1180
- line_end = len(text)
1181
- line = text[idx:line_end]
1182
- stripped = line.lstrip(" ")
1183
- leading_spaces = len(line) - len(stripped)
1184
- if leading_spaces <= 3 and stripped:
1185
- first = stripped[0]
1186
- if first in {"`", "~"}:
1187
- run_len = 0
1188
- while run_len < len(stripped) and stripped[run_len] == first:
1189
- run_len += 1
1190
- if run_len >= 3:
1191
- if not in_fence:
1192
- in_fence = True
1193
- fence_char = first
1194
- fence_len = run_len
1195
- elif first == fence_char and run_len >= fence_len:
1196
- in_fence = False
1197
- fence_char = ""
1198
- fence_len = 0
1199
- out.append(line)
1200
- idx = line_end
1201
- continue
1202
-
1203
- if in_fence:
1204
- out.append(text[idx])
1205
- idx += 1
1206
- continue
1207
-
1208
- if inline_delim_len > 0:
1209
- delim = "`" * inline_delim_len
1210
- if text.startswith(delim, idx):
1211
- out.append(delim)
1212
- idx += inline_delim_len
1213
- inline_delim_len = 0
1214
- continue
1215
- out.append(text[idx])
1216
- idx += 1
1217
- continue
1218
-
1219
- if text[idx] == "`":
1220
- run_len = 0
1221
- while idx + run_len < len(text) and text[idx + run_len] == "`":
1222
- run_len += 1
1223
- inline_delim_len = run_len
1224
- out.append("`" * run_len)
1225
- idx += run_len
1226
- continue
1227
-
1228
- if lower.startswith("<table", idx):
1229
- end = lower.find("</table>", idx)
1230
- if end != -1:
1231
- end += len("</table>")
1232
- raw = text[idx:end]
1233
- key = next_placeholder(raw)
1234
- if out and not out[-1].endswith("\n"):
1235
- out.append("\n\n")
1236
- out.append(key)
1237
- out.append("\n\n")
1238
- idx = end
1239
- continue
1240
-
1241
- out.append(text[idx])
1242
- idx += 1
1243
-
1244
- return "".join(out), placeholders
1245
-
1246
-
1247
- def _render_paper_markdown(
1248
- paper: dict[str, Any],
1249
- fallback_language: str,
1250
- *,
1251
- template_tag: str | None = None,
1252
- ) -> tuple[str, str, str | None]:
1253
- selected_tag, _ = _select_template_tag(paper, template_tag)
1254
- selected_paper = paper
1255
- if selected_tag:
1256
- selected_paper = (paper.get("templates") or {}).get(selected_tag, paper)
1257
-
1258
- template_name = selected_tag or selected_paper.get("prompt_template")
1259
- warning = None
1260
- if template_name:
1261
- try:
1262
- template = load_render_template(str(template_name))
1263
- except Exception:
1264
- template = load_default_template()
1265
- warning = "Rendered using default template (missing template)."
1266
- template_name = "default_paper"
1267
- else:
1268
- template = load_default_template()
1269
- warning = "Rendered using default template (no template specified)."
1270
- template_name = "default_paper"
1271
-
1272
- context = dict(selected_paper)
1273
- if not context.get("output_language"):
1274
- context["output_language"] = fallback_language
1275
- return template.render(**context), str(template_name), warning
1276
-
1277
-
1278
- _TITLE_PREFIX_LEN = 16
1279
- _TITLE_MIN_CHARS = 24
1280
- _TITLE_MIN_TOKENS = 4
1281
- _AUTHOR_YEAR_MIN_SIMILARITY = 0.8
1282
- _LEADING_NUMERIC_MAX_LEN = 2
1283
- _SIMILARITY_START = 0.95
1284
- _SIMILARITY_STEP = 0.05
1285
- _SIMILARITY_MAX_STEPS = 10
1286
-
1287
-
1288
- def _normalize_title_key(title: str) -> str:
1289
- value = unicodedata.normalize("NFKD", title)
1290
- greek_map = {
1291
- "α": "alpha",
1292
- "β": "beta",
1293
- "γ": "gamma",
1294
- "δ": "delta",
1295
- "ε": "epsilon",
1296
- "ζ": "zeta",
1297
- "η": "eta",
1298
- "θ": "theta",
1299
- "ι": "iota",
1300
- "κ": "kappa",
1301
- "λ": "lambda",
1302
- "μ": "mu",
1303
- "ν": "nu",
1304
- "ξ": "xi",
1305
- "ο": "omicron",
1306
- "π": "pi",
1307
- "ρ": "rho",
1308
- "σ": "sigma",
1309
- "τ": "tau",
1310
- "υ": "upsilon",
1311
- "φ": "phi",
1312
- "χ": "chi",
1313
- "ψ": "psi",
1314
- "ω": "omega",
1315
- }
1316
- for char, name in greek_map.items():
1317
- value = value.replace(char, f" {name} ")
1318
- value = re.sub(
1319
- r"\\(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
1320
- r" \1 ",
1321
- value,
1322
- flags=re.IGNORECASE,
1323
- )
1324
- value = value.replace("{", "").replace("}", "")
1325
- value = value.replace("_", " ")
1326
- value = re.sub(r"([a-z])([0-9])", r"\1 \2", value, flags=re.IGNORECASE)
1327
- value = re.sub(r"([0-9])([a-z])", r"\1 \2", value, flags=re.IGNORECASE)
1328
- value = re.sub(r"[^a-z0-9]+", " ", value.lower())
1329
- value = re.sub(r"\s+", " ", value).strip()
1330
- tokens = value.split()
1331
- if not tokens:
1332
- return ""
1333
- merged: list[str] = []
1334
- idx = 0
1335
- while idx < len(tokens):
1336
- token = tokens[idx]
1337
- if len(token) == 1 and idx + 1 < len(tokens):
1338
- merged.append(token + tokens[idx + 1])
1339
- idx += 2
1340
- continue
1341
- merged.append(token)
1342
- idx += 1
1343
- return " ".join(merged)
1344
-
1345
-
1346
- def _compact_title_key(title_key: str) -> str:
1347
- return title_key.replace(" ", "")
1348
-
1349
-
1350
- def _strip_leading_numeric_tokens(title_key: str) -> str:
1351
- tokens = title_key.split()
1352
- idx = 0
1353
- while idx < len(tokens):
1354
- token = tokens[idx]
1355
- if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
1356
- idx += 1
1357
- continue
1358
- break
1359
- if idx == 0:
1360
- return title_key
1361
- return " ".join(tokens[idx:])
1362
-
1363
-
1364
- def _strip_pdf_hash_suffix(name: str) -> str:
1365
- return re.sub(r"(?i)(\.pdf)(?:-[0-9a-f\-]{8,})$", r"\1", name)
1366
-
1367
-
1368
- def _extract_title_from_filename(name: str) -> str:
1369
- base = name
1370
- lower = base.lower()
1371
- if lower.endswith(".md"):
1372
- base = base[:-3]
1373
- lower = base.lower()
1374
- if ".pdf-" in lower:
1375
- base = _strip_pdf_hash_suffix(base)
1376
- lower = base.lower()
1377
- if lower.endswith(".pdf"):
1378
- base = base[:-4]
1379
- base = base.replace("_", " ").strip()
1380
- match = re.match(r"\s*\d{4}\s*-\s*(.+)$", base)
1381
- if match:
1382
- return match.group(1).strip()
1383
- match = re.match(r"\s*.+?\s*-\s*\d{4}\s*-\s*(.+)$", base)
1384
- if match:
1385
- return match.group(1).strip()
1386
- return base.strip()
1387
-
1388
-
1389
- def _clean_pdf_metadata_title(value: str | None, path: Path) -> str | None:
1390
- if not value:
1391
- return None
1392
- text = str(value).replace("\x00", "").strip()
1393
- if not text:
1394
- return None
1395
- text = re.sub(r"(?i)^microsoft\\s+word\\s*-\\s*", "", text)
1396
- text = re.sub(r"(?i)^pdf\\s*-\\s*", "", text)
1397
- text = re.sub(r"(?i)^untitled\\b", "", text).strip()
1398
- if text.lower().endswith(".pdf"):
1399
- text = text[:-4].strip()
1400
- if len(text) < 3:
1401
- return None
1402
- stem = path.stem.strip()
1403
- if stem and text.lower() == stem.lower():
1404
- return None
1405
- return text
1406
-
1407
-
1408
- def _read_pdf_metadata_title(path: Path) -> str | None:
1409
- if not PYPDF_AVAILABLE:
1410
- return None
1411
- try:
1412
- reader = PdfReader(str(path))
1413
- meta = reader.metadata
1414
- title = meta.title if meta else None
1415
- except Exception:
1416
- return None
1417
- return _clean_pdf_metadata_title(title, path)
1418
-
1419
-
1420
- def _is_pdf_like(path: Path) -> bool:
1421
- suffix = path.suffix.lower()
1422
- if suffix == ".pdf":
1423
- return True
1424
- name_lower = path.name.lower()
1425
- return ".pdf-" in name_lower and not name_lower.endswith(".md")
1426
-
1427
-
1428
- def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
1429
- pdf_paths: list[Path] = []
1430
- meta: list[dict[str, Any]] = []
1431
- seen: set[Path] = set()
1432
- for root in roots:
1433
- try:
1434
- if not root.exists() or not root.is_dir():
1435
- continue
1436
- except OSError:
1437
- continue
1438
- files: list[Path] = []
1439
- for path in root.rglob("*"):
1440
- try:
1441
- if not path.is_file():
1442
- continue
1443
- except OSError:
1444
- continue
1445
- if not _is_pdf_like(path):
1446
- continue
1447
- resolved = path.resolve()
1448
- if resolved in seen:
1449
- continue
1450
- seen.add(resolved)
1451
- files.append(resolved)
1452
- max_mtime = 0.0
1453
- total_size = 0
1454
- for path in files:
1455
- try:
1456
- stats = path.stat()
1457
- except OSError:
1458
- continue
1459
- max_mtime = max(max_mtime, stats.st_mtime)
1460
- total_size += stats.st_size
1461
- pdf_paths.extend(files)
1462
- meta.append(
1463
- {
1464
- "path": str(root),
1465
- "count": len(files),
1466
- "max_mtime": max_mtime,
1467
- "size": total_size,
1468
- }
1469
- )
1470
- return pdf_paths, meta
1471
-
1472
-
1473
- def _extract_year_author_from_filename(name: str) -> tuple[str | None, str | None]:
1474
- base = name
1475
- lower = base.lower()
1476
- if lower.endswith(".md"):
1477
- base = base[:-3]
1478
- lower = base.lower()
1479
- if ".pdf-" in lower:
1480
- base = _strip_pdf_hash_suffix(base)
1481
- lower = base.lower()
1482
- if lower.endswith(".pdf"):
1483
- base = base[:-4]
1484
- match = re.match(r"\s*(.+?)\s*-\s*((?:19|20)\d{2})\s*-\s*", base)
1485
- if match:
1486
- return match.group(2), match.group(1).strip()
1487
- match = re.match(r"\s*((?:19|20)\d{2})\s*-\s*", base)
1488
- if match:
1489
- return match.group(1), None
1490
- return None, None
1491
-
1492
-
1493
- def _normalize_author_key(name: str) -> str:
1494
- raw = name.lower().strip()
1495
- raw = raw.replace("et al.", "").replace("et al", "")
1496
- if "," in raw:
1497
- raw = raw.split(",", 1)[0]
1498
- raw = re.sub(r"[^a-z0-9]+", " ", raw)
1499
- raw = re.sub(r"\s+", " ", raw).strip()
1500
- if not raw:
1501
- return ""
1502
- parts = raw.split()
1503
- return parts[-1] if parts else raw
1504
-
1505
-
1506
- def _title_prefix_key(title_key: str) -> str | None:
1507
- if len(title_key.split()) < _TITLE_MIN_TOKENS:
1508
- return None
1509
- compact = _compact_title_key(title_key)
1510
- if len(compact) < _TITLE_PREFIX_LEN:
1511
- return None
1512
- prefix = compact[:_TITLE_PREFIX_LEN]
1513
- if not prefix:
1514
- return None
1515
- return f"prefix:{prefix}"
1516
-
1517
-
1518
- def _title_overlap_match(a: str, b: str) -> bool:
1519
- if not a or not b:
1520
- return False
1521
- if a == b:
1522
- return True
1523
- shorter, longer = (a, b) if len(a) <= len(b) else (b, a)
1524
- token_count = len(shorter.split())
1525
- if len(shorter) >= _TITLE_MIN_CHARS or token_count >= _TITLE_MIN_TOKENS:
1526
- if longer.startswith(shorter) or shorter in longer:
1527
- return True
1528
- return False
1529
-
1530
-
1531
- def _adaptive_similarity_match(title_key: str, candidates: list[Path]) -> Path | None:
1532
- if not title_key:
1533
- return None
1534
- scored: list[tuple[Path, float]] = []
1535
- for path in candidates:
1536
- candidate_title = _normalize_title_key(_extract_title_from_filename(path.name))
1537
- if not candidate_title:
1538
- continue
1539
- if _title_overlap_match(title_key, candidate_title):
1540
- return path
1541
- scored.append((path, _title_similarity(title_key, candidate_title)))
1542
- if not scored:
1543
- return None
1544
-
1545
- def matches_at(threshold: float) -> list[Path]:
1546
- return [path for path, score in scored if score >= threshold]
1547
-
1548
- threshold = _SIMILARITY_START
1549
- step = _SIMILARITY_STEP
1550
- prev_threshold = None
1551
- prev_count = None
1552
- for _ in range(_SIMILARITY_MAX_STEPS):
1553
- matches = matches_at(threshold)
1554
- if len(matches) == 1:
1555
- return matches[0]
1556
- if len(matches) == 0:
1557
- prev_threshold = threshold
1558
- prev_count = 0
1559
- threshold -= step
1560
- continue
1561
- if prev_count == 0 and prev_threshold is not None:
1562
- low = threshold
1563
- high = prev_threshold
1564
- for _ in range(_SIMILARITY_MAX_STEPS):
1565
- mid = (low + high) / 2
1566
- mid_matches = matches_at(mid)
1567
- if len(mid_matches) == 1:
1568
- return mid_matches[0]
1569
- if len(mid_matches) == 0:
1570
- high = mid
1571
- else:
1572
- low = mid
1573
- return None
1574
- prev_threshold = threshold
1575
- prev_count = len(matches)
1576
- threshold -= step
1577
- return None
1578
-
1579
-
1580
- def _resolve_by_title_and_meta(
1581
- paper: dict[str, Any],
1582
- file_index: dict[str, list[Path]],
1583
- ) -> Path | None:
1584
- title = str(paper.get("paper_title") or "")
1585
- title_key = _normalize_title_key(title)
1586
- if not title_key:
1587
- title_key = ""
1588
- candidates = file_index.get(title_key, [])
1589
- if candidates:
1590
- return candidates[0]
1591
- if title_key:
1592
- compact_key = _compact_title_key(title_key)
1593
- compact_candidates = file_index.get(f"compact:{compact_key}", [])
1594
- if compact_candidates:
1595
- return compact_candidates[0]
1596
- stripped_key = _strip_leading_numeric_tokens(title_key)
1597
- if stripped_key and stripped_key != title_key:
1598
- stripped_candidates = file_index.get(stripped_key, [])
1599
- if stripped_candidates:
1600
- return stripped_candidates[0]
1601
- stripped_compact = _compact_title_key(stripped_key)
1602
- stripped_candidates = file_index.get(f"compact:{stripped_compact}", [])
1603
- if stripped_candidates:
1604
- return stripped_candidates[0]
1605
- prefix_candidates: list[Path] = []
1606
- prefix_key = _title_prefix_key(title_key)
1607
- if prefix_key:
1608
- prefix_candidates = file_index.get(prefix_key, [])
1609
- if not prefix_candidates:
1610
- stripped_key = _strip_leading_numeric_tokens(title_key)
1611
- if stripped_key and stripped_key != title_key:
1612
- prefix_key = _title_prefix_key(stripped_key)
1613
- if prefix_key:
1614
- prefix_candidates = file_index.get(prefix_key, [])
1615
- if prefix_candidates:
1616
- match = _adaptive_similarity_match(title_key, prefix_candidates)
1617
- if match is not None:
1618
- return match
1619
- year = str(paper.get("_year") or "").strip()
1620
- if not year.isdigit():
1621
- return None
1622
- author_key = ""
1623
- authors = paper.get("_authors") or []
1624
- if authors:
1625
- author_key = _normalize_author_key(str(authors[0]))
1626
- candidates = []
1627
- if author_key:
1628
- candidates = file_index.get(f"authoryear:{year}:{author_key}", [])
1629
- if not candidates:
1630
- candidates = file_index.get(f"year:{year}", [])
1631
- if not candidates:
1632
- return None
1633
- if len(candidates) == 1 and not title_key:
1634
- return candidates[0]
1635
- match = _adaptive_similarity_match(title_key, candidates)
1636
- if match is not None:
1637
- return match
1638
- return None
1639
-
1640
-
1641
- def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
1642
- index: dict[str, list[Path]] = {}
1643
- for root in roots:
1644
- try:
1645
- if not root.exists() or not root.is_dir():
1646
- continue
1647
- except OSError:
1648
- continue
1649
- for path in root.rglob("*"):
1650
- try:
1651
- if not path.is_file():
1652
- continue
1653
- except OSError:
1654
- continue
1655
- suffix = path.suffix.lower()
1656
- if suffix not in suffixes:
1657
- name_lower = path.name.lower()
1658
- if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
1659
- pass
1660
- else:
1661
- continue
1662
- resolved = path.resolve()
1663
- name_key = path.name.lower()
1664
- index.setdefault(name_key, []).append(resolved)
1665
- title_candidate = _extract_title_from_filename(path.name)
1666
- title_key = _normalize_title_key(title_candidate)
1667
- if title_key:
1668
- if title_key != name_key:
1669
- index.setdefault(title_key, []).append(resolved)
1670
- compact_key = _compact_title_key(title_key)
1671
- if compact_key:
1672
- index.setdefault(f"compact:{compact_key}", []).append(resolved)
1673
- prefix_key = _title_prefix_key(title_key)
1674
- if prefix_key:
1675
- index.setdefault(prefix_key, []).append(resolved)
1676
- stripped_key = _strip_leading_numeric_tokens(title_key)
1677
- if stripped_key and stripped_key != title_key:
1678
- index.setdefault(stripped_key, []).append(resolved)
1679
- stripped_compact = _compact_title_key(stripped_key)
1680
- if stripped_compact:
1681
- index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
1682
- stripped_prefix = _title_prefix_key(stripped_key)
1683
- if stripped_prefix:
1684
- index.setdefault(stripped_prefix, []).append(resolved)
1685
- year_hint, author_hint = _extract_year_author_from_filename(path.name)
1686
- if year_hint:
1687
- index.setdefault(f"year:{year_hint}", []).append(resolved)
1688
- if author_hint:
1689
- author_key = _normalize_author_key(author_hint)
1690
- if author_key:
1691
- index.setdefault(f"authoryear:{year_hint}:{author_key}", []).append(resolved)
1692
- return index
1693
-
1694
-
1695
- def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
1696
- index: dict[str, list[Path]] = {}
1697
- for path in paths:
1698
- try:
1699
- if not path.is_file():
1700
- continue
1701
- except OSError:
1702
- continue
1703
- suffix = path.suffix.lower()
1704
- if suffix not in suffixes:
1705
- name_lower = path.name.lower()
1706
- if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
1707
- pass
1708
- else:
1709
- continue
1710
- resolved = path.resolve()
1711
- name_key = path.name.lower()
1712
- index.setdefault(name_key, []).append(resolved)
1713
- title_candidate = _extract_title_from_filename(path.name)
1714
- title_key = _normalize_title_key(title_candidate)
1715
- if title_key:
1716
- if title_key != name_key:
1717
- index.setdefault(title_key, []).append(resolved)
1718
- compact_key = _compact_title_key(title_key)
1719
- if compact_key:
1720
- index.setdefault(f"compact:{compact_key}", []).append(resolved)
1721
- prefix_key = _title_prefix_key(title_key)
1722
- if prefix_key:
1723
- index.setdefault(prefix_key, []).append(resolved)
1724
- stripped_key = _strip_leading_numeric_tokens(title_key)
1725
- if stripped_key and stripped_key != title_key:
1726
- index.setdefault(stripped_key, []).append(resolved)
1727
- stripped_compact = _compact_title_key(stripped_key)
1728
- if stripped_compact:
1729
- index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
1730
- stripped_prefix = _title_prefix_key(stripped_key)
1731
- if stripped_prefix:
1732
- index.setdefault(stripped_prefix, []).append(resolved)
1733
- return index
1734
-
1735
-
1736
- def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
1737
- source_path = paper.get("source_path")
1738
- if not source_path:
1739
- source_path = ""
1740
- if source_path:
1741
- name = Path(str(source_path)).name.lower()
1742
- candidates = md_index.get(name, [])
1743
- if candidates:
1744
- return candidates[0]
1745
- return _resolve_by_title_and_meta(paper, md_index)
1746
-
1747
-
1748
- def _build_translated_index(roots: list[Path]) -> dict[str, dict[str, Path]]:
1749
- index: dict[str, dict[str, Path]] = {}
1750
- candidates: list[Path] = []
1751
- for root in roots:
1752
- try:
1753
- if not root.exists() or not root.is_dir():
1754
- continue
1755
- except OSError:
1756
- continue
1757
- try:
1758
- candidates.extend(root.rglob("*.md"))
1759
- except OSError:
1760
- continue
1761
- for path in sorted(candidates, key=lambda item: str(item)):
1762
- try:
1763
- if not path.is_file():
1764
- continue
1765
- except OSError:
1766
- continue
1767
- name = path.name
1768
- match = re.match(r"^(.+)\.([^.]+)\.md$", name, flags=re.IGNORECASE)
1769
- if not match:
1770
- continue
1771
- base_name = match.group(1).strip()
1772
- lang = match.group(2).strip()
1773
- if not base_name or not lang:
1774
- continue
1775
- base_key = base_name.lower()
1776
- lang_key = lang.lower()
1777
- index.setdefault(base_key, {}).setdefault(lang_key, path.resolve())
1778
- return index
1779
-
1780
-
1781
- def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
1782
- source_path = paper.get("source_path")
1783
- if not source_path:
1784
- return []
1785
- name = Path(str(source_path)).name
1786
- match = re.match(r"(?i)(.+\\.pdf)(?:-[0-9a-f\\-]{8,})?\\.md$", name)
1787
- if match:
1788
- return [Path(match.group(1)).name]
1789
- if ".pdf-" in name.lower():
1790
- base = name[: name.lower().rfind(".pdf-") + 4]
1791
- return [Path(base).name]
1792
- if name.lower().endswith(".pdf"):
1793
- return [name]
1794
- if name.lower().endswith(".pdf.md"):
1795
- return [name[:-3]]
1796
- return []
1797
-
1798
-
1799
- def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
1800
- for filename in _guess_pdf_names(paper):
1801
- candidates = pdf_index.get(filename.lower(), [])
1802
- if candidates:
1803
- return candidates[0]
1804
- return _resolve_by_title_and_meta(paper, pdf_index)
1805
-
1806
-
1807
- def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
1808
- resolved = path.resolve()
1809
- for root in roots:
1810
- try:
1811
- resolved.relative_to(root.resolve())
1812
- return True
1813
- except Exception:
1814
- continue
1815
- return False
1816
-
1817
-
1818
- _BOOL_TRUE = {"1", "true", "yes", "with", "has"}
1819
- _BOOL_FALSE = {"0", "false", "no", "without"}
1820
-
1821
-
1822
- def _tokenize_filter_query(text: str) -> list[str]:
1823
- out: list[str] = []
1824
- buf: list[str] = []
1825
- in_quote = False
1826
-
1827
- for ch in text:
1828
- if ch == '"':
1829
- in_quote = not in_quote
1830
- continue
1831
- if not in_quote and ch.isspace():
1832
- token = "".join(buf).strip()
1833
- if token:
1834
- out.append(token)
1835
- buf = []
1836
- continue
1837
- buf.append(ch)
1838
-
1839
- token = "".join(buf).strip()
1840
- if token:
1841
- out.append(token)
1842
- return out
1843
-
1844
-
1845
- def _normalize_presence_value(value: str) -> str | None:
1846
- token = value.strip().lower()
1847
- if token in _BOOL_TRUE:
1848
- return "with"
1849
- if token in _BOOL_FALSE:
1850
- return "without"
1851
- return None
1852
-
1853
-
1854
- def _parse_filter_query(text: str) -> dict[str, set[str]]:
1855
- parsed = {
1856
- "pdf": set(),
1857
- "source": set(),
1858
- "summary": set(),
1859
- "translated": set(),
1860
- "template": set(),
1861
- }
1862
- for token in _tokenize_filter_query(text):
1863
- if ":" not in token:
1864
- continue
1865
- key, raw_value = token.split(":", 1)
1866
- key = key.strip().lower()
1867
- raw_value = raw_value.strip()
1868
- if not raw_value:
1869
- continue
1870
- if key in {"tmpl", "template"}:
1871
- for part in raw_value.split(","):
1872
- tag = part.strip()
1873
- if tag:
1874
- parsed["template"].add(tag.lower())
1875
- continue
1876
- if key in {"pdf", "source", "summary", "translated"}:
1877
- for part in raw_value.split(","):
1878
- normalized = _normalize_presence_value(part)
1879
- if normalized:
1880
- parsed[key].add(normalized)
1881
- continue
1882
- if key in {"has", "no"}:
1883
- targets = [part.strip().lower() for part in raw_value.split(",") if part.strip()]
1884
- for target in targets:
1885
- if target not in {"pdf", "source", "summary", "translated"}:
1886
- continue
1887
- parsed[target].add("with" if key == "has" else "without")
1888
- return parsed
1889
-
1890
-
1891
- def _presence_filter(values: list[str]) -> set[str] | None:
1892
- normalized = set()
1893
- for value in values:
1894
- token = _normalize_presence_value(value)
1895
- if token:
1896
- normalized.add(token)
1897
- if not normalized or normalized == {"with", "without"}:
1898
- return None
1899
- return normalized
1900
-
1901
-
1902
- def _merge_filter_set(primary: set[str] | None, secondary: set[str] | None) -> set[str] | None:
1903
- if not primary:
1904
- return secondary
1905
- if not secondary:
1906
- return primary
1907
- return primary & secondary
1908
-
1909
-
1910
- def _matches_presence(allowed: set[str] | None, has_value: bool) -> bool:
1911
- if not allowed:
1912
- return True
1913
- if has_value and "with" in allowed:
1914
- return True
1915
- if not has_value and "without" in allowed:
1916
- return True
1917
- return False
1918
-
1919
-
1920
- def _template_tag_map(index: PaperIndex) -> dict[str, str]:
1921
- return {tag.lower(): tag for tag in index.template_tags}
1922
-
1923
-
1924
- def _compute_counts(index: PaperIndex, ids: set[int]) -> dict[str, Any]:
1925
- template_order = list(index.template_tags)
1926
- template_counts = {tag: 0 for tag in template_order}
1927
- pdf_count = 0
1928
- source_count = 0
1929
- summary_count = 0
1930
- translated_count = 0
1931
- total_count = 0
1932
- tag_map = _template_tag_map(index)
1933
-
1934
- for idx in ids:
1935
- paper = index.papers[idx]
1936
- if paper.get("_is_pdf_only"):
1937
- continue
1938
- total_count += 1
1939
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
1940
- has_source = source_hash in index.md_path_by_hash
1941
- has_pdf = source_hash in index.pdf_path_by_hash
1942
- has_summary = bool(paper.get("_has_summary"))
1943
- has_translated = bool(index.translated_md_by_hash.get(source_hash))
1944
- if has_source:
1945
- source_count += 1
1946
- if has_pdf:
1947
- pdf_count += 1
1948
- if has_summary:
1949
- summary_count += 1
1950
- if has_translated:
1951
- translated_count += 1
1952
- for tag_lc in paper.get("_template_tags_lc") or []:
1953
- display = tag_map.get(tag_lc)
1954
- if display:
1955
- template_counts[display] = template_counts.get(display, 0) + 1
1956
-
1957
- return {
1958
- "total": total_count,
1959
- "pdf": pdf_count,
1960
- "source": source_count,
1961
- "summary": summary_count,
1962
- "translated": translated_count,
1963
- "templates": template_counts,
1964
- "template_order": template_order,
1965
- }
1966
-
1967
-
1968
- def _apply_query(index: PaperIndex, query: Query) -> set[int]:
1969
- all_ids = set(index.ordered_ids)
1970
-
1971
- def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
1972
- value_lc = term.value.lower()
1973
- if term.field is None:
1974
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
1975
- if term.field == "title":
1976
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
1977
- if term.field == "venue":
1978
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
1979
- if term.field == "tag":
1980
- exact = index.by_tag.get(value_lc)
1981
- if exact is not None:
1982
- return exact & base
1983
- return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
1984
- if term.field == "author":
1985
- exact = index.by_author.get(value_lc)
1986
- if exact is not None:
1987
- return exact & base
1988
- return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
1989
- if term.field == "month":
1990
- exact = index.by_month.get(value_lc)
1991
- if exact is not None:
1992
- return exact & base
1993
- return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
1994
- if term.field == "year":
1995
- if ".." in term.value:
1996
- start_str, end_str = term.value.split("..", 1)
1997
- if start_str.strip().isdigit() and end_str.strip().isdigit():
1998
- start = int(start_str.strip())
1999
- end = int(end_str.strip())
2000
- ids: set[int] = set()
2001
- for y in range(min(start, end), max(start, end) + 1):
2002
- ids |= index.by_year.get(str(y), set())
2003
- return ids & base
2004
- exact = index.by_year.get(value_lc)
2005
- if exact is not None:
2006
- return exact & base
2007
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
2008
- return set()
2009
-
2010
- result: set[int] = set()
2011
- for group in query.groups:
2012
- group_ids = set(all_ids)
2013
- for term in group:
2014
- matched = ids_for_term(term, group_ids if not term.negated else all_ids)
2015
- if term.negated:
2016
- group_ids -= matched
2017
- else:
2018
- group_ids &= matched
2019
- result |= group_ids
2020
-
2021
- return result
2022
-
2023
-
2024
- def _page_shell(
2025
- title: str,
2026
- body_html: str,
2027
- extra_head: str = "",
2028
- extra_scripts: str = "",
2029
- header_title: str | None = None,
2030
- ) -> str:
2031
- header_html = """
2032
- <header>
2033
- <a href="/">Papers</a>
2034
- <a href="/stats">Stats</a>
2035
- </header>
2036
- """
2037
- if header_title:
2038
- safe_title = html.escape(header_title)
2039
- header_html = f"""
2040
- <header class="detail-header">
2041
- <div class="header-row">
2042
- <a class="header-back" href="/">← Papers</a>
2043
- <span class="header-title" title="{safe_title}">{safe_title}</span>
2044
- <a class="header-link" href="/stats">Stats</a>
2045
- </div>
2046
- </header>
2047
- """
2048
- return f"""<!doctype html>
2049
- <html lang="en">
2050
- <head>
2051
- <meta charset="utf-8" />
2052
- <meta name="viewport" content="width=device-width, initial-scale=1" />
2053
- <title>{html.escape(title)}</title>
2054
- <style>
2055
- body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; }}
2056
- header {{ position: sticky; top: 0; background: #0b1220; color: #fff; padding: 12px 16px; z-index: 10; }}
2057
- header a {{ color: #cfe3ff; text-decoration: none; margin-right: 12px; }}
2058
- .detail-header .header-row {{ display: grid; grid-template-columns: auto minmax(0, 1fr) auto; align-items: center; gap: 12px; }}
2059
- .detail-header .header-title {{ text-align: center; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; }}
2060
- .detail-header .header-back {{ margin-right: 0; }}
2061
- .detail-header .header-link {{ margin-right: 0; }}
2062
- .container {{ max-width: 1100px; margin: 0 auto; padding: 16px; }}
2063
- .filters {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(160px, 1fr)); gap: 8px; margin: 12px 0 16px; }}
2064
- .filters input {{ width: 100%; padding: 8px; border: 1px solid #d0d7de; border-radius: 6px; }}
2065
- .filters select {{ width: 100%; border: 1px solid #d0d7de; border-radius: 6px; background: #fff; font-size: 13px; }}
2066
- .filters select:not([multiple]) {{ padding: 6px 8px; }}
2067
- .filters select[multiple] {{ padding: 2px; line-height: 1.25; min-height: 72px; font-size: 13px; }}
2068
- .filters select[multiple] option {{ padding: 2px 6px; line-height: 1.25; }}
2069
- .filters label {{ font-size: 12px; color: #57606a; }}
2070
- .filter-group {{ display: flex; flex-direction: column; gap: 4px; }}
2071
- .card {{ border: 1px solid #d0d7de; border-radius: 10px; padding: 12px; margin: 10px 0; }}
2072
- .muted {{ color: #57606a; font-size: 13px; }}
2073
- .pill {{ display: inline-block; padding: 2px 8px; border-radius: 999px; border: 1px solid #d0d7de; margin-right: 6px; font-size: 12px; }}
2074
- .pill.template {{ border-color: #8a92a5; color: #243b53; background: #f6f8fa; }}
2075
- .pill.pdf-only {{ border-color: #c8a951; background: #fff8dc; color: #5b4a00; }}
2076
- .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
2077
- .tabs {{ display: flex; gap: 8px; flex-wrap: wrap; }}
2078
- .tab {{ display: inline-block; padding: 6px 12px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; color: #0969da; text-decoration: none; font-size: 13px; }}
2079
- .tab:hover {{ background: #eef1f4; }}
2080
- .tab.active {{ background: #0969da; border-color: #0969da; color: #fff; }}
2081
- .detail-shell {{ display: flex; flex-direction: column; gap: 12px; min-height: calc(100vh - 120px); }}
2082
- .detail-toolbar {{ display: flex; flex-wrap: wrap; align-items: center; justify-content: flex-start; gap: 12px; padding: 6px 8px 10px; border-bottom: 1px solid #e5e7eb; box-sizing: border-box; }}
2083
- .detail-toolbar .tabs {{ margin: 0; }}
2084
- .toolbar-actions {{ display: flex; flex-wrap: wrap; align-items: center; gap: 10px; margin-left: auto; padding-right: 16px; }}
2085
- .search-row {{ display: flex; flex-wrap: wrap; gap: 8px; margin-top: 8px; align-items: stretch; }}
2086
- .search-row input {{ flex: 1 1 320px; min-width: 0; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; }}
2087
- .search-row select {{ flex: 0 1 220px; min-width: 0; max-width: 100%; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; background: #fff; }}
2088
- .filter-row {{ display: flex; flex-wrap: wrap; gap: 8px; align-items: center; margin-top: 8px; }}
2089
- .filter-row input {{ flex: 1 1 320px; min-width: 0; padding: 10px; border: 1px solid #d0d7de; border-radius: 8px; }}
2090
- .filter-row .help-icon {{ flex: 0 0 auto; }}
2091
- .adv-actions {{ display: flex; gap: 8px; align-items: center; margin-top: 8px; flex-wrap: wrap; }}
2092
- .split-inline {{ display: flex; flex-wrap: wrap; align-items: center; gap: 6px; }}
2093
- .split-inline select {{ padding: 6px 8px; border-radius: 8px; border: 1px solid #d0d7de; background: #fff; min-width: 140px; }}
2094
- .split-actions {{ display: flex; align-items: center; justify-content: center; gap: 8px; }}
2095
- .split-actions button {{ padding: 6px 10px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; min-width: 36px; }}
2096
- .lang-select {{ display: flex; align-items: center; gap: 6px; }}
2097
- .lang-select label {{ color: #57606a; font-size: 13px; }}
2098
- .lang-select select {{ padding: 6px 8px; border-radius: 8px; border: 1px solid #d0d7de; background: #fff; min-width: 120px; }}
2099
- .fullscreen-actions {{ display: flex; align-items: center; gap: 6px; }}
2100
- .fullscreen-actions button {{ padding: 6px 10px; border-radius: 8px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; }}
2101
- .fullscreen-exit {{ display: none; }}
2102
- body.detail-fullscreen {{ overflow: hidden; --outline-top: 16px; }}
2103
- body.detail-fullscreen header {{ display: none; }}
2104
- body.detail-fullscreen .container {{ max-width: 100%; padding: 0; }}
2105
- body.detail-fullscreen .detail-shell {{
2106
- position: fixed;
2107
- inset: 0;
2108
- padding: 12px 16px;
2109
- background: #fff;
2110
- z-index: 40;
2111
- overflow: auto;
2112
- }}
2113
- body.detail-fullscreen .detail-toolbar {{ position: sticky; top: 0; background: #fff; z-index: 41; }}
2114
- body.detail-fullscreen .fullscreen-enter {{ display: none; }}
2115
- body.detail-fullscreen .fullscreen-exit {{ display: inline-flex; }}
2116
- .detail-body {{ display: flex; flex-direction: column; gap: 8px; flex: 1; min-height: 0; }}
2117
- .help-icon {{ display: inline-flex; align-items: center; justify-content: center; width: 18px; height: 18px; border-radius: 50%; border: 1px solid #d0d7de; color: #57606a; font-size: 12px; cursor: default; position: relative; }}
2118
- .help-icon::after {{ content: attr(data-tip); display: none; position: absolute; top: 24px; right: 0; background: #0b1220; color: #e6edf3; padding: 8px 10px; border-radius: 8px; font-size: 12px; white-space: pre-line; width: 260px; z-index: 20; }}
2119
- .help-icon:hover::after {{ display: block; }}
2120
- .stats {{ margin: 12px 0 6px; }}
2121
- .stats-row {{ display: flex; flex-wrap: wrap; gap: 6px; align-items: center; }}
2122
- .stats-label {{ font-weight: 600; color: #0b1220; margin-right: 4px; }}
2123
- .pill.stat {{ background: #f6f8fa; border-color: #c7d2e0; color: #1f2a37; }}
2124
- .footnotes {{ border-top: 1px solid #e5e7eb; margin-top: 16px; padding-top: 12px; color: #57606a; }}
2125
- .footnotes ol {{ padding-left: 20px; }}
2126
- .footnotes li {{ margin-bottom: 6px; }}
2127
- .footnote-ref {{ font-size: 0.85em; }}
2128
- .footnote-tip {{ position: relative; display: inline-block; }}
2129
- .footnote-tip::after {{
2130
- content: attr(data-footnote);
2131
- position: absolute;
2132
- left: 50%;
2133
- bottom: 130%;
2134
- transform: translateX(-50%);
2135
- width: min(320px, 70vw);
2136
- padding: 8px 10px;
2137
- border-radius: 8px;
2138
- background: #0b1220;
2139
- color: #e6edf3;
2140
- font-size: 12px;
2141
- line-height: 1.35;
2142
- white-space: pre-line;
2143
- box-shadow: 0 10px 24px rgba(0, 0, 0, 0.18);
2144
- opacity: 0;
2145
- pointer-events: none;
2146
- z-index: 30;
2147
- transition: opacity 0.12s ease-in-out;
2148
- }}
2149
- .footnote-tip:hover::after,
2150
- .footnote-tip:focus::after {{
2151
- opacity: 1;
2152
- }}
2153
- pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
2154
- code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
2155
- a {{ color: #0969da; }}
2156
- @media (max-width: 640px) {{
2157
- .search-row {{
2158
- flex-direction: column;
2159
- }}
2160
- .search-row input,
2161
- .search-row select {{
2162
- width: 100%;
2163
- }}
2164
- .filter-row {{
2165
- flex-direction: column;
2166
- align-items: stretch;
2167
- }}
2168
- .filter-row .help-icon {{
2169
- align-self: flex-end;
2170
- }}
2171
- .adv-actions {{
2172
- flex-direction: column;
2173
- align-items: stretch;
2174
- }}
2175
- .detail-toolbar {{
2176
- flex-wrap: nowrap;
2177
- overflow-x: auto;
2178
- padding-bottom: 8px;
2179
- }}
2180
- .detail-toolbar::-webkit-scrollbar {{ height: 6px; }}
2181
- .detail-toolbar::-webkit-scrollbar-thumb {{ background: #c7d2e0; border-radius: 999px; }}
2182
- .detail-toolbar .tabs,
2183
- .toolbar-actions {{
2184
- flex: 0 0 auto;
2185
- }}
2186
- }}
2187
- </style>
2188
- {extra_head}
2189
- </head>
2190
- <body>
2191
- {header_html}
2192
- <div class="container">
2193
- {body_html}
2194
- </div>
2195
- {extra_scripts}
2196
- </body>
2197
- </html>"""
2198
-
2199
-
2200
- def _embed_shell(title: str, body_html: str, extra_head: str = "", extra_scripts: str = "") -> str:
2201
- return f"""<!doctype html>
2202
- <html lang="en">
2203
- <head>
2204
- <meta charset="utf-8" />
2205
- <meta name="viewport" content="width=device-width, initial-scale=1" />
2206
- <title>{html.escape(title)}</title>
2207
- <style>
2208
- body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; padding: 16px; }}
2209
- h1, h2, h3, h4 {{ margin-top: 1.2em; }}
2210
- .muted {{ color: #57606a; font-size: 13px; }}
2211
- .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
2212
- pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
2213
- code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
2214
- a {{ color: #0969da; }}
2215
- </style>
2216
- {extra_head}
2217
- </head>
2218
- <body>
2219
- {body_html}
2220
- {extra_scripts}
2221
- </body>
2222
- </html>"""
2223
-
2224
-
2225
- def _build_pdfjs_viewer_url(pdf_url: str) -> str:
2226
- encoded = quote(pdf_url, safe="")
2227
- return f"{_PDFJS_VIEWER_PATH}?file={encoded}"
2228
-
2229
-
2230
- def _outline_assets(outline_top: str) -> tuple[str, str, str]:
2231
- outline_html = """
2232
- <button id="outlineToggle" class="outline-toggle" title="Toggle outline">☰</button>
2233
- <div id="outlinePanel" class="outline-panel collapsed">
2234
- <div class="outline-title">Outline</div>
2235
- <div id="outlineList" class="outline-list"></div>
2236
- </div>
2237
- <button id="backToTop" class="back-to-top" title="Back to top">↑</button>
2238
- """
2239
- outline_css = f"""
2240
- <style>
2241
- :root {{
2242
- --outline-top: {outline_top};
2243
- }}
2244
- .outline-toggle {{
2245
- position: fixed;
2246
- top: var(--outline-top);
2247
- left: 16px;
2248
- z-index: 20;
2249
- padding: 6px 10px;
2250
- border-radius: 8px;
2251
- border: 1px solid #d0d7de;
2252
- background: #f6f8fa;
2253
- cursor: pointer;
2254
- }}
2255
- .outline-panel {{
2256
- position: fixed;
2257
- top: calc(var(--outline-top) + 42px);
2258
- left: 16px;
2259
- width: 240px;
2260
- max-height: 60vh;
2261
- overflow: auto;
2262
- border: 1px solid #d0d7de;
2263
- border-radius: 10px;
2264
- background: #ffffff;
2265
- padding: 10px;
2266
- z-index: 20;
2267
- box-shadow: 0 6px 18px rgba(0, 0, 0, 0.08);
2268
- }}
2269
- .outline-panel.collapsed {{
2270
- display: none;
2271
- }}
2272
- .outline-title {{
2273
- font-size: 12px;
2274
- text-transform: uppercase;
2275
- letter-spacing: 0.08em;
2276
- color: #57606a;
2277
- margin-bottom: 8px;
2278
- }}
2279
- .outline-list a {{
2280
- display: block;
2281
- color: #0969da;
2282
- text-decoration: none;
2283
- padding: 4px 0;
2284
- }}
2285
- .outline-list a:hover {{
2286
- text-decoration: underline;
2287
- }}
2288
- .back-to-top {{
2289
- position: fixed;
2290
- left: 16px;
2291
- bottom: 16px;
2292
- padding: 6px 10px;
2293
- border-radius: 999px;
2294
- border: 1px solid #d0d7de;
2295
- background: #ffffff;
2296
- cursor: pointer;
2297
- opacity: 0;
2298
- pointer-events: none;
2299
- transition: opacity 0.2s ease;
2300
- z-index: 20;
2301
- }}
2302
- .back-to-top.visible {{
2303
- opacity: 1;
2304
- pointer-events: auto;
2305
- }}
2306
- @media (max-width: 900px) {{
2307
- .outline-panel {{
2308
- width: 200px;
2309
- }}
2310
- }}
2311
- </style>
2312
- """
2313
- outline_js = """
2314
- const outlineToggle = document.getElementById('outlineToggle');
2315
- const outlinePanel = document.getElementById('outlinePanel');
2316
- const outlineList = document.getElementById('outlineList');
2317
- const backToTop = document.getElementById('backToTop');
2318
-
2319
- function slugify(text) {
2320
- return text.toLowerCase().trim()
2321
- .replace(/[^a-z0-9\\s-]/g, '')
2322
- .replace(/\\s+/g, '-')
2323
- .replace(/-+/g, '-');
2324
- }
2325
-
2326
- function buildOutline() {
2327
- if (!outlineList) return;
2328
- const content = document.getElementById('content');
2329
- if (!content) return;
2330
- const headings = content.querySelectorAll('h1, h2, h3, h4');
2331
- if (!headings.length) {
2332
- outlineList.innerHTML = '<div class="muted">No headings</div>';
2333
- return;
2334
- }
2335
- const used = new Set();
2336
- outlineList.innerHTML = '';
2337
- headings.forEach((heading) => {
2338
- let id = heading.id;
2339
- if (!id) {
2340
- const base = slugify(heading.textContent || 'section') || 'section';
2341
- id = base;
2342
- let i = 1;
2343
- while (used.has(id) || document.getElementById(id)) {
2344
- id = `${base}-${i++}`;
2345
- }
2346
- heading.id = id;
2347
- }
2348
- used.add(id);
2349
- const level = parseInt(heading.tagName.slice(1), 10) || 1;
2350
- const link = document.createElement('a');
2351
- link.href = `#${id}`;
2352
- link.textContent = heading.textContent || '';
2353
- link.style.paddingLeft = `${(level - 1) * 12}px`;
2354
- outlineList.appendChild(link);
2355
- });
2356
- }
2357
-
2358
- function toggleBackToTop() {
2359
- if (!backToTop) return;
2360
- if (window.scrollY > 300) {
2361
- backToTop.classList.add('visible');
2362
- } else {
2363
- backToTop.classList.remove('visible');
2364
- }
2365
- }
2366
-
2367
- if (outlineToggle && outlinePanel) {
2368
- outlineToggle.addEventListener('click', () => {
2369
- outlinePanel.classList.toggle('collapsed');
2370
- });
2371
- }
2372
-
2373
- if (backToTop) {
2374
- backToTop.addEventListener('click', () => {
2375
- window.scrollTo({ top: 0, behavior: 'smooth' });
2376
- });
2377
- }
2378
-
2379
- buildOutline();
2380
- window.addEventListener('scroll', toggleBackToTop);
2381
- toggleBackToTop();
2382
- """
2383
- return outline_html, outline_css, outline_js
2384
-
2385
-
2386
- async def _index_page(request: Request) -> HTMLResponse:
2387
- index: PaperIndex = request.app.state.index
2388
- template_options = "".join(
2389
- f'<option value="{html.escape(tag)}">{html.escape(tag)}</option>'
2390
- for tag in index.template_tags
2391
- )
2392
- if not template_options:
2393
- template_options = '<option value="" disabled>(no templates)</option>'
2394
- filter_help = (
2395
- "Filters syntax:\\n"
2396
- "pdf:yes|no source:yes|no translated:yes|no summary:yes|no\\n"
2397
- "tmpl:<tag> or template:<tag>\\n"
2398
- "has:pdf / no:source aliases\\n"
2399
- "Content tags still use the search box (tag:fpga)."
2400
- )
2401
- filter_help_attr = html.escape(filter_help).replace("\n", "&#10;")
2402
- body_html = """
2403
- <h2>Paper Database</h2>
2404
- <div class="card">
2405
- <div class="muted">Search (Scholar-style): <code>tag:fpga year:2023..2025 -survey</code> · Use quotes for phrases and <code>OR</code> for alternatives.</div>
2406
- <div class="search-row">
2407
- <input id="query" placeholder='Search... e.g. title:"nearest neighbor" tag:fpga year:2023..2025' />
2408
- <select id="openView">
2409
- <option value="summary" selected>Open: Summary</option>
2410
- <option value="source">Open: Source</option>
2411
- <option value="translated">Open: Translated</option>
2412
- <option value="pdf">Open: PDF</option>
2413
- <option value="pdfjs">Open: PDF Viewer</option>
2414
- <option value="split">Open: Split</option>
2415
- </select>
2416
- </div>
2417
- <div class="filters" style="margin-top:10px;">
2418
- <div class="filter-group">
2419
- <label>PDF</label>
2420
- <select id="filterPdf" multiple size="2">
2421
- <option value="with">With</option>
2422
- <option value="without">Without</option>
2423
- </select>
2424
- </div>
2425
- <div class="filter-group">
2426
- <label>Source</label>
2427
- <select id="filterSource" multiple size="2">
2428
- <option value="with">With</option>
2429
- <option value="without">Without</option>
2430
- </select>
2431
- </div>
2432
- <div class="filter-group">
2433
- <label>Translated</label>
2434
- <select id="filterTranslated" multiple size="2">
2435
- <option value="with">With</option>
2436
- <option value="without">Without</option>
2437
- </select>
2438
- </div>
2439
- <div class="filter-group">
2440
- <label>Summary</label>
2441
- <select id="filterSummary" multiple size="2">
2442
- <option value="with">With</option>
2443
- <option value="without">Without</option>
2444
- </select>
2445
- </div>
2446
- <div class="filter-group">
2447
- <label>Template</label>
2448
- <select id="filterTemplate" multiple size="4">
2449
- __TEMPLATE_OPTIONS__
2450
- </select>
2451
- </div>
2452
- </div>
2453
- <div class="filter-row">
2454
- <input id="filterQuery" placeholder='Filters... e.g. pdf:yes tmpl:simple' />
2455
- <span class="help-icon" data-tip="__FILTER_HELP__">?</span>
2456
- </div>
2457
- <details style="margin-top:10px;">
2458
- <summary>Advanced search</summary>
2459
- <div style="margin-top:10px;" class="muted">Build a query:</div>
2460
- <div class="filters">
2461
- <input id="advTitle" placeholder="title contains..." />
2462
- <input id="advAuthor" placeholder="author contains..." />
2463
- <input id="advTag" placeholder="tag (comma separated)" />
2464
- <input id="advYear" placeholder="year (e.g. 2020..2024)" />
2465
- <input id="advMonth" placeholder="month (01-12)" />
2466
- <input id="advVenue" placeholder="venue contains..." />
2467
- </div>
2468
- <div class="adv-actions">
2469
- <button id="buildQuery" style="padding:8px 12px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Build</button>
2470
- <div class="muted">Generated: <code id="generated"></code></div>
2471
- </div>
2472
- </details>
2473
- </div>
2474
- <div id="stats" class="stats">
2475
- <div id="statsTotal" class="stats-row"></div>
2476
- <div id="statsFiltered" class="stats-row" style="margin-top:6px;"></div>
2477
- </div>
2478
- <div id="results"></div>
2479
- <div id="loading" class="muted">Loading...</div>
2480
- <script>
2481
- let page = 1;
2482
- let loading = false;
2483
- let done = false;
2484
-
2485
- function currentParams(nextPage) {
2486
- const params = new URLSearchParams();
2487
- params.set("page", String(nextPage));
2488
- params.set("page_size", "30");
2489
- const q = document.getElementById("query").value.trim();
2490
- if (q) params.set("q", q);
2491
- const fq = document.getElementById("filterQuery").value.trim();
2492
- if (fq) params.set("fq", fq);
2493
- function addMulti(id, key) {
2494
- const el = document.getElementById(id);
2495
- const values = Array.from(el.selectedOptions).map(opt => opt.value).filter(Boolean);
2496
- for (const value of values) {
2497
- params.append(key, value);
2498
- }
2499
- }
2500
- addMulti("filterPdf", "pdf");
2501
- addMulti("filterSource", "source");
2502
- addMulti("filterTranslated", "translated");
2503
- addMulti("filterSummary", "summary");
2504
- addMulti("filterTemplate", "template");
2505
- return params;
2506
- }
2507
-
2508
- function escapeHtml(text) {
2509
- const div = document.createElement("div");
2510
- div.textContent = text;
2511
- return div.innerHTML;
2512
- }
2513
-
2514
- function viewSuffixForItem(item) {
2515
- let view = document.getElementById("openView").value;
2516
- const isPdfOnly = item.is_pdf_only;
2517
- const pdfFallback = item.has_pdf ? "pdfjs" : "pdf";
2518
- if (isPdfOnly && (view === "summary" || view === "source" || view === "translated")) {
2519
- view = pdfFallback;
2520
- }
2521
- if (!view || view === "summary") return "";
2522
- const params = new URLSearchParams();
2523
- params.set("view", view);
2524
- if (view === "split") {
2525
- if (isPdfOnly) {
2526
- params.set("left", pdfFallback);
2527
- params.set("right", pdfFallback);
2528
- } else {
2529
- params.set("left", "summary");
2530
- if (item.has_pdf) {
2531
- params.set("right", "pdfjs");
2532
- } else if (item.has_source) {
2533
- params.set("right", "source");
2534
- } else {
2535
- params.set("right", "summary");
2536
- }
2537
- }
2538
- }
2539
- return `?${params.toString()}`;
2540
- }
2541
-
2542
- function renderItem(item) {
2543
- const tags = (item.tags || []).map(t => `<span class="pill">${escapeHtml(t)}</span>`).join("");
2544
- const templateTags = (item.template_tags || []).map(t => `<span class="pill template">tmpl:${escapeHtml(t)}</span>`).join("");
2545
- const authors = (item.authors || []).slice(0, 6).map(a => escapeHtml(a)).join(", ");
2546
- const meta = `${escapeHtml(item.year || "")}-${escapeHtml(item.month || "")} · ${escapeHtml(item.venue || "")}`;
2547
- const viewSuffix = viewSuffixForItem(item);
2548
- const badges = [
2549
- item.has_source ? `<span class="pill">source</span>` : "",
2550
- item.has_translation ? `<span class="pill">translated</span>` : "",
2551
- item.has_pdf ? `<span class="pill">pdf</span>` : "",
2552
- item.is_pdf_only ? `<span class="pill pdf-only">pdf-only</span>` : "",
2553
- ].join("");
2554
- return `
2555
- <div class="card">
2556
- <div><a href="/paper/${encodeURIComponent(item.source_hash)}${viewSuffix}">${escapeHtml(item.title || "")}</a></div>
2557
- <div class="muted">${authors}</div>
2558
- <div class="muted">${meta}</div>
2559
- <div style="margin-top:6px">${badges} ${templateTags} ${tags}</div>
2560
- </div>
2561
- `;
2562
- }
2563
-
2564
- function renderStatsRow(targetId, label, counts) {
2565
- const row = document.getElementById(targetId);
2566
- if (!row || !counts) return;
2567
- const pills = [];
2568
- pills.push(`<span class="stats-label">${escapeHtml(label)}</span>`);
2569
- pills.push(`<span class="pill stat">Count ${counts.total}</span>`);
2570
- pills.push(`<span class="pill stat">PDF ${counts.pdf}</span>`);
2571
- pills.push(`<span class="pill stat">Source ${counts.source}</span>`);
2572
- pills.push(`<span class="pill stat">Translated ${counts.translated || 0}</span>`);
2573
- pills.push(`<span class="pill stat">Summary ${counts.summary}</span>`);
2574
- const order = counts.template_order || Object.keys(counts.templates || {});
2575
- for (const tag of order) {
2576
- const count = (counts.templates && counts.templates[tag]) || 0;
2577
- pills.push(`<span class="pill stat">tmpl:${escapeHtml(tag)} ${count}</span>`);
2578
- }
2579
- row.innerHTML = pills.join("");
2580
- }
2581
-
2582
- function updateStats(stats) {
2583
- if (!stats) return;
2584
- renderStatsRow("statsTotal", "Total", stats.all);
2585
- renderStatsRow("statsFiltered", "Filtered", stats.filtered);
2586
- }
2587
-
2588
- async function loadMore() {
2589
- if (loading || done) return;
2590
- loading = true;
2591
- document.getElementById("loading").textContent = "Loading...";
2592
- const res = await fetch(`/api/papers?${currentParams(page).toString()}`);
2593
- const data = await res.json();
2594
- if (data.stats) {
2595
- updateStats(data.stats);
2596
- }
2597
- const results = document.getElementById("results");
2598
- for (const item of data.items) {
2599
- results.insertAdjacentHTML("beforeend", renderItem(item));
2600
- }
2601
- if (!data.has_more) {
2602
- done = true;
2603
- document.getElementById("loading").textContent = "End.";
2604
- } else {
2605
- page += 1;
2606
- document.getElementById("loading").textContent = "Scroll to load more...";
2607
- }
2608
- loading = false;
2609
- }
2610
-
2611
- function resetAndLoad() {
2612
- page = 1;
2613
- done = false;
2614
- document.getElementById("results").innerHTML = "";
2615
- loadMore();
2616
- }
2617
-
2618
- document.getElementById("query").addEventListener("change", resetAndLoad);
2619
- document.getElementById("openView").addEventListener("change", resetAndLoad);
2620
- document.getElementById("filterQuery").addEventListener("change", resetAndLoad);
2621
- document.getElementById("filterPdf").addEventListener("change", resetAndLoad);
2622
- document.getElementById("filterSource").addEventListener("change", resetAndLoad);
2623
- document.getElementById("filterTranslated").addEventListener("change", resetAndLoad);
2624
- document.getElementById("filterSummary").addEventListener("change", resetAndLoad);
2625
- document.getElementById("filterTemplate").addEventListener("change", resetAndLoad);
2626
-
2627
- document.getElementById("buildQuery").addEventListener("click", () => {
2628
- function add(field, value) {
2629
- value = value.trim();
2630
- if (!value) return "";
2631
- if (value.includes(" ")) return `${field}:"${value}"`;
2632
- return `${field}:${value}`;
2633
- }
2634
- const parts = [];
2635
- const t = document.getElementById("advTitle").value.trim();
2636
- const a = document.getElementById("advAuthor").value.trim();
2637
- const tag = document.getElementById("advTag").value.trim();
2638
- const y = document.getElementById("advYear").value.trim();
2639
- const m = document.getElementById("advMonth").value.trim();
2640
- const v = document.getElementById("advVenue").value.trim();
2641
- if (t) parts.push(add("title", t));
2642
- if (a) parts.push(add("author", a));
2643
- if (tag) {
2644
- for (const item of tag.split(",")) {
2645
- const val = item.trim();
2646
- if (val) parts.push(add("tag", val));
2647
- }
2648
- }
2649
- if (y) parts.push(add("year", y));
2650
- if (m) parts.push(add("month", m));
2651
- if (v) parts.push(add("venue", v));
2652
- const q = parts.join(" ");
2653
- document.getElementById("generated").textContent = q;
2654
- document.getElementById("query").value = q;
2655
- resetAndLoad();
2656
- });
2657
-
2658
- window.addEventListener("scroll", () => {
2659
- if ((window.innerHeight + window.scrollY) >= (document.body.offsetHeight - 600)) {
2660
- loadMore();
2661
- }
2662
- });
2663
-
2664
- loadMore();
2665
- </script>
2666
- """
2667
- body_html = body_html.replace("__TEMPLATE_OPTIONS__", template_options)
2668
- body_html = body_html.replace("__FILTER_HELP__", filter_help_attr)
2669
- return HTMLResponse(_page_shell("Paper DB", body_html))
2670
-
2671
-
2672
- def _parse_filters(request: Request) -> dict[str, list[str] | str | int]:
2673
- qp = request.query_params
2674
- page = int(qp.get("page", "1"))
2675
- page_size = int(qp.get("page_size", "30"))
2676
- page = max(1, page)
2677
- page_size = min(max(1, page_size), 200)
2678
-
2679
- q = qp.get("q", "").strip()
2680
- filter_query = qp.get("fq", "").strip()
2681
- pdf_filters = [item for item in qp.getlist("pdf") if item]
2682
- source_filters = [item for item in qp.getlist("source") if item]
2683
- summary_filters = [item for item in qp.getlist("summary") if item]
2684
- translated_filters = [item for item in qp.getlist("translated") if item]
2685
- template_filters = [item for item in qp.getlist("template") if item]
2686
-
2687
- return {
2688
- "page": page,
2689
- "page_size": page_size,
2690
- "q": q,
2691
- "filter_query": filter_query,
2692
- "pdf": pdf_filters,
2693
- "source": source_filters,
2694
- "summary": summary_filters,
2695
- "translated": translated_filters,
2696
- "template": template_filters,
2697
- }
2698
-
2699
-
2700
- async def _api_papers(request: Request) -> JSONResponse:
2701
- index: PaperIndex = request.app.state.index
2702
- filters = _parse_filters(request)
2703
- page = int(filters["page"])
2704
- page_size = int(filters["page_size"])
2705
- q = str(filters["q"])
2706
- filter_query = str(filters["filter_query"])
2707
- query = parse_query(q)
2708
- candidate = _apply_query(index, query)
2709
- filter_terms = _parse_filter_query(filter_query)
2710
- pdf_filter = _merge_filter_set(_presence_filter(filters["pdf"]), _presence_filter(list(filter_terms["pdf"])))
2711
- source_filter = _merge_filter_set(
2712
- _presence_filter(filters["source"]), _presence_filter(list(filter_terms["source"]))
2713
- )
2714
- summary_filter = _merge_filter_set(
2715
- _presence_filter(filters["summary"]), _presence_filter(list(filter_terms["summary"]))
2716
- )
2717
- translated_filter = _merge_filter_set(
2718
- _presence_filter(filters["translated"]), _presence_filter(list(filter_terms["translated"]))
2719
- )
2720
- template_selected = {item.lower() for item in filters["template"] if item}
2721
- template_filter = _merge_filter_set(
2722
- template_selected or None,
2723
- filter_terms["template"] or None,
2724
- )
2725
-
2726
- if candidate:
2727
- filtered: set[int] = set()
2728
- for idx in candidate:
2729
- paper = index.papers[idx]
2730
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
2731
- has_source = source_hash in index.md_path_by_hash
2732
- has_pdf = source_hash in index.pdf_path_by_hash
2733
- has_summary = bool(paper.get("_has_summary"))
2734
- has_translated = bool(index.translated_md_by_hash.get(source_hash))
2735
- if not _matches_presence(pdf_filter, has_pdf):
2736
- continue
2737
- if not _matches_presence(source_filter, has_source):
2738
- continue
2739
- if not _matches_presence(summary_filter, has_summary):
2740
- continue
2741
- if not _matches_presence(translated_filter, has_translated):
2742
- continue
2743
- if template_filter:
2744
- tags = paper.get("_template_tags_lc") or []
2745
- if not any(tag in template_filter for tag in tags):
2746
- continue
2747
- filtered.add(idx)
2748
- candidate = filtered
2749
- ordered = [idx for idx in index.ordered_ids if idx in candidate]
2750
- total = len(ordered)
2751
- start = (page - 1) * page_size
2752
- end = min(start + page_size, total)
2753
- page_ids = ordered[start:end]
2754
- stats_payload = None
2755
- if page == 1:
2756
- all_ids = set(index.ordered_ids)
2757
- stats_payload = {
2758
- "all": _compute_counts(index, all_ids),
2759
- "filtered": _compute_counts(index, candidate),
2760
- }
2761
-
2762
- items: list[dict[str, Any]] = []
2763
- for idx in page_ids:
2764
- paper = index.papers[idx]
2765
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
2766
- translations = index.translated_md_by_hash.get(source_hash, {})
2767
- translation_languages = sorted(translations.keys(), key=str.lower)
2768
- items.append(
2769
- {
2770
- "source_hash": source_hash,
2771
- "title": paper.get("paper_title") or "",
2772
- "authors": paper.get("_authors") or [],
2773
- "year": paper.get("_year") or "",
2774
- "month": paper.get("_month") or "",
2775
- "venue": paper.get("_venue") or "",
2776
- "tags": paper.get("_tags") or [],
2777
- "template_tags": paper.get("_template_tags") or [],
2778
- "has_source": source_hash in index.md_path_by_hash,
2779
- "has_translation": bool(translation_languages),
2780
- "has_pdf": source_hash in index.pdf_path_by_hash,
2781
- "has_summary": bool(paper.get("_has_summary")),
2782
- "is_pdf_only": bool(paper.get("_is_pdf_only")),
2783
- "translation_languages": translation_languages,
2784
- }
2785
- )
2786
-
2787
- return JSONResponse(
2788
- {
2789
- "page": page,
2790
- "page_size": page_size,
2791
- "total": total,
2792
- "has_more": end < total,
2793
- "items": items,
2794
- "stats": stats_payload,
2795
- }
2796
- )
2797
-
2798
-
2799
- async def _paper_detail(request: Request) -> HTMLResponse:
2800
- index: PaperIndex = request.app.state.index
2801
- md = request.app.state.md
2802
- source_hash = request.path_params["source_hash"]
2803
- idx = index.id_by_hash.get(source_hash)
2804
- if idx is None:
2805
- return RedirectResponse("/")
2806
- paper = index.papers[idx]
2807
- is_pdf_only = bool(paper.get("_is_pdf_only"))
2808
- page_title = str(paper.get("paper_title") or "Paper")
2809
- view = request.query_params.get("view")
2810
- template_param = request.query_params.get("template")
2811
- embed = request.query_params.get("embed") == "1"
2812
-
2813
- pdf_path = index.pdf_path_by_hash.get(source_hash)
2814
- pdf_url = f"/api/pdf/{source_hash}"
2815
- source_available = source_hash in index.md_path_by_hash
2816
- translations = index.translated_md_by_hash.get(source_hash, {})
2817
- translation_langs = sorted(translations.keys(), key=str.lower)
2818
- lang_param = request.query_params.get("lang")
2819
- normalized_lang = lang_param.lower() if lang_param else None
2820
- selected_lang = None
2821
- if translation_langs:
2822
- if normalized_lang and normalized_lang in translations:
2823
- selected_lang = normalized_lang
2824
- elif "zh" in translations:
2825
- selected_lang = "zh"
2826
- else:
2827
- selected_lang = translation_langs[0]
2828
- allowed_views = {"summary", "source", "translated", "pdf", "pdfjs", "split"}
2829
- if is_pdf_only:
2830
- allowed_views = {"pdf", "pdfjs", "split"}
2831
-
2832
- def normalize_view(value: str | None, default: str) -> str:
2833
- if value in allowed_views:
2834
- return value
2835
- return default
2836
-
2837
- preferred_pdf_view = "pdfjs" if pdf_path else "pdf"
2838
- default_view = preferred_pdf_view if is_pdf_only else "summary"
2839
- view = normalize_view(view, default_view)
2840
- if view == "split":
2841
- embed = False
2842
- if is_pdf_only:
2843
- left_param = request.query_params.get("left")
2844
- right_param = request.query_params.get("right")
2845
- left = normalize_view(left_param, preferred_pdf_view) if left_param else preferred_pdf_view
2846
- right = normalize_view(right_param, preferred_pdf_view) if right_param else preferred_pdf_view
2847
- else:
2848
- default_left = preferred_pdf_view if pdf_path else ("source" if source_available else "summary")
2849
- default_right = "summary"
2850
- left_param = request.query_params.get("left")
2851
- right_param = request.query_params.get("right")
2852
- left = normalize_view(left_param, default_left) if left_param else default_left
2853
- right = normalize_view(right_param, default_right) if right_param else default_right
2854
-
2855
- def render_page(title: str, body: str, extra_head: str = "", extra_scripts: str = "") -> HTMLResponse:
2856
- if embed:
2857
- return HTMLResponse(_embed_shell(title, body, extra_head, extra_scripts))
2858
- return HTMLResponse(_page_shell(title, body, extra_head, extra_scripts, header_title=page_title))
2859
-
2860
- def nav_link(label: str, v: str) -> str:
2861
- active = " active" if view == v else ""
2862
- params: dict[str, str] = {"view": v}
2863
- if v == "summary" and template_param:
2864
- params["template"] = str(template_param)
2865
- if v == "translated" and selected_lang:
2866
- params["lang"] = selected_lang
2867
- if v == "split":
2868
- params["left"] = left
2869
- params["right"] = right
2870
- href = f"/paper/{source_hash}?{urlencode(params)}"
2871
- return f'<a class="tab{active}" href="{html.escape(href)}">{html.escape(label)}</a>'
2872
-
2873
- tab_defs = [
2874
- ("Summary", "summary"),
2875
- ("Source", "source"),
2876
- ("Translated", "translated"),
2877
- ("PDF", "pdf"),
2878
- ("PDF Viewer", "pdfjs"),
2879
- ("Split", "split"),
2880
- ]
2881
- if is_pdf_only:
2882
- tab_defs = [
2883
- ("PDF", "pdf"),
2884
- ("PDF Viewer", "pdfjs"),
2885
- ("Split", "split"),
2886
- ]
2887
- tabs_html = '<div class="tabs">' + "".join(nav_link(label, v) for label, v in tab_defs) + "</div>"
2888
- fullscreen_controls = """
2889
- <div class="fullscreen-actions">
2890
- <button id="fullscreenEnter" class="fullscreen-enter" type="button" title="Enter fullscreen">Fullscreen</button>
2891
- <button id="fullscreenExit" class="fullscreen-exit" type="button" title="Exit fullscreen">Exit Fullscreen</button>
2892
- </div>
2893
- """
2894
-
2895
- def detail_toolbar(extra_controls: str = "") -> str:
2896
- if embed:
2897
- return ""
2898
- controls = extra_controls.strip()
2899
- toolbar_controls = f"{controls}{fullscreen_controls}" if controls else fullscreen_controls
2900
- return f"""
2901
- <div class="detail-toolbar">
2902
- {tabs_html}
2903
- <div class="toolbar-actions">
2904
- {toolbar_controls}
2905
- </div>
2906
- </div>
2907
- """
2908
-
2909
- def wrap_detail(content: str, toolbar_html: str | None = None) -> str:
2910
- if embed:
2911
- return content
2912
- toolbar = detail_toolbar() if toolbar_html is None else toolbar_html
2913
- return f"""
2914
- <div class="detail-shell">
2915
- {toolbar}
2916
- <div class="detail-body">
2917
- {content}
2918
- </div>
2919
- </div>
2920
- """
2921
-
2922
- fullscreen_script = ""
2923
- if not embed:
2924
- fullscreen_script = """
2925
- <script>
2926
- const fullscreenEnter = document.getElementById('fullscreenEnter');
2927
- const fullscreenExit = document.getElementById('fullscreenExit');
2928
- function setFullscreen(enable) {
2929
- document.body.classList.toggle('detail-fullscreen', enable);
2930
- }
2931
- if (fullscreenEnter) {
2932
- fullscreenEnter.addEventListener('click', () => setFullscreen(true));
2933
- }
2934
- if (fullscreenExit) {
2935
- fullscreenExit.addEventListener('click', () => setFullscreen(false));
2936
- }
2937
- document.addEventListener('keydown', (event) => {
2938
- if (event.key === 'Escape' && document.body.classList.contains('detail-fullscreen')) {
2939
- setFullscreen(false);
2940
- }
2941
- });
2942
- </script>
2943
- """
2944
- pdf_only_warning_html = ""
2945
- if is_pdf_only:
2946
- pdf_only_warning_html = (
2947
- '<div class="warning">PDF-only entry: summary and source views are unavailable.</div>'
2948
- )
2949
- outline_top = "72px" if not embed else "16px"
2950
- outline_html, outline_css, outline_js = _outline_assets(outline_top)
2951
-
2952
- if view == "split":
2953
- def pane_src(pane_view: str) -> str:
2954
- if pane_view == "pdfjs" and pdf_path:
2955
- return _build_pdfjs_viewer_url(pdf_url)
2956
- params: dict[str, str] = {"view": pane_view, "embed": "1"}
2957
- if pane_view == "summary" and template_param:
2958
- params["template"] = str(template_param)
2959
- if pane_view == "translated" and selected_lang:
2960
- params["lang"] = selected_lang
2961
- return f"/paper/{source_hash}?{urlencode(params)}"
2962
-
2963
- left_src = pane_src(left)
2964
- right_src = pane_src(right)
2965
- options = [
2966
- ("summary", "Summary"),
2967
- ("source", "Source"),
2968
- ("translated", "Translated"),
2969
- ("pdf", "PDF"),
2970
- ("pdfjs", "PDF Viewer"),
2971
- ]
2972
- if is_pdf_only:
2973
- options = [
2974
- ("pdf", "PDF"),
2975
- ("pdfjs", "PDF Viewer"),
2976
- ]
2977
- if translation_langs:
2978
- lang_options = "\n".join(
2979
- f'<option value="{html.escape(lang)}"{" selected" if lang == selected_lang else ""}>'
2980
- f'{html.escape(lang)}</option>'
2981
- for lang in translation_langs
2982
- )
2983
- lang_disabled = ""
2984
- else:
2985
- lang_options = '<option value="" selected>(no translations)</option>'
2986
- lang_disabled = " disabled"
2987
- left_options = "\n".join(
2988
- f'<option value="{value}"{" selected" if value == left else ""}>{label}</option>'
2989
- for value, label in options
2990
- )
2991
- right_options = "\n".join(
2992
- f'<option value="{value}"{" selected" if value == right else ""}>{label}</option>'
2993
- for value, label in options
2994
- )
2995
- split_controls = f"""
2996
- <div class="split-inline">
2997
- <span class="muted">Left</span>
2998
- <select id="splitLeft">
2999
- {left_options}
3000
- </select>
3001
- <div class="split-actions">
3002
- <button id="splitTighten" type="button" title="Tighten width">-</button>
3003
- <button id="splitSwap" type="button" title="Swap panes">⇄</button>
3004
- <button id="splitWiden" type="button" title="Widen width">+</button>
3005
- </div>
3006
- <span class="muted">Right</span>
3007
- <select id="splitRight">
3008
- {right_options}
3009
- </select>
3010
- <span class="muted">Lang</span>
3011
- <select id="splitLang"{lang_disabled}>
3012
- {lang_options}
3013
- </select>
3014
- </div>
3015
- """
3016
- toolbar_html = detail_toolbar(split_controls)
3017
- split_layout = f"""
3018
- {pdf_only_warning_html}
3019
- <div class="split-layout">
3020
- <div class="split-pane">
3021
- <iframe id="leftPane" src="{html.escape(left_src)}" title="Left pane"></iframe>
3022
- </div>
3023
- <div class="split-pane">
3024
- <iframe id="rightPane" src="{html.escape(right_src)}" title="Right pane"></iframe>
3025
- </div>
3026
- </div>
3027
- """
3028
- body = wrap_detail(split_layout, toolbar_html=toolbar_html)
3029
- extra_head = """
3030
- <style>
3031
- .container {
3032
- max-width: 100%;
3033
- width: 100%;
3034
- margin: 0 auto;
3035
- }
3036
- .split-layout {
3037
- display: flex;
3038
- gap: 12px;
3039
- width: 100%;
3040
- max-width: var(--split-max-width, 100%);
3041
- margin: 0 auto;
3042
- flex: 1;
3043
- min-height: 440px;
3044
- }
3045
- .split-pane {
3046
- flex: 1;
3047
- border: 1px solid #d0d7de;
3048
- border-radius: 10px;
3049
- overflow: hidden;
3050
- background: #fff;
3051
- }
3052
- .split-pane iframe {
3053
- width: 100%;
3054
- height: 100%;
3055
- border: 0;
3056
- }
3057
- @media (max-width: 900px) {
3058
- .split-layout {
3059
- flex-direction: column;
3060
- min-height: 0;
3061
- }
3062
- .split-pane {
3063
- height: 70vh;
3064
- }
3065
- }
3066
- </style>
3067
- """
3068
- extra_scripts = """
3069
- <script>
3070
- const leftSelect = document.getElementById('splitLeft');
3071
- const rightSelect = document.getElementById('splitRight');
3072
- const langSelect = document.getElementById('splitLang');
3073
- const swapButton = document.getElementById('splitSwap');
3074
- const tightenButton = document.getElementById('splitTighten');
3075
- const widenButton = document.getElementById('splitWiden');
3076
- function updateSplit() {
3077
- const params = new URLSearchParams(window.location.search);
3078
- params.set('view', 'split');
3079
- params.set('left', leftSelect.value);
3080
- params.set('right', rightSelect.value);
3081
- if (langSelect && langSelect.value) {
3082
- params.set('lang', langSelect.value);
3083
- }
3084
- window.location.search = params.toString();
3085
- }
3086
- leftSelect.addEventListener('change', updateSplit);
3087
- rightSelect.addEventListener('change', updateSplit);
3088
- if (langSelect) {
3089
- langSelect.addEventListener('change', updateSplit);
3090
- }
3091
- swapButton.addEventListener('click', () => {
3092
- const leftValue = leftSelect.value;
3093
- leftSelect.value = rightSelect.value;
3094
- rightSelect.value = leftValue;
3095
- updateSplit();
3096
- });
3097
- const widthSteps = ["1200px", "1400px", "1600px", "1800px", "2000px", "100%"];
3098
- let widthIndex = widthSteps.length - 1;
3099
- try {
3100
- const stored = localStorage.getItem('splitWidthIndex');
3101
- if (stored !== null) {
3102
- const parsed = Number.parseInt(stored, 10);
3103
- if (!Number.isNaN(parsed)) {
3104
- widthIndex = Math.max(0, Math.min(widthSteps.length - 1, parsed));
3105
- }
3106
- }
3107
- } catch (err) {
3108
- // Ignore storage errors (e.g. private mode)
3109
- }
3110
-
3111
- function applySplitWidth() {
3112
- const value = widthSteps[widthIndex];
3113
- document.documentElement.style.setProperty('--split-max-width', value);
3114
- try {
3115
- localStorage.setItem('splitWidthIndex', String(widthIndex));
3116
- } catch (err) {
3117
- // Ignore storage errors
3118
- }
3119
- }
3120
-
3121
- tightenButton.addEventListener('click', () => {
3122
- widthIndex = Math.max(0, widthIndex - 1);
3123
- applySplitWidth();
3124
- });
3125
- widenButton.addEventListener('click', () => {
3126
- widthIndex = Math.min(widthSteps.length - 1, widthIndex + 1);
3127
- applySplitWidth();
3128
- });
3129
- applySplitWidth();
3130
- </script>
3131
- """
3132
- return render_page(
3133
- "Split View",
3134
- body,
3135
- extra_head=extra_head,
3136
- extra_scripts=extra_scripts + fullscreen_script,
3137
- )
3138
-
3139
- if view == "translated":
3140
- if translation_langs:
3141
- lang_options = "\n".join(
3142
- f'<option value="{html.escape(lang)}"{" selected" if lang == selected_lang else ""}>'
3143
- f'{html.escape(lang)}</option>'
3144
- for lang in translation_langs
3145
- )
3146
- disabled_attr = ""
3147
- else:
3148
- lang_options = '<option value="" selected>(no translations)</option>'
3149
- disabled_attr = " disabled"
3150
- lang_controls = f"""
3151
- <div class="lang-select">
3152
- <label for="translationLang">Language</label>
3153
- <select id="translationLang"{disabled_attr}>
3154
- {lang_options}
3155
- </select>
3156
- </div>
3157
- """
3158
- toolbar_html = detail_toolbar(lang_controls)
3159
- if not translation_langs or not selected_lang:
3160
- body = wrap_detail(
3161
- '<div class="warning">No translated markdown found. '
3162
- 'Provide <code>--md-translated-root</code> and place '
3163
- '<code>&lt;base&gt;.&lt;lang&gt;.md</code> under that root.</div>',
3164
- toolbar_html=toolbar_html,
3165
- )
3166
- return render_page("Translated", body, extra_scripts=fullscreen_script)
3167
- translated_path = translations.get(selected_lang)
3168
- if not translated_path:
3169
- body = wrap_detail(
3170
- '<div class="warning">Translated markdown not found for the selected language.</div>',
3171
- toolbar_html=toolbar_html,
3172
- )
3173
- return render_page("Translated", body, extra_scripts=fullscreen_script)
3174
- try:
3175
- raw = translated_path.read_text(encoding="utf-8")
3176
- except UnicodeDecodeError:
3177
- raw = translated_path.read_text(encoding="latin-1")
3178
- raw = _normalize_markdown_images(raw)
3179
- rendered = _render_markdown_with_math_placeholders(md, raw)
3180
- body = wrap_detail(
3181
- f"""
3182
- <div class="muted">Language: {html.escape(selected_lang)}</div>
3183
- <div class="muted">{html.escape(str(translated_path))}</div>
3184
- <div class="muted" style="margin-top:10px;">Rendered from translated markdown:</div>
3185
- {outline_html}
3186
- <div id="content">{rendered}</div>
3187
- <details style="margin-top:12px;"><summary>Raw markdown</summary>
3188
- <pre><code>{html.escape(raw)}</code></pre>
3189
- </details>
3190
- """,
3191
- toolbar_html=toolbar_html,
3192
- )
3193
- extra_head = f"""
3194
- <link rel="stylesheet" href="{_CDN_KATEX}" />
3195
- {outline_css}
3196
- <style>
3197
- #content img {{
3198
- max-width: 100%;
3199
- height: auto;
3200
- }}
3201
- </style>
3202
- """
3203
- extra_scripts = f"""
3204
- <script src="{_CDN_MERMAID}"></script>
3205
- <script src="{_CDN_KATEX_JS}"></script>
3206
- <script src="{_CDN_KATEX_AUTO}"></script>
3207
- <script>
3208
- const translationSelect = document.getElementById('translationLang');
3209
- if (translationSelect) {{
3210
- translationSelect.addEventListener('change', () => {{
3211
- const params = new URLSearchParams(window.location.search);
3212
- params.set('view', 'translated');
3213
- params.set('lang', translationSelect.value);
3214
- window.location.search = params.toString();
3215
- }});
3216
- }}
3217
- document.querySelectorAll('code.language-mermaid').forEach((code) => {{
3218
- const pre = code.parentElement;
3219
- const div = document.createElement('div');
3220
- div.className = 'mermaid';
3221
- div.textContent = code.textContent;
3222
- pre.replaceWith(div);
3223
- }});
3224
- if (window.mermaid) {{
3225
- mermaid.initialize({{ startOnLoad: false }});
3226
- mermaid.run();
3227
- }}
3228
- if (window.renderMathInElement) {{
3229
- renderMathInElement(document.getElementById('content'), {{
3230
- delimiters: [
3231
- {{left: '$$', right: '$$', display: true}},
3232
- {{left: '$', right: '$', display: false}},
3233
- {{left: '\\\\(', right: '\\\\)', display: false}},
3234
- {{left: '\\\\[', right: '\\\\]', display: true}}
3235
- ],
3236
- throwOnError: false
3237
- }});
3238
- }}
3239
- if (document.querySelector('.footnotes')) {{
3240
- const notes = {{}};
3241
- document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
3242
- const id = li.getAttribute('id');
3243
- if (!id) return;
3244
- const clone = li.cloneNode(true);
3245
- clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
3246
- const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
3247
- if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
3248
- }});
3249
- document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
3250
- const ref = link.getAttribute('href');
3251
- const text = notes[ref];
3252
- if (!text) return;
3253
- link.dataset.footnote = text;
3254
- link.classList.add('footnote-tip');
3255
- }});
3256
- }}
3257
- {outline_js}
3258
- </script>
3259
- """
3260
- return render_page(
3261
- "Translated",
3262
- body,
3263
- extra_head=extra_head,
3264
- extra_scripts=extra_scripts + fullscreen_script,
3265
- )
3266
-
3267
- if view == "source":
3268
- source_path = index.md_path_by_hash.get(source_hash)
3269
- if not source_path:
3270
- body = wrap_detail(
3271
- '<div class="warning">Source markdown not found. Provide --md-root to enable source viewing.</div>'
3272
- )
3273
- return render_page("Source", body, extra_scripts=fullscreen_script)
3274
- try:
3275
- raw = source_path.read_text(encoding="utf-8")
3276
- except UnicodeDecodeError:
3277
- raw = source_path.read_text(encoding="latin-1")
3278
- rendered = _render_markdown_with_math_placeholders(md, raw)
3279
- body = wrap_detail(
3280
- f"""
3281
- <div class="muted">{html.escape(str(source_path))}</div>
3282
- <div class="muted" style="margin-top:10px;">Rendered from source markdown:</div>
3283
- {outline_html}
3284
- <div id="content">{rendered}</div>
3285
- <details style="margin-top:12px;"><summary>Raw markdown</summary>
3286
- <pre><code>{html.escape(raw)}</code></pre>
3287
- </details>
3288
- """
3289
- )
3290
- extra_head = f"""
3291
- <link rel="stylesheet" href="{_CDN_KATEX}" />
3292
- {outline_css}
3293
- <style>
3294
- #content img {{
3295
- max-width: 100%;
3296
- height: auto;
3297
- }}
3298
- </style>
3299
- """
3300
- extra_scripts = f"""
3301
- <script src="{_CDN_MERMAID}"></script>
3302
- <script src="{_CDN_KATEX_JS}"></script>
3303
- <script src="{_CDN_KATEX_AUTO}"></script>
3304
- <script>
3305
- document.querySelectorAll('code.language-mermaid').forEach((code) => {{
3306
- const pre = code.parentElement;
3307
- const div = document.createElement('div');
3308
- div.className = 'mermaid';
3309
- div.textContent = code.textContent;
3310
- pre.replaceWith(div);
3311
- }});
3312
- if (window.mermaid) {{
3313
- mermaid.initialize({{ startOnLoad: false }});
3314
- mermaid.run();
3315
- }}
3316
- if (window.renderMathInElement) {{
3317
- renderMathInElement(document.getElementById('content'), {{
3318
- delimiters: [
3319
- {{left: '$$', right: '$$', display: true}},
3320
- {{left: '$', right: '$', display: false}},
3321
- {{left: '\\\\(', right: '\\\\)', display: false}},
3322
- {{left: '\\\\[', right: '\\\\]', display: true}}
3323
- ],
3324
- throwOnError: false
3325
- }});
3326
- }}
3327
- if (document.querySelector('.footnotes')) {{
3328
- const notes = {{}};
3329
- document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
3330
- const id = li.getAttribute('id');
3331
- if (!id) return;
3332
- const clone = li.cloneNode(true);
3333
- clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
3334
- const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
3335
- if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
3336
- }});
3337
- document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
3338
- const ref = link.getAttribute('href');
3339
- const text = notes[ref];
3340
- if (!text) return;
3341
- link.dataset.footnote = text;
3342
- link.classList.add('footnote-tip');
3343
- }});
3344
- }}
3345
- {outline_js}
3346
- </script>
3347
- """
3348
- return render_page("Source", body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
3349
-
3350
- if view == "pdf":
3351
- if not pdf_path:
3352
- body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
3353
- return render_page("PDF", body, extra_scripts=fullscreen_script)
3354
- body = wrap_detail(
3355
- f"""
3356
- {pdf_only_warning_html}
3357
- <div class="muted">{html.escape(str(pdf_path.name))}</div>
3358
- <div style="display:flex; gap:8px; align-items:center; margin: 10px 0;">
3359
- <button id="prev" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Prev</button>
3360
- <button id="next" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Next</button>
3361
- <span class="muted">Page <span id="page_num">1</span> / <span id="page_count">?</span></span>
3362
- <span style="flex:1"></span>
3363
- <button id="zoomOut" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">-</button>
3364
- <button id="zoomIn" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">+</button>
3365
- </div>
3366
- <canvas id="the-canvas" style="width: 100%; border: 1px solid #d0d7de; border-radius: 10px;"></canvas>
3367
- """
3368
- )
3369
- extra_scripts = f"""
3370
- <script src="{_CDN_PDFJS}"></script>
3371
- <script>
3372
- const url = {json.dumps(pdf_url)};
3373
- pdfjsLib.GlobalWorkerOptions.workerSrc = {json.dumps(_CDN_PDFJS_WORKER)};
3374
- let pdfDoc = null;
3375
- let pageNum = 1;
3376
- let pageRendering = false;
3377
- let pageNumPending = null;
3378
- let zoomLevel = 1.0;
3379
- const canvas = document.getElementById('the-canvas');
3380
- const ctx = canvas.getContext('2d');
3381
-
3382
- function renderPage(num) {{
3383
- pageRendering = true;
3384
- pdfDoc.getPage(num).then((page) => {{
3385
- const baseViewport = page.getViewport({{scale: 1}});
3386
- const containerWidth = canvas.clientWidth || baseViewport.width;
3387
- const fitScale = containerWidth / baseViewport.width;
3388
- const scale = fitScale * zoomLevel;
3389
-
3390
- const viewport = page.getViewport({{scale}});
3391
- const outputScale = window.devicePixelRatio || 1;
3392
-
3393
- canvas.width = Math.floor(viewport.width * outputScale);
3394
- canvas.height = Math.floor(viewport.height * outputScale);
3395
- canvas.style.width = Math.floor(viewport.width) + 'px';
3396
- canvas.style.height = Math.floor(viewport.height) + 'px';
3397
-
3398
- const transform = outputScale !== 1 ? [outputScale, 0, 0, outputScale, 0, 0] : null;
3399
- const renderContext = {{ canvasContext: ctx, viewport, transform }};
3400
- const renderTask = page.render(renderContext);
3401
- renderTask.promise.then(() => {{
3402
- pageRendering = false;
3403
- document.getElementById('page_num').textContent = String(pageNum);
3404
- if (pageNumPending !== null) {{
3405
- const next = pageNumPending;
3406
- pageNumPending = null;
3407
- renderPage(next);
3408
- }}
3409
- }});
3410
- }});
3411
- }}
3412
-
3413
- function queueRenderPage(num) {{
3414
- if (pageRendering) {{
3415
- pageNumPending = num;
3416
- }} else {{
3417
- renderPage(num);
3418
- }}
3419
- }}
3420
-
3421
- function onPrevPage() {{
3422
- if (pageNum <= 1) return;
3423
- pageNum--;
3424
- queueRenderPage(pageNum);
3425
- }}
3426
-
3427
- function onNextPage() {{
3428
- if (pageNum >= pdfDoc.numPages) return;
3429
- pageNum++;
3430
- queueRenderPage(pageNum);
3431
- }}
3432
-
3433
- function adjustZoom(delta) {{
3434
- zoomLevel = Math.max(0.5, Math.min(3.0, zoomLevel + delta));
3435
- queueRenderPage(pageNum);
3436
- }}
3437
-
3438
- document.getElementById('prev').addEventListener('click', onPrevPage);
3439
- document.getElementById('next').addEventListener('click', onNextPage);
3440
- document.getElementById('zoomOut').addEventListener('click', () => adjustZoom(-0.1));
3441
- document.getElementById('zoomIn').addEventListener('click', () => adjustZoom(0.1));
3442
-
3443
- pdfjsLib.getDocument(url).promise.then((pdfDoc_) => {{
3444
- pdfDoc = pdfDoc_;
3445
- document.getElementById('page_count').textContent = String(pdfDoc.numPages);
3446
- renderPage(pageNum);
3447
- }});
3448
-
3449
- let resizeTimer = null;
3450
- window.addEventListener('resize', () => {{
3451
- if (!pdfDoc) return;
3452
- if (resizeTimer) clearTimeout(resizeTimer);
3453
- resizeTimer = setTimeout(() => queueRenderPage(pageNum), 150);
3454
- }});
3455
- </script>
3456
- """
3457
- return render_page("PDF", body, extra_scripts=extra_scripts + fullscreen_script)
3458
-
3459
- if view == "pdfjs":
3460
- if not pdf_path:
3461
- body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
3462
- return render_page("PDF Viewer", body, extra_scripts=fullscreen_script)
3463
- viewer_url = _build_pdfjs_viewer_url(pdf_url)
3464
- frame_height = "calc(100vh - 32px)" if embed else "100%"
3465
- body = wrap_detail(
3466
- f"""
3467
- {pdf_only_warning_html}
3468
- <div class="muted">{html.escape(str(pdf_path.name))}</div>
3469
- <iframe class="pdfjs-frame" src="{html.escape(viewer_url)}" title="PDF.js Viewer"></iframe>
3470
- """
3471
- )
3472
- extra_head = f"""
3473
- <style>
3474
- .pdfjs-frame {{
3475
- width: 100%;
3476
- height: {frame_height};
3477
- border: 1px solid #d0d7de;
3478
- border-radius: 10px;
3479
- flex: 1;
3480
- }}
3481
- </style>
3482
- """
3483
- return render_page("PDF Viewer", body, extra_head=extra_head, extra_scripts=fullscreen_script)
3484
-
3485
- selected_tag, available_templates = _select_template_tag(paper, template_param)
3486
- markdown, template_name, warning = _render_paper_markdown(
3487
- paper,
3488
- request.app.state.fallback_language,
3489
- template_tag=selected_tag,
3490
- )
3491
- rendered_html = _render_markdown_with_math_placeholders(md, markdown)
3492
-
3493
- warning_html = f'<div class="warning">{html.escape(warning)}</div>' if warning else ""
3494
- template_controls = f'<div class="muted">Template: {html.escape(template_name)}</div>'
3495
- if available_templates:
3496
- options = "\n".join(
3497
- f'<option value="{html.escape(tag)}"{" selected" if tag == selected_tag else ""}>{html.escape(tag)}</option>'
3498
- for tag in available_templates
3499
- )
3500
- template_controls = f"""
3501
- <div class="muted" style="margin: 6px 0;">
3502
- Template:
3503
- <select id="templateSelect" style="padding:6px 8px; border:1px solid #d0d7de; border-radius:6px;">
3504
- {options}
3505
- </select>
3506
- </div>
3507
- <script>
3508
- const templateSelect = document.getElementById('templateSelect');
3509
- if (templateSelect) {{
3510
- templateSelect.addEventListener('change', () => {{
3511
- const params = new URLSearchParams(window.location.search);
3512
- params.set('view', 'summary');
3513
- params.set('template', templateSelect.value);
3514
- window.location.search = params.toString();
3515
- }});
3516
- }}
3517
- </script>
3518
- """
3519
- content_html = f"""
3520
- {template_controls}
3521
- {warning_html}
3522
- {outline_html}
3523
- <div id="content">{rendered_html}</div>
3524
- """
3525
- body = wrap_detail(content_html)
3526
-
3527
- extra_head = f"""
3528
- <link rel="stylesheet" href="{_CDN_KATEX}" />
3529
- {outline_css}
3530
- """
3531
- extra_scripts = f"""
3532
- <script src="{_CDN_MERMAID}"></script>
3533
- <script src="{_CDN_KATEX_JS}"></script>
3534
- <script src="{_CDN_KATEX_AUTO}"></script>
3535
- <script>
3536
- // Mermaid: convert fenced code blocks to mermaid divs
3537
- document.querySelectorAll('code.language-mermaid').forEach((code) => {{
3538
- const pre = code.parentElement;
3539
- const div = document.createElement('div');
3540
- div.className = 'mermaid';
3541
- div.textContent = code.textContent;
3542
- pre.replaceWith(div);
3543
- }});
3544
- if (window.mermaid) {{
3545
- mermaid.initialize({{ startOnLoad: false }});
3546
- mermaid.run();
3547
- }}
3548
- if (window.renderMathInElement) {{
3549
- renderMathInElement(document.getElementById('content'), {{
3550
- delimiters: [
3551
- {{left: '$$', right: '$$', display: true}},
3552
- {{left: '$', right: '$', display: false}},
3553
- {{left: '\\\\(', right: '\\\\)', display: false}},
3554
- {{left: '\\\\[', right: '\\\\]', display: true}}
3555
- ],
3556
- throwOnError: false
3557
- }});
3558
- }}
3559
- if (document.querySelector('.footnotes')) {{
3560
- const notes = {{}};
3561
- document.querySelectorAll('.footnotes li[id]').forEach((li) => {{
3562
- const id = li.getAttribute('id');
3563
- if (!id) return;
3564
- const clone = li.cloneNode(true);
3565
- clone.querySelectorAll('a.footnote-backref').forEach((el) => el.remove());
3566
- const text = (clone.textContent || '').replace(/\\s+/g, ' ').trim();
3567
- if (text) notes['#' + id] = text.length > 400 ? text.slice(0, 397) + '…' : text;
3568
- }});
3569
- document.querySelectorAll('.footnote-ref a[href^="#fn"]').forEach((link) => {{
3570
- const ref = link.getAttribute('href');
3571
- const text = notes[ref];
3572
- if (!text) return;
3573
- link.dataset.footnote = text;
3574
- link.classList.add('footnote-tip');
3575
- }});
3576
- }}
3577
- {outline_js}
3578
- </script>
3579
- """
3580
- return render_page(page_title, body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
3581
-
3582
-
3583
- async def _api_stats(request: Request) -> JSONResponse:
3584
- index: PaperIndex = request.app.state.index
3585
- return JSONResponse(index.stats)
3586
-
3587
-
3588
- async def _api_pdf(request: Request) -> Response:
3589
- index: PaperIndex = request.app.state.index
3590
- source_hash = request.path_params["source_hash"]
3591
- pdf_path = index.pdf_path_by_hash.get(source_hash)
3592
- if not pdf_path:
3593
- return Response("PDF not found", status_code=404)
3594
- allowed_roots: list[Path] = request.app.state.pdf_roots
3595
- if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
3596
- return Response("Forbidden", status_code=403)
3597
- return FileResponse(pdf_path)
3598
-
3599
-
3600
- async def _stats_page(request: Request) -> HTMLResponse:
3601
- body = """
3602
- <h2>Stats</h2>
3603
- <div class="muted">Charts are rendered with ECharts (CDN).</div>
3604
- <div id="year" style="width:100%;height:360px"></div>
3605
- <div id="month" style="width:100%;height:360px"></div>
3606
- <div id="tags" style="width:100%;height:420px"></div>
3607
- <div id="keywords" style="width:100%;height:420px"></div>
3608
- <div id="authors" style="width:100%;height:420px"></div>
3609
- <div id="venues" style="width:100%;height:420px"></div>
3610
- """
3611
- scripts = f"""
3612
- <script src="{_CDN_ECHARTS}"></script>
3613
- <script>
3614
- async function main() {{
3615
- const res = await fetch('/api/stats');
3616
- const data = await res.json();
3617
-
3618
- function bar(el, title, items) {{
3619
- const chart = echarts.init(document.getElementById(el));
3620
- const labels = items.map(x => x.label);
3621
- const counts = items.map(x => x.count);
3622
- chart.setOption({{
3623
- title: {{ text: title }},
3624
- tooltip: {{ trigger: 'axis' }},
3625
- xAxis: {{ type: 'category', data: labels }},
3626
- yAxis: {{ type: 'value' }},
3627
- series: [{{ type: 'bar', data: counts }}]
3628
- }});
3629
- }}
3630
-
3631
- bar('year', 'Publication Year', data.years || []);
3632
- bar('month', 'Publication Month', data.months || []);
3633
- bar('tags', 'Top Tags', (data.tags || []).slice(0, 20));
3634
- bar('keywords', 'Top Keywords', (data.keywords || []).slice(0, 20));
3635
- bar('authors', 'Top Authors', (data.authors || []).slice(0, 20));
3636
- bar('venues', 'Top Venues', (data.venues || []).slice(0, 20));
3637
- }}
3638
- main();
3639
- </script>
3640
- """
3641
- return HTMLResponse(_page_shell("Stats", body, extra_scripts=scripts))
3642
-
3643
-
3644
- def _normalize_bibtex_title(title: str) -> str:
3645
- value = title.replace("{", "").replace("}", "")
3646
- value = re.sub(r"[^a-z0-9]+", " ", value.lower())
3647
- return re.sub(r"\\s+", " ", value).strip()
3648
-
3649
-
3650
- def _title_similarity(a: str, b: str) -> float:
3651
- import difflib
3652
-
3653
- if not a or not b:
3654
- return 0.0
3655
- return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
3656
-
3657
-
3658
- def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
3659
- if not PYBTEX_AVAILABLE:
3660
- raise RuntimeError("pybtex is required for --bibtex support")
3661
-
3662
- bib_data = parse_file(str(bibtex_path))
3663
- entries: list[dict[str, Any]] = []
3664
- by_prefix: dict[str, list[int]] = {}
3665
- for key, entry in bib_data.entries.items():
3666
- fields = dict(entry.fields)
3667
- title = str(fields.get("title") or "").strip()
3668
- title_norm = _normalize_bibtex_title(title)
3669
- if not title_norm:
3670
- continue
3671
- record = {
3672
- "key": key,
3673
- "type": entry.type,
3674
- "fields": fields,
3675
- "persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
3676
- "_title_norm": title_norm,
3677
- }
3678
- idx = len(entries)
3679
- entries.append(record)
3680
- prefix = title_norm[:16]
3681
- by_prefix.setdefault(prefix, []).append(idx)
3682
-
3683
- for paper in papers:
3684
- if isinstance(paper.get("bibtex"), dict):
3685
- continue
3686
- title = str(paper.get("paper_title") or "").strip()
3687
- if not title:
3688
- continue
3689
- norm = _normalize_bibtex_title(title)
3690
- if not norm:
3691
- continue
3692
-
3693
- candidates = []
3694
- prefix = norm[:16]
3695
- for cand_idx in by_prefix.get(prefix, []):
3696
- candidates.append(entries[cand_idx])
3697
- if not candidates:
3698
- candidates = entries
3699
-
3700
- best = None
3701
- best_score = 0.0
3702
- for entry in candidates:
3703
- score = _title_similarity(norm, entry["_title_norm"])
3704
- if score > best_score:
3705
- best_score = score
3706
- best = entry
3707
-
3708
- if best is not None and best_score >= 0.9:
3709
- paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
28
+ class _NoIndexMiddleware(BaseHTTPMiddleware):
29
+ async def dispatch(self, request: Request, call_next): # type: ignore[override]
30
+ response = await call_next(request)
31
+ response.headers["X-Robots-Tag"] = "noindex, nofollow, noarchive, nosnippet, noai, noimageai"
32
+ return response
3710
33
 
3711
34
 
3712
35
  def create_app(
@@ -3720,7 +43,7 @@ def create_app(
3720
43
  cache_dir: Path | None = None,
3721
44
  use_cache: bool = True,
3722
45
  ) -> Starlette:
3723
- papers = _load_or_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
46
+ papers = load_and_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
3724
47
 
3725
48
  md_roots = md_roots or []
3726
49
  md_translated_roots = md_translated_roots or []
@@ -3731,29 +54,39 @@ def create_app(
3731
54
  md_translated_roots=md_translated_roots,
3732
55
  pdf_roots=pdf_roots,
3733
56
  )
3734
- md = _md_renderer()
57
+ md = create_md_renderer()
3735
58
  routes = [
3736
- Route("/", _index_page, methods=["GET"]),
3737
- Route("/stats", _stats_page, methods=["GET"]),
3738
- Route("/paper/{source_hash:str}", _paper_detail, methods=["GET"]),
3739
- Route("/api/papers", _api_papers, methods=["GET"]),
3740
- Route("/api/stats", _api_stats, methods=["GET"]),
3741
- Route("/api/pdf/{source_hash:str}", _api_pdf, methods=["GET"]),
59
+ Route("/", index_page, methods=["GET"]),
60
+ Route("/robots.txt", robots_txt, methods=["GET"]),
61
+ Route("/stats", stats_page, methods=["GET"]),
62
+ Route("/paper/{source_hash:str}", paper_detail, methods=["GET"]),
63
+ Route("/api/papers", api_papers, methods=["GET"]),
64
+ Route("/api/stats", api_stats, methods=["GET"]),
65
+ Route("/api/pdf/{source_hash:str}", api_pdf, methods=["GET"]),
3742
66
  ]
3743
- if _PDFJS_STATIC_DIR.exists():
67
+ if PDFJS_STATIC_DIR.exists():
3744
68
  routes.append(
3745
69
  Mount(
3746
70
  "/pdfjs",
3747
- app=StaticFiles(directory=str(_PDFJS_STATIC_DIR), html=True),
71
+ app=StaticFiles(directory=str(PDFJS_STATIC_DIR), html=True),
3748
72
  name="pdfjs",
3749
73
  )
3750
74
  )
3751
75
  elif pdf_roots:
3752
76
  logger.warning(
3753
77
  "PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable.",
3754
- _PDFJS_STATIC_DIR,
78
+ PDFJS_STATIC_DIR,
79
+ )
80
+ if STATIC_DIR.exists():
81
+ routes.append(
82
+ Mount(
83
+ "/static",
84
+ app=StaticFiles(directory=str(STATIC_DIR)),
85
+ name="static",
86
+ )
3755
87
  )
3756
88
  app = Starlette(routes=routes)
89
+ app.add_middleware(_NoIndexMiddleware)
3757
90
  app.state.index = index
3758
91
  app.state.md = md
3759
92
  app.state.fallback_language = fallback_language