deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,3326 +1,35 @@
1
1
  from __future__ import annotations
2
2
 
3
- import html
4
- import json
5
3
  import logging
6
- import unicodedata
7
- from dataclasses import dataclass
8
- from html.parser import HTMLParser
9
4
  from pathlib import Path
10
- from typing import Any
11
- import re
12
- from urllib.parse import urlencode, quote
13
5
 
14
- from markdown_it import MarkdownIt
15
6
  from starlette.applications import Starlette
7
+ from starlette.middleware.base import BaseHTTPMiddleware
16
8
  from starlette.requests import Request
17
- from starlette.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, Response
18
9
  from starlette.routing import Mount, Route
19
10
  from starlette.staticfiles import StaticFiles
20
11
 
21
- from deepresearch_flow.paper.render import load_default_template
22
- from deepresearch_flow.paper.template_registry import (
23
- list_template_names_in_registry_order,
24
- load_render_template,
25
- load_schema_for_template,
12
+ from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
13
+ from deepresearch_flow.paper.web.constants import PDFJS_STATIC_DIR, STATIC_DIR
14
+ from deepresearch_flow.paper.web.handlers import (
15
+ api_papers,
16
+ api_pdf,
17
+ api_stats,
18
+ index_page,
19
+ paper_detail,
20
+ robots_txt,
21
+ stats_page,
26
22
  )
27
- from deepresearch_flow.paper.utils import stable_hash
28
- from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
29
-
30
- try:
31
- from pybtex.database import parse_file
32
- PYBTEX_AVAILABLE = True
33
- except Exception:
34
- PYBTEX_AVAILABLE = False
35
-
36
- try:
37
- from pypdf import PdfReader
38
- PYPDF_AVAILABLE = True
39
- except Exception:
40
- PYPDF_AVAILABLE = False
41
-
42
-
43
- _CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
44
- _CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"
45
- _CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
46
- _CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"
47
- _CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js"
48
- # Use legacy builds to ensure `pdfjsLib` is available as a global.
49
- _CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
50
- _CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
51
- _PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
52
- _PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
23
+ from deepresearch_flow.paper.web.markdown import create_md_renderer
53
24
 
54
25
  logger = logging.getLogger(__name__)
55
26
 
56
27
 
57
- @dataclass(frozen=True)
58
- class PaperIndex:
59
- papers: list[dict[str, Any]]
60
- id_by_hash: dict[str, int]
61
- ordered_ids: list[int]
62
- by_tag: dict[str, set[int]]
63
- by_author: dict[str, set[int]]
64
- by_year: dict[str, set[int]]
65
- by_month: dict[str, set[int]]
66
- by_venue: dict[str, set[int]]
67
- stats: dict[str, Any]
68
- md_path_by_hash: dict[str, Path]
69
- pdf_path_by_hash: dict[str, Path]
70
- template_tags: list[str]
71
-
72
-
73
- def _split_csv(values: list[str]) -> list[str]:
74
- out: list[str] = []
75
- for value in values:
76
- for part in value.split(","):
77
- part = part.strip()
78
- if part:
79
- out.append(part)
80
- return out
81
-
82
-
83
- def _normalize_key(value: str) -> str:
84
- return value.strip().lower()
85
-
86
-
87
- def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
88
- if not date_str:
89
- return None, None
90
- text = str(date_str).strip()
91
- year = None
92
- month = None
93
-
94
- year_match = re.search(r"(19|20)\d{2}", text)
95
- if year_match:
96
- year = year_match.group(0)
97
-
98
- numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
99
- if numeric_match:
100
- m = int(numeric_match.group(2))
101
- if 1 <= m <= 12:
102
- month = f"{m:02d}"
103
- return year, month
104
-
105
- month_word = re.search(
106
- r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
107
- r"january|february|march|april|june|july|august|september|october|november|december)",
108
- text.lower(),
109
- )
110
- if month_word:
111
- lookup = {
112
- "january": "01",
113
- "february": "02",
114
- "march": "03",
115
- "april": "04",
116
- "may": "05",
117
- "june": "06",
118
- "july": "07",
119
- "august": "08",
120
- "september": "09",
121
- "october": "10",
122
- "november": "11",
123
- "december": "12",
124
- "jan": "01",
125
- "feb": "02",
126
- "mar": "03",
127
- "apr": "04",
128
- "jun": "06",
129
- "jul": "07",
130
- "aug": "08",
131
- "sep": "09",
132
- "sept": "09",
133
- "oct": "10",
134
- "nov": "11",
135
- "dec": "12",
136
- }
137
- month = lookup.get(month_word.group(0))
138
- return year, month
139
-
140
-
141
- def _normalize_month_token(value: str | int | None) -> str | None:
142
- if value is None:
143
- return None
144
- if isinstance(value, int):
145
- if 1 <= value <= 12:
146
- return f"{value:02d}"
147
- return None
148
- raw = str(value).strip().lower()
149
- if not raw:
150
- return None
151
- if raw.isdigit():
152
- return _normalize_month_token(int(raw))
153
- lookup = {
154
- "january": "01",
155
- "february": "02",
156
- "march": "03",
157
- "april": "04",
158
- "may": "05",
159
- "june": "06",
160
- "july": "07",
161
- "august": "08",
162
- "september": "09",
163
- "october": "10",
164
- "november": "11",
165
- "december": "12",
166
- "jan": "01",
167
- "feb": "02",
168
- "mar": "03",
169
- "apr": "04",
170
- "jun": "06",
171
- "jul": "07",
172
- "aug": "08",
173
- "sep": "09",
174
- "sept": "09",
175
- "oct": "10",
176
- "nov": "11",
177
- "dec": "12",
178
- }
179
- return lookup.get(raw)
180
-
181
-
182
- def _extract_authors(paper: dict[str, Any]) -> list[str]:
183
- value = paper.get("paper_authors")
184
- if value is None:
185
- return []
186
- if isinstance(value, list):
187
- return [str(item).strip() for item in value if str(item).strip()]
188
- if isinstance(value, str):
189
- return [part.strip() for part in value.split(",") if part.strip()]
190
- return [str(value)]
191
-
192
-
193
- def _extract_tags(paper: dict[str, Any]) -> list[str]:
194
- tags = paper.get("ai_generated_tags") or []
195
- if isinstance(tags, list):
196
- return [str(tag).strip() for tag in tags if str(tag).strip()]
197
- return []
198
-
199
-
200
- def _extract_keywords(paper: dict[str, Any]) -> list[str]:
201
- keywords = paper.get("keywords") or []
202
- if isinstance(keywords, list):
203
- return [str(keyword).strip() for keyword in keywords if str(keyword).strip()]
204
- if isinstance(keywords, str):
205
- parts = re.split(r"[;,]", keywords)
206
- return [part.strip() for part in parts if part.strip()]
207
- return []
208
-
209
-
210
- _SUMMARY_FIELDS = (
211
- "summary",
212
- "abstract",
213
- "keywords",
214
- "question1",
215
- "question2",
216
- "question3",
217
- "question4",
218
- "question5",
219
- "question6",
220
- "question7",
221
- "question8",
222
- )
223
-
224
-
225
- def _has_summary(paper: dict[str, Any], template_tags: list[str]) -> bool:
226
- if template_tags:
227
- return True
228
- for key in _SUMMARY_FIELDS:
229
- value = paper.get(key)
230
- if isinstance(value, str) and value.strip():
231
- return True
232
- return False
233
-
234
-
235
- def _extract_venue(paper: dict[str, Any]) -> str:
236
- if isinstance(paper.get("bibtex"), dict):
237
- bib = paper.get("bibtex") or {}
238
- fields = bib.get("fields") or {}
239
- bib_type = (bib.get("type") or "").lower()
240
- if bib_type == "article" and fields.get("journal"):
241
- return str(fields.get("journal"))
242
- if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
243
- return str(fields.get("booktitle"))
244
- return str(paper.get("publication_venue") or "")
245
-
246
-
247
- def build_index(
248
- papers: list[dict[str, Any]],
249
- *,
250
- md_roots: list[Path] | None = None,
251
- pdf_roots: list[Path] | None = None,
252
- ) -> PaperIndex:
253
- id_by_hash: dict[str, int] = {}
254
- by_tag: dict[str, set[int]] = {}
255
- by_author: dict[str, set[int]] = {}
256
- by_year: dict[str, set[int]] = {}
257
- by_month: dict[str, set[int]] = {}
258
- by_venue: dict[str, set[int]] = {}
259
-
260
- md_path_by_hash: dict[str, Path] = {}
261
- pdf_path_by_hash: dict[str, Path] = {}
262
-
263
- md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
264
- pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
265
-
266
- year_counts: dict[str, int] = {}
267
- month_counts: dict[str, int] = {}
268
- tag_counts: dict[str, int] = {}
269
- keyword_counts: dict[str, int] = {}
270
- author_counts: dict[str, int] = {}
271
- venue_counts: dict[str, int] = {}
272
- template_tag_counts: dict[str, int] = {}
273
-
274
- def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
275
- index.setdefault(key, set()).add(idx)
276
-
277
- for idx, paper in enumerate(papers):
278
- is_pdf_only = bool(paper.get("_is_pdf_only"))
279
- source_hash = paper.get("source_hash")
280
- if not source_hash and paper.get("source_path"):
281
- source_hash = stable_hash(str(paper.get("source_path")))
282
- if source_hash:
283
- id_by_hash[str(source_hash)] = idx
284
-
285
- title = str(paper.get("paper_title") or "")
286
- paper["_title_lc"] = title.lower()
287
-
288
- bib_fields: dict[str, Any] = {}
289
- if isinstance(paper.get("bibtex"), dict):
290
- bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
291
-
292
- year = None
293
- if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
294
- year = str(bib_fields.get("year"))
295
- month = _normalize_month_token(bib_fields.get("month"))
296
- if not year or not month:
297
- parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
298
- year = year or parsed_year
299
- month = month or parsed_month
300
-
301
- year_label = year or "Unknown"
302
- month_label = month or "Unknown"
303
- paper["_year"] = year_label
304
- paper["_month"] = month_label
305
- add_index(by_year, _normalize_key(year_label), idx)
306
- add_index(by_month, _normalize_key(month_label), idx)
307
- if not is_pdf_only:
308
- year_counts[year_label] = year_counts.get(year_label, 0) + 1
309
- month_counts[month_label] = month_counts.get(month_label, 0) + 1
310
-
311
- venue = _extract_venue(paper).strip()
312
- paper["_venue"] = venue
313
- if venue:
314
- add_index(by_venue, _normalize_key(venue), idx)
315
- if not is_pdf_only:
316
- venue_counts[venue] = venue_counts.get(venue, 0) + 1
317
- else:
318
- add_index(by_venue, "unknown", idx)
319
- if not is_pdf_only:
320
- venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
321
-
322
- authors = _extract_authors(paper)
323
- paper["_authors"] = authors
324
- for author in authors:
325
- key = _normalize_key(author)
326
- add_index(by_author, key, idx)
327
- if not is_pdf_only:
328
- author_counts[author] = author_counts.get(author, 0) + 1
329
-
330
- tags = _extract_tags(paper)
331
- paper["_tags"] = tags
332
- for tag in tags:
333
- key = _normalize_key(tag)
334
- add_index(by_tag, key, idx)
335
- if not is_pdf_only:
336
- tag_counts[tag] = tag_counts.get(tag, 0) + 1
337
-
338
- keywords = _extract_keywords(paper)
339
- paper["_keywords"] = keywords
340
- for keyword in keywords:
341
- if not is_pdf_only:
342
- keyword_counts[keyword] = keyword_counts.get(keyword, 0) + 1
343
-
344
- template_tags = _available_templates(paper)
345
- if not template_tags:
346
- fallback_tag = paper.get("template_tag") or paper.get("prompt_template")
347
- if fallback_tag:
348
- template_tags = [str(fallback_tag)]
349
- paper["_template_tags"] = template_tags
350
- paper["_template_tags_lc"] = [tag.lower() for tag in template_tags]
351
- paper["_has_summary"] = _has_summary(paper, template_tags)
352
- if not is_pdf_only:
353
- for tag in template_tags:
354
- template_tag_counts[tag] = template_tag_counts.get(tag, 0) + 1
355
-
356
- search_parts = [title, venue, " ".join(authors), " ".join(tags)]
357
- paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
358
-
359
- source_hash_str = str(source_hash) if source_hash else str(idx)
360
- md_path = _resolve_source_md(paper, md_file_index)
361
- if md_path is not None:
362
- md_path_by_hash[source_hash_str] = md_path
363
- pdf_path = _resolve_pdf(paper, pdf_file_index)
364
- if pdf_path is not None:
365
- pdf_path_by_hash[source_hash_str] = pdf_path
366
-
367
- def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
368
- idx, paper = item
369
- year_label = str(paper.get("_year") or "Unknown")
370
- title_label = str(paper.get("paper_title") or "")
371
- if year_label.isdigit():
372
- return (0, -int(year_label), title_label.lower())
373
- return (1, 0, title_label.lower())
374
-
375
- ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
376
-
377
- stats_total = sum(1 for paper in papers if not paper.get("_is_pdf_only"))
378
- stats = {
379
- "total": stats_total,
380
- "years": _sorted_counts(year_counts, numeric_desc=True),
381
- "months": _sorted_month_counts(month_counts),
382
- "tags": _sorted_counts(tag_counts),
383
- "keywords": _sorted_counts(keyword_counts),
384
- "authors": _sorted_counts(author_counts),
385
- "venues": _sorted_counts(venue_counts),
386
- }
387
-
388
- template_tags = sorted(template_tag_counts.keys(), key=lambda item: item.lower())
389
-
390
- return PaperIndex(
391
- papers=papers,
392
- id_by_hash=id_by_hash,
393
- ordered_ids=ordered_ids,
394
- by_tag=by_tag,
395
- by_author=by_author,
396
- by_year=by_year,
397
- by_month=by_month,
398
- by_venue=by_venue,
399
- stats=stats,
400
- md_path_by_hash=md_path_by_hash,
401
- pdf_path_by_hash=pdf_path_by_hash,
402
- template_tags=template_tags,
403
- )
404
-
405
-
406
- def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
407
- items = list(counts.items())
408
- if numeric_desc:
409
- def key(item: tuple[str, int]) -> tuple[int, int]:
410
- label, count = item
411
- if label.isdigit():
412
- return (0, -int(label))
413
- return (1, 0)
414
- items.sort(key=key)
415
- else:
416
- items.sort(key=lambda item: item[1], reverse=True)
417
- return [{"label": k, "count": v} for k, v in items]
418
-
419
-
420
- def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
421
- def month_sort(label: str) -> int:
422
- if label == "Unknown":
423
- return 99
424
- if label.isdigit():
425
- return int(label)
426
- return 98
427
-
428
- items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
429
- return [{"label": k, "count": v} for k, v in items]
430
-
431
-
432
- _TEMPLATE_INFER_IGNORE_KEYS = {
433
- "source_path",
434
- "source_hash",
435
- "provider",
436
- "model",
437
- "extracted_at",
438
- "truncation",
439
- "output_language",
440
- "prompt_template",
441
- }
442
-
443
-
444
- def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
445
- inputs: list[dict[str, Any]] = []
446
- for path in paths:
447
- payload = json.loads(path.read_text(encoding="utf-8"))
448
- if isinstance(payload, list):
449
- raise ValueError(
450
- f"Input JSON must be an object with template_tag and papers (got array): {path}"
451
- )
452
- if not isinstance(payload, dict):
453
- raise ValueError(f"Input JSON must be an object: {path}")
454
- papers = payload.get("papers")
455
- if not isinstance(papers, list):
456
- raise ValueError(f"Input JSON missing papers list: {path}")
457
- template_tag = payload.get("template_tag")
458
- if not template_tag:
459
- template_tag = _infer_template_tag(papers, path)
460
- inputs.append({"template_tag": str(template_tag), "papers": papers})
461
- return inputs
462
-
463
-
464
- def _infer_template_tag(papers: list[dict[str, Any]], path: Path) -> str:
465
- prompt_tags = {
466
- str(paper.get("prompt_template"))
467
- for paper in papers
468
- if isinstance(paper, dict) and paper.get("prompt_template")
469
- }
470
- if len(prompt_tags) == 1:
471
- return prompt_tags.pop()
472
-
473
- sample = next((paper for paper in papers if isinstance(paper, dict)), None)
474
- if sample is None:
475
- raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
476
-
477
- paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
478
- if not paper_keys:
479
- raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
480
-
481
- best_tag = None
482
- best_score = -1
483
- for name in list_template_names_in_registry_order():
484
- schema = load_schema_for_template(name)
485
- schema_keys = set((schema.get("properties") or {}).keys())
486
- score = len(paper_keys & schema_keys)
487
- if score > best_score:
488
- best_score = score
489
- best_tag = name
490
- elif score == best_score:
491
- if best_tag != "simple" and name == "simple":
492
- best_tag = name
493
-
494
- if not best_tag:
495
- raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
496
- return best_tag
497
-
498
-
499
- def _build_cache_meta(
500
- db_paths: list[Path],
501
- bibtex_path: Path | None,
502
- pdf_roots_meta: list[dict[str, Any]] | None = None,
503
- ) -> dict[str, Any]:
504
- def file_meta(path: Path) -> dict[str, Any]:
505
- try:
506
- stats = path.stat()
507
- except OSError as exc:
508
- raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
509
- return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
510
-
511
- meta = {
512
- "version": 1,
513
- "inputs": [file_meta(path) for path in db_paths],
514
- "bibtex": file_meta(bibtex_path) if bibtex_path else None,
515
- }
516
- if pdf_roots_meta is not None:
517
- meta["pdf_roots"] = pdf_roots_meta
518
- return meta
519
-
520
-
521
- def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
522
- meta_path = cache_dir / "db_serve_cache.meta.json"
523
- data_path = cache_dir / "db_serve_cache.papers.json"
524
- if not meta_path.exists() or not data_path.exists():
525
- return None
526
- try:
527
- cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
528
- if cached_meta != meta:
529
- return None
530
- cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
531
- if not isinstance(cached_papers, list):
532
- return None
533
- return cached_papers
534
- except Exception:
535
- return None
536
-
537
-
538
- def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
539
- meta_path = cache_dir / "db_serve_cache.meta.json"
540
- data_path = cache_dir / "db_serve_cache.papers.json"
541
- meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
542
- data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
543
-
544
-
545
- def _extract_year_for_matching(paper: dict[str, Any]) -> str | None:
546
- if isinstance(paper.get("bibtex"), dict):
547
- fields = paper.get("bibtex", {}).get("fields", {}) or {}
548
- year = fields.get("year")
549
- if year and str(year).isdigit():
550
- return str(year)
551
- parsed_year, _ = _parse_year_month(str(paper.get("publication_date") or ""))
552
- return parsed_year
553
-
554
-
555
- def _prepare_paper_matching_fields(paper: dict[str, Any]) -> None:
556
- if "_authors" not in paper:
557
- paper["_authors"] = _extract_authors(paper)
558
- if "_year" not in paper:
559
- paper["_year"] = _extract_year_for_matching(paper) or ""
560
-
561
-
562
- def _build_pdf_only_entries(
563
- papers: list[dict[str, Any]],
564
- pdf_paths: list[Path],
565
- pdf_index: dict[str, list[Path]],
566
- ) -> list[dict[str, Any]]:
567
- matched: set[Path] = set()
568
- for paper in papers:
569
- _prepare_paper_matching_fields(paper)
570
- pdf_path = _resolve_pdf(paper, pdf_index)
571
- if pdf_path:
572
- matched.add(pdf_path.resolve())
573
-
574
- entries: list[dict[str, Any]] = []
575
- for path in pdf_paths:
576
- resolved = path.resolve()
577
- if resolved in matched:
578
- continue
579
- title = _read_pdf_metadata_title(resolved) or _extract_title_from_filename(resolved.name)
580
- if not title:
581
- title = resolved.stem
582
- year_hint, author_hint = _extract_year_author_from_filename(resolved.name)
583
- entry: dict[str, Any] = {
584
- "paper_title": title,
585
- "paper_authors": [author_hint] if author_hint else [],
586
- "publication_date": year_hint or "",
587
- "source_hash": stable_hash(str(resolved)),
588
- "source_path": str(resolved),
589
- "_is_pdf_only": True,
590
- }
591
- entries.append(entry)
592
- return entries
593
-
594
-
595
- def _load_or_merge_papers(
596
- db_paths: list[Path],
597
- bibtex_path: Path | None,
598
- cache_dir: Path | None,
599
- use_cache: bool,
600
- pdf_roots: list[Path] | None = None,
601
- ) -> list[dict[str, Any]]:
602
- cache_meta = None
603
- pdf_roots = pdf_roots or []
604
- pdf_paths: list[Path] = []
605
- pdf_roots_meta: list[dict[str, Any]] | None = None
606
- if pdf_roots:
607
- pdf_paths, pdf_roots_meta = _scan_pdf_roots(pdf_roots)
608
- if cache_dir and use_cache:
609
- cache_dir.mkdir(parents=True, exist_ok=True)
610
- cache_meta = _build_cache_meta(db_paths, bibtex_path, pdf_roots_meta)
611
- cached = _load_cached_papers(cache_dir, cache_meta)
612
- if cached is not None:
613
- return cached
614
-
615
- inputs = _load_paper_inputs(db_paths)
616
- if bibtex_path is not None:
617
- for bundle in inputs:
618
- enrich_with_bibtex(bundle["papers"], bibtex_path)
619
- papers = _merge_paper_inputs(inputs)
620
- if pdf_paths:
621
- pdf_index = _build_file_index_from_paths(pdf_paths, suffixes={".pdf"})
622
- papers.extend(_build_pdf_only_entries(papers, pdf_paths, pdf_index))
623
-
624
- if cache_dir and use_cache and cache_meta is not None:
625
- _write_cached_papers(cache_dir, cache_meta, papers)
626
- return papers
627
-
628
-
629
- def _md_renderer() -> MarkdownIt:
630
- md = MarkdownIt("commonmark", {"html": False, "linkify": True})
631
- md.enable("table")
632
- return md
633
-
634
-
635
- def _strip_paragraph_wrapped_tables(text: str) -> str:
636
- lines = text.splitlines()
637
- for idx, line in enumerate(lines):
638
- line = re.sub(r"^\s*<p>\s*\|", "|", line)
639
- line = re.sub(r"\|\s*</p>\s*$", "|", line)
640
- lines[idx] = line
641
- return "\n".join(lines)
642
-
643
-
644
- def _normalize_merge_title(value: str | None) -> str | None:
645
- if not value:
646
- return None
647
- return str(value).replace("{", "").replace("}", "").strip().lower()
648
-
649
-
650
- def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
651
- if not isinstance(paper.get("bibtex"), dict):
652
- return None
653
- fields = paper.get("bibtex", {}).get("fields", {}) or {}
654
- return _normalize_merge_title(fields.get("title"))
655
-
656
-
657
- def _extract_paper_title(paper: dict[str, Any]) -> str | None:
658
- return _normalize_merge_title(paper.get("paper_title"))
659
-
660
-
661
- def _available_templates(paper: dict[str, Any]) -> list[str]:
662
- templates = paper.get("templates")
663
- if not isinstance(templates, dict):
664
- return []
665
- order = paper.get("template_order") or list(templates.keys())
666
- seen: set[str] = set()
667
- available: list[str] = []
668
- for tag in order:
669
- if tag in templates and tag not in seen:
670
- available.append(tag)
671
- seen.add(tag)
672
- for tag in templates:
673
- if tag not in seen:
674
- available.append(tag)
675
- seen.add(tag)
676
- return available
677
-
678
-
679
- def _select_template_tag(
680
- paper: dict[str, Any], requested: str | None
681
- ) -> tuple[str | None, list[str]]:
682
- available = _available_templates(paper)
683
- if not available:
684
- return None, []
685
- default_tag = paper.get("default_template")
686
- if not default_tag:
687
- default_tag = "simple" if "simple" in available else available[0]
688
- selected = requested if requested in available else default_tag
689
- return selected, available
690
-
691
-
692
- def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
693
- bib_title = _extract_bibtex_title(paper)
694
- group_bib = group.get("_merge_bibtex_titles") or set()
695
- if bib_title and group_bib:
696
- return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
697
-
698
- paper_title = _extract_paper_title(paper)
699
- group_titles = group.get("_merge_paper_titles") or set()
700
- if paper_title and group_titles:
701
- return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
702
- return False
703
-
704
-
705
- def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
706
- bib_title = _extract_bibtex_title(paper)
707
- if bib_title:
708
- group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
709
- paper_title = _extract_paper_title(paper)
710
- if paper_title:
711
- group.setdefault("_merge_paper_titles", set()).add(paper_title)
712
-
713
-
714
- def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
715
- merged: list[dict[str, Any]] = []
716
- threshold = 0.95
717
- prefix_len = 5
718
- bibtex_exact: dict[str, set[int]] = {}
719
- bibtex_prefix: dict[str, set[int]] = {}
720
- paper_exact: dict[str, set[int]] = {}
721
- paper_prefix: dict[str, set[int]] = {}
722
-
723
- def prefix_key(value: str) -> str:
724
- return value[:prefix_len] if len(value) >= prefix_len else value
725
-
726
- def add_index(
727
- value: str,
728
- exact_index: dict[str, set[int]],
729
- prefix_index: dict[str, set[int]],
730
- idx: int,
731
- ) -> None:
732
- exact_index.setdefault(value, set()).add(idx)
733
- prefix_index.setdefault(prefix_key(value), set()).add(idx)
734
-
735
- def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
736
- ids: set[int] = set()
737
- if bib_title:
738
- ids |= bibtex_exact.get(bib_title, set())
739
- ids |= bibtex_prefix.get(prefix_key(bib_title), set())
740
- if paper_title:
741
- ids |= paper_exact.get(paper_title, set())
742
- ids |= paper_prefix.get(prefix_key(paper_title), set())
743
- return sorted(ids)
744
-
745
- for bundle in inputs:
746
- template_tag = bundle.get("template_tag")
747
- papers = bundle.get("papers") or []
748
- for paper in papers:
749
- if not isinstance(paper, dict):
750
- raise ValueError("Input papers must be objects")
751
- bib_title = _extract_bibtex_title(paper)
752
- paper_title = _extract_paper_title(paper)
753
- match = None
754
- match_idx = None
755
- for idx in candidate_ids(bib_title, paper_title):
756
- candidate = merged[idx]
757
- if _titles_match(candidate, paper, threshold=threshold):
758
- match = candidate
759
- match_idx = idx
760
- break
761
- if match is None:
762
- group = {
763
- "templates": {template_tag: paper},
764
- "template_order": [template_tag],
765
- }
766
- _add_merge_titles(group, paper)
767
- merged.append(group)
768
- group_idx = len(merged) - 1
769
- if bib_title:
770
- add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
771
- if paper_title:
772
- add_index(paper_title, paper_exact, paper_prefix, group_idx)
773
- else:
774
- templates = match.setdefault("templates", {})
775
- templates[template_tag] = paper
776
- order = match.setdefault("template_order", [])
777
- if template_tag not in order:
778
- order.append(template_tag)
779
- _add_merge_titles(match, paper)
780
- if match_idx is not None:
781
- if bib_title:
782
- add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
783
- if paper_title:
784
- add_index(paper_title, paper_exact, paper_prefix, match_idx)
785
-
786
- for group in merged:
787
- templates = group.get("templates") or {}
788
- order = group.get("template_order") or list(templates.keys())
789
- default_tag = "simple" if "simple" in order else (order[0] if order else None)
790
- group["default_template"] = default_tag
791
- if default_tag and default_tag in templates:
792
- base = templates[default_tag]
793
- for key, value in base.items():
794
- group[key] = value
795
- group.pop("_merge_bibtex_titles", None)
796
- group.pop("_merge_paper_titles", None)
797
- return merged
798
-
799
-
800
- def _render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
801
- text = _strip_paragraph_wrapped_tables(text)
802
- rendered, table_placeholders = _extract_html_table_placeholders(text)
803
- rendered, img_placeholders = _extract_html_img_placeholders(rendered)
804
- rendered, placeholders = _extract_math_placeholders(rendered)
805
- html_out = md.render(rendered)
806
- for key, value in placeholders.items():
807
- html_out = html_out.replace(key, html.escape(value))
808
- for key, value in img_placeholders.items():
809
- html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: value, html_out)
810
- html_out = html_out.replace(key, value)
811
- for key, value in table_placeholders.items():
812
- safe_html = _sanitize_table_html(value)
813
- html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: safe_html, html_out)
814
- return html_out
815
-
816
-
817
- def _extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
818
- placeholders: dict[str, str] = {}
819
- out: list[str] = []
820
- idx = 0
821
- in_fence = False
822
- fence_char = ""
823
- fence_len = 0
824
- inline_delim_len = 0
825
-
826
- def next_placeholder(value: str) -> str:
827
- key = f"@@MATH_{len(placeholders)}@@"
828
- placeholders[key] = value
829
- return key
830
-
831
- while idx < len(text):
832
- at_line_start = idx == 0 or text[idx - 1] == "\n"
833
-
834
- if inline_delim_len == 0 and at_line_start:
835
- line_end = text.find("\n", idx)
836
- if line_end == -1:
837
- line_end = len(text)
838
- line = text[idx:line_end]
839
- stripped = line.lstrip(" ")
840
- leading_spaces = len(line) - len(stripped)
841
- if leading_spaces <= 3 and stripped:
842
- first = stripped[0]
843
- if first in {"`", "~"}:
844
- run_len = 0
845
- while run_len < len(stripped) and stripped[run_len] == first:
846
- run_len += 1
847
- if run_len >= 3:
848
- if not in_fence:
849
- in_fence = True
850
- fence_char = first
851
- fence_len = run_len
852
- elif first == fence_char and run_len >= fence_len:
853
- in_fence = False
854
- fence_char = ""
855
- fence_len = 0
856
- out.append(line)
857
- idx = line_end
858
- continue
859
-
860
- if in_fence:
861
- out.append(text[idx])
862
- idx += 1
863
- continue
864
-
865
- if inline_delim_len > 0:
866
- delim = "`" * inline_delim_len
867
- if text.startswith(delim, idx):
868
- out.append(delim)
869
- idx += inline_delim_len
870
- inline_delim_len = 0
871
- continue
872
- out.append(text[idx])
873
- idx += 1
874
- continue
875
-
876
- ch = text[idx]
877
- if ch == "`":
878
- run_len = 0
879
- while idx + run_len < len(text) and text[idx + run_len] == "`":
880
- run_len += 1
881
- inline_delim_len = run_len
882
- out.append("`" * run_len)
883
- idx += run_len
884
- continue
885
-
886
- # Block math: $$...$$ (can span lines)
887
- if text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
888
- search_from = idx + 2
889
- end = text.find("$$", search_from)
890
- while end != -1 and text[end - 1] == "\\":
891
- search_from = end + 2
892
- end = text.find("$$", search_from)
893
- if end != -1:
894
- out.append(next_placeholder(text[idx : end + 2]))
895
- idx = end + 2
896
- continue
897
-
898
- # Inline math: $...$ (single-line)
899
- if ch == "$" and not text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
900
- search_from = idx + 1
901
- end = text.find("$", search_from)
902
- while end != -1 and text[end - 1] == "\\":
903
- search_from = end + 1
904
- end = text.find("$", search_from)
905
- if end != -1:
906
- out.append(next_placeholder(text[idx : end + 1]))
907
- idx = end + 1
908
- continue
909
-
910
- out.append(ch)
911
- idx += 1
912
-
913
- return "".join(out), placeholders
914
-
915
-
916
- class _TableSanitizer(HTMLParser):
917
- def __init__(self) -> None:
918
- super().__init__(convert_charrefs=True)
919
- self._out: list[str] = []
920
- self._stack: list[str] = []
921
-
922
- def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
923
- t = tag.lower()
924
- if t not in {
925
- "table",
926
- "thead",
927
- "tbody",
928
- "tfoot",
929
- "tr",
930
- "th",
931
- "td",
932
- "caption",
933
- "colgroup",
934
- "col",
935
- "br",
936
- }:
937
- return
938
-
939
- allowed: dict[str, str] = {}
940
- for name, value in attrs:
941
- if value is None:
942
- continue
943
- n = name.lower()
944
- v = value.strip()
945
- if t in {"td", "th"} and n in {"colspan", "rowspan"} and v.isdigit():
946
- allowed[n] = v
947
- elif t in {"td", "th"} and n == "align" and v.lower() in {"left", "right", "center"}:
948
- allowed[n] = v.lower()
949
-
950
- attr_text = "".join(f' {k}="{html.escape(v, quote=True)}"' for k, v in allowed.items())
951
- self._out.append(f"<{t}{attr_text}>")
952
- if t not in {"br", "col"}:
953
- self._stack.append(t)
954
-
955
- def handle_endtag(self, tag: str) -> None:
956
- t = tag.lower()
957
- if t not in self._stack:
958
- return
959
- while self._stack:
960
- popped = self._stack.pop()
961
- self._out.append(f"</{popped}>")
962
- if popped == t:
963
- break
964
-
965
- def handle_data(self, data: str) -> None:
966
- self._out.append(html.escape(data))
967
-
968
- def handle_entityref(self, name: str) -> None:
969
- self._out.append(f"&{name};")
970
-
971
- def handle_charref(self, name: str) -> None:
972
- self._out.append(f"&#{name};")
973
-
974
- def close(self) -> None:
975
- super().close()
976
- while self._stack:
977
- self._out.append(f"</{self._stack.pop()}>")
978
-
979
- def get_html(self) -> str:
980
- return "".join(self._out)
981
-
982
-
983
- def _sanitize_table_html(raw: str) -> str:
984
- parser = _TableSanitizer()
985
- try:
986
- parser.feed(raw)
987
- parser.close()
988
- except Exception:
989
- return f"<pre><code>{html.escape(raw)}</code></pre>"
990
- return parser.get_html()
991
-
992
-
993
- def _sanitize_img_html(raw: str) -> str | None:
994
- attrs = {}
995
- for match in re.finditer(r"(\w+)\s*=\s*(\"[^\"]*\"|'[^']*'|[^\s>]+)", raw):
996
- name = match.group(1).lower()
997
- value = match.group(2).strip()
998
- if value and value[0] in {"\"", "'"} and value[-1] == value[0]:
999
- value = value[1:-1]
1000
- attrs[name] = value
1001
-
1002
- src = attrs.get("src", "")
1003
- src_lower = src.lower()
1004
- if not src_lower.startswith("data:image/") or ";base64," not in src_lower:
1005
- return None
1006
-
1007
- alt = attrs.get("alt", "")
1008
- alt_attr = f' alt="{html.escape(alt, quote=True)}"' if alt else ""
1009
- return f'<img src="{html.escape(src, quote=True)}"{alt_attr} />'
1010
-
1011
-
1012
- def _extract_html_img_placeholders(text: str) -> tuple[str, dict[str, str]]:
1013
- placeholders: dict[str, str] = {}
1014
- out: list[str] = []
1015
- idx = 0
1016
- in_fence = False
1017
- fence_char = ""
1018
- fence_len = 0
1019
- inline_delim_len = 0
1020
-
1021
- def next_placeholder(value: str) -> str:
1022
- key = f"@@HTML_IMG_{len(placeholders)}@@"
1023
- placeholders[key] = value
1024
- return key
1025
-
1026
- lower = text.lower()
1027
- while idx < len(text):
1028
- at_line_start = idx == 0 or text[idx - 1] == "\n"
1029
-
1030
- if inline_delim_len == 0 and at_line_start:
1031
- line_end = text.find("\n", idx)
1032
- if line_end == -1:
1033
- line_end = len(text)
1034
- line = text[idx:line_end]
1035
- stripped = line.lstrip(" ")
1036
- leading_spaces = len(line) - len(stripped)
1037
- if leading_spaces <= 3 and stripped:
1038
- first = stripped[0]
1039
- if first in {"`", "~"}:
1040
- run_len = 0
1041
- while run_len < len(stripped) and stripped[run_len] == first:
1042
- run_len += 1
1043
- if run_len >= 3:
1044
- if not in_fence:
1045
- in_fence = True
1046
- fence_char = first
1047
- fence_len = run_len
1048
- elif first == fence_char and run_len >= fence_len:
1049
- in_fence = False
1050
- fence_char = ""
1051
- fence_len = 0
1052
- out.append(line)
1053
- idx = line_end
1054
- continue
1055
-
1056
- if in_fence:
1057
- out.append(text[idx])
1058
- idx += 1
1059
- continue
1060
-
1061
- if inline_delim_len > 0:
1062
- delim = "`" * inline_delim_len
1063
- if text.startswith(delim, idx):
1064
- out.append(delim)
1065
- idx += inline_delim_len
1066
- inline_delim_len = 0
1067
- continue
1068
- out.append(text[idx])
1069
- idx += 1
1070
- continue
1071
-
1072
- if text[idx] == "`":
1073
- run_len = 0
1074
- while idx + run_len < len(text) and text[idx + run_len] == "`":
1075
- run_len += 1
1076
- inline_delim_len = run_len
1077
- out.append("`" * run_len)
1078
- idx += run_len
1079
- continue
1080
-
1081
- if lower.startswith("<img", idx):
1082
- end = text.find(">", idx)
1083
- if end != -1:
1084
- raw = text[idx : end + 1]
1085
- safe_html = _sanitize_img_html(raw)
1086
- if safe_html:
1087
- out.append(next_placeholder(safe_html))
1088
- idx = end + 1
1089
- continue
1090
-
1091
- out.append(text[idx])
1092
- idx += 1
1093
-
1094
- return "".join(out), placeholders
1095
-
1096
-
1097
- def _extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
1098
- placeholders: dict[str, str] = {}
1099
- out: list[str] = []
1100
- idx = 0
1101
- in_fence = False
1102
- fence_char = ""
1103
- fence_len = 0
1104
- inline_delim_len = 0
1105
-
1106
- def next_placeholder(value: str) -> str:
1107
- key = f"@@HTML_TABLE_{len(placeholders)}@@"
1108
- placeholders[key] = value
1109
- return key
1110
-
1111
- lower = text.lower()
1112
- while idx < len(text):
1113
- at_line_start = idx == 0 or text[idx - 1] == "\n"
1114
-
1115
- if inline_delim_len == 0 and at_line_start:
1116
- line_end = text.find("\n", idx)
1117
- if line_end == -1:
1118
- line_end = len(text)
1119
- line = text[idx:line_end]
1120
- stripped = line.lstrip(" ")
1121
- leading_spaces = len(line) - len(stripped)
1122
- if leading_spaces <= 3 and stripped:
1123
- first = stripped[0]
1124
- if first in {"`", "~"}:
1125
- run_len = 0
1126
- while run_len < len(stripped) and stripped[run_len] == first:
1127
- run_len += 1
1128
- if run_len >= 3:
1129
- if not in_fence:
1130
- in_fence = True
1131
- fence_char = first
1132
- fence_len = run_len
1133
- elif first == fence_char and run_len >= fence_len:
1134
- in_fence = False
1135
- fence_char = ""
1136
- fence_len = 0
1137
- out.append(line)
1138
- idx = line_end
1139
- continue
1140
-
1141
- if in_fence:
1142
- out.append(text[idx])
1143
- idx += 1
1144
- continue
1145
-
1146
- if inline_delim_len > 0:
1147
- delim = "`" * inline_delim_len
1148
- if text.startswith(delim, idx):
1149
- out.append(delim)
1150
- idx += inline_delim_len
1151
- inline_delim_len = 0
1152
- continue
1153
- out.append(text[idx])
1154
- idx += 1
1155
- continue
1156
-
1157
- if text[idx] == "`":
1158
- run_len = 0
1159
- while idx + run_len < len(text) and text[idx + run_len] == "`":
1160
- run_len += 1
1161
- inline_delim_len = run_len
1162
- out.append("`" * run_len)
1163
- idx += run_len
1164
- continue
1165
-
1166
- if lower.startswith("<table", idx):
1167
- end = lower.find("</table>", idx)
1168
- if end != -1:
1169
- end += len("</table>")
1170
- raw = text[idx:end]
1171
- key = next_placeholder(raw)
1172
- if out and not out[-1].endswith("\n"):
1173
- out.append("\n\n")
1174
- out.append(key)
1175
- out.append("\n\n")
1176
- idx = end
1177
- continue
1178
-
1179
- out.append(text[idx])
1180
- idx += 1
1181
-
1182
- return "".join(out), placeholders
1183
-
1184
-
1185
- def _render_paper_markdown(
1186
- paper: dict[str, Any],
1187
- fallback_language: str,
1188
- *,
1189
- template_tag: str | None = None,
1190
- ) -> tuple[str, str, str | None]:
1191
- selected_tag, _ = _select_template_tag(paper, template_tag)
1192
- selected_paper = paper
1193
- if selected_tag:
1194
- selected_paper = (paper.get("templates") or {}).get(selected_tag, paper)
1195
-
1196
- template_name = selected_tag or selected_paper.get("prompt_template")
1197
- warning = None
1198
- if template_name:
1199
- try:
1200
- template = load_render_template(str(template_name))
1201
- except Exception:
1202
- template = load_default_template()
1203
- warning = "Rendered using default template (missing template)."
1204
- template_name = "default_paper"
1205
- else:
1206
- template = load_default_template()
1207
- warning = "Rendered using default template (no template specified)."
1208
- template_name = "default_paper"
1209
-
1210
- context = dict(selected_paper)
1211
- if not context.get("output_language"):
1212
- context["output_language"] = fallback_language
1213
- return template.render(**context), str(template_name), warning
1214
-
1215
-
1216
- _TITLE_PREFIX_LEN = 16
1217
- _TITLE_MIN_CHARS = 24
1218
- _TITLE_MIN_TOKENS = 4
1219
- _AUTHOR_YEAR_MIN_SIMILARITY = 0.8
1220
- _LEADING_NUMERIC_MAX_LEN = 2
1221
- _SIMILARITY_START = 0.95
1222
- _SIMILARITY_STEP = 0.05
1223
- _SIMILARITY_MAX_STEPS = 10
1224
-
1225
-
1226
- def _normalize_title_key(title: str) -> str:
1227
- value = unicodedata.normalize("NFKD", title)
1228
- greek_map = {
1229
- "α": "alpha",
1230
- "β": "beta",
1231
- "γ": "gamma",
1232
- "δ": "delta",
1233
- "ε": "epsilon",
1234
- "ζ": "zeta",
1235
- "η": "eta",
1236
- "θ": "theta",
1237
- "ι": "iota",
1238
- "κ": "kappa",
1239
- "λ": "lambda",
1240
- "μ": "mu",
1241
- "ν": "nu",
1242
- "ξ": "xi",
1243
- "ο": "omicron",
1244
- "π": "pi",
1245
- "ρ": "rho",
1246
- "σ": "sigma",
1247
- "τ": "tau",
1248
- "υ": "upsilon",
1249
- "φ": "phi",
1250
- "χ": "chi",
1251
- "ψ": "psi",
1252
- "ω": "omega",
1253
- }
1254
- for char, name in greek_map.items():
1255
- value = value.replace(char, f" {name} ")
1256
- value = re.sub(
1257
- r"\\(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\b",
1258
- r" \1 ",
1259
- value,
1260
- flags=re.IGNORECASE,
1261
- )
1262
- value = value.replace("{", "").replace("}", "")
1263
- value = value.replace("_", " ")
1264
- value = re.sub(r"([a-z])([0-9])", r"\1 \2", value, flags=re.IGNORECASE)
1265
- value = re.sub(r"([0-9])([a-z])", r"\1 \2", value, flags=re.IGNORECASE)
1266
- value = re.sub(r"[^a-z0-9]+", " ", value.lower())
1267
- value = re.sub(r"\s+", " ", value).strip()
1268
- tokens = value.split()
1269
- if not tokens:
1270
- return ""
1271
- merged: list[str] = []
1272
- idx = 0
1273
- while idx < len(tokens):
1274
- token = tokens[idx]
1275
- if len(token) == 1 and idx + 1 < len(tokens):
1276
- merged.append(token + tokens[idx + 1])
1277
- idx += 2
1278
- continue
1279
- merged.append(token)
1280
- idx += 1
1281
- return " ".join(merged)
1282
-
1283
-
1284
- def _compact_title_key(title_key: str) -> str:
1285
- return title_key.replace(" ", "")
1286
-
1287
-
1288
- def _strip_leading_numeric_tokens(title_key: str) -> str:
1289
- tokens = title_key.split()
1290
- idx = 0
1291
- while idx < len(tokens):
1292
- token = tokens[idx]
1293
- if token.isdigit() and len(token) <= _LEADING_NUMERIC_MAX_LEN:
1294
- idx += 1
1295
- continue
1296
- break
1297
- if idx == 0:
1298
- return title_key
1299
- return " ".join(tokens[idx:])
1300
-
1301
-
1302
- def _strip_pdf_hash_suffix(name: str) -> str:
1303
- return re.sub(r"(?i)(\.pdf)(?:-[0-9a-f\-]{8,})$", r"\1", name)
1304
-
1305
-
1306
- def _extract_title_from_filename(name: str) -> str:
1307
- base = name
1308
- lower = base.lower()
1309
- if lower.endswith(".md"):
1310
- base = base[:-3]
1311
- lower = base.lower()
1312
- if ".pdf-" in lower:
1313
- base = _strip_pdf_hash_suffix(base)
1314
- lower = base.lower()
1315
- if lower.endswith(".pdf"):
1316
- base = base[:-4]
1317
- base = base.replace("_", " ").strip()
1318
- match = re.match(r"\s*\d{4}\s*-\s*(.+)$", base)
1319
- if match:
1320
- return match.group(1).strip()
1321
- match = re.match(r"\s*.+?\s*-\s*\d{4}\s*-\s*(.+)$", base)
1322
- if match:
1323
- return match.group(1).strip()
1324
- return base.strip()
1325
-
1326
-
1327
- def _clean_pdf_metadata_title(value: str | None, path: Path) -> str | None:
1328
- if not value:
1329
- return None
1330
- text = str(value).replace("\x00", "").strip()
1331
- if not text:
1332
- return None
1333
- text = re.sub(r"(?i)^microsoft\\s+word\\s*-\\s*", "", text)
1334
- text = re.sub(r"(?i)^pdf\\s*-\\s*", "", text)
1335
- text = re.sub(r"(?i)^untitled\\b", "", text).strip()
1336
- if text.lower().endswith(".pdf"):
1337
- text = text[:-4].strip()
1338
- if len(text) < 3:
1339
- return None
1340
- stem = path.stem.strip()
1341
- if stem and text.lower() == stem.lower():
1342
- return None
1343
- return text
1344
-
1345
-
1346
- def _read_pdf_metadata_title(path: Path) -> str | None:
1347
- if not PYPDF_AVAILABLE:
1348
- return None
1349
- try:
1350
- reader = PdfReader(str(path))
1351
- meta = reader.metadata
1352
- title = meta.title if meta else None
1353
- except Exception:
1354
- return None
1355
- return _clean_pdf_metadata_title(title, path)
1356
-
1357
-
1358
- def _is_pdf_like(path: Path) -> bool:
1359
- suffix = path.suffix.lower()
1360
- if suffix == ".pdf":
1361
- return True
1362
- name_lower = path.name.lower()
1363
- return ".pdf-" in name_lower and not name_lower.endswith(".md")
1364
-
1365
-
1366
- def _scan_pdf_roots(roots: list[Path]) -> tuple[list[Path], list[dict[str, Any]]]:
1367
- pdf_paths: list[Path] = []
1368
- meta: list[dict[str, Any]] = []
1369
- seen: set[Path] = set()
1370
- for root in roots:
1371
- try:
1372
- if not root.exists() or not root.is_dir():
1373
- continue
1374
- except OSError:
1375
- continue
1376
- files: list[Path] = []
1377
- for path in root.rglob("*"):
1378
- try:
1379
- if not path.is_file():
1380
- continue
1381
- except OSError:
1382
- continue
1383
- if not _is_pdf_like(path):
1384
- continue
1385
- resolved = path.resolve()
1386
- if resolved in seen:
1387
- continue
1388
- seen.add(resolved)
1389
- files.append(resolved)
1390
- max_mtime = 0.0
1391
- total_size = 0
1392
- for path in files:
1393
- try:
1394
- stats = path.stat()
1395
- except OSError:
1396
- continue
1397
- max_mtime = max(max_mtime, stats.st_mtime)
1398
- total_size += stats.st_size
1399
- pdf_paths.extend(files)
1400
- meta.append(
1401
- {
1402
- "path": str(root),
1403
- "count": len(files),
1404
- "max_mtime": max_mtime,
1405
- "size": total_size,
1406
- }
1407
- )
1408
- return pdf_paths, meta
1409
-
1410
-
1411
- def _extract_year_author_from_filename(name: str) -> tuple[str | None, str | None]:
1412
- base = name
1413
- lower = base.lower()
1414
- if lower.endswith(".md"):
1415
- base = base[:-3]
1416
- lower = base.lower()
1417
- if ".pdf-" in lower:
1418
- base = _strip_pdf_hash_suffix(base)
1419
- lower = base.lower()
1420
- if lower.endswith(".pdf"):
1421
- base = base[:-4]
1422
- match = re.match(r"\s*(.+?)\s*-\s*((?:19|20)\d{2})\s*-\s*", base)
1423
- if match:
1424
- return match.group(2), match.group(1).strip()
1425
- match = re.match(r"\s*((?:19|20)\d{2})\s*-\s*", base)
1426
- if match:
1427
- return match.group(1), None
1428
- return None, None
1429
-
1430
-
1431
- def _normalize_author_key(name: str) -> str:
1432
- raw = name.lower().strip()
1433
- raw = raw.replace("et al.", "").replace("et al", "")
1434
- if "," in raw:
1435
- raw = raw.split(",", 1)[0]
1436
- raw = re.sub(r"[^a-z0-9]+", " ", raw)
1437
- raw = re.sub(r"\s+", " ", raw).strip()
1438
- if not raw:
1439
- return ""
1440
- parts = raw.split()
1441
- return parts[-1] if parts else raw
1442
-
1443
-
1444
- def _title_prefix_key(title_key: str) -> str | None:
1445
- if len(title_key.split()) < _TITLE_MIN_TOKENS:
1446
- return None
1447
- compact = _compact_title_key(title_key)
1448
- if len(compact) < _TITLE_PREFIX_LEN:
1449
- return None
1450
- prefix = compact[:_TITLE_PREFIX_LEN]
1451
- if not prefix:
1452
- return None
1453
- return f"prefix:{prefix}"
1454
-
1455
-
1456
- def _title_overlap_match(a: str, b: str) -> bool:
1457
- if not a or not b:
1458
- return False
1459
- if a == b:
1460
- return True
1461
- shorter, longer = (a, b) if len(a) <= len(b) else (b, a)
1462
- token_count = len(shorter.split())
1463
- if len(shorter) >= _TITLE_MIN_CHARS or token_count >= _TITLE_MIN_TOKENS:
1464
- if longer.startswith(shorter) or shorter in longer:
1465
- return True
1466
- return False
1467
-
1468
-
1469
- def _adaptive_similarity_match(title_key: str, candidates: list[Path]) -> Path | None:
1470
- if not title_key:
1471
- return None
1472
- scored: list[tuple[Path, float]] = []
1473
- for path in candidates:
1474
- candidate_title = _normalize_title_key(_extract_title_from_filename(path.name))
1475
- if not candidate_title:
1476
- continue
1477
- if _title_overlap_match(title_key, candidate_title):
1478
- return path
1479
- scored.append((path, _title_similarity(title_key, candidate_title)))
1480
- if not scored:
1481
- return None
1482
-
1483
- def matches_at(threshold: float) -> list[Path]:
1484
- return [path for path, score in scored if score >= threshold]
1485
-
1486
- threshold = _SIMILARITY_START
1487
- step = _SIMILARITY_STEP
1488
- prev_threshold = None
1489
- prev_count = None
1490
- for _ in range(_SIMILARITY_MAX_STEPS):
1491
- matches = matches_at(threshold)
1492
- if len(matches) == 1:
1493
- return matches[0]
1494
- if len(matches) == 0:
1495
- prev_threshold = threshold
1496
- prev_count = 0
1497
- threshold -= step
1498
- continue
1499
- if prev_count == 0 and prev_threshold is not None:
1500
- low = threshold
1501
- high = prev_threshold
1502
- for _ in range(_SIMILARITY_MAX_STEPS):
1503
- mid = (low + high) / 2
1504
- mid_matches = matches_at(mid)
1505
- if len(mid_matches) == 1:
1506
- return mid_matches[0]
1507
- if len(mid_matches) == 0:
1508
- high = mid
1509
- else:
1510
- low = mid
1511
- return None
1512
- prev_threshold = threshold
1513
- prev_count = len(matches)
1514
- threshold -= step
1515
- return None
1516
-
1517
-
1518
- def _resolve_by_title_and_meta(
1519
- paper: dict[str, Any],
1520
- file_index: dict[str, list[Path]],
1521
- ) -> Path | None:
1522
- title = str(paper.get("paper_title") or "")
1523
- title_key = _normalize_title_key(title)
1524
- if not title_key:
1525
- title_key = ""
1526
- candidates = file_index.get(title_key, [])
1527
- if candidates:
1528
- return candidates[0]
1529
- if title_key:
1530
- compact_key = _compact_title_key(title_key)
1531
- compact_candidates = file_index.get(f"compact:{compact_key}", [])
1532
- if compact_candidates:
1533
- return compact_candidates[0]
1534
- stripped_key = _strip_leading_numeric_tokens(title_key)
1535
- if stripped_key and stripped_key != title_key:
1536
- stripped_candidates = file_index.get(stripped_key, [])
1537
- if stripped_candidates:
1538
- return stripped_candidates[0]
1539
- stripped_compact = _compact_title_key(stripped_key)
1540
- stripped_candidates = file_index.get(f"compact:{stripped_compact}", [])
1541
- if stripped_candidates:
1542
- return stripped_candidates[0]
1543
- prefix_candidates: list[Path] = []
1544
- prefix_key = _title_prefix_key(title_key)
1545
- if prefix_key:
1546
- prefix_candidates = file_index.get(prefix_key, [])
1547
- if not prefix_candidates:
1548
- stripped_key = _strip_leading_numeric_tokens(title_key)
1549
- if stripped_key and stripped_key != title_key:
1550
- prefix_key = _title_prefix_key(stripped_key)
1551
- if prefix_key:
1552
- prefix_candidates = file_index.get(prefix_key, [])
1553
- if prefix_candidates:
1554
- match = _adaptive_similarity_match(title_key, prefix_candidates)
1555
- if match is not None:
1556
- return match
1557
- year = str(paper.get("_year") or "").strip()
1558
- if not year.isdigit():
1559
- return None
1560
- author_key = ""
1561
- authors = paper.get("_authors") or []
1562
- if authors:
1563
- author_key = _normalize_author_key(str(authors[0]))
1564
- candidates = []
1565
- if author_key:
1566
- candidates = file_index.get(f"authoryear:{year}:{author_key}", [])
1567
- if not candidates:
1568
- candidates = file_index.get(f"year:{year}", [])
1569
- if not candidates:
1570
- return None
1571
- if len(candidates) == 1 and not title_key:
1572
- return candidates[0]
1573
- match = _adaptive_similarity_match(title_key, candidates)
1574
- if match is not None:
1575
- return match
1576
- return None
1577
-
1578
-
1579
- def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
1580
- index: dict[str, list[Path]] = {}
1581
- for root in roots:
1582
- try:
1583
- if not root.exists() or not root.is_dir():
1584
- continue
1585
- except OSError:
1586
- continue
1587
- for path in root.rglob("*"):
1588
- try:
1589
- if not path.is_file():
1590
- continue
1591
- except OSError:
1592
- continue
1593
- suffix = path.suffix.lower()
1594
- if suffix not in suffixes:
1595
- name_lower = path.name.lower()
1596
- if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
1597
- pass
1598
- else:
1599
- continue
1600
- resolved = path.resolve()
1601
- name_key = path.name.lower()
1602
- index.setdefault(name_key, []).append(resolved)
1603
- title_candidate = _extract_title_from_filename(path.name)
1604
- title_key = _normalize_title_key(title_candidate)
1605
- if title_key:
1606
- if title_key != name_key:
1607
- index.setdefault(title_key, []).append(resolved)
1608
- compact_key = _compact_title_key(title_key)
1609
- if compact_key:
1610
- index.setdefault(f"compact:{compact_key}", []).append(resolved)
1611
- prefix_key = _title_prefix_key(title_key)
1612
- if prefix_key:
1613
- index.setdefault(prefix_key, []).append(resolved)
1614
- stripped_key = _strip_leading_numeric_tokens(title_key)
1615
- if stripped_key and stripped_key != title_key:
1616
- index.setdefault(stripped_key, []).append(resolved)
1617
- stripped_compact = _compact_title_key(stripped_key)
1618
- if stripped_compact:
1619
- index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
1620
- stripped_prefix = _title_prefix_key(stripped_key)
1621
- if stripped_prefix:
1622
- index.setdefault(stripped_prefix, []).append(resolved)
1623
- year_hint, author_hint = _extract_year_author_from_filename(path.name)
1624
- if year_hint:
1625
- index.setdefault(f"year:{year_hint}", []).append(resolved)
1626
- if author_hint:
1627
- author_key = _normalize_author_key(author_hint)
1628
- if author_key:
1629
- index.setdefault(f"authoryear:{year_hint}:{author_key}", []).append(resolved)
1630
- return index
1631
-
1632
-
1633
- def _build_file_index_from_paths(paths: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
1634
- index: dict[str, list[Path]] = {}
1635
- for path in paths:
1636
- try:
1637
- if not path.is_file():
1638
- continue
1639
- except OSError:
1640
- continue
1641
- suffix = path.suffix.lower()
1642
- if suffix not in suffixes:
1643
- name_lower = path.name.lower()
1644
- if suffixes == {".pdf"} and ".pdf-" in name_lower and suffix != ".md":
1645
- pass
1646
- else:
1647
- continue
1648
- resolved = path.resolve()
1649
- name_key = path.name.lower()
1650
- index.setdefault(name_key, []).append(resolved)
1651
- title_candidate = _extract_title_from_filename(path.name)
1652
- title_key = _normalize_title_key(title_candidate)
1653
- if title_key:
1654
- if title_key != name_key:
1655
- index.setdefault(title_key, []).append(resolved)
1656
- compact_key = _compact_title_key(title_key)
1657
- if compact_key:
1658
- index.setdefault(f"compact:{compact_key}", []).append(resolved)
1659
- prefix_key = _title_prefix_key(title_key)
1660
- if prefix_key:
1661
- index.setdefault(prefix_key, []).append(resolved)
1662
- stripped_key = _strip_leading_numeric_tokens(title_key)
1663
- if stripped_key and stripped_key != title_key:
1664
- index.setdefault(stripped_key, []).append(resolved)
1665
- stripped_compact = _compact_title_key(stripped_key)
1666
- if stripped_compact:
1667
- index.setdefault(f"compact:{stripped_compact}", []).append(resolved)
1668
- stripped_prefix = _title_prefix_key(stripped_key)
1669
- if stripped_prefix:
1670
- index.setdefault(stripped_prefix, []).append(resolved)
1671
- return index
1672
-
1673
-
1674
- def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
1675
- source_path = paper.get("source_path")
1676
- if not source_path:
1677
- source_path = ""
1678
- if source_path:
1679
- name = Path(str(source_path)).name.lower()
1680
- candidates = md_index.get(name, [])
1681
- if candidates:
1682
- return candidates[0]
1683
- return _resolve_by_title_and_meta(paper, md_index)
1684
-
1685
-
1686
- def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
1687
- source_path = paper.get("source_path")
1688
- if not source_path:
1689
- return []
1690
- name = Path(str(source_path)).name
1691
- match = re.match(r"(?i)(.+\\.pdf)(?:-[0-9a-f\\-]{8,})?\\.md$", name)
1692
- if match:
1693
- return [Path(match.group(1)).name]
1694
- if ".pdf-" in name.lower():
1695
- base = name[: name.lower().rfind(".pdf-") + 4]
1696
- return [Path(base).name]
1697
- if name.lower().endswith(".pdf"):
1698
- return [name]
1699
- if name.lower().endswith(".pdf.md"):
1700
- return [name[:-3]]
1701
- return []
1702
-
1703
-
1704
- def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
1705
- for filename in _guess_pdf_names(paper):
1706
- candidates = pdf_index.get(filename.lower(), [])
1707
- if candidates:
1708
- return candidates[0]
1709
- return _resolve_by_title_and_meta(paper, pdf_index)
1710
-
1711
-
1712
- def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
1713
- resolved = path.resolve()
1714
- for root in roots:
1715
- try:
1716
- resolved.relative_to(root.resolve())
1717
- return True
1718
- except Exception:
1719
- continue
1720
- return False
1721
-
1722
-
1723
- _BOOL_TRUE = {"1", "true", "yes", "with", "has"}
1724
- _BOOL_FALSE = {"0", "false", "no", "without"}
1725
-
1726
-
1727
- def _tokenize_filter_query(text: str) -> list[str]:
1728
- out: list[str] = []
1729
- buf: list[str] = []
1730
- in_quote = False
1731
-
1732
- for ch in text:
1733
- if ch == '"':
1734
- in_quote = not in_quote
1735
- continue
1736
- if not in_quote and ch.isspace():
1737
- token = "".join(buf).strip()
1738
- if token:
1739
- out.append(token)
1740
- buf = []
1741
- continue
1742
- buf.append(ch)
1743
-
1744
- token = "".join(buf).strip()
1745
- if token:
1746
- out.append(token)
1747
- return out
1748
-
1749
-
1750
- def _normalize_presence_value(value: str) -> str | None:
1751
- token = value.strip().lower()
1752
- if token in _BOOL_TRUE:
1753
- return "with"
1754
- if token in _BOOL_FALSE:
1755
- return "without"
1756
- return None
1757
-
1758
-
1759
- def _parse_filter_query(text: str) -> dict[str, set[str]]:
1760
- parsed = {
1761
- "pdf": set(),
1762
- "source": set(),
1763
- "summary": set(),
1764
- "template": set(),
1765
- }
1766
- for token in _tokenize_filter_query(text):
1767
- if ":" not in token:
1768
- continue
1769
- key, raw_value = token.split(":", 1)
1770
- key = key.strip().lower()
1771
- raw_value = raw_value.strip()
1772
- if not raw_value:
1773
- continue
1774
- if key in {"tmpl", "template"}:
1775
- for part in raw_value.split(","):
1776
- tag = part.strip()
1777
- if tag:
1778
- parsed["template"].add(tag.lower())
1779
- continue
1780
- if key in {"pdf", "source", "summary"}:
1781
- for part in raw_value.split(","):
1782
- normalized = _normalize_presence_value(part)
1783
- if normalized:
1784
- parsed[key].add(normalized)
1785
- continue
1786
- if key in {"has", "no"}:
1787
- targets = [part.strip().lower() for part in raw_value.split(",") if part.strip()]
1788
- for target in targets:
1789
- if target not in {"pdf", "source", "summary"}:
1790
- continue
1791
- parsed[target].add("with" if key == "has" else "without")
1792
- return parsed
1793
-
1794
-
1795
- def _presence_filter(values: list[str]) -> set[str] | None:
1796
- normalized = set()
1797
- for value in values:
1798
- token = _normalize_presence_value(value)
1799
- if token:
1800
- normalized.add(token)
1801
- if not normalized or normalized == {"with", "without"}:
1802
- return None
1803
- return normalized
1804
-
1805
-
1806
- def _merge_filter_set(primary: set[str] | None, secondary: set[str] | None) -> set[str] | None:
1807
- if not primary:
1808
- return secondary
1809
- if not secondary:
1810
- return primary
1811
- return primary & secondary
1812
-
1813
-
1814
- def _matches_presence(allowed: set[str] | None, has_value: bool) -> bool:
1815
- if not allowed:
1816
- return True
1817
- if has_value and "with" in allowed:
1818
- return True
1819
- if not has_value and "without" in allowed:
1820
- return True
1821
- return False
1822
-
1823
-
1824
- def _template_tag_map(index: PaperIndex) -> dict[str, str]:
1825
- return {tag.lower(): tag for tag in index.template_tags}
1826
-
1827
-
1828
- def _compute_counts(index: PaperIndex, ids: set[int]) -> dict[str, Any]:
1829
- template_order = list(index.template_tags)
1830
- template_counts = {tag: 0 for tag in template_order}
1831
- pdf_count = 0
1832
- source_count = 0
1833
- summary_count = 0
1834
- total_count = 0
1835
- tag_map = _template_tag_map(index)
1836
-
1837
- for idx in ids:
1838
- paper = index.papers[idx]
1839
- if paper.get("_is_pdf_only"):
1840
- continue
1841
- total_count += 1
1842
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
1843
- has_source = source_hash in index.md_path_by_hash
1844
- has_pdf = source_hash in index.pdf_path_by_hash
1845
- has_summary = bool(paper.get("_has_summary"))
1846
- if has_source:
1847
- source_count += 1
1848
- if has_pdf:
1849
- pdf_count += 1
1850
- if has_summary:
1851
- summary_count += 1
1852
- for tag_lc in paper.get("_template_tags_lc") or []:
1853
- display = tag_map.get(tag_lc)
1854
- if display:
1855
- template_counts[display] = template_counts.get(display, 0) + 1
1856
-
1857
- return {
1858
- "total": total_count,
1859
- "pdf": pdf_count,
1860
- "source": source_count,
1861
- "summary": summary_count,
1862
- "templates": template_counts,
1863
- "template_order": template_order,
1864
- }
1865
-
1866
-
1867
- def _apply_query(index: PaperIndex, query: Query) -> set[int]:
1868
- all_ids = set(index.ordered_ids)
1869
-
1870
- def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
1871
- value_lc = term.value.lower()
1872
- if term.field is None:
1873
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
1874
- if term.field == "title":
1875
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
1876
- if term.field == "venue":
1877
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
1878
- if term.field == "tag":
1879
- exact = index.by_tag.get(value_lc)
1880
- if exact is not None:
1881
- return exact & base
1882
- return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
1883
- if term.field == "author":
1884
- exact = index.by_author.get(value_lc)
1885
- if exact is not None:
1886
- return exact & base
1887
- return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
1888
- if term.field == "month":
1889
- exact = index.by_month.get(value_lc)
1890
- if exact is not None:
1891
- return exact & base
1892
- return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
1893
- if term.field == "year":
1894
- if ".." in term.value:
1895
- start_str, end_str = term.value.split("..", 1)
1896
- if start_str.strip().isdigit() and end_str.strip().isdigit():
1897
- start = int(start_str.strip())
1898
- end = int(end_str.strip())
1899
- ids: set[int] = set()
1900
- for y in range(min(start, end), max(start, end) + 1):
1901
- ids |= index.by_year.get(str(y), set())
1902
- return ids & base
1903
- exact = index.by_year.get(value_lc)
1904
- if exact is not None:
1905
- return exact & base
1906
- return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
1907
- return set()
1908
-
1909
- result: set[int] = set()
1910
- for group in query.groups:
1911
- group_ids = set(all_ids)
1912
- for term in group:
1913
- matched = ids_for_term(term, group_ids if not term.negated else all_ids)
1914
- if term.negated:
1915
- group_ids -= matched
1916
- else:
1917
- group_ids &= matched
1918
- result |= group_ids
1919
-
1920
- return result
1921
-
1922
-
1923
- def _page_shell(
1924
- title: str,
1925
- body_html: str,
1926
- extra_head: str = "",
1927
- extra_scripts: str = "",
1928
- header_title: str | None = None,
1929
- ) -> str:
1930
- header_html = """
1931
- <header>
1932
- <a href="/">Papers</a>
1933
- <a href="/stats">Stats</a>
1934
- </header>
1935
- """
1936
- if header_title:
1937
- safe_title = html.escape(header_title)
1938
- header_html = f"""
1939
- <header class="detail-header">
1940
- <div class="header-row">
1941
- <a class="header-back" href="/">← Papers</a>
1942
- <span class="header-title" title="{safe_title}">{safe_title}</span>
1943
- <a class="header-link" href="/stats">Stats</a>
1944
- </div>
1945
- </header>
1946
- """
1947
- return f"""<!doctype html>
1948
- <html lang="en">
1949
- <head>
1950
- <meta charset="utf-8" />
1951
- <meta name="viewport" content="width=device-width, initial-scale=1" />
1952
- <title>{html.escape(title)}</title>
1953
- <style>
1954
- body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; }}
1955
- header {{ position: sticky; top: 0; background: #0b1220; color: #fff; padding: 12px 16px; z-index: 10; }}
1956
- header a {{ color: #cfe3ff; text-decoration: none; margin-right: 12px; }}
1957
- .detail-header .header-row {{ display: grid; grid-template-columns: auto minmax(0, 1fr) auto; align-items: center; gap: 12px; }}
1958
- .detail-header .header-title {{ text-align: center; white-space: nowrap; overflow: hidden; text-overflow: ellipsis; }}
1959
- .detail-header .header-back {{ margin-right: 0; }}
1960
- .detail-header .header-link {{ margin-right: 0; }}
1961
- .container {{ max-width: 1100px; margin: 0 auto; padding: 16px; }}
1962
- .filters {{ display: grid; grid-template-columns: repeat(6, 1fr); gap: 8px; margin: 12px 0 16px; }}
1963
- .filters input {{ width: 100%; padding: 8px; border: 1px solid #d0d7de; border-radius: 6px; }}
1964
- .filters select {{ width: 100%; border: 1px solid #d0d7de; border-radius: 6px; background: #fff; font-size: 13px; }}
1965
- .filters select:not([multiple]) {{ padding: 6px 8px; }}
1966
- .filters select[multiple] {{ padding: 2px; line-height: 1.25; min-height: 72px; font-size: 13px; }}
1967
- .filters select[multiple] option {{ padding: 2px 6px; line-height: 1.25; }}
1968
- .filters label {{ font-size: 12px; color: #57606a; }}
1969
- .filter-group {{ display: flex; flex-direction: column; gap: 4px; }}
1970
- .card {{ border: 1px solid #d0d7de; border-radius: 10px; padding: 12px; margin: 10px 0; }}
1971
- .muted {{ color: #57606a; font-size: 13px; }}
1972
- .pill {{ display: inline-block; padding: 2px 8px; border-radius: 999px; border: 1px solid #d0d7de; margin-right: 6px; font-size: 12px; }}
1973
- .pill.template {{ border-color: #8a92a5; color: #243b53; background: #f6f8fa; }}
1974
- .pill.pdf-only {{ border-color: #c8a951; background: #fff8dc; color: #5b4a00; }}
1975
- .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
1976
- .tabs {{ display: flex; gap: 8px; flex-wrap: wrap; }}
1977
- .tab {{ display: inline-block; padding: 6px 12px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; color: #0969da; text-decoration: none; font-size: 13px; }}
1978
- .tab:hover {{ background: #eef1f4; }}
1979
- .tab.active {{ background: #0969da; border-color: #0969da; color: #fff; }}
1980
- .detail-shell {{ display: flex; flex-direction: column; gap: 12px; min-height: calc(100vh - 120px); }}
1981
- .detail-toolbar {{ display: flex; flex-wrap: wrap; align-items: center; justify-content: flex-start; gap: 12px; padding: 6px 8px 10px; border-bottom: 1px solid #e5e7eb; box-sizing: border-box; }}
1982
- .detail-toolbar .tabs {{ margin: 0; }}
1983
- .toolbar-actions {{ display: flex; flex-wrap: wrap; align-items: center; gap: 10px; margin-left: auto; padding-right: 16px; }}
1984
- .split-inline {{ display: flex; flex-wrap: wrap; align-items: center; gap: 6px; }}
1985
- .split-inline select {{ padding: 6px 8px; border-radius: 8px; border: 1px solid #d0d7de; background: #fff; min-width: 140px; }}
1986
- .split-actions {{ display: flex; align-items: center; justify-content: center; gap: 8px; }}
1987
- .split-actions button {{ padding: 6px 10px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; min-width: 36px; }}
1988
- .fullscreen-actions {{ display: flex; align-items: center; gap: 6px; }}
1989
- .fullscreen-actions button {{ padding: 6px 10px; border-radius: 8px; border: 1px solid #d0d7de; background: #f6f8fa; cursor: pointer; }}
1990
- .fullscreen-exit {{ display: none; }}
1991
- body.detail-fullscreen {{ overflow: hidden; --outline-top: 16px; }}
1992
- body.detail-fullscreen header {{ display: none; }}
1993
- body.detail-fullscreen .container {{ max-width: 100%; padding: 0; }}
1994
- body.detail-fullscreen .detail-shell {{
1995
- position: fixed;
1996
- inset: 0;
1997
- padding: 12px 16px;
1998
- background: #fff;
1999
- z-index: 40;
2000
- overflow: auto;
2001
- }}
2002
- body.detail-fullscreen .detail-toolbar {{ position: sticky; top: 0; background: #fff; z-index: 41; }}
2003
- body.detail-fullscreen .fullscreen-enter {{ display: none; }}
2004
- body.detail-fullscreen .fullscreen-exit {{ display: inline-flex; }}
2005
- .detail-body {{ display: flex; flex-direction: column; gap: 8px; flex: 1; min-height: 0; }}
2006
- .help-icon {{ display: inline-flex; align-items: center; justify-content: center; width: 18px; height: 18px; border-radius: 50%; border: 1px solid #d0d7de; color: #57606a; font-size: 12px; cursor: default; position: relative; }}
2007
- .help-icon::after {{ content: attr(data-tip); display: none; position: absolute; top: 24px; right: 0; background: #0b1220; color: #e6edf3; padding: 8px 10px; border-radius: 8px; font-size: 12px; white-space: pre-line; width: 260px; z-index: 20; }}
2008
- .help-icon:hover::after {{ display: block; }}
2009
- .stats {{ margin: 12px 0 6px; }}
2010
- .stats-row {{ display: flex; flex-wrap: wrap; gap: 6px; align-items: center; }}
2011
- .stats-label {{ font-weight: 600; color: #0b1220; margin-right: 4px; }}
2012
- .pill.stat {{ background: #f6f8fa; border-color: #c7d2e0; color: #1f2a37; }}
2013
- pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
2014
- code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
2015
- a {{ color: #0969da; }}
2016
- @media (max-width: 768px) {{
2017
- .detail-toolbar {{
2018
- flex-wrap: nowrap;
2019
- overflow-x: auto;
2020
- padding-bottom: 8px;
2021
- }}
2022
- .detail-toolbar::-webkit-scrollbar {{ height: 6px; }}
2023
- .detail-toolbar::-webkit-scrollbar-thumb {{ background: #c7d2e0; border-radius: 999px; }}
2024
- .detail-toolbar .tabs,
2025
- .toolbar-actions {{
2026
- flex: 0 0 auto;
2027
- }}
2028
- }}
2029
- </style>
2030
- {extra_head}
2031
- </head>
2032
- <body>
2033
- {header_html}
2034
- <div class="container">
2035
- {body_html}
2036
- </div>
2037
- {extra_scripts}
2038
- </body>
2039
- </html>"""
2040
-
2041
-
2042
- def _embed_shell(title: str, body_html: str, extra_head: str = "", extra_scripts: str = "") -> str:
2043
- return f"""<!doctype html>
2044
- <html lang="en">
2045
- <head>
2046
- <meta charset="utf-8" />
2047
- <meta name="viewport" content="width=device-width, initial-scale=1" />
2048
- <title>{html.escape(title)}</title>
2049
- <style>
2050
- body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; padding: 16px; }}
2051
- h1, h2, h3, h4 {{ margin-top: 1.2em; }}
2052
- .muted {{ color: #57606a; font-size: 13px; }}
2053
- .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
2054
- pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
2055
- code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
2056
- a {{ color: #0969da; }}
2057
- </style>
2058
- {extra_head}
2059
- </head>
2060
- <body>
2061
- {body_html}
2062
- {extra_scripts}
2063
- </body>
2064
- </html>"""
2065
-
2066
-
2067
- def _build_pdfjs_viewer_url(pdf_url: str) -> str:
2068
- encoded = quote(pdf_url, safe="")
2069
- return f"{_PDFJS_VIEWER_PATH}?file={encoded}"
2070
-
2071
-
2072
- def _outline_assets(outline_top: str) -> tuple[str, str, str]:
2073
- outline_html = """
2074
- <button id="outlineToggle" class="outline-toggle" title="Toggle outline">☰</button>
2075
- <div id="outlinePanel" class="outline-panel collapsed">
2076
- <div class="outline-title">Outline</div>
2077
- <div id="outlineList" class="outline-list"></div>
2078
- </div>
2079
- <button id="backToTop" class="back-to-top" title="Back to top">↑</button>
2080
- """
2081
- outline_css = f"""
2082
- <style>
2083
- :root {{
2084
- --outline-top: {outline_top};
2085
- }}
2086
- .outline-toggle {{
2087
- position: fixed;
2088
- top: var(--outline-top);
2089
- left: 16px;
2090
- z-index: 20;
2091
- padding: 6px 10px;
2092
- border-radius: 8px;
2093
- border: 1px solid #d0d7de;
2094
- background: #f6f8fa;
2095
- cursor: pointer;
2096
- }}
2097
- .outline-panel {{
2098
- position: fixed;
2099
- top: calc(var(--outline-top) + 42px);
2100
- left: 16px;
2101
- width: 240px;
2102
- max-height: 60vh;
2103
- overflow: auto;
2104
- border: 1px solid #d0d7de;
2105
- border-radius: 10px;
2106
- background: #ffffff;
2107
- padding: 10px;
2108
- z-index: 20;
2109
- box-shadow: 0 6px 18px rgba(0, 0, 0, 0.08);
2110
- }}
2111
- .outline-panel.collapsed {{
2112
- display: none;
2113
- }}
2114
- .outline-title {{
2115
- font-size: 12px;
2116
- text-transform: uppercase;
2117
- letter-spacing: 0.08em;
2118
- color: #57606a;
2119
- margin-bottom: 8px;
2120
- }}
2121
- .outline-list a {{
2122
- display: block;
2123
- color: #0969da;
2124
- text-decoration: none;
2125
- padding: 4px 0;
2126
- }}
2127
- .outline-list a:hover {{
2128
- text-decoration: underline;
2129
- }}
2130
- .back-to-top {{
2131
- position: fixed;
2132
- left: 16px;
2133
- bottom: 16px;
2134
- padding: 6px 10px;
2135
- border-radius: 999px;
2136
- border: 1px solid #d0d7de;
2137
- background: #ffffff;
2138
- cursor: pointer;
2139
- opacity: 0;
2140
- pointer-events: none;
2141
- transition: opacity 0.2s ease;
2142
- z-index: 20;
2143
- }}
2144
- .back-to-top.visible {{
2145
- opacity: 1;
2146
- pointer-events: auto;
2147
- }}
2148
- @media (max-width: 900px) {{
2149
- .outline-panel {{
2150
- width: 200px;
2151
- }}
2152
- }}
2153
- </style>
2154
- """
2155
- outline_js = """
2156
- const outlineToggle = document.getElementById('outlineToggle');
2157
- const outlinePanel = document.getElementById('outlinePanel');
2158
- const outlineList = document.getElementById('outlineList');
2159
- const backToTop = document.getElementById('backToTop');
2160
-
2161
- function slugify(text) {
2162
- return text.toLowerCase().trim()
2163
- .replace(/[^a-z0-9\\s-]/g, '')
2164
- .replace(/\\s+/g, '-')
2165
- .replace(/-+/g, '-');
2166
- }
2167
-
2168
- function buildOutline() {
2169
- if (!outlineList) return;
2170
- const content = document.getElementById('content');
2171
- if (!content) return;
2172
- const headings = content.querySelectorAll('h1, h2, h3, h4');
2173
- if (!headings.length) {
2174
- outlineList.innerHTML = '<div class="muted">No headings</div>';
2175
- return;
2176
- }
2177
- const used = new Set();
2178
- outlineList.innerHTML = '';
2179
- headings.forEach((heading) => {
2180
- let id = heading.id;
2181
- if (!id) {
2182
- const base = slugify(heading.textContent || 'section') || 'section';
2183
- id = base;
2184
- let i = 1;
2185
- while (used.has(id) || document.getElementById(id)) {
2186
- id = `${base}-${i++}`;
2187
- }
2188
- heading.id = id;
2189
- }
2190
- used.add(id);
2191
- const level = parseInt(heading.tagName.slice(1), 10) || 1;
2192
- const link = document.createElement('a');
2193
- link.href = `#${id}`;
2194
- link.textContent = heading.textContent || '';
2195
- link.style.paddingLeft = `${(level - 1) * 12}px`;
2196
- outlineList.appendChild(link);
2197
- });
2198
- }
2199
-
2200
- function toggleBackToTop() {
2201
- if (!backToTop) return;
2202
- if (window.scrollY > 300) {
2203
- backToTop.classList.add('visible');
2204
- } else {
2205
- backToTop.classList.remove('visible');
2206
- }
2207
- }
2208
-
2209
- if (outlineToggle && outlinePanel) {
2210
- outlineToggle.addEventListener('click', () => {
2211
- outlinePanel.classList.toggle('collapsed');
2212
- });
2213
- }
2214
-
2215
- if (backToTop) {
2216
- backToTop.addEventListener('click', () => {
2217
- window.scrollTo({ top: 0, behavior: 'smooth' });
2218
- });
2219
- }
2220
-
2221
- buildOutline();
2222
- window.addEventListener('scroll', toggleBackToTop);
2223
- toggleBackToTop();
2224
- """
2225
- return outline_html, outline_css, outline_js
2226
-
2227
-
2228
- async def _index_page(request: Request) -> HTMLResponse:
2229
- index: PaperIndex = request.app.state.index
2230
- template_options = "".join(
2231
- f'<option value="{html.escape(tag)}">{html.escape(tag)}</option>'
2232
- for tag in index.template_tags
2233
- )
2234
- if not template_options:
2235
- template_options = '<option value="" disabled>(no templates)</option>'
2236
- filter_help = (
2237
- "Filters syntax:\\n"
2238
- "pdf:yes|no source:yes|no summary:yes|no\\n"
2239
- "tmpl:<tag> or template:<tag>\\n"
2240
- "has:pdf / no:source aliases\\n"
2241
- "Content tags still use the search box (tag:fpga)."
2242
- )
2243
- filter_help_attr = html.escape(filter_help).replace("\n", "&#10;")
2244
- body_html = """
2245
- <h2>Paper Database</h2>
2246
- <div class="card">
2247
- <div class="muted">Search (Scholar-style): <code>tag:fpga year:2023..2025 -survey</code> · Use quotes for phrases and <code>OR</code> for alternatives.</div>
2248
- <div style="display:flex; gap:8px; margin-top:8px;">
2249
- <input id="query" placeholder='Search... e.g. title:"nearest neighbor" tag:fpga year:2023..2025' style="flex:1; padding:10px; border:1px solid #d0d7de; border-radius:8px;" />
2250
- <select id="openView" style="padding:10px; border:1px solid #d0d7de; border-radius:8px;">
2251
- <option value="summary" selected>Open: Summary</option>
2252
- <option value="source">Open: Source</option>
2253
- <option value="pdf">Open: PDF</option>
2254
- <option value="pdfjs">Open: PDF Viewer</option>
2255
- <option value="split">Open: Split</option>
2256
- </select>
2257
- </div>
2258
- <div class="filters" style="grid-template-columns: repeat(4, 1fr); margin-top:10px;">
2259
- <div class="filter-group">
2260
- <label>PDF</label>
2261
- <select id="filterPdf" multiple size="2">
2262
- <option value="with">With</option>
2263
- <option value="without">Without</option>
2264
- </select>
2265
- </div>
2266
- <div class="filter-group">
2267
- <label>Source</label>
2268
- <select id="filterSource" multiple size="2">
2269
- <option value="with">With</option>
2270
- <option value="without">Without</option>
2271
- </select>
2272
- </div>
2273
- <div class="filter-group">
2274
- <label>Summary</label>
2275
- <select id="filterSummary" multiple size="2">
2276
- <option value="with">With</option>
2277
- <option value="without">Without</option>
2278
- </select>
2279
- </div>
2280
- <div class="filter-group">
2281
- <label>Template</label>
2282
- <select id="filterTemplate" multiple size="4">
2283
- __TEMPLATE_OPTIONS__
2284
- </select>
2285
- </div>
2286
- </div>
2287
- <div style="display:flex; gap:8px; align-items:center; margin-top:8px;">
2288
- <input id="filterQuery" placeholder='Filters... e.g. pdf:yes tmpl:simple' style="flex:1; padding:10px; border:1px solid #d0d7de; border-radius:8px;" />
2289
- <span class="help-icon" data-tip="__FILTER_HELP__">?</span>
2290
- </div>
2291
- <details style="margin-top:10px;">
2292
- <summary>Advanced search</summary>
2293
- <div style="margin-top:10px;" class="muted">Build a query:</div>
2294
- <div class="filters" style="grid-template-columns: repeat(3, 1fr);">
2295
- <input id="advTitle" placeholder="title contains..." />
2296
- <input id="advAuthor" placeholder="author contains..." />
2297
- <input id="advTag" placeholder="tag (comma separated)" />
2298
- <input id="advYear" placeholder="year (e.g. 2020..2024)" />
2299
- <input id="advMonth" placeholder="month (01-12)" />
2300
- <input id="advVenue" placeholder="venue contains..." />
2301
- </div>
2302
- <div style="display:flex; gap:8px; align-items:center; margin-top:8px;">
2303
- <button id="buildQuery" style="padding:8px 12px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Build</button>
2304
- <div class="muted">Generated: <code id="generated"></code></div>
2305
- </div>
2306
- </details>
2307
- </div>
2308
- <div id="stats" class="stats">
2309
- <div id="statsTotal" class="stats-row"></div>
2310
- <div id="statsFiltered" class="stats-row" style="margin-top:6px;"></div>
2311
- </div>
2312
- <div id="results"></div>
2313
- <div id="loading" class="muted">Loading...</div>
2314
- <script>
2315
- let page = 1;
2316
- let loading = false;
2317
- let done = false;
2318
-
2319
- function currentParams(nextPage) {
2320
- const params = new URLSearchParams();
2321
- params.set("page", String(nextPage));
2322
- params.set("page_size", "30");
2323
- const q = document.getElementById("query").value.trim();
2324
- if (q) params.set("q", q);
2325
- const fq = document.getElementById("filterQuery").value.trim();
2326
- if (fq) params.set("fq", fq);
2327
- function addMulti(id, key) {
2328
- const el = document.getElementById(id);
2329
- const values = Array.from(el.selectedOptions).map(opt => opt.value).filter(Boolean);
2330
- for (const value of values) {
2331
- params.append(key, value);
2332
- }
2333
- }
2334
- addMulti("filterPdf", "pdf");
2335
- addMulti("filterSource", "source");
2336
- addMulti("filterSummary", "summary");
2337
- addMulti("filterTemplate", "template");
2338
- return params;
2339
- }
2340
-
2341
- function escapeHtml(text) {
2342
- const div = document.createElement("div");
2343
- div.textContent = text;
2344
- return div.innerHTML;
2345
- }
2346
-
2347
- function viewSuffixForItem(item) {
2348
- let view = document.getElementById("openView").value;
2349
- const isPdfOnly = item.is_pdf_only;
2350
- const pdfFallback = item.has_pdf ? "pdfjs" : "pdf";
2351
- if (isPdfOnly && (view === "summary" || view === "source")) {
2352
- view = pdfFallback;
2353
- }
2354
- if (!view || view === "summary") return "";
2355
- const params = new URLSearchParams();
2356
- params.set("view", view);
2357
- if (view === "split") {
2358
- if (isPdfOnly) {
2359
- params.set("left", pdfFallback);
2360
- params.set("right", pdfFallback);
2361
- } else {
2362
- params.set("left", "summary");
2363
- if (item.has_pdf) {
2364
- params.set("right", "pdfjs");
2365
- } else if (item.has_source) {
2366
- params.set("right", "source");
2367
- } else {
2368
- params.set("right", "summary");
2369
- }
2370
- }
2371
- }
2372
- return `?${params.toString()}`;
2373
- }
2374
-
2375
- function renderItem(item) {
2376
- const tags = (item.tags || []).map(t => `<span class="pill">${escapeHtml(t)}</span>`).join("");
2377
- const templateTags = (item.template_tags || []).map(t => `<span class="pill template">tmpl:${escapeHtml(t)}</span>`).join("");
2378
- const authors = (item.authors || []).slice(0, 6).map(a => escapeHtml(a)).join(", ");
2379
- const meta = `${escapeHtml(item.year || "")}-${escapeHtml(item.month || "")} · ${escapeHtml(item.venue || "")}`;
2380
- const viewSuffix = viewSuffixForItem(item);
2381
- const badges = [
2382
- item.has_source ? `<span class="pill">source</span>` : "",
2383
- item.has_pdf ? `<span class="pill">pdf</span>` : "",
2384
- item.is_pdf_only ? `<span class="pill pdf-only">pdf-only</span>` : "",
2385
- ].join("");
2386
- return `
2387
- <div class="card">
2388
- <div><a href="/paper/${encodeURIComponent(item.source_hash)}${viewSuffix}">${escapeHtml(item.title || "")}</a></div>
2389
- <div class="muted">${authors}</div>
2390
- <div class="muted">${meta}</div>
2391
- <div style="margin-top:6px">${badges} ${templateTags} ${tags}</div>
2392
- </div>
2393
- `;
2394
- }
2395
-
2396
- function renderStatsRow(targetId, label, counts) {
2397
- const row = document.getElementById(targetId);
2398
- if (!row || !counts) return;
2399
- const pills = [];
2400
- pills.push(`<span class="stats-label">${escapeHtml(label)}</span>`);
2401
- pills.push(`<span class="pill stat">Count ${counts.total}</span>`);
2402
- pills.push(`<span class="pill stat">PDF ${counts.pdf}</span>`);
2403
- pills.push(`<span class="pill stat">Source ${counts.source}</span>`);
2404
- pills.push(`<span class="pill stat">Summary ${counts.summary}</span>`);
2405
- const order = counts.template_order || Object.keys(counts.templates || {});
2406
- for (const tag of order) {
2407
- const count = (counts.templates && counts.templates[tag]) || 0;
2408
- pills.push(`<span class="pill stat">tmpl:${escapeHtml(tag)} ${count}</span>`);
2409
- }
2410
- row.innerHTML = pills.join("");
2411
- }
2412
-
2413
- function updateStats(stats) {
2414
- if (!stats) return;
2415
- renderStatsRow("statsTotal", "Total", stats.all);
2416
- renderStatsRow("statsFiltered", "Filtered", stats.filtered);
2417
- }
2418
-
2419
- async function loadMore() {
2420
- if (loading || done) return;
2421
- loading = true;
2422
- document.getElementById("loading").textContent = "Loading...";
2423
- const res = await fetch(`/api/papers?${currentParams(page).toString()}`);
2424
- const data = await res.json();
2425
- if (data.stats) {
2426
- updateStats(data.stats);
2427
- }
2428
- const results = document.getElementById("results");
2429
- for (const item of data.items) {
2430
- results.insertAdjacentHTML("beforeend", renderItem(item));
2431
- }
2432
- if (!data.has_more) {
2433
- done = true;
2434
- document.getElementById("loading").textContent = "End.";
2435
- } else {
2436
- page += 1;
2437
- document.getElementById("loading").textContent = "Scroll to load more...";
2438
- }
2439
- loading = false;
2440
- }
2441
-
2442
- function resetAndLoad() {
2443
- page = 1;
2444
- done = false;
2445
- document.getElementById("results").innerHTML = "";
2446
- loadMore();
2447
- }
2448
-
2449
- document.getElementById("query").addEventListener("change", resetAndLoad);
2450
- document.getElementById("openView").addEventListener("change", resetAndLoad);
2451
- document.getElementById("filterQuery").addEventListener("change", resetAndLoad);
2452
- document.getElementById("filterPdf").addEventListener("change", resetAndLoad);
2453
- document.getElementById("filterSource").addEventListener("change", resetAndLoad);
2454
- document.getElementById("filterSummary").addEventListener("change", resetAndLoad);
2455
- document.getElementById("filterTemplate").addEventListener("change", resetAndLoad);
2456
-
2457
- document.getElementById("buildQuery").addEventListener("click", () => {
2458
- function add(field, value) {
2459
- value = value.trim();
2460
- if (!value) return "";
2461
- if (value.includes(" ")) return `${field}:"${value}"`;
2462
- return `${field}:${value}`;
2463
- }
2464
- const parts = [];
2465
- const t = document.getElementById("advTitle").value.trim();
2466
- const a = document.getElementById("advAuthor").value.trim();
2467
- const tag = document.getElementById("advTag").value.trim();
2468
- const y = document.getElementById("advYear").value.trim();
2469
- const m = document.getElementById("advMonth").value.trim();
2470
- const v = document.getElementById("advVenue").value.trim();
2471
- if (t) parts.push(add("title", t));
2472
- if (a) parts.push(add("author", a));
2473
- if (tag) {
2474
- for (const item of tag.split(",")) {
2475
- const val = item.trim();
2476
- if (val) parts.push(add("tag", val));
2477
- }
2478
- }
2479
- if (y) parts.push(add("year", y));
2480
- if (m) parts.push(add("month", m));
2481
- if (v) parts.push(add("venue", v));
2482
- const q = parts.join(" ");
2483
- document.getElementById("generated").textContent = q;
2484
- document.getElementById("query").value = q;
2485
- resetAndLoad();
2486
- });
2487
-
2488
- window.addEventListener("scroll", () => {
2489
- if ((window.innerHeight + window.scrollY) >= (document.body.offsetHeight - 600)) {
2490
- loadMore();
2491
- }
2492
- });
2493
-
2494
- loadMore();
2495
- </script>
2496
- """
2497
- body_html = body_html.replace("__TEMPLATE_OPTIONS__", template_options)
2498
- body_html = body_html.replace("__FILTER_HELP__", filter_help_attr)
2499
- return HTMLResponse(_page_shell("Paper DB", body_html))
2500
-
2501
-
2502
- def _parse_filters(request: Request) -> dict[str, list[str] | str | int]:
2503
- qp = request.query_params
2504
- page = int(qp.get("page", "1"))
2505
- page_size = int(qp.get("page_size", "30"))
2506
- page = max(1, page)
2507
- page_size = min(max(1, page_size), 200)
2508
-
2509
- q = qp.get("q", "").strip()
2510
- filter_query = qp.get("fq", "").strip()
2511
- pdf_filters = [item for item in qp.getlist("pdf") if item]
2512
- source_filters = [item for item in qp.getlist("source") if item]
2513
- summary_filters = [item for item in qp.getlist("summary") if item]
2514
- template_filters = [item for item in qp.getlist("template") if item]
2515
-
2516
- return {
2517
- "page": page,
2518
- "page_size": page_size,
2519
- "q": q,
2520
- "filter_query": filter_query,
2521
- "pdf": pdf_filters,
2522
- "source": source_filters,
2523
- "summary": summary_filters,
2524
- "template": template_filters,
2525
- }
2526
-
2527
-
2528
- async def _api_papers(request: Request) -> JSONResponse:
2529
- index: PaperIndex = request.app.state.index
2530
- filters = _parse_filters(request)
2531
- page = int(filters["page"])
2532
- page_size = int(filters["page_size"])
2533
- q = str(filters["q"])
2534
- filter_query = str(filters["filter_query"])
2535
- query = parse_query(q)
2536
- candidate = _apply_query(index, query)
2537
- filter_terms = _parse_filter_query(filter_query)
2538
- pdf_filter = _merge_filter_set(_presence_filter(filters["pdf"]), _presence_filter(list(filter_terms["pdf"])))
2539
- source_filter = _merge_filter_set(
2540
- _presence_filter(filters["source"]), _presence_filter(list(filter_terms["source"]))
2541
- )
2542
- summary_filter = _merge_filter_set(
2543
- _presence_filter(filters["summary"]), _presence_filter(list(filter_terms["summary"]))
2544
- )
2545
- template_selected = {item.lower() for item in filters["template"] if item}
2546
- template_filter = _merge_filter_set(
2547
- template_selected or None,
2548
- filter_terms["template"] or None,
2549
- )
2550
-
2551
- if candidate:
2552
- filtered: set[int] = set()
2553
- for idx in candidate:
2554
- paper = index.papers[idx]
2555
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
2556
- has_source = source_hash in index.md_path_by_hash
2557
- has_pdf = source_hash in index.pdf_path_by_hash
2558
- has_summary = bool(paper.get("_has_summary"))
2559
- if not _matches_presence(pdf_filter, has_pdf):
2560
- continue
2561
- if not _matches_presence(source_filter, has_source):
2562
- continue
2563
- if not _matches_presence(summary_filter, has_summary):
2564
- continue
2565
- if template_filter:
2566
- tags = paper.get("_template_tags_lc") or []
2567
- if not any(tag in template_filter for tag in tags):
2568
- continue
2569
- filtered.add(idx)
2570
- candidate = filtered
2571
- ordered = [idx for idx in index.ordered_ids if idx in candidate]
2572
- total = len(ordered)
2573
- start = (page - 1) * page_size
2574
- end = min(start + page_size, total)
2575
- page_ids = ordered[start:end]
2576
- stats_payload = None
2577
- if page == 1:
2578
- all_ids = set(index.ordered_ids)
2579
- stats_payload = {
2580
- "all": _compute_counts(index, all_ids),
2581
- "filtered": _compute_counts(index, candidate),
2582
- }
2583
-
2584
- items: list[dict[str, Any]] = []
2585
- for idx in page_ids:
2586
- paper = index.papers[idx]
2587
- source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
2588
- items.append(
2589
- {
2590
- "source_hash": source_hash,
2591
- "title": paper.get("paper_title") or "",
2592
- "authors": paper.get("_authors") or [],
2593
- "year": paper.get("_year") or "",
2594
- "month": paper.get("_month") or "",
2595
- "venue": paper.get("_venue") or "",
2596
- "tags": paper.get("_tags") or [],
2597
- "template_tags": paper.get("_template_tags") or [],
2598
- "has_source": source_hash in index.md_path_by_hash,
2599
- "has_pdf": source_hash in index.pdf_path_by_hash,
2600
- "has_summary": bool(paper.get("_has_summary")),
2601
- "is_pdf_only": bool(paper.get("_is_pdf_only")),
2602
- }
2603
- )
2604
-
2605
- return JSONResponse(
2606
- {
2607
- "page": page,
2608
- "page_size": page_size,
2609
- "total": total,
2610
- "has_more": end < total,
2611
- "items": items,
2612
- "stats": stats_payload,
2613
- }
2614
- )
2615
-
2616
-
2617
- async def _paper_detail(request: Request) -> HTMLResponse:
2618
- index: PaperIndex = request.app.state.index
2619
- md = request.app.state.md
2620
- source_hash = request.path_params["source_hash"]
2621
- idx = index.id_by_hash.get(source_hash)
2622
- if idx is None:
2623
- return RedirectResponse("/")
2624
- paper = index.papers[idx]
2625
- is_pdf_only = bool(paper.get("_is_pdf_only"))
2626
- page_title = str(paper.get("paper_title") or "Paper")
2627
- view = request.query_params.get("view")
2628
- template_param = request.query_params.get("template")
2629
- embed = request.query_params.get("embed") == "1"
2630
-
2631
- pdf_path = index.pdf_path_by_hash.get(source_hash)
2632
- pdf_url = f"/api/pdf/{source_hash}"
2633
- source_available = source_hash in index.md_path_by_hash
2634
- allowed_views = {"summary", "source", "pdf", "pdfjs", "split"}
2635
- if is_pdf_only:
2636
- allowed_views = {"pdf", "pdfjs", "split"}
2637
-
2638
- def normalize_view(value: str | None, default: str) -> str:
2639
- if value in allowed_views:
2640
- return value
2641
- return default
2642
-
2643
- preferred_pdf_view = "pdfjs" if pdf_path else "pdf"
2644
- default_view = preferred_pdf_view if is_pdf_only else "summary"
2645
- view = normalize_view(view, default_view)
2646
- if view == "split":
2647
- embed = False
2648
- if is_pdf_only:
2649
- left_param = request.query_params.get("left")
2650
- right_param = request.query_params.get("right")
2651
- left = normalize_view(left_param, preferred_pdf_view) if left_param else preferred_pdf_view
2652
- right = normalize_view(right_param, preferred_pdf_view) if right_param else preferred_pdf_view
2653
- else:
2654
- default_right = "pdfjs" if pdf_path else ("source" if source_available else "summary")
2655
- left_param = request.query_params.get("left")
2656
- right_param = request.query_params.get("right")
2657
- left = normalize_view(left_param, "summary") if left_param else "summary"
2658
- right = normalize_view(right_param, default_right) if right_param else default_right
2659
-
2660
- def render_page(title: str, body: str, extra_head: str = "", extra_scripts: str = "") -> HTMLResponse:
2661
- if embed:
2662
- return HTMLResponse(_embed_shell(title, body, extra_head, extra_scripts))
2663
- return HTMLResponse(_page_shell(title, body, extra_head, extra_scripts, header_title=page_title))
2664
-
2665
- def nav_link(label: str, v: str) -> str:
2666
- active = " active" if view == v else ""
2667
- params: dict[str, str] = {"view": v}
2668
- if v == "summary" and template_param:
2669
- params["template"] = str(template_param)
2670
- if v == "split":
2671
- params["left"] = left
2672
- params["right"] = right
2673
- href = f"/paper/{source_hash}?{urlencode(params)}"
2674
- return f'<a class="tab{active}" href="{html.escape(href)}">{html.escape(label)}</a>'
2675
-
2676
- tab_defs = [
2677
- ("Summary", "summary"),
2678
- ("Source", "source"),
2679
- ("PDF", "pdf"),
2680
- ("PDF Viewer", "pdfjs"),
2681
- ("Split", "split"),
2682
- ]
2683
- if is_pdf_only:
2684
- tab_defs = [
2685
- ("PDF", "pdf"),
2686
- ("PDF Viewer", "pdfjs"),
2687
- ("Split", "split"),
2688
- ]
2689
- tabs_html = '<div class="tabs">' + "".join(nav_link(label, v) for label, v in tab_defs) + "</div>"
2690
- fullscreen_controls = """
2691
- <div class="fullscreen-actions">
2692
- <button id="fullscreenEnter" class="fullscreen-enter" type="button" title="Enter fullscreen">Fullscreen</button>
2693
- <button id="fullscreenExit" class="fullscreen-exit" type="button" title="Exit fullscreen">Exit Fullscreen</button>
2694
- </div>
2695
- """
2696
-
2697
- def detail_toolbar(extra_controls: str = "") -> str:
2698
- if embed:
2699
- return ""
2700
- controls = extra_controls.strip()
2701
- toolbar_controls = f"{controls}{fullscreen_controls}" if controls else fullscreen_controls
2702
- return f"""
2703
- <div class="detail-toolbar">
2704
- {tabs_html}
2705
- <div class="toolbar-actions">
2706
- {toolbar_controls}
2707
- </div>
2708
- </div>
2709
- """
2710
-
2711
- def wrap_detail(content: str, toolbar_html: str | None = None) -> str:
2712
- if embed:
2713
- return content
2714
- toolbar = detail_toolbar() if toolbar_html is None else toolbar_html
2715
- return f"""
2716
- <div class="detail-shell">
2717
- {toolbar}
2718
- <div class="detail-body">
2719
- {content}
2720
- </div>
2721
- </div>
2722
- """
2723
-
2724
- fullscreen_script = ""
2725
- if not embed:
2726
- fullscreen_script = """
2727
- <script>
2728
- const fullscreenEnter = document.getElementById('fullscreenEnter');
2729
- const fullscreenExit = document.getElementById('fullscreenExit');
2730
- function setFullscreen(enable) {
2731
- document.body.classList.toggle('detail-fullscreen', enable);
2732
- }
2733
- if (fullscreenEnter) {
2734
- fullscreenEnter.addEventListener('click', () => setFullscreen(true));
2735
- }
2736
- if (fullscreenExit) {
2737
- fullscreenExit.addEventListener('click', () => setFullscreen(false));
2738
- }
2739
- document.addEventListener('keydown', (event) => {
2740
- if (event.key === 'Escape' && document.body.classList.contains('detail-fullscreen')) {
2741
- setFullscreen(false);
2742
- }
2743
- });
2744
- </script>
2745
- """
2746
- pdf_only_warning_html = ""
2747
- if is_pdf_only:
2748
- pdf_only_warning_html = (
2749
- '<div class="warning">PDF-only entry: summary and source views are unavailable.</div>'
2750
- )
2751
- outline_top = "72px" if not embed else "16px"
2752
- outline_html, outline_css, outline_js = _outline_assets(outline_top)
2753
-
2754
- if view == "split":
2755
- def pane_src(pane_view: str) -> str:
2756
- if pane_view == "pdfjs" and pdf_path:
2757
- return _build_pdfjs_viewer_url(pdf_url)
2758
- params: dict[str, str] = {"view": pane_view, "embed": "1"}
2759
- if pane_view == "summary" and template_param:
2760
- params["template"] = str(template_param)
2761
- return f"/paper/{source_hash}?{urlencode(params)}"
2762
-
2763
- left_src = pane_src(left)
2764
- right_src = pane_src(right)
2765
- options = [
2766
- ("summary", "Summary"),
2767
- ("source", "Source"),
2768
- ("pdf", "PDF"),
2769
- ("pdfjs", "PDF Viewer"),
2770
- ]
2771
- if is_pdf_only:
2772
- options = [
2773
- ("pdf", "PDF"),
2774
- ("pdfjs", "PDF Viewer"),
2775
- ]
2776
- left_options = "\n".join(
2777
- f'<option value="{value}"{" selected" if value == left else ""}>{label}</option>'
2778
- for value, label in options
2779
- )
2780
- right_options = "\n".join(
2781
- f'<option value="{value}"{" selected" if value == right else ""}>{label}</option>'
2782
- for value, label in options
2783
- )
2784
- split_controls = f"""
2785
- <div class="split-inline">
2786
- <span class="muted">Left</span>
2787
- <select id="splitLeft">
2788
- {left_options}
2789
- </select>
2790
- <div class="split-actions">
2791
- <button id="splitTighten" type="button" title="Tighten width">-</button>
2792
- <button id="splitSwap" type="button" title="Swap panes">⇄</button>
2793
- <button id="splitWiden" type="button" title="Widen width">+</button>
2794
- </div>
2795
- <span class="muted">Right</span>
2796
- <select id="splitRight">
2797
- {right_options}
2798
- </select>
2799
- </div>
2800
- """
2801
- toolbar_html = detail_toolbar(split_controls)
2802
- split_layout = f"""
2803
- {pdf_only_warning_html}
2804
- <div class="split-layout">
2805
- <div class="split-pane">
2806
- <iframe id="leftPane" src="{html.escape(left_src)}" title="Left pane"></iframe>
2807
- </div>
2808
- <div class="split-pane">
2809
- <iframe id="rightPane" src="{html.escape(right_src)}" title="Right pane"></iframe>
2810
- </div>
2811
- </div>
2812
- """
2813
- body = wrap_detail(split_layout, toolbar_html=toolbar_html)
2814
- extra_head = """
2815
- <style>
2816
- .container {
2817
- max-width: 100%;
2818
- width: 100%;
2819
- margin: 0 auto;
2820
- }
2821
- .split-layout {
2822
- display: flex;
2823
- gap: 12px;
2824
- width: 100%;
2825
- max-width: var(--split-max-width, 100%);
2826
- margin: 0 auto;
2827
- flex: 1;
2828
- min-height: 440px;
2829
- }
2830
- .split-pane {
2831
- flex: 1;
2832
- border: 1px solid #d0d7de;
2833
- border-radius: 10px;
2834
- overflow: hidden;
2835
- background: #fff;
2836
- }
2837
- .split-pane iframe {
2838
- width: 100%;
2839
- height: 100%;
2840
- border: 0;
2841
- }
2842
- @media (max-width: 900px) {
2843
- .split-layout {
2844
- flex-direction: column;
2845
- min-height: 0;
2846
- }
2847
- .split-pane {
2848
- height: 70vh;
2849
- }
2850
- }
2851
- </style>
2852
- """
2853
- extra_scripts = """
2854
- <script>
2855
- const leftSelect = document.getElementById('splitLeft');
2856
- const rightSelect = document.getElementById('splitRight');
2857
- const swapButton = document.getElementById('splitSwap');
2858
- const tightenButton = document.getElementById('splitTighten');
2859
- const widenButton = document.getElementById('splitWiden');
2860
- function updateSplit() {
2861
- const params = new URLSearchParams(window.location.search);
2862
- params.set('view', 'split');
2863
- params.set('left', leftSelect.value);
2864
- params.set('right', rightSelect.value);
2865
- window.location.search = params.toString();
2866
- }
2867
- leftSelect.addEventListener('change', updateSplit);
2868
- rightSelect.addEventListener('change', updateSplit);
2869
- swapButton.addEventListener('click', () => {
2870
- const leftValue = leftSelect.value;
2871
- leftSelect.value = rightSelect.value;
2872
- rightSelect.value = leftValue;
2873
- updateSplit();
2874
- });
2875
- const widthSteps = ["1200px", "1400px", "1600px", "1800px", "2000px", "100%"];
2876
- let widthIndex = widthSteps.length - 1;
2877
- try {
2878
- const stored = localStorage.getItem('splitWidthIndex');
2879
- if (stored !== null) {
2880
- const parsed = Number.parseInt(stored, 10);
2881
- if (!Number.isNaN(parsed)) {
2882
- widthIndex = Math.max(0, Math.min(widthSteps.length - 1, parsed));
2883
- }
2884
- }
2885
- } catch (err) {
2886
- // Ignore storage errors (e.g. private mode)
2887
- }
2888
-
2889
- function applySplitWidth() {
2890
- const value = widthSteps[widthIndex];
2891
- document.documentElement.style.setProperty('--split-max-width', value);
2892
- try {
2893
- localStorage.setItem('splitWidthIndex', String(widthIndex));
2894
- } catch (err) {
2895
- // Ignore storage errors
2896
- }
2897
- }
2898
-
2899
- tightenButton.addEventListener('click', () => {
2900
- widthIndex = Math.max(0, widthIndex - 1);
2901
- applySplitWidth();
2902
- });
2903
- widenButton.addEventListener('click', () => {
2904
- widthIndex = Math.min(widthSteps.length - 1, widthIndex + 1);
2905
- applySplitWidth();
2906
- });
2907
- applySplitWidth();
2908
- </script>
2909
- """
2910
- return render_page(
2911
- "Split View",
2912
- body,
2913
- extra_head=extra_head,
2914
- extra_scripts=extra_scripts + fullscreen_script,
2915
- )
2916
-
2917
- if view == "source":
2918
- source_path = index.md_path_by_hash.get(source_hash)
2919
- if not source_path:
2920
- body = wrap_detail(
2921
- '<div class="warning">Source markdown not found. Provide --md-root to enable source viewing.</div>'
2922
- )
2923
- return render_page("Source", body, extra_scripts=fullscreen_script)
2924
- try:
2925
- raw = source_path.read_text(encoding="utf-8")
2926
- except UnicodeDecodeError:
2927
- raw = source_path.read_text(encoding="latin-1")
2928
- rendered = _render_markdown_with_math_placeholders(md, raw)
2929
- body = wrap_detail(
2930
- f"""
2931
- <div class="muted">{html.escape(str(source_path))}</div>
2932
- <div class="muted" style="margin-top:10px;">Rendered from source markdown:</div>
2933
- {outline_html}
2934
- <div id="content">{rendered}</div>
2935
- <details style="margin-top:12px;"><summary>Raw markdown</summary>
2936
- <pre><code>{html.escape(raw)}</code></pre>
2937
- </details>
2938
- """
2939
- )
2940
- extra_head = f"""
2941
- <link rel="stylesheet" href="{_CDN_KATEX}" />
2942
- {outline_css}
2943
- <style>
2944
- #content img {{
2945
- max-width: 100%;
2946
- height: auto;
2947
- }}
2948
- </style>
2949
- """
2950
- extra_scripts = f"""
2951
- <script src="{_CDN_MERMAID}"></script>
2952
- <script src="{_CDN_KATEX_JS}"></script>
2953
- <script src="{_CDN_KATEX_AUTO}"></script>
2954
- <script>
2955
- document.querySelectorAll('code.language-mermaid').forEach((code) => {{
2956
- const pre = code.parentElement;
2957
- const div = document.createElement('div');
2958
- div.className = 'mermaid';
2959
- div.textContent = code.textContent;
2960
- pre.replaceWith(div);
2961
- }});
2962
- if (window.mermaid) {{
2963
- mermaid.initialize({{ startOnLoad: false }});
2964
- mermaid.run();
2965
- }}
2966
- if (window.renderMathInElement) {{
2967
- renderMathInElement(document.getElementById('content'), {{
2968
- delimiters: [
2969
- {{left: '$$', right: '$$', display: true}},
2970
- {{left: '$', right: '$', display: false}},
2971
- {{left: '\\\\(', right: '\\\\)', display: false}},
2972
- {{left: '\\\\[', right: '\\\\]', display: true}}
2973
- ],
2974
- throwOnError: false
2975
- }});
2976
- }}
2977
- {outline_js}
2978
- </script>
2979
- """
2980
- return render_page("Source", body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
2981
-
2982
- if view == "pdf":
2983
- if not pdf_path:
2984
- body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
2985
- return render_page("PDF", body, extra_scripts=fullscreen_script)
2986
- body = wrap_detail(
2987
- f"""
2988
- {pdf_only_warning_html}
2989
- <div class="muted">{html.escape(str(pdf_path.name))}</div>
2990
- <div style="display:flex; gap:8px; align-items:center; margin: 10px 0;">
2991
- <button id="prev" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Prev</button>
2992
- <button id="next" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Next</button>
2993
- <span class="muted">Page <span id="page_num">1</span> / <span id="page_count">?</span></span>
2994
- <span style="flex:1"></span>
2995
- <button id="zoomOut" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">-</button>
2996
- <button id="zoomIn" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">+</button>
2997
- </div>
2998
- <canvas id="the-canvas" style="width: 100%; border: 1px solid #d0d7de; border-radius: 10px;"></canvas>
2999
- """
3000
- )
3001
- extra_scripts = f"""
3002
- <script src="{_CDN_PDFJS}"></script>
3003
- <script>
3004
- const url = {json.dumps(pdf_url)};
3005
- pdfjsLib.GlobalWorkerOptions.workerSrc = {json.dumps(_CDN_PDFJS_WORKER)};
3006
- let pdfDoc = null;
3007
- let pageNum = 1;
3008
- let pageRendering = false;
3009
- let pageNumPending = null;
3010
- let zoomLevel = 1.0;
3011
- const canvas = document.getElementById('the-canvas');
3012
- const ctx = canvas.getContext('2d');
3013
-
3014
- function renderPage(num) {{
3015
- pageRendering = true;
3016
- pdfDoc.getPage(num).then((page) => {{
3017
- const baseViewport = page.getViewport({{scale: 1}});
3018
- const containerWidth = canvas.clientWidth || baseViewport.width;
3019
- const fitScale = containerWidth / baseViewport.width;
3020
- const scale = fitScale * zoomLevel;
3021
-
3022
- const viewport = page.getViewport({{scale}});
3023
- const outputScale = window.devicePixelRatio || 1;
3024
-
3025
- canvas.width = Math.floor(viewport.width * outputScale);
3026
- canvas.height = Math.floor(viewport.height * outputScale);
3027
- canvas.style.width = Math.floor(viewport.width) + 'px';
3028
- canvas.style.height = Math.floor(viewport.height) + 'px';
3029
-
3030
- const transform = outputScale !== 1 ? [outputScale, 0, 0, outputScale, 0, 0] : null;
3031
- const renderContext = {{ canvasContext: ctx, viewport, transform }};
3032
- const renderTask = page.render(renderContext);
3033
- renderTask.promise.then(() => {{
3034
- pageRendering = false;
3035
- document.getElementById('page_num').textContent = String(pageNum);
3036
- if (pageNumPending !== null) {{
3037
- const next = pageNumPending;
3038
- pageNumPending = null;
3039
- renderPage(next);
3040
- }}
3041
- }});
3042
- }});
3043
- }}
3044
-
3045
- function queueRenderPage(num) {{
3046
- if (pageRendering) {{
3047
- pageNumPending = num;
3048
- }} else {{
3049
- renderPage(num);
3050
- }}
3051
- }}
3052
-
3053
- function onPrevPage() {{
3054
- if (pageNum <= 1) return;
3055
- pageNum--;
3056
- queueRenderPage(pageNum);
3057
- }}
3058
-
3059
- function onNextPage() {{
3060
- if (pageNum >= pdfDoc.numPages) return;
3061
- pageNum++;
3062
- queueRenderPage(pageNum);
3063
- }}
3064
-
3065
- function adjustZoom(delta) {{
3066
- zoomLevel = Math.max(0.5, Math.min(3.0, zoomLevel + delta));
3067
- queueRenderPage(pageNum);
3068
- }}
3069
-
3070
- document.getElementById('prev').addEventListener('click', onPrevPage);
3071
- document.getElementById('next').addEventListener('click', onNextPage);
3072
- document.getElementById('zoomOut').addEventListener('click', () => adjustZoom(-0.1));
3073
- document.getElementById('zoomIn').addEventListener('click', () => adjustZoom(0.1));
3074
-
3075
- pdfjsLib.getDocument(url).promise.then((pdfDoc_) => {{
3076
- pdfDoc = pdfDoc_;
3077
- document.getElementById('page_count').textContent = String(pdfDoc.numPages);
3078
- renderPage(pageNum);
3079
- }});
3080
-
3081
- let resizeTimer = null;
3082
- window.addEventListener('resize', () => {{
3083
- if (!pdfDoc) return;
3084
- if (resizeTimer) clearTimeout(resizeTimer);
3085
- resizeTimer = setTimeout(() => queueRenderPage(pageNum), 150);
3086
- }});
3087
- </script>
3088
- """
3089
- return render_page("PDF", body, extra_scripts=extra_scripts + fullscreen_script)
3090
-
3091
- if view == "pdfjs":
3092
- if not pdf_path:
3093
- body = wrap_detail('<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>')
3094
- return render_page("PDF Viewer", body, extra_scripts=fullscreen_script)
3095
- viewer_url = _build_pdfjs_viewer_url(pdf_url)
3096
- frame_height = "calc(100vh - 32px)" if embed else "100%"
3097
- body = wrap_detail(
3098
- f"""
3099
- {pdf_only_warning_html}
3100
- <div class="muted">{html.escape(str(pdf_path.name))}</div>
3101
- <iframe class="pdfjs-frame" src="{html.escape(viewer_url)}" title="PDF.js Viewer"></iframe>
3102
- """
3103
- )
3104
- extra_head = f"""
3105
- <style>
3106
- .pdfjs-frame {{
3107
- width: 100%;
3108
- height: {frame_height};
3109
- border: 1px solid #d0d7de;
3110
- border-radius: 10px;
3111
- flex: 1;
3112
- }}
3113
- </style>
3114
- """
3115
- return render_page("PDF Viewer", body, extra_head=extra_head, extra_scripts=fullscreen_script)
3116
-
3117
- selected_tag, available_templates = _select_template_tag(paper, template_param)
3118
- markdown, template_name, warning = _render_paper_markdown(
3119
- paper,
3120
- request.app.state.fallback_language,
3121
- template_tag=selected_tag,
3122
- )
3123
- rendered_html = _render_markdown_with_math_placeholders(md, markdown)
3124
-
3125
- warning_html = f'<div class="warning">{html.escape(warning)}</div>' if warning else ""
3126
- template_controls = f'<div class="muted">Template: {html.escape(template_name)}</div>'
3127
- if available_templates:
3128
- options = "\n".join(
3129
- f'<option value="{html.escape(tag)}"{" selected" if tag == selected_tag else ""}>{html.escape(tag)}</option>'
3130
- for tag in available_templates
3131
- )
3132
- template_controls = f"""
3133
- <div class="muted" style="margin: 6px 0;">
3134
- Template:
3135
- <select id="templateSelect" style="padding:6px 8px; border:1px solid #d0d7de; border-radius:6px;">
3136
- {options}
3137
- </select>
3138
- </div>
3139
- <script>
3140
- const templateSelect = document.getElementById('templateSelect');
3141
- if (templateSelect) {{
3142
- templateSelect.addEventListener('change', () => {{
3143
- const params = new URLSearchParams(window.location.search);
3144
- params.set('view', 'summary');
3145
- params.set('template', templateSelect.value);
3146
- window.location.search = params.toString();
3147
- }});
3148
- }}
3149
- </script>
3150
- """
3151
- content_html = f"""
3152
- {template_controls}
3153
- {warning_html}
3154
- {outline_html}
3155
- <div id="content">{rendered_html}</div>
3156
- """
3157
- body = wrap_detail(content_html)
3158
-
3159
- extra_head = f"""
3160
- <link rel="stylesheet" href="{_CDN_KATEX}" />
3161
- {outline_css}
3162
- """
3163
- extra_scripts = f"""
3164
- <script src="{_CDN_MERMAID}"></script>
3165
- <script src="{_CDN_KATEX_JS}"></script>
3166
- <script src="{_CDN_KATEX_AUTO}"></script>
3167
- <script>
3168
- // Mermaid: convert fenced code blocks to mermaid divs
3169
- document.querySelectorAll('code.language-mermaid').forEach((code) => {{
3170
- const pre = code.parentElement;
3171
- const div = document.createElement('div');
3172
- div.className = 'mermaid';
3173
- div.textContent = code.textContent;
3174
- pre.replaceWith(div);
3175
- }});
3176
- if (window.mermaid) {{
3177
- mermaid.initialize({{ startOnLoad: false }});
3178
- mermaid.run();
3179
- }}
3180
- if (window.renderMathInElement) {{
3181
- renderMathInElement(document.getElementById('content'), {{
3182
- delimiters: [
3183
- {{left: '$$', right: '$$', display: true}},
3184
- {{left: '$', right: '$', display: false}},
3185
- {{left: '\\\\(', right: '\\\\)', display: false}},
3186
- {{left: '\\\\[', right: '\\\\]', display: true}}
3187
- ],
3188
- throwOnError: false
3189
- }});
3190
- }}
3191
- {outline_js}
3192
- </script>
3193
- """
3194
- return render_page(page_title, body, extra_head=extra_head, extra_scripts=extra_scripts + fullscreen_script)
3195
-
3196
-
3197
- async def _api_stats(request: Request) -> JSONResponse:
3198
- index: PaperIndex = request.app.state.index
3199
- return JSONResponse(index.stats)
3200
-
3201
-
3202
- async def _api_pdf(request: Request) -> Response:
3203
- index: PaperIndex = request.app.state.index
3204
- source_hash = request.path_params["source_hash"]
3205
- pdf_path = index.pdf_path_by_hash.get(source_hash)
3206
- if not pdf_path:
3207
- return Response("PDF not found", status_code=404)
3208
- allowed_roots: list[Path] = request.app.state.pdf_roots
3209
- if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
3210
- return Response("Forbidden", status_code=403)
3211
- return FileResponse(pdf_path)
3212
-
3213
-
3214
- async def _stats_page(request: Request) -> HTMLResponse:
3215
- body = """
3216
- <h2>Stats</h2>
3217
- <div class="muted">Charts are rendered with ECharts (CDN).</div>
3218
- <div id="year" style="width:100%;height:360px"></div>
3219
- <div id="month" style="width:100%;height:360px"></div>
3220
- <div id="tags" style="width:100%;height:420px"></div>
3221
- <div id="keywords" style="width:100%;height:420px"></div>
3222
- <div id="authors" style="width:100%;height:420px"></div>
3223
- <div id="venues" style="width:100%;height:420px"></div>
3224
- """
3225
- scripts = f"""
3226
- <script src="{_CDN_ECHARTS}"></script>
3227
- <script>
3228
- async function main() {{
3229
- const res = await fetch('/api/stats');
3230
- const data = await res.json();
3231
-
3232
- function bar(el, title, items) {{
3233
- const chart = echarts.init(document.getElementById(el));
3234
- const labels = items.map(x => x.label);
3235
- const counts = items.map(x => x.count);
3236
- chart.setOption({{
3237
- title: {{ text: title }},
3238
- tooltip: {{ trigger: 'axis' }},
3239
- xAxis: {{ type: 'category', data: labels }},
3240
- yAxis: {{ type: 'value' }},
3241
- series: [{{ type: 'bar', data: counts }}]
3242
- }});
3243
- }}
3244
-
3245
- bar('year', 'Publication Year', data.years || []);
3246
- bar('month', 'Publication Month', data.months || []);
3247
- bar('tags', 'Top Tags', (data.tags || []).slice(0, 20));
3248
- bar('keywords', 'Top Keywords', (data.keywords || []).slice(0, 20));
3249
- bar('authors', 'Top Authors', (data.authors || []).slice(0, 20));
3250
- bar('venues', 'Top Venues', (data.venues || []).slice(0, 20));
3251
- }}
3252
- main();
3253
- </script>
3254
- """
3255
- return HTMLResponse(_page_shell("Stats", body, extra_scripts=scripts))
3256
-
3257
-
3258
- def _normalize_bibtex_title(title: str) -> str:
3259
- value = title.replace("{", "").replace("}", "")
3260
- value = re.sub(r"[^a-z0-9]+", " ", value.lower())
3261
- return re.sub(r"\\s+", " ", value).strip()
3262
-
3263
-
3264
- def _title_similarity(a: str, b: str) -> float:
3265
- import difflib
3266
-
3267
- if not a or not b:
3268
- return 0.0
3269
- return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
3270
-
3271
-
3272
- def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
3273
- if not PYBTEX_AVAILABLE:
3274
- raise RuntimeError("pybtex is required for --bibtex support")
3275
-
3276
- bib_data = parse_file(str(bibtex_path))
3277
- entries: list[dict[str, Any]] = []
3278
- by_prefix: dict[str, list[int]] = {}
3279
- for key, entry in bib_data.entries.items():
3280
- fields = dict(entry.fields)
3281
- title = str(fields.get("title") or "").strip()
3282
- title_norm = _normalize_bibtex_title(title)
3283
- if not title_norm:
3284
- continue
3285
- record = {
3286
- "key": key,
3287
- "type": entry.type,
3288
- "fields": fields,
3289
- "persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
3290
- "_title_norm": title_norm,
3291
- }
3292
- idx = len(entries)
3293
- entries.append(record)
3294
- prefix = title_norm[:16]
3295
- by_prefix.setdefault(prefix, []).append(idx)
3296
-
3297
- for paper in papers:
3298
- if isinstance(paper.get("bibtex"), dict):
3299
- continue
3300
- title = str(paper.get("paper_title") or "").strip()
3301
- if not title:
3302
- continue
3303
- norm = _normalize_bibtex_title(title)
3304
- if not norm:
3305
- continue
3306
-
3307
- candidates = []
3308
- prefix = norm[:16]
3309
- for cand_idx in by_prefix.get(prefix, []):
3310
- candidates.append(entries[cand_idx])
3311
- if not candidates:
3312
- candidates = entries
3313
-
3314
- best = None
3315
- best_score = 0.0
3316
- for entry in candidates:
3317
- score = _title_similarity(norm, entry["_title_norm"])
3318
- if score > best_score:
3319
- best_score = score
3320
- best = entry
3321
-
3322
- if best is not None and best_score >= 0.9:
3323
- paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
28
+ class _NoIndexMiddleware(BaseHTTPMiddleware):
29
+ async def dispatch(self, request: Request, call_next): # type: ignore[override]
30
+ response = await call_next(request)
31
+ response.headers["X-Robots-Tag"] = "noindex, nofollow, noarchive, nosnippet, noai, noimageai"
32
+ return response
3324
33
 
3325
34
 
3326
35
  def create_app(
@@ -3329,38 +38,55 @@ def create_app(
3329
38
  fallback_language: str = "en",
3330
39
  bibtex_path: Path | None = None,
3331
40
  md_roots: list[Path] | None = None,
41
+ md_translated_roots: list[Path] | None = None,
3332
42
  pdf_roots: list[Path] | None = None,
3333
43
  cache_dir: Path | None = None,
3334
44
  use_cache: bool = True,
3335
45
  ) -> Starlette:
3336
- papers = _load_or_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
46
+ papers = load_and_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
3337
47
 
3338
48
  md_roots = md_roots or []
49
+ md_translated_roots = md_translated_roots or []
3339
50
  pdf_roots = pdf_roots or []
3340
- index = build_index(papers, md_roots=md_roots, pdf_roots=pdf_roots)
3341
- md = _md_renderer()
51
+ index = build_index(
52
+ papers,
53
+ md_roots=md_roots,
54
+ md_translated_roots=md_translated_roots,
55
+ pdf_roots=pdf_roots,
56
+ )
57
+ md = create_md_renderer()
3342
58
  routes = [
3343
- Route("/", _index_page, methods=["GET"]),
3344
- Route("/stats", _stats_page, methods=["GET"]),
3345
- Route("/paper/{source_hash:str}", _paper_detail, methods=["GET"]),
3346
- Route("/api/papers", _api_papers, methods=["GET"]),
3347
- Route("/api/stats", _api_stats, methods=["GET"]),
3348
- Route("/api/pdf/{source_hash:str}", _api_pdf, methods=["GET"]),
59
+ Route("/", index_page, methods=["GET"]),
60
+ Route("/robots.txt", robots_txt, methods=["GET"]),
61
+ Route("/stats", stats_page, methods=["GET"]),
62
+ Route("/paper/{source_hash:str}", paper_detail, methods=["GET"]),
63
+ Route("/api/papers", api_papers, methods=["GET"]),
64
+ Route("/api/stats", api_stats, methods=["GET"]),
65
+ Route("/api/pdf/{source_hash:str}", api_pdf, methods=["GET"]),
3349
66
  ]
3350
- if _PDFJS_STATIC_DIR.exists():
67
+ if PDFJS_STATIC_DIR.exists():
3351
68
  routes.append(
3352
69
  Mount(
3353
70
  "/pdfjs",
3354
- app=StaticFiles(directory=str(_PDFJS_STATIC_DIR), html=True),
71
+ app=StaticFiles(directory=str(PDFJS_STATIC_DIR), html=True),
3355
72
  name="pdfjs",
3356
73
  )
3357
74
  )
3358
75
  elif pdf_roots:
3359
76
  logger.warning(
3360
77
  "PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable.",
3361
- _PDFJS_STATIC_DIR,
78
+ PDFJS_STATIC_DIR,
79
+ )
80
+ if STATIC_DIR.exists():
81
+ routes.append(
82
+ Mount(
83
+ "/static",
84
+ app=StaticFiles(directory=str(STATIC_DIR)),
85
+ name="static",
86
+ )
3362
87
  )
3363
88
  app = Starlette(routes=routes)
89
+ app.add_middleware(_NoIndexMiddleware)
3364
90
  app.state.index = index
3365
91
  app.state.md = md
3366
92
  app.state.fallback_language = fallback_language