deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1154 -35
  4. deepresearch_flow/paper/db_ops.py +124 -19
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
  15. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  16. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  17. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  18. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  19. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
  20. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  21. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  22. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  23. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  24. deepresearch_flow/paper/snapshot/api.py +941 -0
  25. deepresearch_flow/paper/snapshot/builder.py +965 -0
  26. deepresearch_flow/paper/snapshot/identity.py +239 -0
  27. deepresearch_flow/paper/snapshot/schema.py +245 -0
  28. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  29. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  30. deepresearch_flow/paper/snapshot/text.py +154 -0
  31. deepresearch_flow/paper/template_registry.py +1 -0
  32. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  33. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  35. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  36. deepresearch_flow/paper/web/app.py +10 -3
  37. deepresearch_flow/recognize/cli.py +380 -103
  38. deepresearch_flow/recognize/markdown.py +31 -7
  39. deepresearch_flow/recognize/math.py +47 -12
  40. deepresearch_flow/recognize/mermaid.py +320 -10
  41. deepresearch_flow/recognize/organize.py +29 -7
  42. deepresearch_flow/translator/cli.py +71 -20
  43. deepresearch_flow/translator/engine.py +220 -81
  44. deepresearch_flow/translator/prompts.py +19 -2
  45. deepresearch_flow/translator/protector.py +15 -3
  46. deepresearch_flow-0.6.1.dist-info/METADATA +849 -0
  47. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/RECORD +51 -43
  48. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/WHEEL +1 -1
  49. deepresearch_flow-0.5.1.dist-info/METADATA +0 -440
  50. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/entry_points.txt +0 -0
  51. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/licenses/LICENSE +0 -0
  52. {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,941 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import sqlite3
5
+ from pathlib import Path
6
+ import re
7
+ from typing import Any
8
+ from urllib.parse import quote
9
+
10
+ from starlette.applications import Starlette
11
+ from starlette.middleware.cors import CORSMiddleware
12
+ from starlette.requests import Request
13
+ from starlette.responses import JSONResponse, Response
14
+ from starlette.routing import Route
15
+
16
+ from deepresearch_flow.paper.snapshot.text import merge_adjacent_markers, remove_cjk_spaces, rewrite_search_query
17
+
18
+ _WHITESPACE_RE = re.compile(r"\s+")
19
+
20
+
21
+ def _normalize_facet_value(value: str) -> str:
22
+ cleaned = str(value or "").strip().lower()
23
+ cleaned = _WHITESPACE_RE.sub(" ", cleaned)
24
+ return cleaned
25
+
26
+
27
+ _FACET_TYPE_BY_NAME = {
28
+ "author": "author",
29
+ "authors": "author",
30
+ "institution": "institution",
31
+ "institutions": "institution",
32
+ "venue": "venue",
33
+ "venues": "venue",
34
+ "keyword": "keyword",
35
+ "keywords": "keyword",
36
+ "tag": "tag",
37
+ "tags": "tag",
38
+ "year": "year",
39
+ "years": "year",
40
+ "month": "month",
41
+ "months": "month",
42
+ "summary_template": "summary_template",
43
+ "summary_templates": "summary_template",
44
+ "templates": "summary_template",
45
+ "output_language": "output_language",
46
+ "output_languages": "output_language",
47
+ "provider": "provider",
48
+ "providers": "provider",
49
+ "model": "model",
50
+ "models": "model",
51
+ "prompt_template": "prompt_template",
52
+ "prompt_templates": "prompt_template",
53
+ "translation_lang": "translation_lang",
54
+ "translation_langs": "translation_lang",
55
+ "translations": "translation_lang",
56
+ }
57
+
58
+ _SEARCH_SORTS = {
59
+ "year_desc": (
60
+ "CASE WHEN p.year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END, "
61
+ "CAST(p.year AS INT) DESC, LOWER(p.title) ASC"
62
+ ),
63
+ "year_asc": (
64
+ "CASE WHEN p.year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END, "
65
+ "CAST(p.year AS INT) ASC, LOWER(p.title) ASC"
66
+ ),
67
+ "title_asc": "LOWER(p.title) ASC",
68
+ "title_desc": "LOWER(p.title) DESC",
69
+ "venue_asc": "LOWER(p.venue) ASC, LOWER(p.title) ASC",
70
+ "venue_desc": "LOWER(p.venue) DESC, LOWER(p.title) ASC",
71
+ }
72
+
73
+ _FACET_TYPE_TO_KEY = {
74
+ "author": "authors",
75
+ "institution": "institutions",
76
+ "venue": "venues",
77
+ "keyword": "keywords",
78
+ "tag": "tags",
79
+ "year": "years",
80
+ "month": "months",
81
+ "summary_template": "summary_templates",
82
+ "output_language": "output_languages",
83
+ "provider": "providers",
84
+ "model": "models",
85
+ "prompt_template": "prompt_templates",
86
+ "translation_lang": "translation_langs",
87
+ }
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class ApiLimits:
92
+ max_query_length: int = 500
93
+ max_page_size: int = 100
94
+ max_pagination_offset: int = 10_000 # page * page_size
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class SnapshotApiConfig:
99
+ snapshot_db: Path
100
+ static_base_url: str
101
+ cors_allowed_origins: list[str]
102
+ limits: ApiLimits
103
+
104
+
105
+ def _normalize_base_url(value: str) -> str:
106
+ return (value or "").rstrip("/")
107
+
108
+
109
+ def _json_error(status_code: int, *, error: str, detail: str) -> JSONResponse:
110
+ return JSONResponse({"error": error, "detail": detail}, status_code=status_code)
111
+
112
+
113
+ def _open_ro_conn(db_path: Path) -> sqlite3.Connection:
114
+ uri = f"file:{db_path.as_posix()}?mode=ro"
115
+ conn = sqlite3.connect(uri, uri=True)
116
+ conn.row_factory = sqlite3.Row
117
+ conn.execute("PRAGMA query_only=ON;")
118
+ return conn
119
+
120
+
121
+ def _snapshot_build_id(conn: sqlite3.Connection) -> str:
122
+ row = conn.execute(
123
+ "SELECT value FROM snapshot_meta WHERE key = 'snapshot_build_id' LIMIT 1"
124
+ ).fetchone()
125
+ return str(row["value"]) if row else ""
126
+
127
+
128
+ def _asset_urls(
129
+ *,
130
+ static_base_url: str,
131
+ snapshot_build_id: str,
132
+ paper_id: str,
133
+ pdf_hash: str | None,
134
+ source_md_hash: str | None,
135
+ translated: dict[str, str],
136
+ ) -> dict[str, Any]:
137
+ base = _normalize_base_url(static_base_url)
138
+ images_base_url = f"{base}/images" if base else ""
139
+ summary_url = f"{base}/summary/{paper_id}.json"
140
+ manifest_url = f"{base}/manifest/{paper_id}.json"
141
+ if snapshot_build_id:
142
+ summary_url = f"{summary_url}?v={snapshot_build_id}"
143
+ manifest_url = f"{manifest_url}?v={snapshot_build_id}"
144
+ return {
145
+ "static_base_url": base,
146
+ "pdf_url": f"{base}/pdf/{pdf_hash}.pdf" if pdf_hash else None,
147
+ "source_md_url": f"{base}/md/{source_md_hash}.md" if source_md_hash else None,
148
+ "translated_md_urls": {
149
+ lang: f"{base}/md_translate/{lang}/{md_hash}.md" for lang, md_hash in translated.items()
150
+ },
151
+ "images_base_url": images_base_url,
152
+ "summary_url": summary_url,
153
+ "manifest_url": manifest_url,
154
+ }
155
+
156
+
157
+ def _summary_urls(
158
+ *,
159
+ static_base_url: str,
160
+ snapshot_build_id: str,
161
+ paper_id: str,
162
+ template_tags: list[str],
163
+ ) -> dict[str, str]:
164
+ base = _normalize_base_url(static_base_url)
165
+ out: dict[str, str] = {}
166
+ for tag in template_tags:
167
+ safe_tag = quote(tag, safe="")
168
+ url = f"{base}/summary/{paper_id}/{safe_tag}.json"
169
+ if snapshot_build_id:
170
+ url = f"{url}?v={snapshot_build_id}"
171
+ out[tag] = url
172
+ return out
173
+
174
+
175
+ def _list_facet_values(
176
+ conn: sqlite3.Connection,
177
+ *,
178
+ paper_id: str,
179
+ join_table: str,
180
+ facet_table: str,
181
+ facet_id: str,
182
+ ) -> list[str]:
183
+ rows = conn.execute(
184
+ f"""
185
+ SELECT f.value
186
+ FROM {join_table} j
187
+ JOIN {facet_table} f ON f.{facet_id} = j.{facet_id}
188
+ WHERE j.paper_id = ?
189
+ ORDER BY f.value ASC
190
+ """,
191
+ (paper_id,),
192
+ ).fetchall()
193
+ return [str(r["value"]) for r in rows]
194
+
195
+
196
+ def _parse_pagination(request: Request, limits: ApiLimits) -> tuple[int, int] | JSONResponse:
197
+ page_raw = request.query_params.get("page", "1")
198
+ page_size_raw = request.query_params.get("page_size", "20")
199
+ try:
200
+ page = int(page_raw)
201
+ page_size = int(page_size_raw)
202
+ except ValueError:
203
+ return _json_error(400, error="invalid_pagination", detail="page and page_size must be integers")
204
+ if page <= 0 or page_size <= 0:
205
+ return _json_error(400, error="invalid_pagination", detail="page and page_size must be positive")
206
+ if page_size > limits.max_page_size:
207
+ return _json_error(
208
+ 400,
209
+ error="page_size_too_large",
210
+ detail=f"page_size must not exceed {limits.max_page_size}",
211
+ )
212
+ if page * page_size > limits.max_pagination_offset:
213
+ return _json_error(
214
+ 400,
215
+ error="pagination_too_deep",
216
+ detail="pagination depth exceeds limit",
217
+ )
218
+ return page, page_size
219
+
220
+
221
+ async def _api_search(request: Request) -> Response:
222
+ cfg: SnapshotApiConfig = request.app.state.cfg
223
+ pagination = _parse_pagination(request, cfg.limits)
224
+ if isinstance(pagination, JSONResponse):
225
+ return pagination
226
+ page, page_size = pagination
227
+ q = (request.query_params.get("q") or "").strip()
228
+ sort_raw = (request.query_params.get("sort") or "").strip().lower()
229
+ if len(q) > cfg.limits.max_query_length:
230
+ return _json_error(
231
+ 400,
232
+ error="query_too_long",
233
+ detail=f"q must not exceed {cfg.limits.max_query_length} characters",
234
+ )
235
+
236
+ if sort_raw and sort_raw not in _SEARCH_SORTS and sort_raw != "relevance":
237
+ return _json_error(400, error="invalid_sort", detail="unsupported sort value")
238
+ sort_key = sort_raw
239
+ if not sort_key:
240
+ sort_key = "relevance" if q else "year_desc"
241
+ if not q and sort_key == "relevance":
242
+ sort_key = "year_desc"
243
+
244
+ offset = (page - 1) * page_size
245
+
246
+ conn = _open_ro_conn(cfg.snapshot_db)
247
+ try:
248
+ build_id = _snapshot_build_id(conn)
249
+ items: list[dict[str, Any]] = []
250
+ total = 0
251
+
252
+ if q:
253
+ match_expr = rewrite_search_query(q)
254
+ if not match_expr:
255
+ return JSONResponse({"page": page, "page_size": page_size, "total": 0, "has_more": False, "items": []})
256
+
257
+ total_row = conn.execute(
258
+ "SELECT COUNT(*) AS c FROM paper_fts WHERE paper_fts MATCH ?",
259
+ (match_expr,),
260
+ ).fetchone()
261
+ total = int(total_row["c"]) if total_row else 0
262
+
263
+ order_by = "rank" if sort_key == "relevance" else _SEARCH_SORTS.get(sort_key, _SEARCH_SORTS["year_desc"])
264
+ rows = conn.execute(
265
+ f"""
266
+ SELECT
267
+ p.paper_id,
268
+ p.title,
269
+ p.year,
270
+ p.venue,
271
+ p.preferred_summary_template,
272
+ p.summary_preview,
273
+ p.paper_index,
274
+ p.pdf_content_hash,
275
+ p.source_md_content_hash,
276
+ snippet(paper_fts, -1, '[[[', ']]]', '…', 30) AS snippet_markdown,
277
+ bm25(paper_fts, 5.0, 3.0, 1.0, 1.0, 2.0) AS rank
278
+ FROM paper_fts
279
+ JOIN paper p ON p.paper_id = paper_fts.paper_id
280
+ WHERE paper_fts MATCH ?
281
+ ORDER BY {order_by}
282
+ LIMIT ? OFFSET ?
283
+ """,
284
+ (match_expr, page_size, offset),
285
+ ).fetchall()
286
+
287
+ for row in rows:
288
+ paper_id = str(row["paper_id"])
289
+ snippet = str(row["snippet_markdown"] or "")
290
+ snippet = remove_cjk_spaces(snippet)
291
+ snippet = merge_adjacent_markers(snippet)
292
+ translated_rows = conn.execute(
293
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
294
+ (paper_id,),
295
+ ).fetchall()
296
+ translated = {str(r["lang"]): str(r["md_content_hash"]) for r in translated_rows}
297
+ authors = _list_facet_values(conn, paper_id=paper_id, join_table="paper_author", facet_table="author", facet_id="author_id")
298
+ assets = _asset_urls(
299
+ static_base_url=cfg.static_base_url,
300
+ snapshot_build_id=build_id,
301
+ paper_id=paper_id,
302
+ pdf_hash=str(row["pdf_content_hash"]) if row["pdf_content_hash"] else None,
303
+ source_md_hash=str(row["source_md_content_hash"]) if row["source_md_content_hash"] else None,
304
+ translated=translated,
305
+ )
306
+ items.append(
307
+ {
308
+ "paper_id": paper_id,
309
+ "title": str(row["title"]),
310
+ "year": str(row["year"]),
311
+ "venue": str(row["venue"]),
312
+ "snippet_markdown": snippet,
313
+ "summary_preview": str(row["summary_preview"] or ""),
314
+ "paper_index": int(row["paper_index"] or 0),
315
+ "authors": authors,
316
+ "preferred_summary_template": str(row["preferred_summary_template"] or ""),
317
+ "has_pdf": bool(row["pdf_content_hash"]),
318
+ "has_source": bool(row["source_md_content_hash"]),
319
+ "has_translated": bool(translated),
320
+ **assets,
321
+ }
322
+ )
323
+ else:
324
+ total_row = conn.execute("SELECT COUNT(*) AS c FROM paper").fetchone()
325
+ total = int(total_row["c"]) if total_row else 0
326
+ order_by = _SEARCH_SORTS.get(sort_key, _SEARCH_SORTS["year_desc"])
327
+ rows = conn.execute(
328
+ f"""
329
+ SELECT p.paper_id, p.title, p.year, p.venue, p.preferred_summary_template, p.summary_preview, p.paper_index,
330
+ p.pdf_content_hash, p.source_md_content_hash
331
+ FROM paper p
332
+ ORDER BY {order_by}
333
+ LIMIT ? OFFSET ?
334
+ """,
335
+ (page_size, offset),
336
+ ).fetchall()
337
+ for row in rows:
338
+ paper_id = str(row["paper_id"])
339
+ translated_rows = conn.execute(
340
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
341
+ (paper_id,),
342
+ ).fetchall()
343
+ translated = {str(r["lang"]): str(r["md_content_hash"]) for r in translated_rows}
344
+ authors = _list_facet_values(conn, paper_id=paper_id, join_table="paper_author", facet_table="author", facet_id="author_id")
345
+ assets = _asset_urls(
346
+ static_base_url=cfg.static_base_url,
347
+ snapshot_build_id=build_id,
348
+ paper_id=paper_id,
349
+ pdf_hash=str(row["pdf_content_hash"]) if row["pdf_content_hash"] else None,
350
+ source_md_hash=str(row["source_md_content_hash"]) if row["source_md_content_hash"] else None,
351
+ translated=translated,
352
+ )
353
+ items.append(
354
+ {
355
+ "paper_id": paper_id,
356
+ "title": str(row["title"]),
357
+ "year": str(row["year"]),
358
+ "venue": str(row["venue"]),
359
+ "summary_preview": str(row["summary_preview"] or ""),
360
+ "paper_index": int(row["paper_index"] or 0),
361
+ "authors": authors,
362
+ "preferred_summary_template": str(row["preferred_summary_template"] or ""),
363
+ "has_pdf": bool(row["pdf_content_hash"]),
364
+ "has_source": bool(row["source_md_content_hash"]),
365
+ "has_translated": bool(translated),
366
+ **assets,
367
+ }
368
+ )
369
+
370
+ has_more = (page * page_size) < total and bool(items)
371
+ return JSONResponse({"page": page, "page_size": page_size, "total": total, "has_more": has_more, "items": items})
372
+ finally:
373
+ conn.close()
374
+
375
+
376
+ async def _api_paper_detail(request: Request) -> Response:
377
+ cfg: SnapshotApiConfig = request.app.state.cfg
378
+ paper_id = str(request.path_params["paper_id"])
379
+ conn = _open_ro_conn(cfg.snapshot_db)
380
+ try:
381
+ build_id = _snapshot_build_id(conn)
382
+ row = conn.execute(
383
+ """
384
+ SELECT paper_id, title, year, venue, preferred_summary_template,
385
+ output_language, provider, model, prompt_template,
386
+ pdf_content_hash, source_md_content_hash
387
+ FROM paper
388
+ WHERE paper_id = ?
389
+ """,
390
+ (paper_id,),
391
+ ).fetchone()
392
+ if not row:
393
+ return _json_error(404, error="not_found", detail="paper not found")
394
+
395
+ translated_rows = conn.execute(
396
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
397
+ (paper_id,),
398
+ ).fetchall()
399
+ translated = {str(r["lang"]): str(r["md_content_hash"]) for r in translated_rows}
400
+ assets = _asset_urls(
401
+ static_base_url=cfg.static_base_url,
402
+ snapshot_build_id=build_id,
403
+ paper_id=paper_id,
404
+ pdf_hash=str(row["pdf_content_hash"]) if row["pdf_content_hash"] else None,
405
+ source_md_hash=str(row["source_md_content_hash"]) if row["source_md_content_hash"] else None,
406
+ translated=translated,
407
+ )
408
+
409
+ summary_rows = conn.execute(
410
+ "SELECT template_tag FROM paper_summary WHERE paper_id = ? ORDER BY LOWER(template_tag) ASC",
411
+ (paper_id,),
412
+ ).fetchall()
413
+ template_tags = [str(r["template_tag"]) for r in summary_rows]
414
+ preferred_template = str(row["preferred_summary_template"] or "")
415
+ summary_urls = _summary_urls(
416
+ static_base_url=cfg.static_base_url,
417
+ snapshot_build_id=build_id,
418
+ paper_id=paper_id,
419
+ template_tags=template_tags,
420
+ )
421
+
422
+ authors = _list_facet_values(conn, paper_id=paper_id, join_table="paper_author", facet_table="author", facet_id="author_id")
423
+ keywords = _list_facet_values(conn, paper_id=paper_id, join_table="paper_keyword", facet_table="keyword", facet_id="keyword_id")
424
+ institutions = _list_facet_values(conn, paper_id=paper_id, join_table="paper_institution", facet_table="institution", facet_id="institution_id")
425
+ tags = _list_facet_values(conn, paper_id=paper_id, join_table="paper_tag", facet_table="tag", facet_id="tag_id")
426
+
427
+ return JSONResponse(
428
+ {
429
+ "paper_id": paper_id,
430
+ "title": str(row["title"]),
431
+ "year": str(row["year"]),
432
+ "venue": str(row["venue"]),
433
+ "authors": authors,
434
+ "keywords": keywords,
435
+ "institutions": institutions,
436
+ "tags": tags,
437
+ "output_language": str(row["output_language"] or ""),
438
+ "provider": str(row["provider"] or ""),
439
+ "model": str(row["model"] or ""),
440
+ "prompt_template": str(row["prompt_template"] or ""),
441
+ "preferred_summary_template": preferred_template,
442
+ "summary_urls": summary_urls,
443
+ **assets,
444
+ }
445
+ )
446
+ finally:
447
+ conn.close()
448
+
449
+
450
+ async def _api_facet_list(request: Request) -> Response:
451
+ cfg: SnapshotApiConfig = request.app.state.cfg
452
+ facet = str(request.path_params["facet"])
453
+ pagination = _parse_pagination(request, cfg.limits)
454
+ if isinstance(pagination, JSONResponse):
455
+ return pagination
456
+ page, page_size = pagination
457
+ offset = (page - 1) * page_size
458
+
459
+ conn = _open_ro_conn(cfg.snapshot_db)
460
+ try:
461
+ if facet == "years":
462
+ total_row = conn.execute("SELECT COUNT(*) AS c FROM year_count").fetchone()
463
+ total = int(total_row["c"]) if total_row else 0
464
+ rows = conn.execute(
465
+ """
466
+ SELECT year AS id, year AS value, paper_count
467
+ FROM year_count
468
+ ORDER BY
469
+ CASE WHEN year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
470
+ CAST(year AS INT) DESC,
471
+ year ASC
472
+ LIMIT ? OFFSET ?
473
+ """,
474
+ (page_size, offset),
475
+ ).fetchall()
476
+ items = [{"id": str(r["id"]), "value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows]
477
+ elif facet == "months":
478
+ total_row = conn.execute("SELECT COUNT(*) AS c FROM month_count").fetchone()
479
+ total = int(total_row["c"]) if total_row else 0
480
+ rows = conn.execute(
481
+ """
482
+ SELECT month AS id, month AS value, paper_count
483
+ FROM month_count
484
+ ORDER BY
485
+ CASE WHEN month GLOB '[0-1][0-9]' THEN 0 ELSE 1 END,
486
+ CAST(month AS INT) ASC,
487
+ month ASC
488
+ LIMIT ? OFFSET ?
489
+ """,
490
+ (page_size, offset),
491
+ ).fetchall()
492
+ items = [{"id": str(r["id"]), "value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows]
493
+ else:
494
+ mapping = {
495
+ "authors": ("author", "author_id"),
496
+ "keywords": ("keyword", "keyword_id"),
497
+ "institutions": ("institution", "institution_id"),
498
+ "tags": ("tag", "tag_id"),
499
+ "venues": ("venue", "venue_id"),
500
+ }
501
+ if facet in mapping:
502
+ table, id_col = mapping[facet]
503
+ total_row = conn.execute(f"SELECT COUNT(*) AS c FROM {table}").fetchone()
504
+ total = int(total_row["c"]) if total_row else 0
505
+ rows = conn.execute(
506
+ f"""
507
+ SELECT {id_col} AS id, value, paper_count
508
+ FROM {table}
509
+ ORDER BY paper_count DESC, value ASC
510
+ LIMIT ? OFFSET ?
511
+ """,
512
+ (page_size, offset),
513
+ ).fetchall()
514
+ items = [
515
+ {"id": int(r["id"]), "value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows
516
+ ]
517
+ else:
518
+ facet_type = _FACET_TYPE_BY_NAME.get(facet)
519
+ if not facet_type:
520
+ return _json_error(404, error="not_found", detail="facet not found")
521
+ total_row = conn.execute(
522
+ "SELECT COUNT(*) AS c FROM facet_node WHERE facet_type = ?",
523
+ (facet_type,),
524
+ ).fetchone()
525
+ total = int(total_row["c"]) if total_row else 0
526
+ rows = conn.execute(
527
+ """
528
+ SELECT value, paper_count
529
+ FROM facet_node
530
+ WHERE facet_type = ?
531
+ ORDER BY paper_count DESC, value ASC
532
+ LIMIT ? OFFSET ?
533
+ """,
534
+ (facet_type, page_size, offset),
535
+ ).fetchall()
536
+ items = [
537
+ {"id": str(r["value"]), "value": str(r["value"]), "paper_count": int(r["paper_count"])}
538
+ for r in rows
539
+ ]
540
+
541
+ has_more = (page * page_size) < total and bool(items)
542
+ return JSONResponse({"page": page, "page_size": page_size, "total": total, "has_more": has_more, "items": items})
543
+ finally:
544
+ conn.close()
545
+
546
+
547
+ async def _api_facet_papers(request: Request) -> Response:
548
+ cfg: SnapshotApiConfig = request.app.state.cfg
549
+ facet = str(request.path_params["facet"])
550
+ facet_id = str(request.path_params["facet_id"])
551
+ pagination = _parse_pagination(request, cfg.limits)
552
+ if isinstance(pagination, JSONResponse):
553
+ return pagination
554
+ page, page_size = pagination
555
+ offset = (page - 1) * page_size
556
+
557
+ conn = _open_ro_conn(cfg.snapshot_db)
558
+ try:
559
+ mapping = {
560
+ "authors": ("paper_author", "author_id"),
561
+ "keywords": ("paper_keyword", "keyword_id"),
562
+ "institutions": ("paper_institution", "institution_id"),
563
+ "tags": ("paper_tag", "tag_id"),
564
+ "venues": ("paper_venue", "venue_id"),
565
+ }
566
+ if facet == "years":
567
+ total_row = conn.execute("SELECT paper_count AS c FROM year_count WHERE year = ?", (facet_id,)).fetchone()
568
+ total = int(total_row["c"]) if total_row else 0
569
+ rows = conn.execute(
570
+ """
571
+ SELECT paper_id, title, year, venue, summary_preview, pdf_content_hash, source_md_content_hash
572
+ FROM paper
573
+ WHERE year = ?
574
+ ORDER BY LOWER(title) ASC
575
+ LIMIT ? OFFSET ?
576
+ """,
577
+ (facet_id, page_size, offset),
578
+ ).fetchall()
579
+ elif facet == "months":
580
+ total_row = conn.execute(
581
+ "SELECT paper_count AS c FROM month_count WHERE month = ?",
582
+ (facet_id,),
583
+ ).fetchone()
584
+ total = int(total_row["c"]) if total_row else 0
585
+ rows = conn.execute(
586
+ """
587
+ SELECT paper_id, title, year, venue, summary_preview, pdf_content_hash, source_md_content_hash
588
+ FROM paper
589
+ WHERE month = ?
590
+ ORDER BY
591
+ CASE WHEN year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
592
+ CAST(year AS INT) DESC,
593
+ LOWER(title) ASC
594
+ LIMIT ? OFFSET ?
595
+ """,
596
+ (facet_id, page_size, offset),
597
+ ).fetchall()
598
+ else:
599
+ if facet not in mapping:
600
+ return _json_error(404, error="not_found", detail="facet not found")
601
+ join_table, id_col = mapping[facet]
602
+ total_row = conn.execute(
603
+ f"SELECT COUNT(*) AS c FROM {join_table} WHERE {id_col} = ?",
604
+ (facet_id,),
605
+ ).fetchone()
606
+ total = int(total_row["c"]) if total_row else 0
607
+ rows = conn.execute(
608
+ f"""
609
+ SELECT p.paper_id, p.title, p.year, p.venue, p.summary_preview, p.pdf_content_hash, p.source_md_content_hash
610
+ FROM {join_table} j
611
+ JOIN paper p ON p.paper_id = j.paper_id
612
+ WHERE j.{id_col} = ?
613
+ ORDER BY
614
+ CASE WHEN p.year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
615
+ CAST(p.year AS INT) DESC,
616
+ LOWER(p.title) ASC
617
+ LIMIT ? OFFSET ?
618
+ """,
619
+ (facet_id, page_size, offset),
620
+ ).fetchall()
621
+
622
+ build_id = _snapshot_build_id(conn)
623
+ items: list[dict[str, Any]] = []
624
+ for row in rows:
625
+ paper_id = str(row["paper_id"])
626
+ translated_rows = conn.execute(
627
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
628
+ (paper_id,),
629
+ ).fetchall()
630
+ translated = {str(r["lang"]): str(r["md_content_hash"]) for r in translated_rows}
631
+ authors = _list_facet_values(conn, paper_id=paper_id, join_table="paper_author", facet_table="author", facet_id="author_id")
632
+ assets = _asset_urls(
633
+ static_base_url=cfg.static_base_url,
634
+ snapshot_build_id=build_id,
635
+ paper_id=paper_id,
636
+ pdf_hash=str(row["pdf_content_hash"]) if row["pdf_content_hash"] else None,
637
+ source_md_hash=str(row["source_md_content_hash"]) if row["source_md_content_hash"] else None,
638
+ translated=translated,
639
+ )
640
+ items.append(
641
+ {
642
+ "paper_id": paper_id,
643
+ "title": str(row["title"]),
644
+ "year": str(row["year"]),
645
+ "venue": str(row["venue"]),
646
+ "summary_preview": str(row["summary_preview"] or ""),
647
+ "authors": authors,
648
+ "has_pdf": bool(row["pdf_content_hash"]),
649
+ "has_source": bool(row["source_md_content_hash"]),
650
+ "has_translated": bool(translated),
651
+ **assets,
652
+ }
653
+ )
654
+
655
+ has_more = (page * page_size) < total and bool(items)
656
+ return JSONResponse({"page": page, "page_size": page_size, "total": total, "has_more": has_more, "items": items})
657
+ finally:
658
+ conn.close()
659
+
660
+
661
+ def _facet_node_id(conn: sqlite3.Connection, facet_type: str, value: str) -> int | None:
662
+ normalized = _normalize_facet_value(value)
663
+ if not normalized:
664
+ return None
665
+ row = conn.execute(
666
+ "SELECT node_id FROM facet_node WHERE facet_type = ? AND value = ?",
667
+ (facet_type, normalized),
668
+ ).fetchone()
669
+ return int(row["node_id"]) if row else None
670
+
671
+
672
+ def _facet_stats_for_node(conn: sqlite3.Connection, *, facet_type: str, value: str) -> dict[str, Any]:
673
+ node_id = _facet_node_id(conn, facet_type, value)
674
+ related: dict[str, list[dict[str, Any]]] = {key: [] for key in _FACET_TYPE_TO_KEY.values()}
675
+ if node_id is None:
676
+ return {"facet_type": facet_type, "value": _normalize_facet_value(value), "total": 0, "related": related}
677
+
678
+ total_row = conn.execute(
679
+ "SELECT paper_count FROM facet_node WHERE node_id = ?",
680
+ (node_id,),
681
+ ).fetchone()
682
+ total = int(total_row["paper_count"]) if total_row else 0
683
+
684
+ rows = conn.execute(
685
+ """
686
+ SELECT n.facet_type AS facet_type, n.value AS value, e.paper_count AS paper_count
687
+ FROM facet_edge e
688
+ JOIN facet_node n
689
+ ON n.node_id = CASE WHEN e.node_id_a = ? THEN e.node_id_b ELSE e.node_id_a END
690
+ WHERE e.node_id_a = ? OR e.node_id_b = ?
691
+ ORDER BY e.paper_count DESC, n.value ASC
692
+ """,
693
+ (node_id, node_id, node_id),
694
+ ).fetchall()
695
+
696
+ for row in rows:
697
+ other_type = str(row["facet_type"])
698
+ key = _FACET_TYPE_TO_KEY.get(other_type)
699
+ if not key:
700
+ continue
701
+ related[key].append({"value": str(row["value"]), "paper_count": int(row["paper_count"])})
702
+
703
+ return {
704
+ "facet_type": facet_type,
705
+ "value": _normalize_facet_value(value),
706
+ "total": total,
707
+ "related": related,
708
+ }
709
+
710
+
711
+ async def _api_facet_by_value_papers(request: Request) -> Response:
712
+ cfg: SnapshotApiConfig = request.app.state.cfg
713
+ facet = str(request.path_params["facet"])
714
+ raw_value = str(request.path_params["value"])
715
+ pagination = _parse_pagination(request, cfg.limits)
716
+ if isinstance(pagination, JSONResponse):
717
+ return pagination
718
+ page, page_size = pagination
719
+ offset = (page - 1) * page_size
720
+
721
+ facet_type = _FACET_TYPE_BY_NAME.get(facet)
722
+ if not facet_type:
723
+ return _json_error(404, error="not_found", detail="facet not found")
724
+
725
+ conn = _open_ro_conn(cfg.snapshot_db)
726
+ try:
727
+ node_id = _facet_node_id(conn, facet_type, raw_value)
728
+ if node_id is None:
729
+ return JSONResponse({"page": page, "page_size": page_size, "total": 0, "has_more": False, "items": []})
730
+
731
+ total_row = conn.execute(
732
+ "SELECT paper_count FROM facet_node WHERE node_id = ?",
733
+ (node_id,),
734
+ ).fetchone()
735
+ total = int(total_row["paper_count"]) if total_row else 0
736
+
737
+ rows = conn.execute(
738
+ """
739
+ SELECT p.paper_id, p.title, p.year, p.venue, p.summary_preview, p.pdf_content_hash, p.source_md_content_hash
740
+ FROM paper_facet pf
741
+ JOIN paper p ON p.paper_id = pf.paper_id
742
+ WHERE pf.node_id = ?
743
+ ORDER BY
744
+ CASE WHEN p.year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
745
+ CAST(p.year AS INT) DESC,
746
+ LOWER(p.title) ASC
747
+ LIMIT ? OFFSET ?
748
+ """,
749
+ (node_id, page_size, offset),
750
+ ).fetchall()
751
+
752
+ build_id = _snapshot_build_id(conn)
753
+ items: list[dict[str, Any]] = []
754
+ for row in rows:
755
+ paper_id = str(row["paper_id"])
756
+ translated_rows = conn.execute(
757
+ "SELECT lang, md_content_hash FROM paper_translation WHERE paper_id = ?",
758
+ (paper_id,),
759
+ ).fetchall()
760
+ translated = {str(r["lang"]): str(r["md_content_hash"]) for r in translated_rows}
761
+ authors = _list_facet_values(conn, paper_id=paper_id, join_table="paper_author", facet_table="author", facet_id="author_id")
762
+ assets = _asset_urls(
763
+ static_base_url=cfg.static_base_url,
764
+ snapshot_build_id=build_id,
765
+ paper_id=paper_id,
766
+ pdf_hash=str(row["pdf_content_hash"]) if row["pdf_content_hash"] else None,
767
+ source_md_hash=str(row["source_md_content_hash"]) if row["source_md_content_hash"] else None,
768
+ translated=translated,
769
+ )
770
+ items.append(
771
+ {
772
+ "paper_id": paper_id,
773
+ "title": str(row["title"]),
774
+ "year": str(row["year"]),
775
+ "venue": str(row["venue"]),
776
+ "summary_preview": str(row["summary_preview"] or ""),
777
+ "authors": authors,
778
+ "has_pdf": bool(row["pdf_content_hash"]),
779
+ "has_source": bool(row["source_md_content_hash"]),
780
+ "has_translated": bool(translated),
781
+ **assets,
782
+ }
783
+ )
784
+
785
+ has_more = (page * page_size) < total and bool(items)
786
+ return JSONResponse({"page": page, "page_size": page_size, "total": total, "has_more": has_more, "items": items})
787
+ finally:
788
+ conn.close()
789
+
790
+
791
+ async def _api_facet_by_value_stats(request: Request) -> Response:
792
+ cfg: SnapshotApiConfig = request.app.state.cfg
793
+ facet = str(request.path_params["facet"])
794
+ raw_value = str(request.path_params["value"])
795
+ facet_type = _FACET_TYPE_BY_NAME.get(facet)
796
+ if not facet_type:
797
+ return _json_error(404, error="not_found", detail="facet not found")
798
+
799
+ conn = _open_ro_conn(cfg.snapshot_db)
800
+ try:
801
+ return JSONResponse(_facet_stats_for_node(conn, facet_type=facet_type, value=raw_value))
802
+ finally:
803
+ conn.close()
804
+
805
+
806
+ async def _api_facet_stats(request: Request) -> Response:
807
+ cfg: SnapshotApiConfig = request.app.state.cfg
808
+ facet = str(request.path_params["facet"])
809
+ facet_id = str(request.path_params["facet_id"])
810
+ facet_type = _FACET_TYPE_BY_NAME.get(facet)
811
+ if not facet_type:
812
+ return _json_error(404, error="not_found", detail="facet not found")
813
+
814
+ conn = _open_ro_conn(cfg.snapshot_db)
815
+ try:
816
+ value: str | None = None
817
+ mapping = {
818
+ "authors": ("author", "author_id"),
819
+ "keywords": ("keyword", "keyword_id"),
820
+ "institutions": ("institution", "institution_id"),
821
+ "tags": ("tag", "tag_id"),
822
+ "venues": ("venue", "venue_id"),
823
+ }
824
+ if facet in ("years", "months"):
825
+ value = facet_id
826
+ elif facet in mapping:
827
+ table, id_col = mapping[facet]
828
+ row = conn.execute(
829
+ f"SELECT value FROM {table} WHERE {id_col} = ?",
830
+ (facet_id,),
831
+ ).fetchone()
832
+ if row:
833
+ value = str(row["value"])
834
+ else:
835
+ value = facet_id
836
+
837
+ if not value:
838
+ value = facet_id
839
+ return JSONResponse(_facet_stats_for_node(conn, facet_type=facet_type, value=value))
840
+ finally:
841
+ conn.close()
842
+
843
+
844
+ async def _api_stats(request: Request) -> Response:
845
+ cfg: SnapshotApiConfig = request.app.state.cfg
846
+ conn = _open_ro_conn(cfg.snapshot_db)
847
+ try:
848
+ total_row = conn.execute("SELECT COUNT(*) AS c FROM paper").fetchone()
849
+ total = int(total_row["c"]) if total_row else 0
850
+
851
+ def top(table: str, *, limit: int = 20) -> list[dict[str, Any]]:
852
+ rows = conn.execute(
853
+ f"""
854
+ SELECT value, paper_count
855
+ FROM {table}
856
+ ORDER BY paper_count DESC, value ASC
857
+ LIMIT ?
858
+ """,
859
+ (limit,),
860
+ ).fetchall()
861
+ return [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in rows]
862
+
863
+ years = conn.execute(
864
+ """
865
+ SELECT year AS value, paper_count
866
+ FROM year_count
867
+ ORDER BY
868
+ CASE WHEN year GLOB '[0-9][0-9][0-9][0-9]' THEN 0 ELSE 1 END,
869
+ CAST(year AS INT) DESC,
870
+ year ASC
871
+ LIMIT 50
872
+ """
873
+ ).fetchall()
874
+ months = conn.execute(
875
+ """
876
+ SELECT month AS value, paper_count
877
+ FROM month_count
878
+ ORDER BY
879
+ CASE WHEN month GLOB '[0-1][0-9]' THEN 0 ELSE 1 END,
880
+ CAST(month AS INT) ASC,
881
+ month ASC
882
+ """
883
+ ).fetchall()
884
+
885
+ return JSONResponse(
886
+ {
887
+ "total": total,
888
+ "years": [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in years],
889
+ "months": [{"value": str(r["value"]), "paper_count": int(r["paper_count"])} for r in months],
890
+ "authors": top("author"),
891
+ "venues": top("venue"),
892
+ "institutions": top("institution"),
893
+ "keywords": top("keyword"),
894
+ "tags": top("tag"),
895
+ }
896
+ )
897
+ finally:
898
+ conn.close()
899
+
900
+
901
+ async def _api_config(request: Request) -> Response:
902
+ cfg: SnapshotApiConfig = request.app.state.cfg
903
+ return JSONResponse({"static_base_url": cfg.static_base_url})
904
+
905
+
906
+ def create_app(
907
+ *,
908
+ snapshot_db: Path,
909
+ static_base_url: str,
910
+ cors_allowed_origins: list[str] | None = None,
911
+ limits: ApiLimits | None = None,
912
+ ) -> Starlette:
913
+ cfg = SnapshotApiConfig(
914
+ snapshot_db=snapshot_db,
915
+ static_base_url=_normalize_base_url(static_base_url),
916
+ cors_allowed_origins=cors_allowed_origins or ["*"],
917
+ limits=limits or ApiLimits(),
918
+ )
919
+
920
+ routes = [
921
+ Route("/api/v1/config", _api_config, methods=["GET"]),
922
+ Route("/api/v1/search", _api_search, methods=["GET"]),
923
+ Route("/api/v1/stats", _api_stats, methods=["GET"]),
924
+ Route("/api/v1/papers/{paper_id:str}", _api_paper_detail, methods=["GET"]),
925
+ Route("/api/v1/facets/{facet:str}", _api_facet_list, methods=["GET"]),
926
+ Route("/api/v1/facets/{facet:str}/{facet_id:str}/papers", _api_facet_papers, methods=["GET"]),
927
+ Route("/api/v1/facets/{facet:str}/{facet_id:str}/stats", _api_facet_stats, methods=["GET"]),
928
+ Route("/api/v1/facets/{facet:str}/by-value/{value:str}/papers", _api_facet_by_value_papers, methods=["GET"]),
929
+ Route("/api/v1/facets/{facet:str}/by-value/{value:str}/stats", _api_facet_by_value_stats, methods=["GET"]),
930
+ ]
931
+
932
+ app = Starlette(routes=routes)
933
+ if cfg.cors_allowed_origins:
934
+ app.add_middleware(
935
+ CORSMiddleware,
936
+ allow_origins=cfg.cors_allowed_origins,
937
+ allow_methods=["*"],
938
+ allow_headers=["*"],
939
+ )
940
+ app.state.cfg = cfg
941
+ return app