deepresearch-flow 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (417) hide show
  1. deepresearch_flow/__init__.py +5 -0
  2. deepresearch_flow/cli.py +23 -0
  3. deepresearch_flow/paper/__init__.py +1 -0
  4. deepresearch_flow/paper/cli.py +286 -0
  5. deepresearch_flow/paper/config.py +249 -0
  6. deepresearch_flow/paper/db.py +768 -0
  7. deepresearch_flow/paper/extract.py +870 -0
  8. deepresearch_flow/paper/llm.py +115 -0
  9. deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
  10. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
  11. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
  14. deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
  15. deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
  16. deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
  17. deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
  18. deepresearch_flow/paper/prompts.py +11 -0
  19. deepresearch_flow/paper/providers/__init__.py +1 -0
  20. deepresearch_flow/paper/providers/azure_openai.py +66 -0
  21. deepresearch_flow/paper/providers/base.py +19 -0
  22. deepresearch_flow/paper/providers/claude.py +71 -0
  23. deepresearch_flow/paper/providers/dashscope.py +58 -0
  24. deepresearch_flow/paper/providers/gemini.py +116 -0
  25. deepresearch_flow/paper/providers/ollama.py +46 -0
  26. deepresearch_flow/paper/providers/openai_compatible.py +60 -0
  27. deepresearch_flow/paper/render.py +64 -0
  28. deepresearch_flow/paper/schema.py +58 -0
  29. deepresearch_flow/paper/schemas/__init__.py +1 -0
  30. deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
  31. deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
  32. deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
  33. deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
  34. deepresearch_flow/paper/template_registry.py +189 -0
  35. deepresearch_flow/paper/templates/__init__.py +1 -0
  36. deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
  37. deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
  38. deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
  39. deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
  40. deepresearch_flow/paper/utils.py +136 -0
  41. deepresearch_flow/paper/web/__init__.py +2 -0
  42. deepresearch_flow/paper/web/app.py +2307 -0
  43. deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
  44. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
  45. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
  46. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
  47. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
  48. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
  49. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
  50. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
  51. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
  52. deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
  53. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
  54. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
  55. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
  56. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
  57. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
  58. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
  59. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
  60. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
  61. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
  62. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
  63. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
  64. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
  65. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
  66. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
  67. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
  68. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
  69. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
  70. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  71. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
  72. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
  73. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
  74. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
  75. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
  76. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
  77. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  78. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
  79. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
  80. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
  81. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
  82. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
  83. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
  84. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
  85. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  86. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
  87. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
  88. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
  89. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  90. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
  91. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
  92. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
  93. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
  94. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
  95. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
  96. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
  97. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
  98. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
  99. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
  100. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
  101. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
  102. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
  103. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
  104. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
  105. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
  106. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
  107. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
  108. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
  109. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
  110. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
  111. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
  112. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
  113. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
  114. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
  115. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
  116. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
  117. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
  118. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
  119. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
  120. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
  121. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
  122. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
  123. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
  124. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
  125. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
  126. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
  127. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
  128. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
  129. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
  130. deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
  131. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
  132. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
  133. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
  134. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
  135. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
  136. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
  137. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
  138. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
  139. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
  140. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
  141. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
  142. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
  143. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
  144. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
  145. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
  146. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
  147. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
  148. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
  149. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
  150. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
  151. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
  152. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  153. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  154. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
  155. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
  156. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
  157. deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
  158. deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
  159. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
  160. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
  161. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
  162. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
  163. deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
  164. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  165. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  166. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  167. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  168. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  169. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  170. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  171. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  172. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
  173. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
  174. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
  175. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
  176. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
  177. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
  178. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
  179. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
  180. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  181. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  182. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  183. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  184. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  185. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  186. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  187. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  188. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  189. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  190. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  191. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  192. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  193. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  194. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  195. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  196. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  197. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  198. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  199. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  200. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  201. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  202. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  203. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
  204. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
  205. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
  206. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
  207. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
  208. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
  209. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
  210. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
  211. deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
  212. deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
  213. deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
  214. deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
  215. deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
  216. deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
  217. deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
  218. deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
  219. deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
  220. deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
  221. deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
  222. deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
  223. deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
  224. deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
  225. deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
  226. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
  227. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
  228. deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
  229. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
  230. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
  231. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
  232. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
  233. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
  234. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
  235. deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
  236. deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
  237. deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
  238. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
  239. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
  240. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
  241. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
  242. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
  243. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
  244. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
  245. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
  246. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
  247. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
  248. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
  249. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
  250. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
  251. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
  252. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
  253. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
  254. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
  255. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
  256. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
  257. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
  258. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
  259. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
  260. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
  261. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
  262. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
  263. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
  264. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
  265. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
  266. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
  267. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
  268. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
  269. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
  270. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
  271. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
  272. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
  273. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
  274. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
  275. deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
  276. deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
  277. deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
  278. deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
  279. deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
  280. deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
  281. deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
  282. deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
  283. deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
  284. deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
  285. deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
  286. deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
  287. deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
  288. deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
  289. deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
  290. deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
  291. deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
  292. deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
  293. deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
  294. deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
  295. deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
  296. deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
  297. deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
  298. deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
  299. deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
  300. deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
  301. deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
  302. deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
  303. deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
  304. deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
  305. deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
  306. deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
  307. deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
  308. deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
  309. deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
  310. deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
  311. deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
  312. deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
  313. deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
  314. deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
  315. deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
  316. deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
  317. deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
  318. deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
  319. deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
  320. deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
  321. deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
  322. deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
  323. deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
  324. deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
  325. deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
  326. deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
  327. deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
  328. deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
  329. deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
  330. deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
  331. deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
  332. deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
  333. deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
  334. deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
  335. deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
  336. deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
  337. deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
  338. deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
  339. deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
  340. deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
  341. deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
  342. deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
  343. deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
  344. deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
  345. deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
  346. deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
  347. deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
  348. deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
  349. deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
  350. deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
  351. deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
  352. deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
  353. deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
  354. deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
  355. deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
  356. deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
  357. deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
  358. deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
  359. deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
  360. deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
  361. deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
  362. deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
  363. deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
  364. deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
  365. deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
  366. deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
  367. deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
  368. deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
  369. deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
  370. deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
  371. deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
  372. deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
  373. deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
  374. deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
  375. deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
  376. deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
  377. deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
  378. deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
  379. deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
  380. deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
  381. deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
  382. deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
  383. deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
  384. deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
  385. deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
  386. deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
  387. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
  388. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
  389. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
  390. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  391. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
  392. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
  393. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
  394. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  395. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
  396. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
  397. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
  398. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
  399. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
  400. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  401. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
  402. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
  403. deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
  404. deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
  405. deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
  406. deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
  407. deepresearch_flow/paper/web/query.py +90 -0
  408. deepresearch_flow/recognize/__init__.py +1 -0
  409. deepresearch_flow/recognize/cli.py +469 -0
  410. deepresearch_flow/recognize/markdown.py +277 -0
  411. deepresearch_flow/recognize/organize.py +95 -0
  412. deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
  413. deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
  414. deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
  415. deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
  416. deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
  417. deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,2307 @@
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import json
5
+ import logging
6
+ from dataclasses import dataclass
7
+ from html.parser import HTMLParser
8
+ from pathlib import Path
9
+ from typing import Any
10
+ import re
11
+ from urllib.parse import urlencode, quote
12
+
13
+ from markdown_it import MarkdownIt
14
+ from starlette.applications import Starlette
15
+ from starlette.requests import Request
16
+ from starlette.responses import FileResponse, HTMLResponse, JSONResponse, RedirectResponse, Response
17
+ from starlette.routing import Mount, Route
18
+ from starlette.staticfiles import StaticFiles
19
+
20
+ from deepresearch_flow.paper.render import load_default_template
21
+ from deepresearch_flow.paper.template_registry import (
22
+ list_template_names_in_registry_order,
23
+ load_render_template,
24
+ load_schema_for_template,
25
+ )
26
+ from deepresearch_flow.paper.utils import stable_hash
27
+ from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
28
+
29
+ try:
30
+ from pybtex.database import parse_file
31
+ PYBTEX_AVAILABLE = True
32
+ except Exception:
33
+ PYBTEX_AVAILABLE = False
34
+
35
+
36
+ _CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
37
+ _CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"
38
+ _CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.css"
39
+ _CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/katex.min.js"
40
+ _CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.10/dist/contrib/auto-render.min.js"
41
+ # Use legacy builds to ensure `pdfjsLib` is available as a global.
42
+ _CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
43
+ _CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
44
+ _PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
45
+ _PDFJS_STATIC_DIR = Path(__file__).resolve().parent / "pdfjs"
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class PaperIndex:
52
+ papers: list[dict[str, Any]]
53
+ id_by_hash: dict[str, int]
54
+ ordered_ids: list[int]
55
+ by_tag: dict[str, set[int]]
56
+ by_author: dict[str, set[int]]
57
+ by_year: dict[str, set[int]]
58
+ by_month: dict[str, set[int]]
59
+ by_venue: dict[str, set[int]]
60
+ stats: dict[str, Any]
61
+ md_path_by_hash: dict[str, Path]
62
+ pdf_path_by_hash: dict[str, Path]
63
+
64
+
65
+ def _split_csv(values: list[str]) -> list[str]:
66
+ out: list[str] = []
67
+ for value in values:
68
+ for part in value.split(","):
69
+ part = part.strip()
70
+ if part:
71
+ out.append(part)
72
+ return out
73
+
74
+
75
+ def _normalize_key(value: str) -> str:
76
+ return value.strip().lower()
77
+
78
+
79
+ def _parse_year_month(date_str: str | None) -> tuple[str | None, str | None]:
80
+ if not date_str:
81
+ return None, None
82
+ text = str(date_str).strip()
83
+ year = None
84
+ month = None
85
+
86
+ year_match = re.search(r"(19|20)\d{2}", text)
87
+ if year_match:
88
+ year = year_match.group(0)
89
+
90
+ numeric_match = re.search(r"(19|20)\d{2}[-/](\d{1,2})", text)
91
+ if numeric_match:
92
+ m = int(numeric_match.group(2))
93
+ if 1 <= m <= 12:
94
+ month = f"{m:02d}"
95
+ return year, month
96
+
97
+ month_word = re.search(
98
+ r"(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec|"
99
+ r"january|february|march|april|june|july|august|september|october|november|december)",
100
+ text.lower(),
101
+ )
102
+ if month_word:
103
+ lookup = {
104
+ "january": "01",
105
+ "february": "02",
106
+ "march": "03",
107
+ "april": "04",
108
+ "may": "05",
109
+ "june": "06",
110
+ "july": "07",
111
+ "august": "08",
112
+ "september": "09",
113
+ "october": "10",
114
+ "november": "11",
115
+ "december": "12",
116
+ "jan": "01",
117
+ "feb": "02",
118
+ "mar": "03",
119
+ "apr": "04",
120
+ "jun": "06",
121
+ "jul": "07",
122
+ "aug": "08",
123
+ "sep": "09",
124
+ "sept": "09",
125
+ "oct": "10",
126
+ "nov": "11",
127
+ "dec": "12",
128
+ }
129
+ month = lookup.get(month_word.group(0))
130
+ return year, month
131
+
132
+
133
+ def _normalize_month_token(value: str | int | None) -> str | None:
134
+ if value is None:
135
+ return None
136
+ if isinstance(value, int):
137
+ if 1 <= value <= 12:
138
+ return f"{value:02d}"
139
+ return None
140
+ raw = str(value).strip().lower()
141
+ if not raw:
142
+ return None
143
+ if raw.isdigit():
144
+ return _normalize_month_token(int(raw))
145
+ lookup = {
146
+ "january": "01",
147
+ "february": "02",
148
+ "march": "03",
149
+ "april": "04",
150
+ "may": "05",
151
+ "june": "06",
152
+ "july": "07",
153
+ "august": "08",
154
+ "september": "09",
155
+ "october": "10",
156
+ "november": "11",
157
+ "december": "12",
158
+ "jan": "01",
159
+ "feb": "02",
160
+ "mar": "03",
161
+ "apr": "04",
162
+ "jun": "06",
163
+ "jul": "07",
164
+ "aug": "08",
165
+ "sep": "09",
166
+ "sept": "09",
167
+ "oct": "10",
168
+ "nov": "11",
169
+ "dec": "12",
170
+ }
171
+ return lookup.get(raw)
172
+
173
+
174
+ def _extract_authors(paper: dict[str, Any]) -> list[str]:
175
+ value = paper.get("paper_authors")
176
+ if value is None:
177
+ return []
178
+ if isinstance(value, list):
179
+ return [str(item).strip() for item in value if str(item).strip()]
180
+ if isinstance(value, str):
181
+ return [part.strip() for part in value.split(",") if part.strip()]
182
+ return [str(value)]
183
+
184
+
185
+ def _extract_tags(paper: dict[str, Any]) -> list[str]:
186
+ tags = paper.get("ai_generated_tags") or []
187
+ if isinstance(tags, list):
188
+ return [str(tag).strip() for tag in tags if str(tag).strip()]
189
+ return []
190
+
191
+
192
+ def _extract_venue(paper: dict[str, Any]) -> str:
193
+ if isinstance(paper.get("bibtex"), dict):
194
+ bib = paper.get("bibtex") or {}
195
+ fields = bib.get("fields") or {}
196
+ bib_type = (bib.get("type") or "").lower()
197
+ if bib_type == "article" and fields.get("journal"):
198
+ return str(fields.get("journal"))
199
+ if bib_type in {"inproceedings", "conference", "proceedings"} and fields.get("booktitle"):
200
+ return str(fields.get("booktitle"))
201
+ return str(paper.get("publication_venue") or "")
202
+
203
+
204
+ def build_index(
205
+ papers: list[dict[str, Any]],
206
+ *,
207
+ md_roots: list[Path] | None = None,
208
+ pdf_roots: list[Path] | None = None,
209
+ ) -> PaperIndex:
210
+ id_by_hash: dict[str, int] = {}
211
+ by_tag: dict[str, set[int]] = {}
212
+ by_author: dict[str, set[int]] = {}
213
+ by_year: dict[str, set[int]] = {}
214
+ by_month: dict[str, set[int]] = {}
215
+ by_venue: dict[str, set[int]] = {}
216
+
217
+ md_path_by_hash: dict[str, Path] = {}
218
+ pdf_path_by_hash: dict[str, Path] = {}
219
+
220
+ md_file_index = _build_file_index(md_roots or [], suffixes={".md"})
221
+ pdf_file_index = _build_file_index(pdf_roots or [], suffixes={".pdf"})
222
+
223
+ year_counts: dict[str, int] = {}
224
+ month_counts: dict[str, int] = {}
225
+ tag_counts: dict[str, int] = {}
226
+ author_counts: dict[str, int] = {}
227
+ venue_counts: dict[str, int] = {}
228
+
229
+ def add_index(index: dict[str, set[int]], key: str, idx: int) -> None:
230
+ index.setdefault(key, set()).add(idx)
231
+
232
+ for idx, paper in enumerate(papers):
233
+ source_hash = paper.get("source_hash")
234
+ if not source_hash and paper.get("source_path"):
235
+ source_hash = stable_hash(str(paper.get("source_path")))
236
+ if source_hash:
237
+ id_by_hash[str(source_hash)] = idx
238
+
239
+ title = str(paper.get("paper_title") or "")
240
+ paper["_title_lc"] = title.lower()
241
+
242
+ bib_fields: dict[str, Any] = {}
243
+ if isinstance(paper.get("bibtex"), dict):
244
+ bib_fields = paper.get("bibtex", {}).get("fields", {}) or {}
245
+
246
+ year = None
247
+ if bib_fields.get("year") and str(bib_fields.get("year")).isdigit():
248
+ year = str(bib_fields.get("year"))
249
+ month = _normalize_month_token(bib_fields.get("month"))
250
+ if not year or not month:
251
+ parsed_year, parsed_month = _parse_year_month(str(paper.get("publication_date") or ""))
252
+ year = year or parsed_year
253
+ month = month or parsed_month
254
+
255
+ year_label = year or "Unknown"
256
+ month_label = month or "Unknown"
257
+ paper["_year"] = year_label
258
+ paper["_month"] = month_label
259
+ add_index(by_year, _normalize_key(year_label), idx)
260
+ add_index(by_month, _normalize_key(month_label), idx)
261
+ year_counts[year_label] = year_counts.get(year_label, 0) + 1
262
+ month_counts[month_label] = month_counts.get(month_label, 0) + 1
263
+
264
+ venue = _extract_venue(paper).strip()
265
+ paper["_venue"] = venue
266
+ if venue:
267
+ add_index(by_venue, _normalize_key(venue), idx)
268
+ venue_counts[venue] = venue_counts.get(venue, 0) + 1
269
+ else:
270
+ add_index(by_venue, "unknown", idx)
271
+ venue_counts["Unknown"] = venue_counts.get("Unknown", 0) + 1
272
+
273
+ authors = _extract_authors(paper)
274
+ paper["_authors"] = authors
275
+ for author in authors:
276
+ key = _normalize_key(author)
277
+ add_index(by_author, key, idx)
278
+ author_counts[author] = author_counts.get(author, 0) + 1
279
+
280
+ tags = _extract_tags(paper)
281
+ paper["_tags"] = tags
282
+ for tag in tags:
283
+ key = _normalize_key(tag)
284
+ add_index(by_tag, key, idx)
285
+ tag_counts[tag] = tag_counts.get(tag, 0) + 1
286
+
287
+ search_parts = [title, venue, " ".join(authors), " ".join(tags)]
288
+ paper["_search_lc"] = " ".join(part for part in search_parts if part).lower()
289
+
290
+ source_hash_str = str(source_hash) if source_hash else str(idx)
291
+ md_path = _resolve_source_md(paper, md_file_index)
292
+ if md_path is not None:
293
+ md_path_by_hash[source_hash_str] = md_path
294
+ pdf_path = _resolve_pdf(paper, pdf_file_index)
295
+ if pdf_path is not None:
296
+ pdf_path_by_hash[source_hash_str] = pdf_path
297
+
298
+ def year_sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, int, str]:
299
+ idx, paper = item
300
+ year_label = str(paper.get("_year") or "Unknown")
301
+ title_label = str(paper.get("paper_title") or "")
302
+ if year_label.isdigit():
303
+ return (0, -int(year_label), title_label.lower())
304
+ return (1, 0, title_label.lower())
305
+
306
+ ordered_ids = [idx for idx, _ in sorted(enumerate(papers), key=year_sort_key)]
307
+
308
+ stats = {
309
+ "total": len(papers),
310
+ "years": _sorted_counts(year_counts, numeric_desc=True),
311
+ "months": _sorted_month_counts(month_counts),
312
+ "tags": _sorted_counts(tag_counts),
313
+ "authors": _sorted_counts(author_counts),
314
+ "venues": _sorted_counts(venue_counts),
315
+ }
316
+
317
+ return PaperIndex(
318
+ papers=papers,
319
+ id_by_hash=id_by_hash,
320
+ ordered_ids=ordered_ids,
321
+ by_tag=by_tag,
322
+ by_author=by_author,
323
+ by_year=by_year,
324
+ by_month=by_month,
325
+ by_venue=by_venue,
326
+ stats=stats,
327
+ md_path_by_hash=md_path_by_hash,
328
+ pdf_path_by_hash=pdf_path_by_hash,
329
+ )
330
+
331
+
332
+ def _sorted_counts(counts: dict[str, int], *, numeric_desc: bool = False) -> list[dict[str, Any]]:
333
+ items = list(counts.items())
334
+ if numeric_desc:
335
+ def key(item: tuple[str, int]) -> tuple[int, int]:
336
+ label, count = item
337
+ if label.isdigit():
338
+ return (0, -int(label))
339
+ return (1, 0)
340
+ items.sort(key=key)
341
+ else:
342
+ items.sort(key=lambda item: item[1], reverse=True)
343
+ return [{"label": k, "count": v} for k, v in items]
344
+
345
+
346
+ def _sorted_month_counts(counts: dict[str, int]) -> list[dict[str, Any]]:
347
+ def month_sort(label: str) -> int:
348
+ if label == "Unknown":
349
+ return 99
350
+ if label.isdigit():
351
+ return int(label)
352
+ return 98
353
+
354
+ items = sorted(counts.items(), key=lambda item: month_sort(item[0]))
355
+ return [{"label": k, "count": v} for k, v in items]
356
+
357
+
358
+ _TEMPLATE_INFER_IGNORE_KEYS = {
359
+ "source_path",
360
+ "source_hash",
361
+ "provider",
362
+ "model",
363
+ "extracted_at",
364
+ "truncation",
365
+ "output_language",
366
+ "prompt_template",
367
+ }
368
+
369
+
370
+ def _load_paper_inputs(paths: list[Path]) -> list[dict[str, Any]]:
371
+ inputs: list[dict[str, Any]] = []
372
+ for path in paths:
373
+ payload = json.loads(path.read_text(encoding="utf-8"))
374
+ if isinstance(payload, list):
375
+ raise ValueError(
376
+ f"Input JSON must be an object with template_tag and papers (got array): {path}"
377
+ )
378
+ if not isinstance(payload, dict):
379
+ raise ValueError(f"Input JSON must be an object: {path}")
380
+ papers = payload.get("papers")
381
+ if not isinstance(papers, list):
382
+ raise ValueError(f"Input JSON missing papers list: {path}")
383
+ template_tag = payload.get("template_tag")
384
+ if not template_tag:
385
+ template_tag = _infer_template_tag(papers, path)
386
+ inputs.append({"template_tag": str(template_tag), "papers": papers})
387
+ return inputs
388
+
389
+
390
+ def _infer_template_tag(papers: list[dict[str, Any]], path: Path) -> str:
391
+ prompt_tags = {
392
+ str(paper.get("prompt_template"))
393
+ for paper in papers
394
+ if isinstance(paper, dict) and paper.get("prompt_template")
395
+ }
396
+ if len(prompt_tags) == 1:
397
+ return prompt_tags.pop()
398
+
399
+ sample = next((paper for paper in papers if isinstance(paper, dict)), None)
400
+ if sample is None:
401
+ raise ValueError(f"Input JSON has no paper objects to infer template_tag: {path}")
402
+
403
+ paper_keys = {key for key in sample.keys() if key not in _TEMPLATE_INFER_IGNORE_KEYS}
404
+ if not paper_keys:
405
+ raise ValueError(f"Input JSON papers have no keys to infer template_tag: {path}")
406
+
407
+ best_tag = None
408
+ best_score = -1
409
+ for name in list_template_names_in_registry_order():
410
+ schema = load_schema_for_template(name)
411
+ schema_keys = set((schema.get("properties") or {}).keys())
412
+ score = len(paper_keys & schema_keys)
413
+ if score > best_score:
414
+ best_score = score
415
+ best_tag = name
416
+ elif score == best_score:
417
+ if best_tag != "simple" and name == "simple":
418
+ best_tag = name
419
+
420
+ if not best_tag:
421
+ raise ValueError(f"Unable to infer template_tag from input JSON: {path}")
422
+ return best_tag
423
+
424
+
425
+ def _build_cache_meta(db_paths: list[Path], bibtex_path: Path | None) -> dict[str, Any]:
426
+ def file_meta(path: Path) -> dict[str, Any]:
427
+ try:
428
+ stats = path.stat()
429
+ except OSError as exc:
430
+ raise ValueError(f"Failed to read input metadata for cache: {path}") from exc
431
+ return {"path": str(path), "mtime": stats.st_mtime, "size": stats.st_size}
432
+
433
+ meta = {
434
+ "version": 1,
435
+ "inputs": [file_meta(path) for path in db_paths],
436
+ "bibtex": file_meta(bibtex_path) if bibtex_path else None,
437
+ }
438
+ return meta
439
+
440
+
441
+ def _load_cached_papers(cache_dir: Path, meta: dict[str, Any]) -> list[dict[str, Any]] | None:
442
+ meta_path = cache_dir / "db_serve_cache.meta.json"
443
+ data_path = cache_dir / "db_serve_cache.papers.json"
444
+ if not meta_path.exists() or not data_path.exists():
445
+ return None
446
+ try:
447
+ cached_meta = json.loads(meta_path.read_text(encoding="utf-8"))
448
+ if cached_meta != meta:
449
+ return None
450
+ cached_papers = json.loads(data_path.read_text(encoding="utf-8"))
451
+ if not isinstance(cached_papers, list):
452
+ return None
453
+ return cached_papers
454
+ except Exception:
455
+ return None
456
+
457
+
458
+ def _write_cached_papers(cache_dir: Path, meta: dict[str, Any], papers: list[dict[str, Any]]) -> None:
459
+ meta_path = cache_dir / "db_serve_cache.meta.json"
460
+ data_path = cache_dir / "db_serve_cache.papers.json"
461
+ meta_path.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
462
+ data_path.write_text(json.dumps(papers, ensure_ascii=False, indent=2), encoding="utf-8")
463
+
464
+
465
+ def _load_or_merge_papers(
466
+ db_paths: list[Path],
467
+ bibtex_path: Path | None,
468
+ cache_dir: Path | None,
469
+ use_cache: bool,
470
+ ) -> list[dict[str, Any]]:
471
+ cache_meta = None
472
+ if cache_dir and use_cache:
473
+ cache_dir.mkdir(parents=True, exist_ok=True)
474
+ cache_meta = _build_cache_meta(db_paths, bibtex_path)
475
+ cached = _load_cached_papers(cache_dir, cache_meta)
476
+ if cached is not None:
477
+ return cached
478
+
479
+ inputs = _load_paper_inputs(db_paths)
480
+ if bibtex_path is not None:
481
+ for bundle in inputs:
482
+ enrich_with_bibtex(bundle["papers"], bibtex_path)
483
+ papers = _merge_paper_inputs(inputs)
484
+
485
+ if cache_dir and use_cache and cache_meta is not None:
486
+ _write_cached_papers(cache_dir, cache_meta, papers)
487
+ return papers
488
+
489
+
490
+ def _md_renderer() -> MarkdownIt:
491
+ return MarkdownIt("commonmark", {"html": False, "linkify": True})
492
+
493
+
494
+ def _normalize_merge_title(value: str | None) -> str | None:
495
+ if not value:
496
+ return None
497
+ return str(value).replace("{", "").replace("}", "").strip().lower()
498
+
499
+
500
+ def _extract_bibtex_title(paper: dict[str, Any]) -> str | None:
501
+ if not isinstance(paper.get("bibtex"), dict):
502
+ return None
503
+ fields = paper.get("bibtex", {}).get("fields", {}) or {}
504
+ return _normalize_merge_title(fields.get("title"))
505
+
506
+
507
+ def _extract_paper_title(paper: dict[str, Any]) -> str | None:
508
+ return _normalize_merge_title(paper.get("paper_title"))
509
+
510
+
511
+ def _available_templates(paper: dict[str, Any]) -> list[str]:
512
+ templates = paper.get("templates")
513
+ if not isinstance(templates, dict):
514
+ return []
515
+ order = paper.get("template_order") or list(templates.keys())
516
+ seen: set[str] = set()
517
+ available: list[str] = []
518
+ for tag in order:
519
+ if tag in templates and tag not in seen:
520
+ available.append(tag)
521
+ seen.add(tag)
522
+ for tag in templates:
523
+ if tag not in seen:
524
+ available.append(tag)
525
+ seen.add(tag)
526
+ return available
527
+
528
+
529
+ def _select_template_tag(
530
+ paper: dict[str, Any], requested: str | None
531
+ ) -> tuple[str | None, list[str]]:
532
+ available = _available_templates(paper)
533
+ if not available:
534
+ return None, []
535
+ default_tag = paper.get("default_template")
536
+ if not default_tag:
537
+ default_tag = "simple" if "simple" in available else available[0]
538
+ selected = requested if requested in available else default_tag
539
+ return selected, available
540
+
541
+
542
+ def _titles_match(group: dict[str, Any], paper: dict[str, Any], *, threshold: float) -> bool:
543
+ bib_title = _extract_bibtex_title(paper)
544
+ group_bib = group.get("_merge_bibtex_titles") or set()
545
+ if bib_title and group_bib:
546
+ return any(_title_similarity(bib_title, existing) >= threshold for existing in group_bib)
547
+
548
+ paper_title = _extract_paper_title(paper)
549
+ group_titles = group.get("_merge_paper_titles") or set()
550
+ if paper_title and group_titles:
551
+ return any(_title_similarity(paper_title, existing) >= threshold for existing in group_titles)
552
+ return False
553
+
554
+
555
+ def _add_merge_titles(group: dict[str, Any], paper: dict[str, Any]) -> None:
556
+ bib_title = _extract_bibtex_title(paper)
557
+ if bib_title:
558
+ group.setdefault("_merge_bibtex_titles", set()).add(bib_title)
559
+ paper_title = _extract_paper_title(paper)
560
+ if paper_title:
561
+ group.setdefault("_merge_paper_titles", set()).add(paper_title)
562
+
563
+
564
+ def _merge_paper_inputs(inputs: list[dict[str, Any]]) -> list[dict[str, Any]]:
565
+ merged: list[dict[str, Any]] = []
566
+ threshold = 0.95
567
+ prefix_len = 5
568
+ bibtex_exact: dict[str, set[int]] = {}
569
+ bibtex_prefix: dict[str, set[int]] = {}
570
+ paper_exact: dict[str, set[int]] = {}
571
+ paper_prefix: dict[str, set[int]] = {}
572
+
573
+ def prefix_key(value: str) -> str:
574
+ return value[:prefix_len] if len(value) >= prefix_len else value
575
+
576
+ def add_index(
577
+ value: str,
578
+ exact_index: dict[str, set[int]],
579
+ prefix_index: dict[str, set[int]],
580
+ idx: int,
581
+ ) -> None:
582
+ exact_index.setdefault(value, set()).add(idx)
583
+ prefix_index.setdefault(prefix_key(value), set()).add(idx)
584
+
585
+ def candidate_ids(bib_title: str | None, paper_title: str | None) -> list[int]:
586
+ ids: set[int] = set()
587
+ if bib_title:
588
+ ids |= bibtex_exact.get(bib_title, set())
589
+ ids |= bibtex_prefix.get(prefix_key(bib_title), set())
590
+ if paper_title:
591
+ ids |= paper_exact.get(paper_title, set())
592
+ ids |= paper_prefix.get(prefix_key(paper_title), set())
593
+ return sorted(ids)
594
+
595
+ for bundle in inputs:
596
+ template_tag = bundle.get("template_tag")
597
+ papers = bundle.get("papers") or []
598
+ for paper in papers:
599
+ if not isinstance(paper, dict):
600
+ raise ValueError("Input papers must be objects")
601
+ bib_title = _extract_bibtex_title(paper)
602
+ paper_title = _extract_paper_title(paper)
603
+ match = None
604
+ match_idx = None
605
+ for idx in candidate_ids(bib_title, paper_title):
606
+ candidate = merged[idx]
607
+ if _titles_match(candidate, paper, threshold=threshold):
608
+ match = candidate
609
+ match_idx = idx
610
+ break
611
+ if match is None:
612
+ group = {
613
+ "templates": {template_tag: paper},
614
+ "template_order": [template_tag],
615
+ }
616
+ _add_merge_titles(group, paper)
617
+ merged.append(group)
618
+ group_idx = len(merged) - 1
619
+ if bib_title:
620
+ add_index(bib_title, bibtex_exact, bibtex_prefix, group_idx)
621
+ if paper_title:
622
+ add_index(paper_title, paper_exact, paper_prefix, group_idx)
623
+ else:
624
+ templates = match.setdefault("templates", {})
625
+ templates[template_tag] = paper
626
+ order = match.setdefault("template_order", [])
627
+ if template_tag not in order:
628
+ order.append(template_tag)
629
+ _add_merge_titles(match, paper)
630
+ if match_idx is not None:
631
+ if bib_title:
632
+ add_index(bib_title, bibtex_exact, bibtex_prefix, match_idx)
633
+ if paper_title:
634
+ add_index(paper_title, paper_exact, paper_prefix, match_idx)
635
+
636
+ for group in merged:
637
+ templates = group.get("templates") or {}
638
+ order = group.get("template_order") or list(templates.keys())
639
+ default_tag = "simple" if "simple" in order else (order[0] if order else None)
640
+ group["default_template"] = default_tag
641
+ if default_tag and default_tag in templates:
642
+ base = templates[default_tag]
643
+ for key, value in base.items():
644
+ group[key] = value
645
+ group.pop("_merge_bibtex_titles", None)
646
+ group.pop("_merge_paper_titles", None)
647
+ return merged
648
+
649
+
650
+ def _render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
651
+ rendered, table_placeholders = _extract_html_table_placeholders(text)
652
+ rendered, img_placeholders = _extract_html_img_placeholders(rendered)
653
+ rendered, placeholders = _extract_math_placeholders(rendered)
654
+ html_out = md.render(rendered)
655
+ for key, value in placeholders.items():
656
+ html_out = html_out.replace(key, html.escape(value))
657
+ for key, value in img_placeholders.items():
658
+ html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: value, html_out)
659
+ html_out = html_out.replace(key, value)
660
+ for key, value in table_placeholders.items():
661
+ safe_html = _sanitize_table_html(value)
662
+ html_out = re.sub(rf"<p>\s*{re.escape(key)}\s*</p>", lambda _: safe_html, html_out)
663
+ return html_out
664
+
665
+
666
+ def _extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
667
+ placeholders: dict[str, str] = {}
668
+ out: list[str] = []
669
+ idx = 0
670
+ in_fence = False
671
+ fence_char = ""
672
+ fence_len = 0
673
+ inline_delim_len = 0
674
+
675
+ def next_placeholder(value: str) -> str:
676
+ key = f"@@MATH_{len(placeholders)}@@"
677
+ placeholders[key] = value
678
+ return key
679
+
680
+ while idx < len(text):
681
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
682
+
683
+ if inline_delim_len == 0 and at_line_start:
684
+ line_end = text.find("\n", idx)
685
+ if line_end == -1:
686
+ line_end = len(text)
687
+ line = text[idx:line_end]
688
+ stripped = line.lstrip(" ")
689
+ leading_spaces = len(line) - len(stripped)
690
+ if leading_spaces <= 3 and stripped:
691
+ first = stripped[0]
692
+ if first in {"`", "~"}:
693
+ run_len = 0
694
+ while run_len < len(stripped) and stripped[run_len] == first:
695
+ run_len += 1
696
+ if run_len >= 3:
697
+ if not in_fence:
698
+ in_fence = True
699
+ fence_char = first
700
+ fence_len = run_len
701
+ elif first == fence_char and run_len >= fence_len:
702
+ in_fence = False
703
+ fence_char = ""
704
+ fence_len = 0
705
+ out.append(line)
706
+ idx = line_end
707
+ continue
708
+
709
+ if in_fence:
710
+ out.append(text[idx])
711
+ idx += 1
712
+ continue
713
+
714
+ if inline_delim_len > 0:
715
+ delim = "`" * inline_delim_len
716
+ if text.startswith(delim, idx):
717
+ out.append(delim)
718
+ idx += inline_delim_len
719
+ inline_delim_len = 0
720
+ continue
721
+ out.append(text[idx])
722
+ idx += 1
723
+ continue
724
+
725
+ ch = text[idx]
726
+ if ch == "`":
727
+ run_len = 0
728
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
729
+ run_len += 1
730
+ inline_delim_len = run_len
731
+ out.append("`" * run_len)
732
+ idx += run_len
733
+ continue
734
+
735
+ # Block math: $$...$$ (can span lines)
736
+ if text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
737
+ search_from = idx + 2
738
+ end = text.find("$$", search_from)
739
+ while end != -1 and text[end - 1] == "\\":
740
+ search_from = end + 2
741
+ end = text.find("$$", search_from)
742
+ if end != -1:
743
+ out.append(next_placeholder(text[idx : end + 2]))
744
+ idx = end + 2
745
+ continue
746
+
747
+ # Inline math: $...$ (single-line)
748
+ if ch == "$" and not text.startswith("$$", idx) and (idx == 0 or text[idx - 1] != "\\"):
749
+ search_from = idx + 1
750
+ end = text.find("$", search_from)
751
+ while end != -1 and text[end - 1] == "\\":
752
+ search_from = end + 1
753
+ end = text.find("$", search_from)
754
+ if end != -1:
755
+ out.append(next_placeholder(text[idx : end + 1]))
756
+ idx = end + 1
757
+ continue
758
+
759
+ out.append(ch)
760
+ idx += 1
761
+
762
+ return "".join(out), placeholders
763
+
764
+
765
+ class _TableSanitizer(HTMLParser):
766
+ def __init__(self) -> None:
767
+ super().__init__(convert_charrefs=True)
768
+ self._out: list[str] = []
769
+ self._stack: list[str] = []
770
+
771
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
772
+ t = tag.lower()
773
+ if t not in {
774
+ "table",
775
+ "thead",
776
+ "tbody",
777
+ "tfoot",
778
+ "tr",
779
+ "th",
780
+ "td",
781
+ "caption",
782
+ "colgroup",
783
+ "col",
784
+ "br",
785
+ }:
786
+ return
787
+
788
+ allowed: dict[str, str] = {}
789
+ for name, value in attrs:
790
+ if value is None:
791
+ continue
792
+ n = name.lower()
793
+ v = value.strip()
794
+ if t in {"td", "th"} and n in {"colspan", "rowspan"} and v.isdigit():
795
+ allowed[n] = v
796
+ elif t in {"td", "th"} and n == "align" and v.lower() in {"left", "right", "center"}:
797
+ allowed[n] = v.lower()
798
+
799
+ attr_text = "".join(f' {k}="{html.escape(v, quote=True)}"' for k, v in allowed.items())
800
+ self._out.append(f"<{t}{attr_text}>")
801
+ if t not in {"br", "col"}:
802
+ self._stack.append(t)
803
+
804
+ def handle_endtag(self, tag: str) -> None:
805
+ t = tag.lower()
806
+ if t not in self._stack:
807
+ return
808
+ while self._stack:
809
+ popped = self._stack.pop()
810
+ self._out.append(f"</{popped}>")
811
+ if popped == t:
812
+ break
813
+
814
+ def handle_data(self, data: str) -> None:
815
+ self._out.append(html.escape(data))
816
+
817
+ def handle_entityref(self, name: str) -> None:
818
+ self._out.append(f"&{name};")
819
+
820
+ def handle_charref(self, name: str) -> None:
821
+ self._out.append(f"&#{name};")
822
+
823
+ def close(self) -> None:
824
+ super().close()
825
+ while self._stack:
826
+ self._out.append(f"</{self._stack.pop()}>")
827
+
828
+ def get_html(self) -> str:
829
+ return "".join(self._out)
830
+
831
+
832
+ def _sanitize_table_html(raw: str) -> str:
833
+ parser = _TableSanitizer()
834
+ try:
835
+ parser.feed(raw)
836
+ parser.close()
837
+ except Exception:
838
+ return f"<pre><code>{html.escape(raw)}</code></pre>"
839
+ return parser.get_html()
840
+
841
+
842
+ def _sanitize_img_html(raw: str) -> str | None:
843
+ attrs = {}
844
+ for match in re.finditer(r"(\w+)\s*=\s*(\"[^\"]*\"|'[^']*'|[^\s>]+)", raw):
845
+ name = match.group(1).lower()
846
+ value = match.group(2).strip()
847
+ if value and value[0] in {"\"", "'"} and value[-1] == value[0]:
848
+ value = value[1:-1]
849
+ attrs[name] = value
850
+
851
+ src = attrs.get("src", "")
852
+ src_lower = src.lower()
853
+ if not src_lower.startswith("data:image/") or ";base64," not in src_lower:
854
+ return None
855
+
856
+ alt = attrs.get("alt", "")
857
+ alt_attr = f' alt="{html.escape(alt, quote=True)}"' if alt else ""
858
+ return f'<img src="{html.escape(src, quote=True)}"{alt_attr} />'
859
+
860
+
861
+ def _extract_html_img_placeholders(text: str) -> tuple[str, dict[str, str]]:
862
+ placeholders: dict[str, str] = {}
863
+ out: list[str] = []
864
+ idx = 0
865
+ in_fence = False
866
+ fence_char = ""
867
+ fence_len = 0
868
+ inline_delim_len = 0
869
+
870
+ def next_placeholder(value: str) -> str:
871
+ key = f"@@HTML_IMG_{len(placeholders)}@@"
872
+ placeholders[key] = value
873
+ return key
874
+
875
+ lower = text.lower()
876
+ while idx < len(text):
877
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
878
+
879
+ if inline_delim_len == 0 and at_line_start:
880
+ line_end = text.find("\n", idx)
881
+ if line_end == -1:
882
+ line_end = len(text)
883
+ line = text[idx:line_end]
884
+ stripped = line.lstrip(" ")
885
+ leading_spaces = len(line) - len(stripped)
886
+ if leading_spaces <= 3 and stripped:
887
+ first = stripped[0]
888
+ if first in {"`", "~"}:
889
+ run_len = 0
890
+ while run_len < len(stripped) and stripped[run_len] == first:
891
+ run_len += 1
892
+ if run_len >= 3:
893
+ if not in_fence:
894
+ in_fence = True
895
+ fence_char = first
896
+ fence_len = run_len
897
+ elif first == fence_char and run_len >= fence_len:
898
+ in_fence = False
899
+ fence_char = ""
900
+ fence_len = 0
901
+ out.append(line)
902
+ idx = line_end
903
+ continue
904
+
905
+ if in_fence:
906
+ out.append(text[idx])
907
+ idx += 1
908
+ continue
909
+
910
+ if inline_delim_len > 0:
911
+ delim = "`" * inline_delim_len
912
+ if text.startswith(delim, idx):
913
+ out.append(delim)
914
+ idx += inline_delim_len
915
+ inline_delim_len = 0
916
+ continue
917
+ out.append(text[idx])
918
+ idx += 1
919
+ continue
920
+
921
+ if text[idx] == "`":
922
+ run_len = 0
923
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
924
+ run_len += 1
925
+ inline_delim_len = run_len
926
+ out.append("`" * run_len)
927
+ idx += run_len
928
+ continue
929
+
930
+ if lower.startswith("<img", idx):
931
+ end = text.find(">", idx)
932
+ if end != -1:
933
+ raw = text[idx : end + 1]
934
+ safe_html = _sanitize_img_html(raw)
935
+ if safe_html:
936
+ out.append(next_placeholder(safe_html))
937
+ idx = end + 1
938
+ continue
939
+
940
+ out.append(text[idx])
941
+ idx += 1
942
+
943
+ return "".join(out), placeholders
944
+
945
+
946
+ def _extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
947
+ placeholders: dict[str, str] = {}
948
+ out: list[str] = []
949
+ idx = 0
950
+ in_fence = False
951
+ fence_char = ""
952
+ fence_len = 0
953
+ inline_delim_len = 0
954
+
955
+ def next_placeholder(value: str) -> str:
956
+ key = f"@@HTML_TABLE_{len(placeholders)}@@"
957
+ placeholders[key] = value
958
+ return key
959
+
960
+ lower = text.lower()
961
+ while idx < len(text):
962
+ at_line_start = idx == 0 or text[idx - 1] == "\n"
963
+
964
+ if inline_delim_len == 0 and at_line_start:
965
+ line_end = text.find("\n", idx)
966
+ if line_end == -1:
967
+ line_end = len(text)
968
+ line = text[idx:line_end]
969
+ stripped = line.lstrip(" ")
970
+ leading_spaces = len(line) - len(stripped)
971
+ if leading_spaces <= 3 and stripped:
972
+ first = stripped[0]
973
+ if first in {"`", "~"}:
974
+ run_len = 0
975
+ while run_len < len(stripped) and stripped[run_len] == first:
976
+ run_len += 1
977
+ if run_len >= 3:
978
+ if not in_fence:
979
+ in_fence = True
980
+ fence_char = first
981
+ fence_len = run_len
982
+ elif first == fence_char and run_len >= fence_len:
983
+ in_fence = False
984
+ fence_char = ""
985
+ fence_len = 0
986
+ out.append(line)
987
+ idx = line_end
988
+ continue
989
+
990
+ if in_fence:
991
+ out.append(text[idx])
992
+ idx += 1
993
+ continue
994
+
995
+ if inline_delim_len > 0:
996
+ delim = "`" * inline_delim_len
997
+ if text.startswith(delim, idx):
998
+ out.append(delim)
999
+ idx += inline_delim_len
1000
+ inline_delim_len = 0
1001
+ continue
1002
+ out.append(text[idx])
1003
+ idx += 1
1004
+ continue
1005
+
1006
+ if text[idx] == "`":
1007
+ run_len = 0
1008
+ while idx + run_len < len(text) and text[idx + run_len] == "`":
1009
+ run_len += 1
1010
+ inline_delim_len = run_len
1011
+ out.append("`" * run_len)
1012
+ idx += run_len
1013
+ continue
1014
+
1015
+ if lower.startswith("<table", idx):
1016
+ end = lower.find("</table>", idx)
1017
+ if end != -1:
1018
+ end += len("</table>")
1019
+ raw = text[idx:end]
1020
+ key = next_placeholder(raw)
1021
+ if out and not out[-1].endswith("\n"):
1022
+ out.append("\n\n")
1023
+ out.append(key)
1024
+ out.append("\n\n")
1025
+ idx = end
1026
+ continue
1027
+
1028
+ out.append(text[idx])
1029
+ idx += 1
1030
+
1031
+ return "".join(out), placeholders
1032
+
1033
+
1034
+ def _render_paper_markdown(
1035
+ paper: dict[str, Any],
1036
+ fallback_language: str,
1037
+ *,
1038
+ template_tag: str | None = None,
1039
+ ) -> tuple[str, str, str | None]:
1040
+ selected_tag, _ = _select_template_tag(paper, template_tag)
1041
+ selected_paper = paper
1042
+ if selected_tag:
1043
+ selected_paper = (paper.get("templates") or {}).get(selected_tag, paper)
1044
+
1045
+ template_name = selected_tag or selected_paper.get("prompt_template")
1046
+ warning = None
1047
+ if template_name:
1048
+ try:
1049
+ template = load_render_template(str(template_name))
1050
+ except Exception:
1051
+ template = load_default_template()
1052
+ warning = "Rendered using default template (missing template)."
1053
+ template_name = "default_paper"
1054
+ else:
1055
+ template = load_default_template()
1056
+ warning = "Rendered using default template (no template specified)."
1057
+ template_name = "default_paper"
1058
+
1059
+ context = dict(selected_paper)
1060
+ if not context.get("output_language"):
1061
+ context["output_language"] = fallback_language
1062
+ return template.render(**context), str(template_name), warning
1063
+
1064
+
1065
+ def _build_file_index(roots: list[Path], *, suffixes: set[str]) -> dict[str, list[Path]]:
1066
+ index: dict[str, list[Path]] = {}
1067
+ for root in roots:
1068
+ try:
1069
+ if not root.exists() or not root.is_dir():
1070
+ continue
1071
+ except OSError:
1072
+ continue
1073
+ for path in root.rglob("*"):
1074
+ try:
1075
+ if not path.is_file():
1076
+ continue
1077
+ except OSError:
1078
+ continue
1079
+ if path.suffix.lower() not in suffixes:
1080
+ continue
1081
+ index.setdefault(path.name.lower(), []).append(path.resolve())
1082
+ return index
1083
+
1084
+
1085
+ def _resolve_source_md(paper: dict[str, Any], md_index: dict[str, list[Path]]) -> Path | None:
1086
+ source_path = paper.get("source_path")
1087
+ if not source_path:
1088
+ return None
1089
+ name = Path(str(source_path)).name.lower()
1090
+ candidates = md_index.get(name, [])
1091
+ return candidates[0] if candidates else None
1092
+
1093
+
1094
+ def _guess_pdf_names(paper: dict[str, Any]) -> list[str]:
1095
+ source_path = paper.get("source_path")
1096
+ if not source_path:
1097
+ return []
1098
+ name = Path(str(source_path)).name
1099
+ match = re.match(r"(?i)(.+\\.pdf)(?:-[0-9a-f\\-]{8,})?\\.md$", name)
1100
+ if match:
1101
+ return [Path(match.group(1)).name]
1102
+ if ".pdf-" in name.lower():
1103
+ base = name[: name.lower().rfind(".pdf-") + 4]
1104
+ return [Path(base).name]
1105
+ if name.lower().endswith(".pdf.md"):
1106
+ return [name[:-3]]
1107
+ return []
1108
+
1109
+
1110
+ def _resolve_pdf(paper: dict[str, Any], pdf_index: dict[str, list[Path]]) -> Path | None:
1111
+ for filename in _guess_pdf_names(paper):
1112
+ candidates = pdf_index.get(filename.lower(), [])
1113
+ if candidates:
1114
+ return candidates[0]
1115
+ return None
1116
+
1117
+
1118
+ def _ensure_under_roots(path: Path, roots: list[Path]) -> bool:
1119
+ resolved = path.resolve()
1120
+ for root in roots:
1121
+ try:
1122
+ resolved.relative_to(root.resolve())
1123
+ return True
1124
+ except Exception:
1125
+ continue
1126
+ return False
1127
+
1128
+
1129
+ def _apply_query(index: PaperIndex, query: Query) -> set[int]:
1130
+ all_ids = set(index.ordered_ids)
1131
+
1132
+ def ids_for_term(term: QueryTerm, base: set[int]) -> set[int]:
1133
+ value_lc = term.value.lower()
1134
+ if term.field is None:
1135
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_search_lc") or "")}
1136
+ if term.field == "title":
1137
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_title_lc") or "")}
1138
+ if term.field == "venue":
1139
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_venue") or "").lower()}
1140
+ if term.field == "tag":
1141
+ exact = index.by_tag.get(value_lc)
1142
+ if exact is not None:
1143
+ return exact & base
1144
+ return {idx for idx in base if any(value_lc in t.lower() for t in (index.papers[idx].get("_tags") or []))}
1145
+ if term.field == "author":
1146
+ exact = index.by_author.get(value_lc)
1147
+ if exact is not None:
1148
+ return exact & base
1149
+ return {idx for idx in base if any(value_lc in a.lower() for a in (index.papers[idx].get("_authors") or []))}
1150
+ if term.field == "month":
1151
+ exact = index.by_month.get(value_lc)
1152
+ if exact is not None:
1153
+ return exact & base
1154
+ return {idx for idx in base if value_lc == str(index.papers[idx].get("_month") or "").lower()}
1155
+ if term.field == "year":
1156
+ if ".." in term.value:
1157
+ start_str, end_str = term.value.split("..", 1)
1158
+ if start_str.strip().isdigit() and end_str.strip().isdigit():
1159
+ start = int(start_str.strip())
1160
+ end = int(end_str.strip())
1161
+ ids: set[int] = set()
1162
+ for y in range(min(start, end), max(start, end) + 1):
1163
+ ids |= index.by_year.get(str(y), set())
1164
+ return ids & base
1165
+ exact = index.by_year.get(value_lc)
1166
+ if exact is not None:
1167
+ return exact & base
1168
+ return {idx for idx in base if value_lc in str(index.papers[idx].get("_year") or "").lower()}
1169
+ return set()
1170
+
1171
+ result: set[int] = set()
1172
+ for group in query.groups:
1173
+ group_ids = set(all_ids)
1174
+ for term in group:
1175
+ matched = ids_for_term(term, group_ids if not term.negated else all_ids)
1176
+ if term.negated:
1177
+ group_ids -= matched
1178
+ else:
1179
+ group_ids &= matched
1180
+ result |= group_ids
1181
+
1182
+ return result
1183
+
1184
+
1185
+ def _page_shell(title: str, body_html: str, extra_head: str = "", extra_scripts: str = "") -> str:
1186
+ return f"""<!doctype html>
1187
+ <html lang="en">
1188
+ <head>
1189
+ <meta charset="utf-8" />
1190
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
1191
+ <title>{html.escape(title)}</title>
1192
+ <style>
1193
+ body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; }}
1194
+ header {{ position: sticky; top: 0; background: #0b1220; color: #fff; padding: 12px 16px; z-index: 10; }}
1195
+ header a {{ color: #cfe3ff; text-decoration: none; margin-right: 12px; }}
1196
+ .container {{ max-width: 1100px; margin: 0 auto; padding: 16px; }}
1197
+ .filters {{ display: grid; grid-template-columns: repeat(6, 1fr); gap: 8px; margin: 12px 0 16px; }}
1198
+ .filters input {{ width: 100%; padding: 8px; border: 1px solid #d0d7de; border-radius: 6px; }}
1199
+ .card {{ border: 1px solid #d0d7de; border-radius: 10px; padding: 12px; margin: 10px 0; }}
1200
+ .muted {{ color: #57606a; font-size: 13px; }}
1201
+ .pill {{ display: inline-block; padding: 2px 8px; border-radius: 999px; border: 1px solid #d0d7de; margin-right: 6px; font-size: 12px; }}
1202
+ .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
1203
+ .tabs {{ display: flex; gap: 8px; flex-wrap: wrap; }}
1204
+ .tab {{ display: inline-block; padding: 6px 12px; border-radius: 999px; border: 1px solid #d0d7de; background: #f6f8fa; color: #0969da; text-decoration: none; font-size: 13px; }}
1205
+ .tab:hover {{ background: #eef1f4; }}
1206
+ .tab.active {{ background: #0969da; border-color: #0969da; color: #fff; }}
1207
+ pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
1208
+ code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
1209
+ a {{ color: #0969da; }}
1210
+ </style>
1211
+ {extra_head}
1212
+ </head>
1213
+ <body>
1214
+ <header>
1215
+ <a href="/">Papers</a>
1216
+ <a href="/stats">Stats</a>
1217
+ </header>
1218
+ <div class="container">
1219
+ {body_html}
1220
+ </div>
1221
+ {extra_scripts}
1222
+ </body>
1223
+ </html>"""
1224
+
1225
+
1226
+ def _embed_shell(title: str, body_html: str, extra_head: str = "", extra_scripts: str = "") -> str:
1227
+ return f"""<!doctype html>
1228
+ <html lang="en">
1229
+ <head>
1230
+ <meta charset="utf-8" />
1231
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
1232
+ <title>{html.escape(title)}</title>
1233
+ <style>
1234
+ body {{ font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Arial; margin: 0; padding: 16px; }}
1235
+ h1, h2, h3, h4 {{ margin-top: 1.2em; }}
1236
+ .muted {{ color: #57606a; font-size: 13px; }}
1237
+ .warning {{ background: #fff4ce; border: 1px solid #ffd089; padding: 10px; border-radius: 10px; margin: 12px 0; }}
1238
+ pre {{ overflow: auto; padding: 10px; background: #0b1220; color: #e6edf3; border-radius: 10px; }}
1239
+ code {{ font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace; }}
1240
+ a {{ color: #0969da; }}
1241
+ </style>
1242
+ {extra_head}
1243
+ </head>
1244
+ <body>
1245
+ {body_html}
1246
+ {extra_scripts}
1247
+ </body>
1248
+ </html>"""
1249
+
1250
+
1251
+ def _build_pdfjs_viewer_url(pdf_url: str) -> str:
1252
+ encoded = quote(pdf_url, safe="")
1253
+ return f"{_PDFJS_VIEWER_PATH}?file={encoded}"
1254
+
1255
+
1256
+ async def _index_page(request: Request) -> HTMLResponse:
1257
+ return HTMLResponse(
1258
+ _page_shell(
1259
+ "Paper DB",
1260
+ """
1261
+ <h2>Paper Database</h2>
1262
+ <div class="card">
1263
+ <div class="muted">Search (Scholar-style): <code>tag:fpga year:2023..2025 -survey</code> · Use quotes for phrases and <code>OR</code> for alternatives.</div>
1264
+ <div style="display:flex; gap:8px; margin-top:8px;">
1265
+ <input id="query" placeholder='Search... e.g. title:"nearest neighbor" tag:fpga year:2023..2025' style="flex:1; padding:10px; border:1px solid #d0d7de; border-radius:8px;" />
1266
+ <select id="openView" style="padding:10px; border:1px solid #d0d7de; border-radius:8px;">
1267
+ <option value="summary" selected>Open: Summary</option>
1268
+ <option value="source">Open: Source</option>
1269
+ <option value="pdf">Open: PDF</option>
1270
+ <option value="pdfjs">Open: PDF Viewer</option>
1271
+ <option value="split">Open: Split</option>
1272
+ </select>
1273
+ </div>
1274
+ <details style="margin-top:10px;">
1275
+ <summary>Advanced search</summary>
1276
+ <div style="margin-top:10px;" class="muted">Build a query:</div>
1277
+ <div class="filters" style="grid-template-columns: repeat(3, 1fr);">
1278
+ <input id="advTitle" placeholder="title contains..." />
1279
+ <input id="advAuthor" placeholder="author contains..." />
1280
+ <input id="advTag" placeholder="tag (comma separated)" />
1281
+ <input id="advYear" placeholder="year (e.g. 2020..2024)" />
1282
+ <input id="advMonth" placeholder="month (01-12)" />
1283
+ <input id="advVenue" placeholder="venue contains..." />
1284
+ </div>
1285
+ <div style="display:flex; gap:8px; align-items:center; margin-top:8px;">
1286
+ <button id="buildQuery" style="padding:8px 12px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Build</button>
1287
+ <div class="muted">Generated: <code id="generated"></code></div>
1288
+ </div>
1289
+ </details>
1290
+ </div>
1291
+ <div id="results"></div>
1292
+ <div id="loading" class="muted">Loading...</div>
1293
+ <script>
1294
+ let page = 1;
1295
+ let loading = false;
1296
+ let done = false;
1297
+
1298
+ function currentParams(nextPage) {
1299
+ const params = new URLSearchParams();
1300
+ params.set("page", String(nextPage));
1301
+ params.set("page_size", "30");
1302
+ const q = document.getElementById("query").value.trim();
1303
+ if (q) params.set("q", q);
1304
+ return params;
1305
+ }
1306
+
1307
+ function escapeHtml(text) {
1308
+ const div = document.createElement("div");
1309
+ div.textContent = text;
1310
+ return div.innerHTML;
1311
+ }
1312
+
1313
+ function viewSuffixForItem(item) {
1314
+ const view = document.getElementById("openView").value;
1315
+ if (!view || view === "summary") return "";
1316
+ const params = new URLSearchParams();
1317
+ params.set("view", view);
1318
+ if (view === "split") {
1319
+ params.set("left", "summary");
1320
+ if (item.has_pdf) {
1321
+ params.set("right", "pdfjs");
1322
+ } else if (item.has_source) {
1323
+ params.set("right", "source");
1324
+ } else {
1325
+ params.set("right", "summary");
1326
+ }
1327
+ }
1328
+ return `?${params.toString()}`;
1329
+ }
1330
+
1331
+ function renderItem(item) {
1332
+ const tags = (item.tags || []).map(t => `<span class="pill">${escapeHtml(t)}</span>`).join("");
1333
+ const authors = (item.authors || []).slice(0, 6).map(a => escapeHtml(a)).join(", ");
1334
+ const meta = `${escapeHtml(item.year || "")}-${escapeHtml(item.month || "")} · ${escapeHtml(item.venue || "")}`;
1335
+ const viewSuffix = viewSuffixForItem(item);
1336
+ const badges = [
1337
+ item.has_source ? `<span class="pill">source</span>` : "",
1338
+ item.has_pdf ? `<span class="pill">pdf</span>` : "",
1339
+ ].join("");
1340
+ return `
1341
+ <div class="card">
1342
+ <div><a href="/paper/${encodeURIComponent(item.source_hash)}${viewSuffix}">${escapeHtml(item.title || "")}</a></div>
1343
+ <div class="muted">${authors}</div>
1344
+ <div class="muted">${meta}</div>
1345
+ <div style="margin-top:6px">${badges} ${tags}</div>
1346
+ </div>
1347
+ `;
1348
+ }
1349
+
1350
+ async function loadMore() {
1351
+ if (loading || done) return;
1352
+ loading = true;
1353
+ document.getElementById("loading").textContent = "Loading...";
1354
+ const res = await fetch(`/api/papers?${currentParams(page).toString()}`);
1355
+ const data = await res.json();
1356
+ const results = document.getElementById("results");
1357
+ for (const item of data.items) {
1358
+ results.insertAdjacentHTML("beforeend", renderItem(item));
1359
+ }
1360
+ if (!data.has_more) {
1361
+ done = true;
1362
+ document.getElementById("loading").textContent = "End.";
1363
+ } else {
1364
+ page += 1;
1365
+ document.getElementById("loading").textContent = "Scroll to load more...";
1366
+ }
1367
+ loading = false;
1368
+ }
1369
+
1370
+ function resetAndLoad() {
1371
+ page = 1;
1372
+ done = false;
1373
+ document.getElementById("results").innerHTML = "";
1374
+ loadMore();
1375
+ }
1376
+
1377
+ document.getElementById("query").addEventListener("change", resetAndLoad);
1378
+ document.getElementById("openView").addEventListener("change", resetAndLoad);
1379
+
1380
+ document.getElementById("buildQuery").addEventListener("click", () => {
1381
+ function add(field, value) {
1382
+ value = value.trim();
1383
+ if (!value) return "";
1384
+ if (value.includes(" ")) return `${field}:"${value}"`;
1385
+ return `${field}:${value}`;
1386
+ }
1387
+ const parts = [];
1388
+ const t = document.getElementById("advTitle").value.trim();
1389
+ const a = document.getElementById("advAuthor").value.trim();
1390
+ const tag = document.getElementById("advTag").value.trim();
1391
+ const y = document.getElementById("advYear").value.trim();
1392
+ const m = document.getElementById("advMonth").value.trim();
1393
+ const v = document.getElementById("advVenue").value.trim();
1394
+ if (t) parts.push(add("title", t));
1395
+ if (a) parts.push(add("author", a));
1396
+ if (tag) {
1397
+ for (const item of tag.split(",")) {
1398
+ const val = item.trim();
1399
+ if (val) parts.push(add("tag", val));
1400
+ }
1401
+ }
1402
+ if (y) parts.push(add("year", y));
1403
+ if (m) parts.push(add("month", m));
1404
+ if (v) parts.push(add("venue", v));
1405
+ const q = parts.join(" ");
1406
+ document.getElementById("generated").textContent = q;
1407
+ document.getElementById("query").value = q;
1408
+ resetAndLoad();
1409
+ });
1410
+
1411
+ window.addEventListener("scroll", () => {
1412
+ if ((window.innerHeight + window.scrollY) >= (document.body.offsetHeight - 600)) {
1413
+ loadMore();
1414
+ }
1415
+ });
1416
+
1417
+ loadMore();
1418
+ </script>
1419
+ """,
1420
+ )
1421
+ )
1422
+
1423
+
1424
+ def _parse_filters(request: Request) -> dict[str, list[str] | str | int]:
1425
+ qp = request.query_params
1426
+ page = int(qp.get("page", "1"))
1427
+ page_size = int(qp.get("page_size", "30"))
1428
+ page = max(1, page)
1429
+ page_size = min(max(1, page_size), 200)
1430
+
1431
+ q = qp.get("q", "").strip()
1432
+
1433
+ return {
1434
+ "page": page,
1435
+ "page_size": page_size,
1436
+ "q": q,
1437
+ }
1438
+
1439
+
1440
+ async def _api_papers(request: Request) -> JSONResponse:
1441
+ index: PaperIndex = request.app.state.index
1442
+ filters = _parse_filters(request)
1443
+ page = int(filters["page"])
1444
+ page_size = int(filters["page_size"])
1445
+ q = str(filters["q"])
1446
+ query = parse_query(q)
1447
+ candidate = _apply_query(index, query)
1448
+ ordered = [idx for idx in index.ordered_ids if idx in candidate]
1449
+ total = len(ordered)
1450
+ start = (page - 1) * page_size
1451
+ end = min(start + page_size, total)
1452
+ page_ids = ordered[start:end]
1453
+
1454
+ items: list[dict[str, Any]] = []
1455
+ for idx in page_ids:
1456
+ paper = index.papers[idx]
1457
+ source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
1458
+ items.append(
1459
+ {
1460
+ "source_hash": source_hash,
1461
+ "title": paper.get("paper_title") or "",
1462
+ "authors": paper.get("_authors") or [],
1463
+ "year": paper.get("_year") or "",
1464
+ "month": paper.get("_month") or "",
1465
+ "venue": paper.get("_venue") or "",
1466
+ "tags": paper.get("_tags") or [],
1467
+ "has_source": source_hash in index.md_path_by_hash,
1468
+ "has_pdf": source_hash in index.pdf_path_by_hash,
1469
+ }
1470
+ )
1471
+
1472
+ return JSONResponse(
1473
+ {
1474
+ "page": page,
1475
+ "page_size": page_size,
1476
+ "total": total,
1477
+ "has_more": end < total,
1478
+ "items": items,
1479
+ }
1480
+ )
1481
+
1482
+
1483
+ async def _paper_detail(request: Request) -> HTMLResponse:
1484
+ index: PaperIndex = request.app.state.index
1485
+ md = request.app.state.md
1486
+ source_hash = request.path_params["source_hash"]
1487
+ idx = index.id_by_hash.get(source_hash)
1488
+ if idx is None:
1489
+ return RedirectResponse("/")
1490
+ paper = index.papers[idx]
1491
+ view = request.query_params.get("view", "summary")
1492
+ template_param = request.query_params.get("template")
1493
+ embed = request.query_params.get("embed") == "1"
1494
+ if view == "split":
1495
+ embed = False
1496
+
1497
+ pdf_path = index.pdf_path_by_hash.get(source_hash)
1498
+ pdf_url = f"/api/pdf/{source_hash}"
1499
+ shell = _embed_shell if embed else _page_shell
1500
+ source_available = source_hash in index.md_path_by_hash
1501
+ allowed_views = {"summary", "source", "pdf", "pdfjs"}
1502
+
1503
+ def normalize_view(value: str | None, default: str) -> str:
1504
+ if value in allowed_views:
1505
+ return value
1506
+ return default
1507
+
1508
+ default_right = "pdfjs" if pdf_path else ("source" if source_available else "summary")
1509
+ left_param = request.query_params.get("left")
1510
+ right_param = request.query_params.get("right")
1511
+ left = normalize_view(left_param, "summary") if left_param else "summary"
1512
+ right = normalize_view(right_param, default_right) if right_param else default_right
1513
+
1514
+ def nav_link(label: str, v: str) -> str:
1515
+ active = " active" if view == v else ""
1516
+ params: dict[str, str] = {"view": v}
1517
+ if v == "summary" and template_param:
1518
+ params["template"] = str(template_param)
1519
+ if v == "split":
1520
+ params["left"] = left
1521
+ params["right"] = right
1522
+ href = f"/paper/{source_hash}?{urlencode(params)}"
1523
+ return f'<a class="tab{active}" href="{html.escape(href)}">{html.escape(label)}</a>'
1524
+
1525
+ nav = f"""
1526
+ <div class="tabs" style="margin: 8px 0 14px;">
1527
+ {nav_link("Summary", "summary")}
1528
+ {nav_link("Source", "source")}
1529
+ {nav_link("PDF", "pdf")}
1530
+ {nav_link("PDF Viewer", "pdfjs")}
1531
+ {nav_link("Split", "split")}
1532
+ </div>
1533
+ """
1534
+ nav_html = "" if embed else nav
1535
+
1536
+ if view == "split":
1537
+ def pane_src(pane_view: str) -> str:
1538
+ if pane_view == "pdfjs" and pdf_path:
1539
+ return _build_pdfjs_viewer_url(pdf_url)
1540
+ params: dict[str, str] = {"view": pane_view, "embed": "1"}
1541
+ if pane_view == "summary" and template_param:
1542
+ params["template"] = str(template_param)
1543
+ return f"/paper/{source_hash}?{urlencode(params)}"
1544
+
1545
+ left_src = pane_src(left)
1546
+ right_src = pane_src(right)
1547
+ options = [
1548
+ ("summary", "Summary"),
1549
+ ("source", "Source"),
1550
+ ("pdf", "PDF"),
1551
+ ("pdfjs", "PDF Viewer"),
1552
+ ]
1553
+ left_options = "\n".join(
1554
+ f'<option value="{value}"{" selected" if value == left else ""}>{label}</option>'
1555
+ for value, label in options
1556
+ )
1557
+ right_options = "\n".join(
1558
+ f'<option value="{value}"{" selected" if value == right else ""}>{label}</option>'
1559
+ for value, label in options
1560
+ )
1561
+ body = f"""
1562
+ <h2>{html.escape(str(paper.get('paper_title') or 'Paper'))}</h2>
1563
+ {nav}
1564
+ <div class="split-controls">
1565
+ <div>
1566
+ <div class="muted">Left pane</div>
1567
+ <select id="splitLeft">
1568
+ {left_options}
1569
+ </select>
1570
+ </div>
1571
+ <div class="split-actions">
1572
+ <button id="splitTighten" type="button" title="Tighten width">-</button>
1573
+ <button id="splitSwap" type="button" title="Swap panes">⇄</button>
1574
+ <button id="splitWiden" type="button" title="Widen width">+</button>
1575
+ </div>
1576
+ <div>
1577
+ <div class="muted">Right pane</div>
1578
+ <select id="splitRight">
1579
+ {right_options}
1580
+ </select>
1581
+ </div>
1582
+ </div>
1583
+ <div class="split-layout">
1584
+ <div class="split-pane">
1585
+ <iframe id="leftPane" src="{html.escape(left_src)}" title="Left pane"></iframe>
1586
+ </div>
1587
+ <div class="split-pane">
1588
+ <iframe id="rightPane" src="{html.escape(right_src)}" title="Right pane"></iframe>
1589
+ </div>
1590
+ </div>
1591
+ """
1592
+ extra_head = """
1593
+ <style>
1594
+ .container {
1595
+ max-width: 100%;
1596
+ width: 100%;
1597
+ margin: 0 auto;
1598
+ }
1599
+ .split-controls {
1600
+ display: grid;
1601
+ grid-template-columns: 1fr auto 1fr;
1602
+ gap: 12px;
1603
+ align-items: end;
1604
+ margin: 10px 0 14px;
1605
+ }
1606
+ .split-controls select {
1607
+ padding: 6px 8px;
1608
+ border-radius: 8px;
1609
+ border: 1px solid #d0d7de;
1610
+ background: #fff;
1611
+ min-width: 160px;
1612
+ }
1613
+ .split-actions {
1614
+ display: flex;
1615
+ align-items: center;
1616
+ justify-content: center;
1617
+ gap: 8px;
1618
+ height: 100%;
1619
+ }
1620
+ .split-actions button {
1621
+ padding: 6px 10px;
1622
+ border-radius: 999px;
1623
+ border: 1px solid #d0d7de;
1624
+ background: #f6f8fa;
1625
+ cursor: pointer;
1626
+ min-width: 36px;
1627
+ }
1628
+ .split-layout {
1629
+ display: flex;
1630
+ gap: 12px;
1631
+ width: 100%;
1632
+ max-width: min(100%, var(--split-max-width, 100%));
1633
+ margin: 0 auto;
1634
+ height: calc(100vh - 260px);
1635
+ min-height: 420px;
1636
+ }
1637
+ .split-pane {
1638
+ flex: 1;
1639
+ border: 1px solid #d0d7de;
1640
+ border-radius: 10px;
1641
+ overflow: hidden;
1642
+ background: #fff;
1643
+ }
1644
+ .split-pane iframe {
1645
+ width: 100%;
1646
+ height: 100%;
1647
+ border: 0;
1648
+ }
1649
+ @media (max-width: 900px) {
1650
+ .split-layout {
1651
+ flex-direction: column;
1652
+ height: auto;
1653
+ }
1654
+ .split-pane {
1655
+ height: 70vh;
1656
+ }
1657
+ .split-controls {
1658
+ grid-template-columns: 1fr;
1659
+ }
1660
+ }
1661
+ </style>
1662
+ """
1663
+ extra_scripts = """
1664
+ <script>
1665
+ const leftSelect = document.getElementById('splitLeft');
1666
+ const rightSelect = document.getElementById('splitRight');
1667
+ const swapButton = document.getElementById('splitSwap');
1668
+ const tightenButton = document.getElementById('splitTighten');
1669
+ const widenButton = document.getElementById('splitWiden');
1670
+ function updateSplit() {
1671
+ const params = new URLSearchParams(window.location.search);
1672
+ params.set('view', 'split');
1673
+ params.set('left', leftSelect.value);
1674
+ params.set('right', rightSelect.value);
1675
+ window.location.search = params.toString();
1676
+ }
1677
+ leftSelect.addEventListener('change', updateSplit);
1678
+ rightSelect.addEventListener('change', updateSplit);
1679
+ swapButton.addEventListener('click', () => {
1680
+ const leftValue = leftSelect.value;
1681
+ leftSelect.value = rightSelect.value;
1682
+ rightSelect.value = leftValue;
1683
+ updateSplit();
1684
+ });
1685
+ const widthSteps = ["1200px", "1400px", "1600px", "1800px", "2000px", "100%"];
1686
+ let widthIndex = widthSteps.length - 1;
1687
+ try {
1688
+ const stored = localStorage.getItem('splitWidthIndex');
1689
+ if (stored !== null) {
1690
+ const parsed = Number.parseInt(stored, 10);
1691
+ if (!Number.isNaN(parsed)) {
1692
+ widthIndex = Math.max(0, Math.min(widthSteps.length - 1, parsed));
1693
+ }
1694
+ }
1695
+ } catch (err) {
1696
+ // Ignore storage errors (e.g. private mode)
1697
+ }
1698
+
1699
+ function applySplitWidth() {
1700
+ const value = widthSteps[widthIndex];
1701
+ document.documentElement.style.setProperty('--split-max-width', value);
1702
+ try {
1703
+ localStorage.setItem('splitWidthIndex', String(widthIndex));
1704
+ } catch (err) {
1705
+ // Ignore storage errors
1706
+ }
1707
+ }
1708
+
1709
+ tightenButton.addEventListener('click', () => {
1710
+ widthIndex = Math.max(0, widthIndex - 1);
1711
+ applySplitWidth();
1712
+ });
1713
+ widenButton.addEventListener('click', () => {
1714
+ widthIndex = Math.min(widthSteps.length - 1, widthIndex + 1);
1715
+ applySplitWidth();
1716
+ });
1717
+ applySplitWidth();
1718
+ </script>
1719
+ """
1720
+ return HTMLResponse(_page_shell("Split View", body, extra_head=extra_head, extra_scripts=extra_scripts))
1721
+
1722
+ if view == "source":
1723
+ source_path = index.md_path_by_hash.get(source_hash)
1724
+ if not source_path:
1725
+ body = nav_html + '<div class="warning">Source markdown not found. Provide --md-root to enable source viewing.</div>'
1726
+ return HTMLResponse(shell("Source", body))
1727
+ try:
1728
+ raw = source_path.read_text(encoding="utf-8")
1729
+ except UnicodeDecodeError:
1730
+ raw = source_path.read_text(encoding="latin-1")
1731
+ rendered = _render_markdown_with_math_placeholders(md, raw)
1732
+ body = (
1733
+ nav_html
1734
+ + f"<h2>{html.escape(str(paper.get('paper_title') or 'Paper'))}</h2>"
1735
+ + f'<div class="muted">{html.escape(str(source_path))}</div>'
1736
+ + '<div class="muted" style="margin-top:10px;">Rendered from source markdown:</div>'
1737
+ + f'<div id="content">{rendered}</div>'
1738
+ + "<details style='margin-top:12px;'><summary>Raw markdown</summary>"
1739
+ + f"<pre><code>{html.escape(raw)}</code></pre></details>"
1740
+ )
1741
+ extra_head = f'<link rel="stylesheet" href="{_CDN_KATEX}" />'
1742
+ extra_scripts = f"""
1743
+ <script src="{_CDN_MERMAID}"></script>
1744
+ <script src="{_CDN_KATEX_JS}"></script>
1745
+ <script src="{_CDN_KATEX_AUTO}"></script>
1746
+ <script>
1747
+ document.querySelectorAll('code.language-mermaid').forEach((code) => {{
1748
+ const pre = code.parentElement;
1749
+ const div = document.createElement('div');
1750
+ div.className = 'mermaid';
1751
+ div.textContent = code.textContent;
1752
+ pre.replaceWith(div);
1753
+ }});
1754
+ if (window.mermaid) {{
1755
+ mermaid.initialize({{ startOnLoad: false }});
1756
+ mermaid.run();
1757
+ }}
1758
+ if (window.renderMathInElement) {{
1759
+ renderMathInElement(document.getElementById('content'), {{
1760
+ delimiters: [
1761
+ {{left: '$$', right: '$$', display: true}},
1762
+ {{left: '$', right: '$', display: false}},
1763
+ {{left: '\\\\(', right: '\\\\)', display: false}},
1764
+ {{left: '\\\\[', right: '\\\\]', display: true}}
1765
+ ],
1766
+ throwOnError: false
1767
+ }});
1768
+ }}
1769
+ </script>
1770
+ """
1771
+ return HTMLResponse(shell("Source", body, extra_head=extra_head, extra_scripts=extra_scripts))
1772
+
1773
+ if view == "pdf":
1774
+ if not pdf_path:
1775
+ body = nav_html + '<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>'
1776
+ return HTMLResponse(shell("PDF", body))
1777
+ body = nav_html + f"""
1778
+ <h2>{html.escape(str(paper.get('paper_title') or 'Paper'))}</h2>
1779
+ <div class="muted">{html.escape(str(pdf_path.name))}</div>
1780
+ <div style="display:flex; gap:8px; align-items:center; margin: 10px 0;">
1781
+ <button id="prev" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Prev</button>
1782
+ <button id="next" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">Next</button>
1783
+ <span class="muted">Page <span id="page_num">1</span> / <span id="page_count">?</span></span>
1784
+ <span style="flex:1"></span>
1785
+ <button id="zoomOut" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">-</button>
1786
+ <button id="zoomIn" style="padding:6px 10px; border-radius:8px; border:1px solid #d0d7de; background:#f6f8fa; cursor:pointer;">+</button>
1787
+ </div>
1788
+ <canvas id="the-canvas" style="width: 100%; border: 1px solid #d0d7de; border-radius: 10px;"></canvas>
1789
+ """
1790
+ extra_scripts = f"""
1791
+ <script src="{_CDN_PDFJS}"></script>
1792
+ <script>
1793
+ const url = {json.dumps(pdf_url)};
1794
+ pdfjsLib.GlobalWorkerOptions.workerSrc = {json.dumps(_CDN_PDFJS_WORKER)};
1795
+ let pdfDoc = null;
1796
+ let pageNum = 1;
1797
+ let pageRendering = false;
1798
+ let pageNumPending = null;
1799
+ let zoomLevel = 1.0;
1800
+ const canvas = document.getElementById('the-canvas');
1801
+ const ctx = canvas.getContext('2d');
1802
+
1803
+ function renderPage(num) {{
1804
+ pageRendering = true;
1805
+ pdfDoc.getPage(num).then((page) => {{
1806
+ const baseViewport = page.getViewport({{scale: 1}});
1807
+ const containerWidth = canvas.clientWidth || baseViewport.width;
1808
+ const fitScale = containerWidth / baseViewport.width;
1809
+ const scale = fitScale * zoomLevel;
1810
+
1811
+ const viewport = page.getViewport({{scale}});
1812
+ const outputScale = window.devicePixelRatio || 1;
1813
+
1814
+ canvas.width = Math.floor(viewport.width * outputScale);
1815
+ canvas.height = Math.floor(viewport.height * outputScale);
1816
+ canvas.style.width = Math.floor(viewport.width) + 'px';
1817
+ canvas.style.height = Math.floor(viewport.height) + 'px';
1818
+
1819
+ const transform = outputScale !== 1 ? [outputScale, 0, 0, outputScale, 0, 0] : null;
1820
+ const renderContext = {{ canvasContext: ctx, viewport, transform }};
1821
+ const renderTask = page.render(renderContext);
1822
+ renderTask.promise.then(() => {{
1823
+ pageRendering = false;
1824
+ document.getElementById('page_num').textContent = String(pageNum);
1825
+ if (pageNumPending !== null) {{
1826
+ const next = pageNumPending;
1827
+ pageNumPending = null;
1828
+ renderPage(next);
1829
+ }}
1830
+ }});
1831
+ }});
1832
+ }}
1833
+
1834
+ function queueRenderPage(num) {{
1835
+ if (pageRendering) {{
1836
+ pageNumPending = num;
1837
+ }} else {{
1838
+ renderPage(num);
1839
+ }}
1840
+ }}
1841
+
1842
+ function onPrevPage() {{
1843
+ if (pageNum <= 1) return;
1844
+ pageNum--;
1845
+ queueRenderPage(pageNum);
1846
+ }}
1847
+
1848
+ function onNextPage() {{
1849
+ if (pageNum >= pdfDoc.numPages) return;
1850
+ pageNum++;
1851
+ queueRenderPage(pageNum);
1852
+ }}
1853
+
1854
+ function adjustZoom(delta) {{
1855
+ zoomLevel = Math.max(0.5, Math.min(3.0, zoomLevel + delta));
1856
+ queueRenderPage(pageNum);
1857
+ }}
1858
+
1859
+ document.getElementById('prev').addEventListener('click', onPrevPage);
1860
+ document.getElementById('next').addEventListener('click', onNextPage);
1861
+ document.getElementById('zoomOut').addEventListener('click', () => adjustZoom(-0.1));
1862
+ document.getElementById('zoomIn').addEventListener('click', () => adjustZoom(0.1));
1863
+
1864
+ pdfjsLib.getDocument(url).promise.then((pdfDoc_) => {{
1865
+ pdfDoc = pdfDoc_;
1866
+ document.getElementById('page_count').textContent = String(pdfDoc.numPages);
1867
+ renderPage(pageNum);
1868
+ }});
1869
+
1870
+ let resizeTimer = null;
1871
+ window.addEventListener('resize', () => {{
1872
+ if (!pdfDoc) return;
1873
+ if (resizeTimer) clearTimeout(resizeTimer);
1874
+ resizeTimer = setTimeout(() => queueRenderPage(pageNum), 150);
1875
+ }});
1876
+ </script>
1877
+ """
1878
+ return HTMLResponse(shell("PDF", body, extra_scripts=extra_scripts))
1879
+
1880
+ if view == "pdfjs":
1881
+ if not pdf_path:
1882
+ body = nav_html + '<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>'
1883
+ return HTMLResponse(shell("PDF Viewer", body))
1884
+ viewer_url = _build_pdfjs_viewer_url(pdf_url)
1885
+ header_html = ""
1886
+ if not embed:
1887
+ header_html = (
1888
+ f"<h2>{html.escape(str(paper.get('paper_title') or 'Paper'))}</h2>"
1889
+ + f'<div class="muted">{html.escape(str(pdf_path.name))}</div>'
1890
+ )
1891
+ frame_height = "calc(100vh - 220px)" if not embed else "calc(100vh - 32px)"
1892
+ body = f"""
1893
+ {nav_html}
1894
+ {header_html}
1895
+ <iframe class="pdfjs-frame" src="{html.escape(viewer_url)}" title="PDF.js Viewer"></iframe>
1896
+ """
1897
+ extra_head = f"""
1898
+ <style>
1899
+ .pdfjs-frame {{
1900
+ width: 100%;
1901
+ height: {frame_height};
1902
+ border: 1px solid #d0d7de;
1903
+ border-radius: 10px;
1904
+ }}
1905
+ </style>
1906
+ """
1907
+ return HTMLResponse(shell("PDF Viewer", body, extra_head=extra_head))
1908
+
1909
+ selected_tag, available_templates = _select_template_tag(paper, template_param)
1910
+ markdown, template_name, warning = _render_paper_markdown(
1911
+ paper,
1912
+ request.app.state.fallback_language,
1913
+ template_tag=selected_tag,
1914
+ )
1915
+ rendered_html = _render_markdown_with_math_placeholders(md, markdown)
1916
+
1917
+ warning_html = f'<div class="warning">{html.escape(warning)}</div>' if warning else ""
1918
+ title = str(paper.get("paper_title") or "Paper")
1919
+ outline_top = "72px" if not embed else "16px"
1920
+ template_controls = f'<div class="muted">Template: {html.escape(template_name)}</div>'
1921
+ if available_templates:
1922
+ options = "\n".join(
1923
+ f'<option value="{html.escape(tag)}"{" selected" if tag == selected_tag else ""}>{html.escape(tag)}</option>'
1924
+ for tag in available_templates
1925
+ )
1926
+ template_controls = f"""
1927
+ <div class="muted" style="margin: 6px 0;">
1928
+ Template:
1929
+ <select id="templateSelect" style="padding:6px 8px; border:1px solid #d0d7de; border-radius:6px;">
1930
+ {options}
1931
+ </select>
1932
+ </div>
1933
+ <script>
1934
+ const templateSelect = document.getElementById('templateSelect');
1935
+ if (templateSelect) {{
1936
+ templateSelect.addEventListener('change', () => {{
1937
+ const params = new URLSearchParams(window.location.search);
1938
+ params.set('view', 'summary');
1939
+ params.set('template', templateSelect.value);
1940
+ window.location.search = params.toString();
1941
+ }});
1942
+ }}
1943
+ </script>
1944
+ """
1945
+ outline_html = """
1946
+ <button id="outlineToggle" class="outline-toggle" title="Toggle outline">☰</button>
1947
+ <div id="outlinePanel" class="outline-panel collapsed">
1948
+ <div class="outline-title">Outline</div>
1949
+ <div id="outlineList" class="outline-list"></div>
1950
+ </div>
1951
+ <button id="backToTop" class="back-to-top" title="Back to top">↑</button>
1952
+ """
1953
+ body = f"""
1954
+ <h2>{html.escape(title)}</h2>
1955
+ {template_controls}
1956
+ {warning_html}
1957
+ {nav_html}
1958
+ {outline_html}
1959
+ <div id="content">{rendered_html}</div>
1960
+ """
1961
+
1962
+ extra_head = f"""
1963
+ <link rel="stylesheet" href="{_CDN_KATEX}" />
1964
+ <style>
1965
+ :root {{
1966
+ --outline-top: {outline_top};
1967
+ }}
1968
+ .outline-toggle {{
1969
+ position: fixed;
1970
+ top: var(--outline-top);
1971
+ left: 16px;
1972
+ z-index: 20;
1973
+ padding: 6px 10px;
1974
+ border-radius: 8px;
1975
+ border: 1px solid #d0d7de;
1976
+ background: #f6f8fa;
1977
+ cursor: pointer;
1978
+ }}
1979
+ .outline-panel {{
1980
+ position: fixed;
1981
+ top: calc(var(--outline-top) + 42px);
1982
+ left: 16px;
1983
+ width: 240px;
1984
+ max-height: 60vh;
1985
+ overflow: auto;
1986
+ border: 1px solid #d0d7de;
1987
+ border-radius: 10px;
1988
+ background: #ffffff;
1989
+ padding: 10px;
1990
+ z-index: 20;
1991
+ box-shadow: 0 6px 18px rgba(0, 0, 0, 0.08);
1992
+ }}
1993
+ .outline-panel.collapsed {{
1994
+ display: none;
1995
+ }}
1996
+ .outline-title {{
1997
+ font-size: 12px;
1998
+ text-transform: uppercase;
1999
+ letter-spacing: 0.08em;
2000
+ color: #57606a;
2001
+ margin-bottom: 8px;
2002
+ }}
2003
+ .outline-list a {{
2004
+ display: block;
2005
+ color: #0969da;
2006
+ text-decoration: none;
2007
+ padding: 4px 0;
2008
+ }}
2009
+ .outline-list a:hover {{
2010
+ text-decoration: underline;
2011
+ }}
2012
+ .back-to-top {{
2013
+ position: fixed;
2014
+ left: 16px;
2015
+ bottom: 16px;
2016
+ padding: 6px 10px;
2017
+ border-radius: 999px;
2018
+ border: 1px solid #d0d7de;
2019
+ background: #ffffff;
2020
+ cursor: pointer;
2021
+ opacity: 0;
2022
+ pointer-events: none;
2023
+ transition: opacity 0.2s ease;
2024
+ z-index: 20;
2025
+ }}
2026
+ .back-to-top.visible {{
2027
+ opacity: 1;
2028
+ pointer-events: auto;
2029
+ }}
2030
+ @media (max-width: 900px) {{
2031
+ .outline-panel {{
2032
+ width: 200px;
2033
+ }}
2034
+ }}
2035
+ </style>
2036
+ """
2037
+ extra_scripts = f"""
2038
+ <script src="{_CDN_MERMAID}"></script>
2039
+ <script src="{_CDN_KATEX_JS}"></script>
2040
+ <script src="{_CDN_KATEX_AUTO}"></script>
2041
+ <script>
2042
+ // Mermaid: convert fenced code blocks to mermaid divs
2043
+ document.querySelectorAll('code.language-mermaid').forEach((code) => {{
2044
+ const pre = code.parentElement;
2045
+ const div = document.createElement('div');
2046
+ div.className = 'mermaid';
2047
+ div.textContent = code.textContent;
2048
+ pre.replaceWith(div);
2049
+ }});
2050
+ if (window.mermaid) {{
2051
+ mermaid.initialize({{ startOnLoad: false }});
2052
+ mermaid.run();
2053
+ }}
2054
+ if (window.renderMathInElement) {{
2055
+ renderMathInElement(document.getElementById('content'), {{
2056
+ delimiters: [
2057
+ {{left: '$$', right: '$$', display: true}},
2058
+ {{left: '$', right: '$', display: false}},
2059
+ {{left: '\\\\(', right: '\\\\)', display: false}},
2060
+ {{left: '\\\\[', right: '\\\\]', display: true}}
2061
+ ],
2062
+ throwOnError: false
2063
+ }});
2064
+ }}
2065
+ const outlineToggle = document.getElementById('outlineToggle');
2066
+ const outlinePanel = document.getElementById('outlinePanel');
2067
+ const outlineList = document.getElementById('outlineList');
2068
+ const backToTop = document.getElementById('backToTop');
2069
+
2070
+ function slugify(text) {{
2071
+ return text.toLowerCase().trim()
2072
+ .replace(/[^a-z0-9\\s-]/g, '')
2073
+ .replace(/\\s+/g, '-')
2074
+ .replace(/-+/g, '-');
2075
+ }}
2076
+
2077
+ function buildOutline() {{
2078
+ if (!outlineList) return;
2079
+ const content = document.getElementById('content');
2080
+ if (!content) return;
2081
+ const headings = content.querySelectorAll('h1, h2, h3, h4');
2082
+ if (!headings.length) {{
2083
+ outlineList.innerHTML = '<div class="muted">No headings</div>';
2084
+ return;
2085
+ }}
2086
+ const used = new Set();
2087
+ outlineList.innerHTML = '';
2088
+ headings.forEach((heading) => {{
2089
+ let id = heading.id;
2090
+ if (!id) {{
2091
+ const base = slugify(heading.textContent || 'section') || 'section';
2092
+ id = base;
2093
+ let i = 1;
2094
+ while (used.has(id) || document.getElementById(id)) {{
2095
+ id = `${{base}}-${{i++}}`;
2096
+ }}
2097
+ heading.id = id;
2098
+ }}
2099
+ used.add(id);
2100
+ const level = parseInt(heading.tagName.slice(1), 10) || 1;
2101
+ const link = document.createElement('a');
2102
+ link.href = `#${{id}}`;
2103
+ link.textContent = heading.textContent || '';
2104
+ link.style.paddingLeft = `${{(level - 1) * 12}}px`;
2105
+ outlineList.appendChild(link);
2106
+ }});
2107
+ }}
2108
+
2109
+ function toggleBackToTop() {{
2110
+ if (!backToTop) return;
2111
+ if (window.scrollY > 300) {{
2112
+ backToTop.classList.add('visible');
2113
+ }} else {{
2114
+ backToTop.classList.remove('visible');
2115
+ }}
2116
+ }}
2117
+
2118
+ if (outlineToggle && outlinePanel) {{
2119
+ outlineToggle.addEventListener('click', () => {{
2120
+ outlinePanel.classList.toggle('collapsed');
2121
+ }});
2122
+ }}
2123
+
2124
+ if (backToTop) {{
2125
+ backToTop.addEventListener('click', () => {{
2126
+ window.scrollTo({{ top: 0, behavior: 'smooth' }});
2127
+ }});
2128
+ }}
2129
+
2130
+ buildOutline();
2131
+ window.addEventListener('scroll', toggleBackToTop);
2132
+ toggleBackToTop();
2133
+ </script>
2134
+ """
2135
+ return HTMLResponse(shell(title, body, extra_head=extra_head, extra_scripts=extra_scripts))
2136
+
2137
+
2138
+ async def _api_stats(request: Request) -> JSONResponse:
2139
+ index: PaperIndex = request.app.state.index
2140
+ return JSONResponse(index.stats)
2141
+
2142
+
2143
+ async def _api_pdf(request: Request) -> Response:
2144
+ index: PaperIndex = request.app.state.index
2145
+ source_hash = request.path_params["source_hash"]
2146
+ pdf_path = index.pdf_path_by_hash.get(source_hash)
2147
+ if not pdf_path:
2148
+ return Response("PDF not found", status_code=404)
2149
+ allowed_roots: list[Path] = request.app.state.pdf_roots
2150
+ if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
2151
+ return Response("Forbidden", status_code=403)
2152
+ return FileResponse(pdf_path)
2153
+
2154
+
2155
+ async def _stats_page(request: Request) -> HTMLResponse:
2156
+ body = """
2157
+ <h2>Stats</h2>
2158
+ <div class="muted">Charts are rendered with ECharts (CDN).</div>
2159
+ <div id="year" style="width:100%;height:360px"></div>
2160
+ <div id="month" style="width:100%;height:360px"></div>
2161
+ <div id="tags" style="width:100%;height:420px"></div>
2162
+ <div id="authors" style="width:100%;height:420px"></div>
2163
+ <div id="venues" style="width:100%;height:420px"></div>
2164
+ """
2165
+ scripts = f"""
2166
+ <script src="{_CDN_ECHARTS}"></script>
2167
+ <script>
2168
+ async function main() {{
2169
+ const res = await fetch('/api/stats');
2170
+ const data = await res.json();
2171
+
2172
+ function bar(el, title, items) {{
2173
+ const chart = echarts.init(document.getElementById(el));
2174
+ const labels = items.map(x => x.label);
2175
+ const counts = items.map(x => x.count);
2176
+ chart.setOption({{
2177
+ title: {{ text: title }},
2178
+ tooltip: {{ trigger: 'axis' }},
2179
+ xAxis: {{ type: 'category', data: labels }},
2180
+ yAxis: {{ type: 'value' }},
2181
+ series: [{{ type: 'bar', data: counts }}]
2182
+ }});
2183
+ }}
2184
+
2185
+ bar('year', 'Publication Year', data.years || []);
2186
+ bar('month', 'Publication Month', data.months || []);
2187
+ bar('tags', 'Top Tags', (data.tags || []).slice(0, 20));
2188
+ bar('authors', 'Top Authors', (data.authors || []).slice(0, 20));
2189
+ bar('venues', 'Top Venues', (data.venues || []).slice(0, 20));
2190
+ }}
2191
+ main();
2192
+ </script>
2193
+ """
2194
+ return HTMLResponse(_page_shell("Stats", body, extra_scripts=scripts))
2195
+
2196
+
2197
+ def _normalize_bibtex_title(title: str) -> str:
2198
+ value = title.replace("{", "").replace("}", "")
2199
+ value = re.sub(r"[^a-z0-9]+", " ", value.lower())
2200
+ return re.sub(r"\\s+", " ", value).strip()
2201
+
2202
+
2203
+ def _title_similarity(a: str, b: str) -> float:
2204
+ import difflib
2205
+
2206
+ if not a or not b:
2207
+ return 0.0
2208
+ return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio()
2209
+
2210
+
2211
+ def enrich_with_bibtex(papers: list[dict[str, Any]], bibtex_path: Path) -> None:
2212
+ if not PYBTEX_AVAILABLE:
2213
+ raise RuntimeError("pybtex is required for --bibtex support")
2214
+
2215
+ bib_data = parse_file(str(bibtex_path))
2216
+ entries: list[dict[str, Any]] = []
2217
+ by_prefix: dict[str, list[int]] = {}
2218
+ for key, entry in bib_data.entries.items():
2219
+ fields = dict(entry.fields)
2220
+ title = str(fields.get("title") or "").strip()
2221
+ title_norm = _normalize_bibtex_title(title)
2222
+ if not title_norm:
2223
+ continue
2224
+ record = {
2225
+ "key": key,
2226
+ "type": entry.type,
2227
+ "fields": fields,
2228
+ "persons": {role: [str(p) for p in persons] for role, persons in entry.persons.items()},
2229
+ "_title_norm": title_norm,
2230
+ }
2231
+ idx = len(entries)
2232
+ entries.append(record)
2233
+ prefix = title_norm[:16]
2234
+ by_prefix.setdefault(prefix, []).append(idx)
2235
+
2236
+ for paper in papers:
2237
+ if isinstance(paper.get("bibtex"), dict):
2238
+ continue
2239
+ title = str(paper.get("paper_title") or "").strip()
2240
+ if not title:
2241
+ continue
2242
+ norm = _normalize_bibtex_title(title)
2243
+ if not norm:
2244
+ continue
2245
+
2246
+ candidates = []
2247
+ prefix = norm[:16]
2248
+ for cand_idx in by_prefix.get(prefix, []):
2249
+ candidates.append(entries[cand_idx])
2250
+ if not candidates:
2251
+ candidates = entries
2252
+
2253
+ best = None
2254
+ best_score = 0.0
2255
+ for entry in candidates:
2256
+ score = _title_similarity(norm, entry["_title_norm"])
2257
+ if score > best_score:
2258
+ best_score = score
2259
+ best = entry
2260
+
2261
+ if best is not None and best_score >= 0.9:
2262
+ paper["bibtex"] = {k: v for k, v in best.items() if not k.startswith("_")}
2263
+
2264
+
2265
+ def create_app(
2266
+ *,
2267
+ db_paths: list[Path],
2268
+ fallback_language: str = "en",
2269
+ bibtex_path: Path | None = None,
2270
+ md_roots: list[Path] | None = None,
2271
+ pdf_roots: list[Path] | None = None,
2272
+ cache_dir: Path | None = None,
2273
+ use_cache: bool = True,
2274
+ ) -> Starlette:
2275
+ papers = _load_or_merge_papers(db_paths, bibtex_path, cache_dir, use_cache)
2276
+
2277
+ md_roots = md_roots or []
2278
+ pdf_roots = pdf_roots or []
2279
+ index = build_index(papers, md_roots=md_roots, pdf_roots=pdf_roots)
2280
+ md = _md_renderer()
2281
+ routes = [
2282
+ Route("/", _index_page, methods=["GET"]),
2283
+ Route("/stats", _stats_page, methods=["GET"]),
2284
+ Route("/paper/{source_hash:str}", _paper_detail, methods=["GET"]),
2285
+ Route("/api/papers", _api_papers, methods=["GET"]),
2286
+ Route("/api/stats", _api_stats, methods=["GET"]),
2287
+ Route("/api/pdf/{source_hash:str}", _api_pdf, methods=["GET"]),
2288
+ ]
2289
+ if _PDFJS_STATIC_DIR.exists():
2290
+ routes.append(
2291
+ Mount(
2292
+ "/pdfjs",
2293
+ app=StaticFiles(directory=str(_PDFJS_STATIC_DIR), html=True),
2294
+ name="pdfjs",
2295
+ )
2296
+ )
2297
+ elif pdf_roots:
2298
+ logger.warning(
2299
+ "PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable.",
2300
+ _PDFJS_STATIC_DIR,
2301
+ )
2302
+ app = Starlette(routes=routes)
2303
+ app.state.index = index
2304
+ app.state.md = md
2305
+ app.state.fallback_language = fallback_language
2306
+ app.state.pdf_roots = pdf_roots
2307
+ return app