deepresearch-flow 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (417) hide show
  1. deepresearch_flow/__init__.py +5 -0
  2. deepresearch_flow/cli.py +23 -0
  3. deepresearch_flow/paper/__init__.py +1 -0
  4. deepresearch_flow/paper/cli.py +286 -0
  5. deepresearch_flow/paper/config.py +249 -0
  6. deepresearch_flow/paper/db.py +768 -0
  7. deepresearch_flow/paper/extract.py +870 -0
  8. deepresearch_flow/paper/llm.py +115 -0
  9. deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
  10. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
  11. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
  14. deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
  15. deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
  16. deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
  17. deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
  18. deepresearch_flow/paper/prompts.py +11 -0
  19. deepresearch_flow/paper/providers/__init__.py +1 -0
  20. deepresearch_flow/paper/providers/azure_openai.py +66 -0
  21. deepresearch_flow/paper/providers/base.py +19 -0
  22. deepresearch_flow/paper/providers/claude.py +71 -0
  23. deepresearch_flow/paper/providers/dashscope.py +58 -0
  24. deepresearch_flow/paper/providers/gemini.py +116 -0
  25. deepresearch_flow/paper/providers/ollama.py +46 -0
  26. deepresearch_flow/paper/providers/openai_compatible.py +60 -0
  27. deepresearch_flow/paper/render.py +64 -0
  28. deepresearch_flow/paper/schema.py +58 -0
  29. deepresearch_flow/paper/schemas/__init__.py +1 -0
  30. deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
  31. deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
  32. deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
  33. deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
  34. deepresearch_flow/paper/template_registry.py +189 -0
  35. deepresearch_flow/paper/templates/__init__.py +1 -0
  36. deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
  37. deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
  38. deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
  39. deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
  40. deepresearch_flow/paper/utils.py +136 -0
  41. deepresearch_flow/paper/web/__init__.py +2 -0
  42. deepresearch_flow/paper/web/app.py +2307 -0
  43. deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
  44. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
  45. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
  46. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
  47. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
  48. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
  49. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
  50. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
  51. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
  52. deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
  53. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
  54. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
  55. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
  56. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
  57. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
  58. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
  59. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
  60. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
  61. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
  62. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
  63. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
  64. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
  65. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
  66. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
  67. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
  68. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
  69. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
  70. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  71. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
  72. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
  73. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
  74. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
  75. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
  76. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
  77. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  78. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
  79. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
  80. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
  81. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
  82. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
  83. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
  84. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
  85. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  86. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
  87. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
  88. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
  89. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  90. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
  91. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
  92. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
  93. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
  94. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
  95. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
  96. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
  97. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
  98. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
  99. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
  100. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
  101. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
  102. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
  103. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
  104. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
  105. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
  106. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
  107. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
  108. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
  109. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
  110. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
  111. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
  112. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
  113. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
  114. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
  115. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
  116. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
  117. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
  118. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
  119. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
  120. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
  121. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
  122. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
  123. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
  124. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
  125. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
  126. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
  127. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
  128. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
  129. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
  130. deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
  131. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
  132. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
  133. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
  134. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
  135. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
  136. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
  137. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
  138. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
  139. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
  140. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
  141. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
  142. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
  143. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
  144. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
  145. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
  146. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
  147. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
  148. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
  149. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
  150. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
  151. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
  152. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  153. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  154. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
  155. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
  156. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
  157. deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
  158. deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
  159. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
  160. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
  161. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
  162. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
  163. deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
  164. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  165. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  166. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  167. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  168. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  169. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  170. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  171. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  172. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
  173. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
  174. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
  175. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
  176. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
  177. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
  178. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
  179. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
  180. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  181. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  182. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  183. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  184. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  185. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  186. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  187. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  188. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  189. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  190. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  191. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  192. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  193. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  194. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  195. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  196. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  197. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  198. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  199. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  200. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  201. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  202. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  203. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
  204. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
  205. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
  206. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
  207. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
  208. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
  209. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
  210. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
  211. deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
  212. deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
  213. deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
  214. deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
  215. deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
  216. deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
  217. deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
  218. deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
  219. deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
  220. deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
  221. deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
  222. deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
  223. deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
  224. deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
  225. deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
  226. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
  227. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
  228. deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
  229. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
  230. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
  231. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
  232. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
  233. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
  234. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
  235. deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
  236. deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
  237. deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
  238. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
  239. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
  240. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
  241. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
  242. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
  243. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
  244. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
  245. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
  246. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
  247. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
  248. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
  249. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
  250. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
  251. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
  252. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
  253. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
  254. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
  255. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
  256. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
  257. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
  258. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
  259. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
  260. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
  261. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
  262. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
  263. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
  264. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
  265. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
  266. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
  267. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
  268. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
  269. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
  270. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
  271. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
  272. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
  273. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
  274. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
  275. deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
  276. deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
  277. deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
  278. deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
  279. deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
  280. deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
  281. deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
  282. deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
  283. deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
  284. deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
  285. deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
  286. deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
  287. deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
  288. deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
  289. deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
  290. deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
  291. deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
  292. deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
  293. deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
  294. deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
  295. deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
  296. deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
  297. deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
  298. deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
  299. deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
  300. deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
  301. deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
  302. deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
  303. deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
  304. deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
  305. deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
  306. deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
  307. deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
  308. deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
  309. deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
  310. deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
  311. deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
  312. deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
  313. deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
  314. deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
  315. deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
  316. deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
  317. deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
  318. deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
  319. deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
  320. deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
  321. deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
  322. deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
  323. deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
  324. deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
  325. deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
  326. deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
  327. deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
  328. deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
  329. deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
  330. deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
  331. deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
  332. deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
  333. deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
  334. deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
  335. deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
  336. deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
  337. deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
  338. deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
  339. deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
  340. deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
  341. deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
  342. deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
  343. deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
  344. deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
  345. deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
  346. deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
  347. deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
  348. deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
  349. deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
  350. deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
  351. deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
  352. deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
  353. deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
  354. deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
  355. deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
  356. deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
  357. deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
  358. deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
  359. deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
  360. deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
  361. deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
  362. deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
  363. deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
  364. deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
  365. deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
  366. deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
  367. deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
  368. deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
  369. deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
  370. deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
  371. deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
  372. deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
  373. deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
  374. deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
  375. deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
  376. deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
  377. deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
  378. deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
  379. deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
  380. deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
  381. deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
  382. deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
  383. deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
  384. deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
  385. deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
  386. deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
  387. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
  388. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
  389. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
  390. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  391. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
  392. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
  393. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
  394. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  395. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
  396. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
  397. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
  398. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
  399. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
  400. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  401. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
  402. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
  403. deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
  404. deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
  405. deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
  406. deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
  407. deepresearch_flow/paper/web/query.py +90 -0
  408. deepresearch_flow/recognize/__init__.py +1 -0
  409. deepresearch_flow/recognize/cli.py +469 -0
  410. deepresearch_flow/recognize/markdown.py +277 -0
  411. deepresearch_flow/recognize/organize.py +95 -0
  412. deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
  413. deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
  414. deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
  415. deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
  416. deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
  417. deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,870 @@
1
+ """Paper extraction pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any, Iterable
11
+ import logging
12
+ import re
13
+ import time
14
+
15
+ import coloredlogs
16
+ import click
17
+ import httpx
18
+ from jsonschema import Draft7Validator
19
+ from rich.console import Console
20
+ from rich.table import Table
21
+ from tqdm import tqdm
22
+ from deepresearch_flow.paper.config import PaperConfig, ProviderConfig, resolve_api_keys
23
+ from deepresearch_flow.paper.llm import backoff_delay, call_provider
24
+ from deepresearch_flow.paper.prompts import DEFAULT_SYSTEM_PROMPT, DEFAULT_USER_PROMPT
25
+ from deepresearch_flow.paper.render import render_papers, resolve_render_template
26
+ from deepresearch_flow.paper.schema import schema_to_prompt, validate_schema
27
+ from deepresearch_flow.paper.template_registry import (
28
+ get_stage_definitions,
29
+ load_custom_prompt_templates,
30
+ load_prompt_templates,
31
+ )
32
+ from deepresearch_flow.paper.utils import (
33
+ compute_source_hash,
34
+ discover_markdown,
35
+ estimate_tokens,
36
+ parse_json,
37
+ read_text,
38
+ stable_hash,
39
+ truncate_content,
40
+ split_output_name,
41
+ unique_split_name,
42
+ )
43
+ from deepresearch_flow.paper.providers.base import ProviderError
44
+
45
+
46
+ @dataclass
47
+ class ExtractionError:
48
+ path: Path
49
+ provider: str
50
+ model: str
51
+ error_type: str
52
+ error_message: str
53
+ stage_name: str | None = None
54
+
55
+
56
+ class KeyRotator:
57
+ def __init__(self, keys: list[str]) -> None:
58
+ self._keys = keys
59
+ self._idx = 0
60
+ self._lock = asyncio.Lock()
61
+
62
+ async def next_key(self) -> str | None:
63
+ if not self._keys:
64
+ return None
65
+ async with self._lock:
66
+ key = self._keys[self._idx % len(self._keys)]
67
+ self._idx += 1
68
+ return key
69
+
70
+
71
+ logger = logging.getLogger(__name__)
72
+
73
+
74
+ def configure_logging(verbose: bool) -> None:
75
+ level = "DEBUG" if verbose else "INFO"
76
+ coloredlogs.install(level=level, fmt="%(asctime)s %(levelname)s %(message)s")
77
+
78
+
79
+ def _count_prompt_chars(messages: list[dict[str, str]]) -> int:
80
+ return sum(len(message.get("content") or "") for message in messages)
81
+
82
+
83
+ def _estimate_tokens_for_chars(char_count: int) -> int:
84
+ if char_count <= 0:
85
+ return 0
86
+ return estimate_tokens(char_count)
87
+
88
+
89
+ def _format_duration(seconds: float) -> str:
90
+ if seconds < 60:
91
+ return f"{seconds:.1f}s"
92
+ minutes, remainder = divmod(seconds, 60)
93
+ if minutes < 60:
94
+ return f"{int(minutes)}m {remainder:.1f}s"
95
+ hours, minutes = divmod(minutes, 60)
96
+ return f"{int(hours)}h {int(minutes)}m {remainder:.1f}s"
97
+
98
+
99
+ def _format_rate(value: float, unit: str) -> str:
100
+ if value <= 0:
101
+ return f"0 {unit}"
102
+ return f"{value:.2f} {unit}"
103
+
104
+
105
+ @dataclass
106
+ class ExtractionStats:
107
+ doc_bar: tqdm | None
108
+ input_chars: int = 0
109
+ prompt_chars: int = 0
110
+ output_chars: int = 0
111
+ lock: asyncio.Lock = field(default_factory=asyncio.Lock)
112
+
113
+ async def add_input_chars(self, count: int) -> None:
114
+ if count <= 0:
115
+ return
116
+ async with self.lock:
117
+ self.input_chars += count
118
+ self._update_bar()
119
+
120
+ async def add_prompt_chars(self, count: int) -> None:
121
+ if count <= 0:
122
+ return
123
+ async with self.lock:
124
+ self.prompt_chars += count
125
+ self._update_bar()
126
+
127
+ async def add_output_chars(self, count: int) -> None:
128
+ if count <= 0:
129
+ return
130
+ async with self.lock:
131
+ self.output_chars += count
132
+ self._update_bar()
133
+
134
+ def _update_bar(self) -> None:
135
+ if not self.doc_bar:
136
+ return
137
+ prompt_tokens = _estimate_tokens_for_chars(self.prompt_chars)
138
+ completion_tokens = _estimate_tokens_for_chars(self.output_chars)
139
+ total_tokens = prompt_tokens + completion_tokens
140
+ self.doc_bar.set_postfix_str(f"tok p/c/t {prompt_tokens}/{completion_tokens}/{total_tokens}")
141
+
142
+ def parse_model_ref(model_ref: str, providers: list[ProviderConfig]) -> tuple[ProviderConfig, str]:
143
+ if "/" not in model_ref:
144
+ raise click.ClickException("--model must be in provider/model format")
145
+ provider_name, model_name = model_ref.split("/", 1)
146
+ for provider in providers:
147
+ if provider.name == provider_name:
148
+ if provider.model_list and model_name not in provider.model_list:
149
+ raise click.ClickException(
150
+ f"Model '{model_name}' is not in provider '{provider_name}' model_list"
151
+ )
152
+ return provider, model_name
153
+ raise click.ClickException(f"Unknown provider: {provider_name}")
154
+
155
+
156
+ def build_messages(
157
+ content: str,
158
+ schema: dict[str, Any],
159
+ provider: ProviderConfig,
160
+ prompt_template: str,
161
+ output_language: str,
162
+ custom_prompt: bool,
163
+ prompt_system_path: Path | None,
164
+ prompt_user_path: Path | None,
165
+ stage_name: str | None = None,
166
+ stage_fields: list[str] | None = None,
167
+ previous_outputs: str | None = None,
168
+ ) -> list[dict[str, str]]:
169
+ prompt_schema = schema_to_prompt(schema)
170
+ if custom_prompt and prompt_system_path and prompt_user_path:
171
+ system_prompt, user_prompt = load_custom_prompt_templates(
172
+ prompt_system_path,
173
+ prompt_user_path,
174
+ {
175
+ "content": content,
176
+ "schema": prompt_schema,
177
+ "output_language": output_language,
178
+ "stage_name": stage_name,
179
+ "stage_fields": stage_fields or [],
180
+ "previous_outputs": previous_outputs or "",
181
+ },
182
+ )
183
+ elif prompt_template:
184
+ system_prompt, user_prompt = load_prompt_templates(
185
+ prompt_template,
186
+ content=content,
187
+ schema=prompt_schema,
188
+ output_language=output_language,
189
+ stage_name=stage_name,
190
+ stage_fields=stage_fields,
191
+ previous_outputs=previous_outputs,
192
+ )
193
+ else:
194
+ system_prompt = provider.system_prompt or DEFAULT_SYSTEM_PROMPT
195
+ if output_language:
196
+ system_prompt = f"{system_prompt} Output language: {output_language}."
197
+ user_prompt_template = provider.user_prompt or DEFAULT_USER_PROMPT
198
+ user_prompt = user_prompt_template.format(content=content, schema=prompt_schema)
199
+
200
+ return [
201
+ {"role": "system", "content": system_prompt},
202
+ {"role": "user", "content": user_prompt},
203
+ ]
204
+
205
+
206
+ def should_retry_error(exc: ProviderError) -> bool:
207
+ return exc.retryable
208
+
209
+
210
+
211
+ def append_metadata(
212
+ payload: dict[str, Any],
213
+ source_path: str,
214
+ source_hash: str,
215
+ provider: str,
216
+ model: str,
217
+ truncation: dict[str, Any] | None,
218
+ prompt_template: str,
219
+ output_language: str,
220
+ ) -> dict[str, Any]:
221
+ payload["source_path"] = source_path
222
+ payload["source_hash"] = source_hash
223
+ payload["provider"] = provider
224
+ payload["model"] = model
225
+ payload["prompt_template"] = prompt_template
226
+ payload["output_language"] = output_language
227
+ payload["extracted_at"] = datetime.utcnow().isoformat() + "Z"
228
+ if truncation:
229
+ payload["source_truncated"] = True
230
+ payload["truncation"] = truncation
231
+ else:
232
+ payload["source_truncated"] = False
233
+ return payload
234
+
235
+
236
+ def _normalized_key(key: str) -> str:
237
+ return re.sub(r"[^a-z0-9]", "", key.lower())
238
+
239
+
240
+ def normalize_response_keys(data: dict[str, Any], schema: dict[str, Any]) -> dict[str, Any]:
241
+ if not isinstance(data, dict):
242
+ return data
243
+
244
+ properties = schema.get("properties")
245
+ if not isinstance(properties, dict) or not properties:
246
+ return data
247
+
248
+ allow_extra = schema.get("additionalProperties", True)
249
+ normalized_map: dict[str, str] = {}
250
+ for prop_key in properties.keys():
251
+ normalized_map.setdefault(_normalized_key(prop_key), prop_key)
252
+
253
+ normalized: dict[str, Any] = {}
254
+ renamed: list[tuple[str, str]] = []
255
+ dropped: list[str] = []
256
+
257
+ for key, value in data.items():
258
+ if key in properties:
259
+ normalized[key] = value
260
+ continue
261
+
262
+ target = normalized_map.get(_normalized_key(key))
263
+ if target:
264
+ if target in normalized:
265
+ dropped.append(key)
266
+ else:
267
+ normalized[target] = value
268
+ renamed.append((key, target))
269
+ continue
270
+
271
+ if allow_extra:
272
+ normalized[key] = value
273
+ else:
274
+ dropped.append(key)
275
+
276
+ if renamed:
277
+ logger.debug("Normalized response keys: %s", renamed)
278
+ if dropped and not allow_extra:
279
+ logger.debug("Dropped response keys not in schema: %s", dropped)
280
+
281
+ return normalized
282
+
283
+
284
+ def load_existing(path: Path) -> list[dict[str, Any]]:
285
+ if not path.exists():
286
+ return []
287
+ try:
288
+ return json.loads(path.read_text(encoding="utf-8"))
289
+ except json.JSONDecodeError:
290
+ return []
291
+
292
+
293
+ def load_errors(path: Path) -> list[dict[str, Any]]:
294
+ if not path.exists():
295
+ return []
296
+ try:
297
+ return json.loads(path.read_text(encoding="utf-8"))
298
+ except json.JSONDecodeError:
299
+ return []
300
+
301
+
302
+ def write_json(path: Path, data: Any) -> None:
303
+ path.parent.mkdir(parents=True, exist_ok=True)
304
+ path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
305
+
306
+
307
+ def write_json_atomic(path: Path, data: Any) -> None:
308
+ path.parent.mkdir(parents=True, exist_ok=True)
309
+ tmp_path = path.with_suffix(path.suffix + ".tmp")
310
+ tmp_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
311
+ tmp_path.replace(path)
312
+
313
+
314
+ def build_stage_schema(
315
+ base_schema: dict[str, Any], required_fields: list[str]
316
+ ) -> dict[str, Any]:
317
+ properties = base_schema.get("properties", {})
318
+ unique_fields: list[str] = []
319
+ for field in required_fields:
320
+ if field not in unique_fields:
321
+ unique_fields.append(field)
322
+
323
+ stage_properties: dict[str, Any] = {}
324
+ for field in unique_fields:
325
+ stage_properties[field] = properties.get(field, {"type": "string"})
326
+
327
+ return {
328
+ "$schema": base_schema.get("$schema", "http://json-schema.org/draft-07/schema#"),
329
+ "type": "object",
330
+ "additionalProperties": True,
331
+ "required": unique_fields,
332
+ "properties": stage_properties,
333
+ }
334
+
335
+
336
+ def load_stage_state(path: Path) -> dict[str, Any] | None:
337
+ if not path.exists():
338
+ return None
339
+ try:
340
+ return json.loads(path.read_text(encoding="utf-8"))
341
+ except json.JSONDecodeError:
342
+ return None
343
+
344
+
345
+ class RequestThrottle:
346
+ def __init__(self, sleep_every: int, sleep_time: float) -> None:
347
+ if sleep_every <= 0 or sleep_time <= 0:
348
+ raise ValueError("sleep_every and sleep_time must be positive")
349
+ self.sleep_every = sleep_every
350
+ self.sleep_time = sleep_time
351
+ self._count = 0
352
+ self._lock = asyncio.Lock()
353
+
354
+ async def tick(self) -> None:
355
+ async with self._lock:
356
+ self._count += 1
357
+ if self._count % self.sleep_every == 0:
358
+ await asyncio.sleep(self.sleep_time)
359
+
360
+
361
+ async def call_with_retries(
362
+ provider: ProviderConfig,
363
+ model: str,
364
+ messages: list[dict[str, str]],
365
+ schema: dict[str, Any],
366
+ api_key: str | None,
367
+ timeout: float,
368
+ structured_mode: str,
369
+ max_retries: int,
370
+ backoff_base_seconds: float,
371
+ backoff_max_seconds: float,
372
+ client: httpx.AsyncClient,
373
+ validator: Draft7Validator,
374
+ throttle: RequestThrottle | None = None,
375
+ stats: ExtractionStats | None = None,
376
+ ) -> dict[str, Any]:
377
+ attempt = 0
378
+ use_structured = structured_mode
379
+ prompt_chars = _count_prompt_chars(messages)
380
+ while attempt < max_retries:
381
+ attempt += 1
382
+ if throttle:
383
+ await throttle.tick()
384
+ if stats:
385
+ await stats.add_prompt_chars(prompt_chars)
386
+ try:
387
+ response_text = await call_provider(
388
+ provider,
389
+ model,
390
+ messages,
391
+ schema,
392
+ api_key,
393
+ timeout,
394
+ use_structured,
395
+ client,
396
+ )
397
+ if stats:
398
+ await stats.add_output_chars(len(response_text))
399
+ except ProviderError as exc:
400
+ if exc.structured_error and use_structured != "none":
401
+ use_structured = "none"
402
+ continue
403
+ if should_retry_error(exc) and attempt < max_retries:
404
+ await asyncio.sleep(backoff_delay(backoff_base_seconds, attempt, backoff_max_seconds))
405
+ continue
406
+ raise
407
+
408
+ try:
409
+ data = parse_json(response_text)
410
+ except Exception as exc:
411
+ if attempt < max_retries:
412
+ await asyncio.sleep(backoff_delay(backoff_base_seconds, attempt, backoff_max_seconds))
413
+ continue
414
+ raise ProviderError(f"JSON parse failed: {exc}", error_type="parse_error") from exc
415
+
416
+ data = normalize_response_keys(data, schema)
417
+
418
+ errors_in_doc = sorted(validator.iter_errors(data), key=lambda e: e.path)
419
+ if errors_in_doc:
420
+ if attempt < max_retries:
421
+ await asyncio.sleep(backoff_delay(backoff_base_seconds, attempt, backoff_max_seconds))
422
+ continue
423
+ raise ProviderError(
424
+ f"Schema validation failed: {errors_in_doc[0].message}",
425
+ error_type="validation_error",
426
+ )
427
+
428
+ return data
429
+
430
+ raise ProviderError("Max retries exceeded", retryable=False)
431
+
432
+
433
+ async def extract_documents(
434
+ inputs: Iterable[str],
435
+ glob_pattern: str | None,
436
+ provider: ProviderConfig,
437
+ model: str,
438
+ schema: dict[str, Any],
439
+ validator: Draft7Validator,
440
+ config: PaperConfig,
441
+ output_path: Path,
442
+ errors_path: Path,
443
+ split: bool,
444
+ split_dir: Path | None,
445
+ force: bool,
446
+ retry_failed: bool,
447
+ dry_run: bool,
448
+ max_concurrency_override: int | None,
449
+ prompt_template: str,
450
+ output_language: str,
451
+ custom_prompt: bool,
452
+ prompt_system_path: Path | None,
453
+ prompt_user_path: Path | None,
454
+ render_md: bool,
455
+ render_output_dir: Path | None,
456
+ render_template_path: str | None,
457
+ render_template_name: str | None,
458
+ render_template_dir: str | None,
459
+ sleep_every: int | None,
460
+ sleep_time: float | None,
461
+ verbose: bool,
462
+ ) -> None:
463
+ start_time = time.monotonic()
464
+ markdown_files = discover_markdown(inputs, glob_pattern, recursive=True)
465
+
466
+ if retry_failed:
467
+ error_entries = load_errors(errors_path)
468
+ retry_paths = {Path(entry.get("source_path", "")).resolve() for entry in error_entries}
469
+ markdown_files = [path for path in markdown_files if path in retry_paths]
470
+ logger.debug("Retrying %d markdown files", len(markdown_files))
471
+ else:
472
+ logger.debug("Discovered %d markdown files", len(markdown_files))
473
+
474
+ if dry_run:
475
+ input_chars = 0
476
+ prompt_chars = 0
477
+ stage_definitions = get_stage_definitions(prompt_template) if not custom_prompt else []
478
+ multi_stage = bool(stage_definitions)
479
+ metadata_fields = [
480
+ "paper_title",
481
+ "paper_authors",
482
+ "publication_date",
483
+ "publication_venue",
484
+ ]
485
+ for path in markdown_files:
486
+ content = read_text(path)
487
+ input_chars += len(content)
488
+ truncated_content, _ = truncate_content(
489
+ content, config.extract.truncate_max_chars, config.extract.truncate_strategy
490
+ )
491
+ if multi_stage:
492
+ for stage_def in stage_definitions:
493
+ required_fields = metadata_fields + stage_def.fields
494
+ stage_schema = build_stage_schema(schema, required_fields)
495
+ messages = build_messages(
496
+ truncated_content,
497
+ stage_schema,
498
+ provider,
499
+ prompt_template,
500
+ output_language,
501
+ custom_prompt=False,
502
+ prompt_system_path=None,
503
+ prompt_user_path=None,
504
+ stage_name=stage_def.name,
505
+ stage_fields=required_fields,
506
+ previous_outputs="{}",
507
+ )
508
+ prompt_chars += _count_prompt_chars(messages)
509
+ else:
510
+ messages = build_messages(
511
+ truncated_content,
512
+ schema,
513
+ provider,
514
+ prompt_template if not custom_prompt else "custom",
515
+ output_language,
516
+ custom_prompt=custom_prompt,
517
+ prompt_system_path=prompt_system_path,
518
+ prompt_user_path=prompt_user_path,
519
+ )
520
+ prompt_chars += _count_prompt_chars(messages)
521
+
522
+ duration = time.monotonic() - start_time
523
+ prompt_tokens = _estimate_tokens_for_chars(prompt_chars)
524
+ completion_tokens = 0
525
+ total_tokens = prompt_tokens
526
+ doc_count = len(markdown_files)
527
+ avg_time = duration / doc_count if doc_count else 0.0
528
+ docs_per_min = (doc_count / duration) * 60 if duration > 0 else 0.0
529
+ tokens_per_sec = (total_tokens / duration) if duration > 0 else 0.0
530
+
531
+ table = Table(
532
+ title="paper extract summary (dry-run)",
533
+ header_style="bold cyan",
534
+ title_style="bold magenta",
535
+ )
536
+ table.add_column("Metric", style="cyan", no_wrap=True)
537
+ table.add_column("Value", style="white", overflow="fold")
538
+ table.add_row("Documents", str(doc_count))
539
+ table.add_row("Duration", _format_duration(duration))
540
+ table.add_row("Avg time/doc", _format_duration(avg_time))
541
+ table.add_row("Throughput", _format_rate(docs_per_min, "docs/min"))
542
+ table.add_row("Input chars", str(input_chars))
543
+ table.add_row("Prompt chars", str(prompt_chars))
544
+ table.add_row("Output chars", "0")
545
+ table.add_row("Est prompt tokens", str(prompt_tokens))
546
+ table.add_row("Est completion tokens", str(completion_tokens))
547
+ table.add_row("Est total tokens", str(total_tokens))
548
+ table.add_row("Est tokens/sec", _format_rate(tokens_per_sec, "tok/s"))
549
+ Console().print(table)
550
+ return
551
+
552
+ existing = load_existing(output_path)
553
+ existing_by_path = {
554
+ entry.get("source_path"): entry
555
+ for entry in existing
556
+ if isinstance(entry, dict) and entry.get("source_path")
557
+ }
558
+
559
+ rotator = KeyRotator(resolve_api_keys(provider.api_keys))
560
+ max_concurrency = max_concurrency_override or config.extract.max_concurrency
561
+ semaphore = asyncio.Semaphore(max_concurrency)
562
+
563
+ errors: list[ExtractionError] = []
564
+ results: dict[str, dict[str, Any]] = {}
565
+ stage_definitions = get_stage_definitions(prompt_template) if not custom_prompt else []
566
+ multi_stage = bool(stage_definitions)
567
+ stage_output_dir = Path("paper_stage_outputs")
568
+ if multi_stage:
569
+ stage_output_dir.mkdir(parents=True, exist_ok=True)
570
+
571
+ throttle = None
572
+ if sleep_every is not None or sleep_time is not None:
573
+ if not sleep_every or not sleep_time:
574
+ raise ValueError("--sleep-every and --sleep-time must be set together")
575
+ throttle = RequestThrottle(sleep_every, float(sleep_time))
576
+
577
+ doc_bar: tqdm | None = None
578
+ stage_bar: tqdm | None = None
579
+ doc_bar = tqdm(total=len(markdown_files), desc="documents", unit="doc", position=0)
580
+ if multi_stage and markdown_files:
581
+ stage_total = len(markdown_files) * len(stage_definitions)
582
+ stage_bar = tqdm(
583
+ total=stage_total,
584
+ desc="stages",
585
+ unit="stage",
586
+ position=1,
587
+ leave=False,
588
+ )
589
+ stats = ExtractionStats(doc_bar=doc_bar)
590
+
591
+ async def process_one(path: Path, client: httpx.AsyncClient) -> None:
592
+ source_path = str(path.resolve())
593
+ current_stage: str | None = None
594
+ try:
595
+ if verbose:
596
+ logger.debug("Processing %s", source_path)
597
+ content = read_text(path)
598
+ await stats.add_input_chars(len(content))
599
+ source_hash = compute_source_hash(content)
600
+ stage_state: dict[str, Any] | None = None
601
+ stage_path: Path | None = None
602
+
603
+ if not force and not retry_failed:
604
+ existing_entry = existing_by_path.get(source_path)
605
+ if existing_entry and existing_entry.get("source_hash") == source_hash:
606
+ results[source_path] = existing_entry
607
+ if stage_bar:
608
+ stage_bar.update(len(stage_definitions))
609
+ return
610
+
611
+ truncated_content, truncation = truncate_content(
612
+ content, config.extract.truncate_max_chars, config.extract.truncate_strategy
613
+ )
614
+
615
+ api_key = await rotator.next_key()
616
+
617
+ if multi_stage:
618
+ stage_path = stage_output_dir / f"{stable_hash(source_path)}.json"
619
+ stage_state = load_stage_state(stage_path) if not force else None
620
+ if stage_state and stage_state.get("source_hash") != source_hash:
621
+ stage_state = None
622
+ if stage_state is None:
623
+ stage_state = {
624
+ "source_path": source_path,
625
+ "source_hash": source_hash,
626
+ "prompt_template": prompt_template,
627
+ "output_language": output_language,
628
+ "stages": {},
629
+ }
630
+
631
+ if multi_stage and stage_state is not None and stage_path is not None:
632
+ stages: dict[str, dict[str, Any]] = stage_state.get("stages", {})
633
+ metadata_fields = [
634
+ "paper_title",
635
+ "paper_authors",
636
+ "publication_date",
637
+ "publication_venue",
638
+ ]
639
+ for stage_def in stage_definitions:
640
+ stage_name = stage_def.name
641
+ stage_fields = stage_def.fields
642
+ current_stage = stage_name
643
+ if stage_name in stages and not force:
644
+ if stage_bar:
645
+ stage_bar.update(1)
646
+ continue
647
+ try:
648
+ required_fields = metadata_fields + stage_fields
649
+ stage_schema = build_stage_schema(schema, required_fields)
650
+ stage_validator = validate_schema(stage_schema)
651
+ previous_outputs = json.dumps(stages, ensure_ascii=False)
652
+ messages = build_messages(
653
+ truncated_content,
654
+ stage_schema,
655
+ provider,
656
+ prompt_template,
657
+ output_language,
658
+ custom_prompt=False,
659
+ prompt_system_path=None,
660
+ prompt_user_path=None,
661
+ stage_name=stage_name,
662
+ stage_fields=required_fields,
663
+ previous_outputs=previous_outputs,
664
+ )
665
+ async with semaphore:
666
+ data = await call_with_retries(
667
+ provider,
668
+ model,
669
+ messages,
670
+ stage_schema,
671
+ api_key,
672
+ timeout=60.0,
673
+ structured_mode=provider.structured_mode,
674
+ max_retries=config.extract.max_retries,
675
+ backoff_base_seconds=config.extract.backoff_base_seconds,
676
+ backoff_max_seconds=config.extract.backoff_max_seconds,
677
+ client=client,
678
+ validator=stage_validator,
679
+ throttle=throttle,
680
+ stats=stats,
681
+ )
682
+ stages[stage_name] = data
683
+ stage_state["stages"] = stages
684
+ write_json_atomic(stage_path, stage_state)
685
+ finally:
686
+ if stage_bar:
687
+ stage_bar.update(1)
688
+
689
+ merged: dict[str, Any] = {}
690
+ for stage_def in stage_definitions:
691
+ merged.update(stages.get(stage_def.name, {}))
692
+ errors_in_doc = sorted(validator.iter_errors(merged), key=lambda e: e.path)
693
+ if errors_in_doc:
694
+ raise ProviderError(
695
+ f"Schema validation failed: {errors_in_doc[0].message}",
696
+ error_type="validation_error",
697
+ )
698
+ data = append_metadata(
699
+ merged,
700
+ source_path=source_path,
701
+ source_hash=source_hash,
702
+ provider=provider.name,
703
+ model=model,
704
+ truncation=truncation,
705
+ prompt_template=prompt_template,
706
+ output_language=output_language,
707
+ )
708
+ results[source_path] = data
709
+ else:
710
+ messages = build_messages(
711
+ truncated_content,
712
+ schema,
713
+ provider,
714
+ prompt_template if not custom_prompt else "custom",
715
+ output_language,
716
+ custom_prompt=custom_prompt,
717
+ prompt_system_path=prompt_system_path,
718
+ prompt_user_path=prompt_user_path,
719
+ )
720
+ async with semaphore:
721
+ data = await call_with_retries(
722
+ provider,
723
+ model,
724
+ messages,
725
+ schema,
726
+ api_key,
727
+ timeout=60.0,
728
+ structured_mode=provider.structured_mode,
729
+ max_retries=config.extract.max_retries,
730
+ backoff_base_seconds=config.extract.backoff_base_seconds,
731
+ backoff_max_seconds=config.extract.backoff_max_seconds,
732
+ client=client,
733
+ validator=validator,
734
+ throttle=throttle,
735
+ stats=stats,
736
+ )
737
+
738
+ data = append_metadata(
739
+ data,
740
+ source_path=source_path,
741
+ source_hash=source_hash,
742
+ provider=provider.name,
743
+ model=model,
744
+ truncation=truncation,
745
+ prompt_template=prompt_template if not custom_prompt else "custom",
746
+ output_language=output_language,
747
+ )
748
+ results[source_path] = data
749
+ except ProviderError as exc:
750
+ logger.warning("Extraction failed for %s: %s", source_path, exc)
751
+ errors.append(
752
+ ExtractionError(
753
+ path=path,
754
+ provider=provider.name,
755
+ model=model,
756
+ error_type=exc.error_type,
757
+ error_message=str(exc),
758
+ stage_name=current_stage if multi_stage else None,
759
+ )
760
+ )
761
+ except Exception as exc: # pragma: no cover - safety net
762
+ logger.exception("Unexpected error while processing %s", source_path)
763
+ errors.append(
764
+ ExtractionError(
765
+ path=path,
766
+ provider=provider.name,
767
+ model=model,
768
+ error_type="unexpected_error",
769
+ error_message=str(exc),
770
+ stage_name=current_stage if multi_stage else None,
771
+ )
772
+ )
773
+ finally:
774
+ if doc_bar:
775
+ doc_bar.update(1)
776
+
777
+ try:
778
+ async with httpx.AsyncClient() as client:
779
+ await asyncio.gather(*(process_one(path, client) for path in markdown_files))
780
+ finally:
781
+ if doc_bar:
782
+ doc_bar.close()
783
+ if stage_bar:
784
+ stage_bar.close()
785
+
786
+ final_results: list[dict[str, Any]] = []
787
+ seen = set()
788
+ for entry in existing:
789
+ path = entry.get("source_path") if isinstance(entry, dict) else None
790
+ if path and path in results:
791
+ final_results.append(results[path])
792
+ seen.add(path)
793
+ elif path and not retry_failed:
794
+ final_results.append(entry)
795
+ seen.add(path)
796
+
797
+ for path, entry in results.items():
798
+ if path not in seen:
799
+ final_results.append(entry)
800
+
801
+ write_json(output_path, final_results)
802
+
803
+ error_payload = [
804
+ {
805
+ "source_path": str(err.path.resolve()),
806
+ "provider": err.provider,
807
+ "model": err.model,
808
+ "error_type": err.error_type,
809
+ "error_message": err.error_message,
810
+ "stage_name": err.stage_name,
811
+ }
812
+ for err in errors
813
+ ]
814
+ write_json(errors_path, error_payload)
815
+
816
+ if split:
817
+ target_dir = split_dir or output_path.parent
818
+ target_dir.mkdir(parents=True, exist_ok=True)
819
+ used_names: set[str] = set()
820
+ for entry in final_results:
821
+ source_path = entry.get("source_path")
822
+ if not source_path:
823
+ continue
824
+ base_name = split_output_name(Path(source_path))
825
+ file_name = unique_split_name(base_name, used_names, source_path)
826
+ write_json(target_dir / f"{file_name}.json", entry)
827
+
828
+ if render_md:
829
+ try:
830
+ template = resolve_render_template(
831
+ render_template_path, render_template_name, render_template_dir
832
+ )
833
+ except ValueError as exc:
834
+ raise click.ClickException(str(exc)) from exc
835
+ render_dir = render_output_dir or Path("rendered_md")
836
+ rendered = render_papers(final_results, render_dir, template, output_language)
837
+ click.echo(f"Rendered {rendered} markdown files")
838
+
839
+ duration = time.monotonic() - start_time
840
+ prompt_tokens = _estimate_tokens_for_chars(stats.prompt_chars)
841
+ completion_tokens = _estimate_tokens_for_chars(stats.output_chars)
842
+ total_tokens = prompt_tokens + completion_tokens
843
+ doc_count = len(markdown_files)
844
+ avg_time = duration / doc_count if doc_count else 0.0
845
+ docs_per_min = (doc_count / duration) * 60 if duration > 0 else 0.0
846
+ tokens_per_sec = (total_tokens / duration) if duration > 0 else 0.0
847
+
848
+ table = Table(
849
+ title="paper extract summary",
850
+ header_style="bold cyan",
851
+ title_style="bold magenta",
852
+ )
853
+ table.add_column("Metric", style="cyan", no_wrap=True)
854
+ table.add_column("Value", style="white", overflow="fold")
855
+ table.add_row("Documents", f"{doc_count} total")
856
+ table.add_row("Successful", str(doc_count - len(errors)))
857
+ table.add_row("Errors", str(len(errors)))
858
+ table.add_row("Output JSON", str(output_path))
859
+ table.add_row("Errors JSON", str(errors_path))
860
+ table.add_row("Duration", _format_duration(duration))
861
+ table.add_row("Avg time/doc", _format_duration(avg_time))
862
+ table.add_row("Throughput", _format_rate(docs_per_min, "docs/min"))
863
+ table.add_row("Input chars", str(stats.input_chars))
864
+ table.add_row("Prompt chars", str(stats.prompt_chars))
865
+ table.add_row("Output chars", str(stats.output_chars))
866
+ table.add_row("Est prompt tokens", str(prompt_tokens))
867
+ table.add_row("Est completion tokens", str(completion_tokens))
868
+ table.add_row("Est total tokens", str(total_tokens))
869
+ table.add_row("Est tokens/sec", _format_rate(tokens_per_sec, "tok/s"))
870
+ Console().print(table)