deepresearch-flow 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (417) hide show
  1. deepresearch_flow/__init__.py +5 -0
  2. deepresearch_flow/cli.py +23 -0
  3. deepresearch_flow/paper/__init__.py +1 -0
  4. deepresearch_flow/paper/cli.py +286 -0
  5. deepresearch_flow/paper/config.py +249 -0
  6. deepresearch_flow/paper/db.py +768 -0
  7. deepresearch_flow/paper/extract.py +870 -0
  8. deepresearch_flow/paper/llm.py +115 -0
  9. deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
  10. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
  11. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
  14. deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
  15. deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
  16. deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
  17. deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
  18. deepresearch_flow/paper/prompts.py +11 -0
  19. deepresearch_flow/paper/providers/__init__.py +1 -0
  20. deepresearch_flow/paper/providers/azure_openai.py +66 -0
  21. deepresearch_flow/paper/providers/base.py +19 -0
  22. deepresearch_flow/paper/providers/claude.py +71 -0
  23. deepresearch_flow/paper/providers/dashscope.py +58 -0
  24. deepresearch_flow/paper/providers/gemini.py +116 -0
  25. deepresearch_flow/paper/providers/ollama.py +46 -0
  26. deepresearch_flow/paper/providers/openai_compatible.py +60 -0
  27. deepresearch_flow/paper/render.py +64 -0
  28. deepresearch_flow/paper/schema.py +58 -0
  29. deepresearch_flow/paper/schemas/__init__.py +1 -0
  30. deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
  31. deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
  32. deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
  33. deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
  34. deepresearch_flow/paper/template_registry.py +189 -0
  35. deepresearch_flow/paper/templates/__init__.py +1 -0
  36. deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
  37. deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
  38. deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
  39. deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
  40. deepresearch_flow/paper/utils.py +136 -0
  41. deepresearch_flow/paper/web/__init__.py +2 -0
  42. deepresearch_flow/paper/web/app.py +2307 -0
  43. deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
  44. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
  45. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
  46. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
  47. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
  48. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
  49. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
  50. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
  51. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
  52. deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
  53. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
  54. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
  55. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
  56. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
  57. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
  58. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
  59. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
  60. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
  61. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
  62. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
  63. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
  64. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
  65. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
  66. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
  67. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
  68. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
  69. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
  70. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  71. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
  72. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
  73. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
  74. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
  75. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
  76. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
  77. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  78. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
  79. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
  80. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
  81. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
  82. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
  83. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
  84. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
  85. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  86. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
  87. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
  88. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
  89. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  90. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
  91. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
  92. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
  93. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
  94. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
  95. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
  96. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
  97. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
  98. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
  99. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
  100. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
  101. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
  102. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
  103. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
  104. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
  105. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
  106. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
  107. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
  108. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
  109. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
  110. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
  111. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
  112. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
  113. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
  114. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
  115. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
  116. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
  117. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
  118. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
  119. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
  120. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
  121. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
  122. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
  123. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
  124. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
  125. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
  126. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
  127. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
  128. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
  129. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
  130. deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
  131. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
  132. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
  133. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
  134. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
  135. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
  136. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
  137. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
  138. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
  139. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
  140. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
  141. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
  142. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
  143. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
  144. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
  145. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
  146. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
  147. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
  148. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
  149. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
  150. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
  151. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
  152. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  153. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  154. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
  155. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
  156. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
  157. deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
  158. deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
  159. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
  160. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
  161. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
  162. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
  163. deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
  164. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  165. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  166. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  167. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  168. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  169. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  170. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  171. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  172. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
  173. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
  174. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
  175. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
  176. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
  177. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
  178. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
  179. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
  180. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  181. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  182. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  183. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  184. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  185. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  186. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  187. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  188. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  189. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  190. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  191. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  192. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  193. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  194. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  195. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  196. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  197. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  198. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  199. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  200. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  201. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  202. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  203. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
  204. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
  205. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
  206. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
  207. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
  208. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
  209. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
  210. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
  211. deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
  212. deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
  213. deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
  214. deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
  215. deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
  216. deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
  217. deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
  218. deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
  219. deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
  220. deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
  221. deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
  222. deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
  223. deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
  224. deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
  225. deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
  226. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
  227. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
  228. deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
  229. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
  230. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
  231. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
  232. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
  233. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
  234. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
  235. deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
  236. deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
  237. deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
  238. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
  239. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
  240. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
  241. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
  242. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
  243. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
  244. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
  245. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
  246. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
  247. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
  248. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
  249. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
  250. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
  251. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
  252. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
  253. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
  254. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
  255. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
  256. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
  257. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
  258. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
  259. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
  260. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
  261. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
  262. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
  263. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
  264. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
  265. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
  266. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
  267. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
  268. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
  269. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
  270. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
  271. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
  272. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
  273. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
  274. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
  275. deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
  276. deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
  277. deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
  278. deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
  279. deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
  280. deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
  281. deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
  282. deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
  283. deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
  284. deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
  285. deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
  286. deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
  287. deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
  288. deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
  289. deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
  290. deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
  291. deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
  292. deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
  293. deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
  294. deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
  295. deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
  296. deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
  297. deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
  298. deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
  299. deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
  300. deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
  301. deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
  302. deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
  303. deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
  304. deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
  305. deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
  306. deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
  307. deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
  308. deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
  309. deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
  310. deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
  311. deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
  312. deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
  313. deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
  314. deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
  315. deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
  316. deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
  317. deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
  318. deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
  319. deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
  320. deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
  321. deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
  322. deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
  323. deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
  324. deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
  325. deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
  326. deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
  327. deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
  328. deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
  329. deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
  330. deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
  331. deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
  332. deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
  333. deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
  334. deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
  335. deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
  336. deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
  337. deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
  338. deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
  339. deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
  340. deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
  341. deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
  342. deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
  343. deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
  344. deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
  345. deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
  346. deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
  347. deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
  348. deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
  349. deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
  350. deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
  351. deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
  352. deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
  353. deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
  354. deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
  355. deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
  356. deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
  357. deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
  358. deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
  359. deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
  360. deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
  361. deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
  362. deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
  363. deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
  364. deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
  365. deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
  366. deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
  367. deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
  368. deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
  369. deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
  370. deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
  371. deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
  372. deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
  373. deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
  374. deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
  375. deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
  376. deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
  377. deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
  378. deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
  379. deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
  380. deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
  381. deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
  382. deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
  383. deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
  384. deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
  385. deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
  386. deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
  387. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
  388. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
  389. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
  390. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  391. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
  392. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
  393. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
  394. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  395. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
  396. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
  397. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
  398. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
  399. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
  400. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  401. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
  402. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
  403. deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
  404. deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
  405. deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
  406. deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
  407. deepresearch_flow/paper/web/query.py +90 -0
  408. deepresearch_flow/recognize/__init__.py +1 -0
  409. deepresearch_flow/recognize/cli.py +469 -0
  410. deepresearch_flow/recognize/markdown.py +277 -0
  411. deepresearch_flow/recognize/organize.py +95 -0
  412. deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
  413. deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
  414. deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
  415. deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
  416. deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
  417. deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import re
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class QueryTerm:
9
+ field: str | None
10
+ value: str
11
+ negated: bool
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class Query:
16
+ # OR over groups; each group is AND over terms
17
+ groups: list[list[QueryTerm]]
18
+
19
+
20
+ _FIELD_RE = re.compile(r"^(title|author|tag|venue|year|month):(.+)$", re.IGNORECASE)
21
+
22
+
23
+ def parse_query(text: str) -> Query:
24
+ text = (text or "").strip()
25
+ if not text:
26
+ return Query(groups=[[]])
27
+
28
+ tokens = _tokenize(text)
29
+ groups: list[list[QueryTerm]] = [[]]
30
+
31
+ idx = 0
32
+ while idx < len(tokens):
33
+ token = tokens[idx]
34
+ if token.upper() == "OR":
35
+ if groups[-1]:
36
+ groups.append([])
37
+ idx += 1
38
+ continue
39
+
40
+ negated = token.startswith("-")
41
+ if negated:
42
+ token = token[1:].strip()
43
+ if not token:
44
+ idx += 1
45
+ continue
46
+
47
+ field = None
48
+ value = token
49
+ match = _FIELD_RE.match(token)
50
+ if match:
51
+ field = match.group(1).lower()
52
+ value = match.group(2).strip()
53
+
54
+ if value:
55
+ groups[-1].append(QueryTerm(field=field, value=value, negated=negated))
56
+ idx += 1
57
+
58
+ return Query(groups=[g for g in groups if g] or [[]])
59
+
60
+
61
+ def _tokenize(text: str) -> list[str]:
62
+ out: list[str] = []
63
+ buf: list[str] = []
64
+ in_quote = False
65
+
66
+ idx = 0
67
+ while idx < len(text):
68
+ ch = text[idx]
69
+ if ch == '"':
70
+ in_quote = not in_quote
71
+ idx += 1
72
+ continue
73
+
74
+ if not in_quote and ch.isspace():
75
+ token = "".join(buf).strip()
76
+ if token:
77
+ out.append(token)
78
+ buf = []
79
+ idx += 1
80
+ continue
81
+
82
+ buf.append(ch)
83
+ idx += 1
84
+
85
+ token = "".join(buf).strip()
86
+ if token:
87
+ out.append(token)
88
+
89
+ return out
90
+
@@ -0,0 +1 @@
1
+ """Recognize command helpers."""
@@ -0,0 +1,469 @@
1
+ """CLI commands for recognize workflows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Awaitable, Callable, Iterable
10
+
11
+ import click
12
+ import coloredlogs
13
+ import httpx
14
+ from rich.console import Console
15
+ from rich.table import Table
16
+ from tqdm import tqdm
17
+
18
+ from deepresearch_flow.paper.utils import discover_markdown
19
+ from deepresearch_flow.recognize.markdown import (
20
+ DEFAULT_USER_AGENT,
21
+ HTTP_TIMEOUT_SECONDS,
22
+ NameRegistry,
23
+ count_markdown_images,
24
+ embed_markdown_images,
25
+ read_text,
26
+ sanitize_filename,
27
+ unpack_markdown_images,
28
+ )
29
+ from deepresearch_flow.recognize.organize import discover_mineru_dirs, organize_mineru_dir
30
+
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def configure_logging(verbose: bool) -> None:
36
+ level = "DEBUG" if verbose else "INFO"
37
+ coloredlogs.install(level=level, fmt="%(asctime)s %(levelname)s %(message)s")
38
+
39
+
40
+ def _ensure_output_dir(path_str: str) -> Path:
41
+ output_dir = Path(path_str)
42
+ output_dir.mkdir(parents=True, exist_ok=True)
43
+ return output_dir
44
+
45
+
46
+ def _relative_path(path: Path) -> str:
47
+ try:
48
+ return str(path.resolve().relative_to(Path.cwd().resolve()))
49
+ except ValueError:
50
+ return str(path.resolve())
51
+
52
+
53
+ def _warn_if_not_empty(output_dir: Path) -> None:
54
+ if output_dir.exists() and any(output_dir.iterdir()):
55
+ logger.warning("Output directory not empty: %s", output_dir)
56
+
57
+
58
+ def _print_summary(title: str, rows: list[tuple[str, str]]) -> None:
59
+ table = Table(title=title, header_style="bold cyan", title_style="bold magenta")
60
+ table.add_column("Item", style="cyan", no_wrap=True)
61
+ table.add_column("Value", style="white", overflow="fold")
62
+ for key, value in rows:
63
+ table.add_row(key, value)
64
+ Console().print(table)
65
+
66
+
67
+ def _unique_output_filename(
68
+ base: str,
69
+ output_dirs: Iterable[Path],
70
+ used: set[str],
71
+ ) -> str:
72
+ base = sanitize_filename(base) or "document"
73
+ candidate = f"{base}.md"
74
+ counter = 0
75
+ while candidate in used or any((directory / candidate).exists() for directory in output_dirs):
76
+ counter += 1
77
+ candidate = f"{base}_{counter}.md"
78
+ used.add(candidate)
79
+ return candidate
80
+
81
+
82
+ def _map_output_files(paths: Iterable[Path], output_dirs: list[Path]) -> dict[Path, str]:
83
+ used: set[str] = set()
84
+ mapping: dict[Path, str] = {}
85
+ for path in paths:
86
+ base = path.stem
87
+ mapping[path] = _unique_output_filename(base, output_dirs, used)
88
+ return mapping
89
+
90
+
91
+ def _aggregate_image_counts(paths: Iterable[Path]) -> dict[str, int]:
92
+ totals = {"total": 0, "data": 0, "http": 0, "local": 0}
93
+ for path in paths:
94
+ content = read_text(path)
95
+ counts = count_markdown_images(content)
96
+ for key in totals:
97
+ totals[key] += counts.get(key, 0)
98
+ return totals
99
+
100
+
101
+ def _format_duration(seconds: float) -> str:
102
+ if seconds < 60:
103
+ return f"{seconds:.2f}s"
104
+ minutes, remainder = divmod(seconds, 60)
105
+ if minutes < 60:
106
+ return f"{int(minutes)}m {remainder:.1f}s"
107
+ hours, minutes = divmod(minutes, 60)
108
+ return f"{int(hours)}h {int(minutes)}m {remainder:.1f}s"
109
+
110
+
111
+ async def _run_with_workers(
112
+ items: Iterable[Path],
113
+ workers: int,
114
+ handler: Callable[[Path], Awaitable[None]],
115
+ progress: tqdm | None = None,
116
+ ) -> None:
117
+ semaphore = asyncio.Semaphore(workers)
118
+ progress_lock = asyncio.Lock() if progress else None
119
+
120
+ async def runner(item: Path) -> None:
121
+ async with semaphore:
122
+ await handler(item)
123
+ if progress and progress_lock:
124
+ async with progress_lock:
125
+ progress.update(1)
126
+
127
+ await asyncio.gather(*(runner(item) for item in items))
128
+
129
+
130
+ async def _run_md_embed(
131
+ paths: list[Path],
132
+ output_dir: Path,
133
+ output_map: dict[Path, str],
134
+ enable_http: bool,
135
+ workers: int,
136
+ progress: tqdm | None,
137
+ ) -> None:
138
+ timeout = httpx.Timeout(HTTP_TIMEOUT_SECONDS)
139
+ headers = {"User-Agent": DEFAULT_USER_AGENT}
140
+ client: httpx.AsyncClient | None = None
141
+ if enable_http:
142
+ client = httpx.AsyncClient(timeout=timeout, headers=headers, follow_redirects=True)
143
+
144
+ async def handler(path: Path) -> None:
145
+ content = await asyncio.to_thread(read_text, path)
146
+ updated = await embed_markdown_images(content, path, enable_http, client)
147
+ output_path = output_dir / output_map[path]
148
+ await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
149
+
150
+ try:
151
+ await _run_with_workers(paths, workers, handler, progress=progress)
152
+ finally:
153
+ if client is not None:
154
+ await client.aclose()
155
+
156
+
157
+ async def _run_md_unpack(
158
+ paths: list[Path],
159
+ output_dir: Path,
160
+ output_map: dict[Path, str],
161
+ workers: int,
162
+ progress: tqdm | None,
163
+ ) -> None:
164
+ images_dir = output_dir / "images"
165
+ images_dir.mkdir(parents=True, exist_ok=True)
166
+ name_registry = NameRegistry(images_dir)
167
+
168
+ async def handler(path: Path) -> None:
169
+ content = await asyncio.to_thread(read_text, path)
170
+ updated = await unpack_markdown_images(content, images_dir, name_registry)
171
+ output_path = output_dir / output_map[path]
172
+ await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
173
+
174
+ await _run_with_workers(paths, workers, handler, progress=progress)
175
+
176
+
177
+ async def _run_organize(
178
+ layout_dirs: list[Path],
179
+ output_simple: Path | None,
180
+ output_base64: Path | None,
181
+ output_map: dict[Path, str],
182
+ workers: int,
183
+ progress: tqdm | None,
184
+ ) -> None:
185
+ image_registry = None
186
+ if output_simple is not None:
187
+ images_dir = output_simple / "images"
188
+ images_dir.mkdir(parents=True, exist_ok=True)
189
+ image_registry = NameRegistry(images_dir)
190
+
191
+ async def handler(layout_dir: Path) -> None:
192
+ output_filename = output_map[layout_dir]
193
+ await organize_mineru_dir(
194
+ layout_dir,
195
+ output_simple,
196
+ output_base64,
197
+ output_filename,
198
+ image_registry,
199
+ )
200
+
201
+ await _run_with_workers(layout_dirs, workers, handler, progress=progress)
202
+
203
+
204
+ @click.group()
205
+ def recognize() -> None:
206
+ """OCR recognition and Markdown post-processing commands."""
207
+
208
+
209
+ @recognize.group()
210
+ def md() -> None:
211
+ """Markdown image utilities."""
212
+
213
+
214
+ @md.command()
215
+ @click.option(
216
+ "-i",
217
+ "--input",
218
+ "inputs",
219
+ multiple=True,
220
+ required=True,
221
+ help="Input markdown file or directory (repeatable)",
222
+ )
223
+ @click.option("-o", "--output", "output_dir", required=True, help="Output directory")
224
+ @click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
225
+ @click.option("--enable-http", is_flag=True, help="Allow embedding HTTP(S) images")
226
+ @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
227
+ @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
228
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
229
+ def embed(
230
+ inputs: tuple[str, ...],
231
+ output_dir: str,
232
+ recursive: bool,
233
+ enable_http: bool,
234
+ workers: int,
235
+ dry_run: bool,
236
+ verbose: bool,
237
+ ) -> None:
238
+ """Embed images into markdown as data URLs."""
239
+ configure_logging(verbose)
240
+ start_time = time.monotonic()
241
+ if workers <= 0:
242
+ raise click.ClickException("--workers must be positive")
243
+ output_path = Path(output_dir)
244
+ if not dry_run:
245
+ output_path = _ensure_output_dir(output_dir)
246
+ _warn_if_not_empty(output_path)
247
+ paths = discover_markdown(inputs, None, recursive=recursive)
248
+ if not paths:
249
+ click.echo("No markdown files discovered")
250
+ return
251
+ output_map = _map_output_files(paths, [output_path])
252
+ image_counts = _aggregate_image_counts(paths)
253
+ embed_count = image_counts["local"] + (image_counts["http"] if enable_http else 0)
254
+ if dry_run:
255
+ _print_summary(
256
+ "recognize md embed (dry-run)",
257
+ [
258
+ ("Inputs", str(len(paths))),
259
+ ("Outputs", str(len(output_map))),
260
+ ("Images total", str(image_counts["total"])),
261
+ ("Images to embed", str(embed_count)),
262
+ ("Images data", str(image_counts["data"])),
263
+ ("Images http", str(image_counts["http"])),
264
+ ("Images local", str(image_counts["local"])),
265
+ ("Output dir", _relative_path(output_path)),
266
+ ("HTTP enabled", "yes" if enable_http else "no"),
267
+ ("Duration", _format_duration(time.monotonic() - start_time)),
268
+ ],
269
+ )
270
+ return
271
+
272
+ progress = tqdm(total=len(paths), desc="embed", unit="file")
273
+ try:
274
+ asyncio.run(_run_md_embed(paths, output_path, output_map, enable_http, workers, progress))
275
+ finally:
276
+ progress.close()
277
+ _print_summary(
278
+ "recognize md embed",
279
+ [
280
+ ("Inputs", str(len(paths))),
281
+ ("Outputs", str(len(output_map))),
282
+ ("Images total", str(image_counts["total"])),
283
+ ("Images to embed", str(embed_count)),
284
+ ("Images data", str(image_counts["data"])),
285
+ ("Images http", str(image_counts["http"])),
286
+ ("Images local", str(image_counts["local"])),
287
+ ("Output dir", _relative_path(output_path)),
288
+ ("HTTP enabled", "yes" if enable_http else "no"),
289
+ ("Duration", _format_duration(time.monotonic() - start_time)),
290
+ ],
291
+ )
292
+
293
+
294
+ @md.command()
295
+ @click.option(
296
+ "-i",
297
+ "--input",
298
+ "inputs",
299
+ multiple=True,
300
+ required=True,
301
+ help="Input markdown file or directory (repeatable)",
302
+ )
303
+ @click.option("-o", "--output", "output_dir", required=True, help="Output directory")
304
+ @click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
305
+ @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
306
+ @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
307
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
308
+ def unpack(
309
+ inputs: tuple[str, ...],
310
+ output_dir: str,
311
+ recursive: bool,
312
+ workers: int,
313
+ dry_run: bool,
314
+ verbose: bool,
315
+ ) -> None:
316
+ """Extract embedded data URLs into image files."""
317
+ configure_logging(verbose)
318
+ start_time = time.monotonic()
319
+ if workers <= 0:
320
+ raise click.ClickException("--workers must be positive")
321
+ output_path = Path(output_dir)
322
+ if not dry_run:
323
+ output_path = _ensure_output_dir(output_dir)
324
+ _warn_if_not_empty(output_path)
325
+ paths = discover_markdown(inputs, None, recursive=recursive)
326
+ if not paths:
327
+ click.echo("No markdown files discovered")
328
+ return
329
+ output_map = _map_output_files(paths, [output_path])
330
+ image_counts = _aggregate_image_counts(paths)
331
+ if dry_run:
332
+ _print_summary(
333
+ "recognize md unpack (dry-run)",
334
+ [
335
+ ("Inputs", str(len(paths))),
336
+ ("Outputs", str(len(output_map))),
337
+ ("Images total", str(image_counts["total"])),
338
+ ("Images embedded", str(image_counts["data"])),
339
+ ("Images http", str(image_counts["http"])),
340
+ ("Images local", str(image_counts["local"])),
341
+ ("Output dir", _relative_path(output_path)),
342
+ ("Duration", _format_duration(time.monotonic() - start_time)),
343
+ ],
344
+ )
345
+ return
346
+
347
+ progress = tqdm(total=len(paths), desc="unpack", unit="file")
348
+ try:
349
+ asyncio.run(_run_md_unpack(paths, output_path, output_map, workers, progress))
350
+ finally:
351
+ progress.close()
352
+ _print_summary(
353
+ "recognize md unpack",
354
+ [
355
+ ("Inputs", str(len(paths))),
356
+ ("Outputs", str(len(output_map))),
357
+ ("Images total", str(image_counts["total"])),
358
+ ("Images embedded", str(image_counts["data"])),
359
+ ("Images http", str(image_counts["http"])),
360
+ ("Images local", str(image_counts["local"])),
361
+ ("Output dir", _relative_path(output_path)),
362
+ ("Duration", _format_duration(time.monotonic() - start_time)),
363
+ ],
364
+ )
365
+
366
+
367
+ @recognize.command()
368
+ @click.option(
369
+ "--layout",
370
+ "layout",
371
+ type=click.Choice(["mineru"]),
372
+ default="mineru",
373
+ show_default=True,
374
+ help="OCR output layout type",
375
+ )
376
+ @click.option(
377
+ "-i",
378
+ "--input",
379
+ "inputs",
380
+ multiple=True,
381
+ required=True,
382
+ help="Input directory (repeatable)",
383
+ )
384
+ @click.option("-r", "--recursive", is_flag=True, help="Recursively search for layout folders")
385
+ @click.option("--output-simple", "output_simple", default=None, help="Output directory for copied markdown")
386
+ @click.option("--output-base64", "output_base64", default=None, help="Output directory for embedded markdown")
387
+ @click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
388
+ @click.option("--dry-run", is_flag=True, help="Report actions without writing files")
389
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
390
+ def organize(
391
+ layout: str,
392
+ inputs: tuple[str, ...],
393
+ recursive: bool,
394
+ output_simple: str | None,
395
+ output_base64: str | None,
396
+ workers: int,
397
+ dry_run: bool,
398
+ verbose: bool,
399
+ ) -> None:
400
+ """Organize OCR outputs into markdown files."""
401
+ configure_logging(verbose)
402
+ start_time = time.monotonic()
403
+ if workers <= 0:
404
+ raise click.ClickException("--workers must be positive")
405
+ if output_simple is None and output_base64 is None:
406
+ raise click.ClickException("At least one of --output-simple or --output-base64 is required")
407
+
408
+ if layout != "mineru":
409
+ raise click.ClickException(f"Unsupported layout: {layout}")
410
+
411
+ output_simple_path = Path(output_simple) if output_simple else None
412
+ output_base64_path = Path(output_base64) if output_base64 else None
413
+ if not dry_run:
414
+ output_simple_path = _ensure_output_dir(output_simple) if output_simple else None
415
+ output_base64_path = _ensure_output_dir(output_base64) if output_base64 else None
416
+ output_dirs = [path for path in (output_simple_path, output_base64_path) if path]
417
+ for output_dir in output_dirs:
418
+ _warn_if_not_empty(output_dir)
419
+
420
+ layout_dirs = discover_mineru_dirs(inputs, recursive)
421
+ if not layout_dirs:
422
+ click.echo("No layout directories discovered")
423
+ return
424
+
425
+ output_map = _map_output_files(layout_dirs, output_dirs)
426
+ image_counts = _aggregate_image_counts([path / "full.md" for path in layout_dirs])
427
+ if dry_run:
428
+ rows = [
429
+ ("Layout", layout),
430
+ ("Inputs", str(len(layout_dirs))),
431
+ ("Outputs", str(len(output_map))),
432
+ ("Images total", str(image_counts["total"])),
433
+ ("Images data", str(image_counts["data"])),
434
+ ("Images http", str(image_counts["http"])),
435
+ ("Images local", str(image_counts["local"])),
436
+ ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
437
+ ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
438
+ ("Duration", _format_duration(time.monotonic() - start_time)),
439
+ ]
440
+ _print_summary("recognize organize (dry-run)", rows)
441
+ return
442
+
443
+ progress = tqdm(total=len(layout_dirs), desc="organize", unit="doc")
444
+ try:
445
+ asyncio.run(
446
+ _run_organize(
447
+ layout_dirs,
448
+ output_simple_path,
449
+ output_base64_path,
450
+ output_map,
451
+ workers,
452
+ progress,
453
+ )
454
+ )
455
+ finally:
456
+ progress.close()
457
+ rows = [
458
+ ("Layout", layout),
459
+ ("Inputs", str(len(layout_dirs))),
460
+ ("Outputs", str(len(output_map))),
461
+ ("Images total", str(image_counts["total"])),
462
+ ("Images data", str(image_counts["data"])),
463
+ ("Images http", str(image_counts["http"])),
464
+ ("Images local", str(image_counts["local"])),
465
+ ("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
466
+ ("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
467
+ ("Duration", _format_duration(time.monotonic() - start_time)),
468
+ ]
469
+ _print_summary("recognize organize", rows)