deepresearch-flow 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (417) hide show
  1. deepresearch_flow/__init__.py +5 -0
  2. deepresearch_flow/cli.py +23 -0
  3. deepresearch_flow/paper/__init__.py +1 -0
  4. deepresearch_flow/paper/cli.py +286 -0
  5. deepresearch_flow/paper/config.py +249 -0
  6. deepresearch_flow/paper/db.py +768 -0
  7. deepresearch_flow/paper/extract.py +870 -0
  8. deepresearch_flow/paper/llm.py +115 -0
  9. deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
  10. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
  11. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
  14. deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
  15. deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
  16. deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
  17. deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
  18. deepresearch_flow/paper/prompts.py +11 -0
  19. deepresearch_flow/paper/providers/__init__.py +1 -0
  20. deepresearch_flow/paper/providers/azure_openai.py +66 -0
  21. deepresearch_flow/paper/providers/base.py +19 -0
  22. deepresearch_flow/paper/providers/claude.py +71 -0
  23. deepresearch_flow/paper/providers/dashscope.py +58 -0
  24. deepresearch_flow/paper/providers/gemini.py +116 -0
  25. deepresearch_flow/paper/providers/ollama.py +46 -0
  26. deepresearch_flow/paper/providers/openai_compatible.py +60 -0
  27. deepresearch_flow/paper/render.py +64 -0
  28. deepresearch_flow/paper/schema.py +58 -0
  29. deepresearch_flow/paper/schemas/__init__.py +1 -0
  30. deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
  31. deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
  32. deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
  33. deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
  34. deepresearch_flow/paper/template_registry.py +189 -0
  35. deepresearch_flow/paper/templates/__init__.py +1 -0
  36. deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
  37. deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
  38. deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
  39. deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
  40. deepresearch_flow/paper/utils.py +136 -0
  41. deepresearch_flow/paper/web/__init__.py +2 -0
  42. deepresearch_flow/paper/web/app.py +2307 -0
  43. deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
  44. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
  45. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
  46. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
  47. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
  48. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
  49. deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
  50. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
  51. deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
  52. deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
  53. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
  54. deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
  55. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
  56. deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
  57. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
  58. deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
  59. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
  60. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
  61. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
  62. deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
  63. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
  64. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
  65. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
  66. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
  67. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
  68. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
  69. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
  70. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
  71. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
  72. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
  73. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
  74. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
  75. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
  76. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
  77. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
  78. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
  79. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
  80. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
  81. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
  82. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
  83. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
  84. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
  85. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
  86. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
  87. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
  88. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
  89. deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
  90. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
  91. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
  92. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
  93. deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
  94. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
  95. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
  96. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
  97. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
  98. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
  99. deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
  100. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
  101. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
  102. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
  103. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
  104. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
  105. deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
  106. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
  107. deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
  108. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
  109. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
  110. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
  111. deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
  112. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
  113. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
  114. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
  115. deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
  116. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
  117. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
  118. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
  119. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
  120. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
  121. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
  122. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
  123. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
  124. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
  125. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
  126. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
  127. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
  128. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
  129. deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
  130. deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
  131. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
  132. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
  133. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
  134. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
  135. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
  136. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
  137. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
  138. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
  139. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
  140. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
  141. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
  142. deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
  143. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
  144. deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
  145. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
  146. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
  147. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
  148. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
  149. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
  150. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
  151. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
  152. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
  153. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
  154. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
  155. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
  156. deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
  157. deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
  158. deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
  159. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
  160. deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
  161. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
  162. deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
  163. deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
  164. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
  165. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
  166. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
  167. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
  168. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
  169. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
  170. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
  171. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
  172. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
  173. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
  174. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
  175. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
  176. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
  177. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
  178. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
  179. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
  180. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
  181. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
  182. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
  183. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
  184. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
  185. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
  186. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
  187. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
  188. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
  189. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
  190. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
  191. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
  192. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
  193. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
  194. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
  195. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
  196. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
  197. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
  198. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
  199. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
  200. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
  201. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
  202. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
  203. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
  204. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
  205. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
  206. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
  207. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
  208. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
  209. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
  210. deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
  211. deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
  212. deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
  213. deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
  214. deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
  215. deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
  216. deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
  217. deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
  218. deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
  219. deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
  220. deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
  221. deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
  222. deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
  223. deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
  224. deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
  225. deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
  226. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
  227. deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
  228. deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
  229. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
  230. deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
  231. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
  232. deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
  233. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
  234. deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
  235. deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
  236. deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
  237. deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
  238. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
  239. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
  240. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
  241. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
  242. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
  243. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
  244. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
  245. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
  246. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
  247. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
  248. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
  249. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
  250. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
  251. deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
  252. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
  253. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
  254. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
  255. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
  256. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
  257. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
  258. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
  259. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
  260. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
  261. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
  262. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
  263. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
  264. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
  265. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
  266. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
  267. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
  268. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
  269. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
  270. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
  271. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
  272. deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
  273. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
  274. deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
  275. deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
  276. deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
  277. deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
  278. deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
  279. deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
  280. deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
  281. deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
  282. deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
  283. deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
  284. deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
  285. deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
  286. deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
  287. deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
  288. deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
  289. deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
  290. deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
  291. deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
  292. deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
  293. deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
  294. deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
  295. deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
  296. deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
  297. deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
  298. deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
  299. deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
  300. deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
  301. deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
  302. deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
  303. deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
  304. deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
  305. deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
  306. deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
  307. deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
  308. deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
  309. deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
  310. deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
  311. deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
  312. deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
  313. deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
  314. deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
  315. deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
  316. deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
  317. deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
  318. deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
  319. deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
  320. deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
  321. deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
  322. deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
  323. deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
  324. deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
  325. deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
  326. deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
  327. deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
  328. deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
  329. deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
  330. deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
  331. deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
  332. deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
  333. deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
  334. deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
  335. deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
  336. deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
  337. deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
  338. deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
  339. deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
  340. deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
  341. deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
  342. deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
  343. deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
  344. deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
  345. deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
  346. deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
  347. deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
  348. deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
  349. deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
  350. deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
  351. deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
  352. deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
  353. deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
  354. deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
  355. deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
  356. deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
  357. deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
  358. deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
  359. deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
  360. deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
  361. deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
  362. deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
  363. deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
  364. deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
  365. deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
  366. deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
  367. deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
  368. deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
  369. deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
  370. deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
  371. deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
  372. deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
  373. deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
  374. deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
  375. deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
  376. deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
  377. deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
  378. deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
  379. deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
  380. deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
  381. deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
  382. deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
  383. deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
  384. deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
  385. deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
  386. deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
  387. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
  388. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
  389. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
  390. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
  391. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
  392. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
  393. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
  394. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
  395. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
  396. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
  397. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
  398. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
  399. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
  400. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
  401. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
  402. deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
  403. deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
  404. deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
  405. deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
  406. deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
  407. deepresearch_flow/paper/web/query.py +90 -0
  408. deepresearch_flow/recognize/__init__.py +1 -0
  409. deepresearch_flow/recognize/cli.py +469 -0
  410. deepresearch_flow/recognize/markdown.py +277 -0
  411. deepresearch_flow/recognize/organize.py +95 -0
  412. deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
  413. deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
  414. deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
  415. deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
  416. deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
  417. deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,277 @@
1
+ """Markdown image helpers for recognize commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import base64
7
+ import hashlib
8
+ import logging
9
+ import mimetypes
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Awaitable, Callable, Optional
13
+ from urllib.parse import urlparse, unquote
14
+
15
+ import httpx
16
+
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_USER_AGENT = (
21
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
22
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
23
+ "Chrome/122.0.0.0 Safari/537.36"
24
+ )
25
+ HTTP_TIMEOUT_SECONDS = 60.0
26
+
27
+ ALLOWED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".svg"}
28
+ EXTENSION_OVERRIDES = {
29
+ ".jpe": ".jpg",
30
+ }
31
+
32
+ IMAGE_PATTERN = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
33
+ DATA_URL_PATTERN = re.compile(r"^data:([^;,]+)(;base64)?,(.*)$", re.DOTALL)
34
+
35
+
36
+ class NameRegistry:
37
+ def __init__(self, output_dir: Path) -> None:
38
+ self.output_dir = output_dir
39
+ if output_dir.exists():
40
+ self.used = {path.name for path in output_dir.iterdir() if path.is_file()}
41
+ else:
42
+ self.used = set()
43
+ self.lock = asyncio.Lock()
44
+
45
+ def reserve(self, base: str, ext: str) -> str:
46
+ base = sanitize_filename(base) or "file"
47
+ ext = ext if ext.startswith(".") else f".{ext}"
48
+ candidate = f"{base}{ext}"
49
+ counter = 0
50
+ while candidate in self.used or (self.output_dir / candidate).exists():
51
+ counter += 1
52
+ candidate = f"{base}_{counter}{ext}"
53
+ self.used.add(candidate)
54
+ return candidate
55
+
56
+ async def reserve_async(self, base: str, ext: str) -> str:
57
+ async with self.lock:
58
+ return self.reserve(base, ext)
59
+
60
+
61
+ def sanitize_filename(value: str) -> str:
62
+ cleaned = re.sub(r"[^\w.\-]+", "_", value.strip())
63
+ return cleaned.strip("._-")
64
+
65
+
66
+ def read_text(path: Path) -> str:
67
+ try:
68
+ return path.read_text(encoding="utf-8")
69
+ except UnicodeDecodeError:
70
+ return path.read_text(encoding="latin-1")
71
+
72
+
73
+ def split_link_target(raw_link: str) -> tuple[str, str, str, str]:
74
+ link = raw_link.strip()
75
+ if link.startswith("<"):
76
+ end = link.find(">")
77
+ if end != -1:
78
+ return link[1:end], link[end + 1 :], "<", ">"
79
+ parts = link.split()
80
+ if not parts:
81
+ return "", "", "", ""
82
+ target = parts[0]
83
+ suffix = link[len(target) :]
84
+ return target, suffix, "", ""
85
+
86
+
87
+ def is_data_url(target: str) -> bool:
88
+ return target.startswith("data:")
89
+
90
+
91
+ def is_http_url(target: str) -> bool:
92
+ parsed = urlparse(target)
93
+ return parsed.scheme in {"http", "https"}
94
+
95
+
96
+ def resolve_local_path(md_path: Path, target: str) -> Path:
97
+ target_path = Path(unquote(target))
98
+ if target_path.is_absolute():
99
+ return target_path
100
+ return (md_path.parent / target_path).resolve()
101
+
102
+
103
+ def extension_from_mime(mime: str) -> Optional[str]:
104
+ ext = mimetypes.guess_extension(mime, strict=False)
105
+ if ext in EXTENSION_OVERRIDES:
106
+ return EXTENSION_OVERRIDES[ext]
107
+ return ext
108
+
109
+
110
+ def mime_from_path(path: Path) -> Optional[str]:
111
+ mime, _ = mimetypes.guess_type(path.name)
112
+ if mime:
113
+ return mime
114
+ if path.suffix.lower() in {".jpg", ".jpeg"}:
115
+ return "image/jpeg"
116
+ if path.suffix.lower() == ".png":
117
+ return "image/png"
118
+ if path.suffix.lower() == ".gif":
119
+ return "image/gif"
120
+ if path.suffix.lower() == ".webp":
121
+ return "image/webp"
122
+ if path.suffix.lower() == ".svg":
123
+ return "image/svg+xml"
124
+ return None
125
+
126
+
127
+ def parse_data_url(target: str) -> Optional[tuple[str, bytes]]:
128
+ match = DATA_URL_PATTERN.match(target)
129
+ if not match:
130
+ return None
131
+ mime = match.group(1) or ""
132
+ if not mime.startswith("image/"):
133
+ return None
134
+ if match.group(2) != ";base64":
135
+ return None
136
+ payload = match.group(3) or ""
137
+ try:
138
+ return mime, base64.b64decode(payload)
139
+ except Exception as exc: # pragma: no cover - defensive
140
+ logger.warning("Failed to decode base64 image: %s", exc)
141
+ return None
142
+
143
+
144
+ def data_url_from_bytes(mime: str, data: bytes) -> str:
145
+ encoded = base64.b64encode(data).decode("ascii")
146
+ return f"data:{mime};base64,{encoded}"
147
+
148
+
149
+ def base_name_from_alt(alt_text: str) -> str:
150
+ if not alt_text:
151
+ return ""
152
+ candidate = sanitize_filename(alt_text)
153
+ if not candidate:
154
+ return ""
155
+ suffix = Path(candidate).suffix.lower()
156
+ if suffix in ALLOWED_IMAGE_EXTS:
157
+ return Path(candidate).stem
158
+ return candidate
159
+
160
+
161
+ def hash_name_from_bytes(data: bytes) -> str:
162
+ return hashlib.sha256(data).hexdigest()[:12]
163
+
164
+
165
+ async def rewrite_markdown_images(
166
+ content: str,
167
+ replacer: Callable[[str, str], Awaitable[Optional[str]]],
168
+ ) -> str:
169
+ output: list[str] = []
170
+ last_idx = 0
171
+ for match in IMAGE_PATTERN.finditer(content):
172
+ output.append(content[last_idx : match.start()])
173
+ alt_text = match.group(1)
174
+ raw_link = match.group(2)
175
+ target, suffix, prefix, postfix = split_link_target(raw_link)
176
+ new_target = await replacer(alt_text, target)
177
+ if new_target is None:
178
+ output.append(match.group(0))
179
+ else:
180
+ new_link = f"{prefix}{new_target}{postfix}{suffix}"
181
+ output.append(f"![{alt_text}]({new_link})")
182
+ last_idx = match.end()
183
+ output.append(content[last_idx:])
184
+ return "".join(output)
185
+
186
+
187
+ def count_markdown_images(content: str) -> dict[str, int]:
188
+ counts = {"total": 0, "data": 0, "http": 0, "local": 0}
189
+ for match in IMAGE_PATTERN.finditer(content):
190
+ counts["total"] += 1
191
+ raw_link = match.group(2)
192
+ target, _, _, _ = split_link_target(raw_link)
193
+ if not target:
194
+ continue
195
+ if is_data_url(target):
196
+ counts["data"] += 1
197
+ elif is_http_url(target):
198
+ counts["http"] += 1
199
+ else:
200
+ counts["local"] += 1
201
+ return counts
202
+
203
+
204
+ async def embed_markdown_images(
205
+ content: str,
206
+ md_path: Path,
207
+ enable_http: bool,
208
+ http_client: Optional[httpx.AsyncClient],
209
+ ) -> str:
210
+ async def replacer(alt_text: str, target: str) -> Optional[str]:
211
+ if not target:
212
+ return None
213
+ if is_data_url(target):
214
+ return None
215
+ if is_http_url(target):
216
+ if not enable_http or http_client is None:
217
+ return None
218
+ try:
219
+ response = await http_client.get(target)
220
+ except Exception as exc:
221
+ logger.warning("Failed to fetch %s: %s", target, exc)
222
+ return None
223
+ if response.status_code >= 400:
224
+ logger.warning("Failed to fetch %s: HTTP %d", target, response.status_code)
225
+ return None
226
+ content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
227
+ if not content_type.startswith("image/"):
228
+ guessed = mime_from_path(Path(urlparse(target).path))
229
+ if not guessed or not guessed.startswith("image/"):
230
+ logger.warning(
231
+ "Skipping non-image URL %s (Content-Type %s)", target, content_type
232
+ )
233
+ return None
234
+ content_type = guessed
235
+ return data_url_from_bytes(content_type, response.content)
236
+
237
+ local_path = resolve_local_path(md_path, target)
238
+ if not local_path.exists() or not local_path.is_file():
239
+ logger.warning("Image not found: %s", local_path)
240
+ return None
241
+ mime = mime_from_path(local_path)
242
+ if not mime or not mime.startswith("image/"):
243
+ logger.warning("Unsupported image type: %s", local_path)
244
+ return None
245
+ data = await asyncio.to_thread(local_path.read_bytes)
246
+ return data_url_from_bytes(mime, data)
247
+
248
+ return await rewrite_markdown_images(content, replacer)
249
+
250
+
251
+ async def unpack_markdown_images(
252
+ content: str,
253
+ images_dir: Path,
254
+ name_registry: NameRegistry,
255
+ ) -> str:
256
+ images_dir.mkdir(parents=True, exist_ok=True)
257
+
258
+ async def replacer(alt_text: str, target: str) -> Optional[str]:
259
+ if not is_data_url(target):
260
+ return None
261
+ parsed = parse_data_url(target)
262
+ if parsed is None:
263
+ return None
264
+ mime, data = parsed
265
+ ext = extension_from_mime(mime)
266
+ if not ext:
267
+ logger.warning("Unsupported MIME type: %s", mime)
268
+ return None
269
+ base_name = base_name_from_alt(alt_text)
270
+ if not base_name:
271
+ base_name = hash_name_from_bytes(data)
272
+ filename = await name_registry.reserve_async(base_name, ext)
273
+ dest_path = images_dir / filename
274
+ await asyncio.to_thread(dest_path.write_bytes, data)
275
+ return f"images/{filename}"
276
+
277
+ return await rewrite_markdown_images(content, replacer)
@@ -0,0 +1,95 @@
1
+ """OCR output organizers for recognize commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import logging
7
+ import shutil
8
+ from pathlib import Path
9
+ from typing import Iterable
10
+
11
+ from deepresearch_flow.recognize.markdown import (
12
+ NameRegistry,
13
+ embed_markdown_images,
14
+ read_text,
15
+ rewrite_markdown_images,
16
+ resolve_local_path,
17
+ is_data_url,
18
+ is_http_url,
19
+ )
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
26
+ results: set[Path] = set()
27
+ for raw in inputs:
28
+ path = Path(raw)
29
+ if path.is_file():
30
+ if path.name != "full.md":
31
+ raise FileNotFoundError(f"Expected full.md file but got: {path}")
32
+ parent = path.parent.resolve()
33
+ if (parent / "images").is_dir():
34
+ results.add(parent)
35
+ else:
36
+ logger.warning("Skipping %s (missing images/)", parent)
37
+ continue
38
+ if not path.exists():
39
+ raise FileNotFoundError(f"Input path not found: {path}")
40
+ if path.is_dir():
41
+ if (path / "full.md").is_file():
42
+ if (path / "images").is_dir():
43
+ results.add(path.resolve())
44
+ else:
45
+ logger.warning("Skipping %s (missing images/)", path)
46
+ pattern = path.rglob("full.md") if recursive else path.glob("full.md")
47
+ for full_path in pattern:
48
+ parent = full_path.parent.resolve()
49
+ if (parent / "images").is_dir():
50
+ results.add(parent)
51
+ else:
52
+ logger.warning("Skipping %s (missing images/)", parent)
53
+ continue
54
+ raise FileNotFoundError(f"Input path not found: {path}")
55
+ return sorted(results)
56
+
57
+
58
+ async def organize_mineru_dir(
59
+ layout_dir: Path,
60
+ output_simple: Path | None,
61
+ output_base64: Path | None,
62
+ output_filename: str,
63
+ image_registry: NameRegistry | None,
64
+ ) -> None:
65
+ md_path = layout_dir / "full.md"
66
+ content = await asyncio.to_thread(read_text, md_path)
67
+
68
+ if output_simple is not None and image_registry is not None:
69
+ images_dir = output_simple / "images"
70
+ images_dir.mkdir(parents=True, exist_ok=True)
71
+ image_map: dict[Path, str] = {}
72
+
73
+ async def replace_simple(_: str, target: str) -> str | None:
74
+ if not target or is_data_url(target) or is_http_url(target):
75
+ return None
76
+ source_path = resolve_local_path(md_path, target)
77
+ if not source_path.exists() or not source_path.is_file():
78
+ logger.warning("Image not found: %s", source_path)
79
+ return None
80
+ if source_path in image_map:
81
+ return f"images/{image_map[source_path]}"
82
+ filename = await image_registry.reserve_async(source_path.stem, source_path.suffix)
83
+ dest_path = images_dir / filename
84
+ await asyncio.to_thread(shutil.copy2, source_path, dest_path)
85
+ image_map[source_path] = filename
86
+ return f"images/{filename}"
87
+
88
+ updated = await rewrite_markdown_images(content, replace_simple)
89
+ output_path = output_simple / output_filename
90
+ await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
91
+
92
+ if output_base64 is not None:
93
+ updated = await embed_markdown_images(content, md_path, False, None)
94
+ output_path = output_base64 / output_filename
95
+ await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")