deepresearch-flow 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/__init__.py +5 -0
- deepresearch_flow/cli.py +23 -0
- deepresearch_flow/paper/__init__.py +1 -0
- deepresearch_flow/paper/cli.py +286 -0
- deepresearch_flow/paper/config.py +249 -0
- deepresearch_flow/paper/db.py +768 -0
- deepresearch_flow/paper/extract.py +870 -0
- deepresearch_flow/paper/llm.py +115 -0
- deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
- deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
- deepresearch_flow/paper/prompts.py +11 -0
- deepresearch_flow/paper/providers/__init__.py +1 -0
- deepresearch_flow/paper/providers/azure_openai.py +66 -0
- deepresearch_flow/paper/providers/base.py +19 -0
- deepresearch_flow/paper/providers/claude.py +71 -0
- deepresearch_flow/paper/providers/dashscope.py +58 -0
- deepresearch_flow/paper/providers/gemini.py +116 -0
- deepresearch_flow/paper/providers/ollama.py +46 -0
- deepresearch_flow/paper/providers/openai_compatible.py +60 -0
- deepresearch_flow/paper/render.py +64 -0
- deepresearch_flow/paper/schema.py +58 -0
- deepresearch_flow/paper/schemas/__init__.py +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
- deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
- deepresearch_flow/paper/template_registry.py +189 -0
- deepresearch_flow/paper/templates/__init__.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
- deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
- deepresearch_flow/paper/utils.py +136 -0
- deepresearch_flow/paper/web/__init__.py +2 -0
- deepresearch_flow/paper/web/app.py +2307 -0
- deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
- deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
- deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
- deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
- deepresearch_flow/paper/web/query.py +90 -0
- deepresearch_flow/recognize/__init__.py +1 -0
- deepresearch_flow/recognize/cli.py +469 -0
- deepresearch_flow/recognize/markdown.py +277 -0
- deepresearch_flow/recognize/organize.py +95 -0
- deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
- deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
- deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
- deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
- deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
- deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Markdown image helpers for recognize commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import base64
|
|
7
|
+
import hashlib
|
|
8
|
+
import logging
|
|
9
|
+
import mimetypes
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Awaitable, Callable, Optional
|
|
13
|
+
from urllib.parse import urlparse, unquote
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
DEFAULT_USER_AGENT = (
|
|
21
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
22
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
23
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
24
|
+
)
|
|
25
|
+
HTTP_TIMEOUT_SECONDS = 60.0
|
|
26
|
+
|
|
27
|
+
ALLOWED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".webp", ".gif", ".svg"}
|
|
28
|
+
EXTENSION_OVERRIDES = {
|
|
29
|
+
".jpe": ".jpg",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
IMAGE_PATTERN = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
|
33
|
+
DATA_URL_PATTERN = re.compile(r"^data:([^;,]+)(;base64)?,(.*)$", re.DOTALL)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class NameRegistry:
|
|
37
|
+
def __init__(self, output_dir: Path) -> None:
|
|
38
|
+
self.output_dir = output_dir
|
|
39
|
+
if output_dir.exists():
|
|
40
|
+
self.used = {path.name for path in output_dir.iterdir() if path.is_file()}
|
|
41
|
+
else:
|
|
42
|
+
self.used = set()
|
|
43
|
+
self.lock = asyncio.Lock()
|
|
44
|
+
|
|
45
|
+
def reserve(self, base: str, ext: str) -> str:
|
|
46
|
+
base = sanitize_filename(base) or "file"
|
|
47
|
+
ext = ext if ext.startswith(".") else f".{ext}"
|
|
48
|
+
candidate = f"{base}{ext}"
|
|
49
|
+
counter = 0
|
|
50
|
+
while candidate in self.used or (self.output_dir / candidate).exists():
|
|
51
|
+
counter += 1
|
|
52
|
+
candidate = f"{base}_{counter}{ext}"
|
|
53
|
+
self.used.add(candidate)
|
|
54
|
+
return candidate
|
|
55
|
+
|
|
56
|
+
async def reserve_async(self, base: str, ext: str) -> str:
|
|
57
|
+
async with self.lock:
|
|
58
|
+
return self.reserve(base, ext)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def sanitize_filename(value: str) -> str:
|
|
62
|
+
cleaned = re.sub(r"[^\w.\-]+", "_", value.strip())
|
|
63
|
+
return cleaned.strip("._-")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def read_text(path: Path) -> str:
|
|
67
|
+
try:
|
|
68
|
+
return path.read_text(encoding="utf-8")
|
|
69
|
+
except UnicodeDecodeError:
|
|
70
|
+
return path.read_text(encoding="latin-1")
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def split_link_target(raw_link: str) -> tuple[str, str, str, str]:
|
|
74
|
+
link = raw_link.strip()
|
|
75
|
+
if link.startswith("<"):
|
|
76
|
+
end = link.find(">")
|
|
77
|
+
if end != -1:
|
|
78
|
+
return link[1:end], link[end + 1 :], "<", ">"
|
|
79
|
+
parts = link.split()
|
|
80
|
+
if not parts:
|
|
81
|
+
return "", "", "", ""
|
|
82
|
+
target = parts[0]
|
|
83
|
+
suffix = link[len(target) :]
|
|
84
|
+
return target, suffix, "", ""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def is_data_url(target: str) -> bool:
|
|
88
|
+
return target.startswith("data:")
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def is_http_url(target: str) -> bool:
|
|
92
|
+
parsed = urlparse(target)
|
|
93
|
+
return parsed.scheme in {"http", "https"}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def resolve_local_path(md_path: Path, target: str) -> Path:
|
|
97
|
+
target_path = Path(unquote(target))
|
|
98
|
+
if target_path.is_absolute():
|
|
99
|
+
return target_path
|
|
100
|
+
return (md_path.parent / target_path).resolve()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def extension_from_mime(mime: str) -> Optional[str]:
|
|
104
|
+
ext = mimetypes.guess_extension(mime, strict=False)
|
|
105
|
+
if ext in EXTENSION_OVERRIDES:
|
|
106
|
+
return EXTENSION_OVERRIDES[ext]
|
|
107
|
+
return ext
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def mime_from_path(path: Path) -> Optional[str]:
|
|
111
|
+
mime, _ = mimetypes.guess_type(path.name)
|
|
112
|
+
if mime:
|
|
113
|
+
return mime
|
|
114
|
+
if path.suffix.lower() in {".jpg", ".jpeg"}:
|
|
115
|
+
return "image/jpeg"
|
|
116
|
+
if path.suffix.lower() == ".png":
|
|
117
|
+
return "image/png"
|
|
118
|
+
if path.suffix.lower() == ".gif":
|
|
119
|
+
return "image/gif"
|
|
120
|
+
if path.suffix.lower() == ".webp":
|
|
121
|
+
return "image/webp"
|
|
122
|
+
if path.suffix.lower() == ".svg":
|
|
123
|
+
return "image/svg+xml"
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def parse_data_url(target: str) -> Optional[tuple[str, bytes]]:
|
|
128
|
+
match = DATA_URL_PATTERN.match(target)
|
|
129
|
+
if not match:
|
|
130
|
+
return None
|
|
131
|
+
mime = match.group(1) or ""
|
|
132
|
+
if not mime.startswith("image/"):
|
|
133
|
+
return None
|
|
134
|
+
if match.group(2) != ";base64":
|
|
135
|
+
return None
|
|
136
|
+
payload = match.group(3) or ""
|
|
137
|
+
try:
|
|
138
|
+
return mime, base64.b64decode(payload)
|
|
139
|
+
except Exception as exc: # pragma: no cover - defensive
|
|
140
|
+
logger.warning("Failed to decode base64 image: %s", exc)
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def data_url_from_bytes(mime: str, data: bytes) -> str:
|
|
145
|
+
encoded = base64.b64encode(data).decode("ascii")
|
|
146
|
+
return f"data:{mime};base64,{encoded}"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def base_name_from_alt(alt_text: str) -> str:
|
|
150
|
+
if not alt_text:
|
|
151
|
+
return ""
|
|
152
|
+
candidate = sanitize_filename(alt_text)
|
|
153
|
+
if not candidate:
|
|
154
|
+
return ""
|
|
155
|
+
suffix = Path(candidate).suffix.lower()
|
|
156
|
+
if suffix in ALLOWED_IMAGE_EXTS:
|
|
157
|
+
return Path(candidate).stem
|
|
158
|
+
return candidate
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def hash_name_from_bytes(data: bytes) -> str:
|
|
162
|
+
return hashlib.sha256(data).hexdigest()[:12]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
async def rewrite_markdown_images(
|
|
166
|
+
content: str,
|
|
167
|
+
replacer: Callable[[str, str], Awaitable[Optional[str]]],
|
|
168
|
+
) -> str:
|
|
169
|
+
output: list[str] = []
|
|
170
|
+
last_idx = 0
|
|
171
|
+
for match in IMAGE_PATTERN.finditer(content):
|
|
172
|
+
output.append(content[last_idx : match.start()])
|
|
173
|
+
alt_text = match.group(1)
|
|
174
|
+
raw_link = match.group(2)
|
|
175
|
+
target, suffix, prefix, postfix = split_link_target(raw_link)
|
|
176
|
+
new_target = await replacer(alt_text, target)
|
|
177
|
+
if new_target is None:
|
|
178
|
+
output.append(match.group(0))
|
|
179
|
+
else:
|
|
180
|
+
new_link = f"{prefix}{new_target}{postfix}{suffix}"
|
|
181
|
+
output.append(f"")
|
|
182
|
+
last_idx = match.end()
|
|
183
|
+
output.append(content[last_idx:])
|
|
184
|
+
return "".join(output)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def count_markdown_images(content: str) -> dict[str, int]:
|
|
188
|
+
counts = {"total": 0, "data": 0, "http": 0, "local": 0}
|
|
189
|
+
for match in IMAGE_PATTERN.finditer(content):
|
|
190
|
+
counts["total"] += 1
|
|
191
|
+
raw_link = match.group(2)
|
|
192
|
+
target, _, _, _ = split_link_target(raw_link)
|
|
193
|
+
if not target:
|
|
194
|
+
continue
|
|
195
|
+
if is_data_url(target):
|
|
196
|
+
counts["data"] += 1
|
|
197
|
+
elif is_http_url(target):
|
|
198
|
+
counts["http"] += 1
|
|
199
|
+
else:
|
|
200
|
+
counts["local"] += 1
|
|
201
|
+
return counts
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
async def embed_markdown_images(
|
|
205
|
+
content: str,
|
|
206
|
+
md_path: Path,
|
|
207
|
+
enable_http: bool,
|
|
208
|
+
http_client: Optional[httpx.AsyncClient],
|
|
209
|
+
) -> str:
|
|
210
|
+
async def replacer(alt_text: str, target: str) -> Optional[str]:
|
|
211
|
+
if not target:
|
|
212
|
+
return None
|
|
213
|
+
if is_data_url(target):
|
|
214
|
+
return None
|
|
215
|
+
if is_http_url(target):
|
|
216
|
+
if not enable_http or http_client is None:
|
|
217
|
+
return None
|
|
218
|
+
try:
|
|
219
|
+
response = await http_client.get(target)
|
|
220
|
+
except Exception as exc:
|
|
221
|
+
logger.warning("Failed to fetch %s: %s", target, exc)
|
|
222
|
+
return None
|
|
223
|
+
if response.status_code >= 400:
|
|
224
|
+
logger.warning("Failed to fetch %s: HTTP %d", target, response.status_code)
|
|
225
|
+
return None
|
|
226
|
+
content_type = response.headers.get("Content-Type", "").split(";", 1)[0].strip()
|
|
227
|
+
if not content_type.startswith("image/"):
|
|
228
|
+
guessed = mime_from_path(Path(urlparse(target).path))
|
|
229
|
+
if not guessed or not guessed.startswith("image/"):
|
|
230
|
+
logger.warning(
|
|
231
|
+
"Skipping non-image URL %s (Content-Type %s)", target, content_type
|
|
232
|
+
)
|
|
233
|
+
return None
|
|
234
|
+
content_type = guessed
|
|
235
|
+
return data_url_from_bytes(content_type, response.content)
|
|
236
|
+
|
|
237
|
+
local_path = resolve_local_path(md_path, target)
|
|
238
|
+
if not local_path.exists() or not local_path.is_file():
|
|
239
|
+
logger.warning("Image not found: %s", local_path)
|
|
240
|
+
return None
|
|
241
|
+
mime = mime_from_path(local_path)
|
|
242
|
+
if not mime or not mime.startswith("image/"):
|
|
243
|
+
logger.warning("Unsupported image type: %s", local_path)
|
|
244
|
+
return None
|
|
245
|
+
data = await asyncio.to_thread(local_path.read_bytes)
|
|
246
|
+
return data_url_from_bytes(mime, data)
|
|
247
|
+
|
|
248
|
+
return await rewrite_markdown_images(content, replacer)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
async def unpack_markdown_images(
|
|
252
|
+
content: str,
|
|
253
|
+
images_dir: Path,
|
|
254
|
+
name_registry: NameRegistry,
|
|
255
|
+
) -> str:
|
|
256
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
257
|
+
|
|
258
|
+
async def replacer(alt_text: str, target: str) -> Optional[str]:
|
|
259
|
+
if not is_data_url(target):
|
|
260
|
+
return None
|
|
261
|
+
parsed = parse_data_url(target)
|
|
262
|
+
if parsed is None:
|
|
263
|
+
return None
|
|
264
|
+
mime, data = parsed
|
|
265
|
+
ext = extension_from_mime(mime)
|
|
266
|
+
if not ext:
|
|
267
|
+
logger.warning("Unsupported MIME type: %s", mime)
|
|
268
|
+
return None
|
|
269
|
+
base_name = base_name_from_alt(alt_text)
|
|
270
|
+
if not base_name:
|
|
271
|
+
base_name = hash_name_from_bytes(data)
|
|
272
|
+
filename = await name_registry.reserve_async(base_name, ext)
|
|
273
|
+
dest_path = images_dir / filename
|
|
274
|
+
await asyncio.to_thread(dest_path.write_bytes, data)
|
|
275
|
+
return f"images/{filename}"
|
|
276
|
+
|
|
277
|
+
return await rewrite_markdown_images(content, replacer)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""OCR output organizers for recognize commands."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import shutil
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Iterable
|
|
10
|
+
|
|
11
|
+
from deepresearch_flow.recognize.markdown import (
|
|
12
|
+
NameRegistry,
|
|
13
|
+
embed_markdown_images,
|
|
14
|
+
read_text,
|
|
15
|
+
rewrite_markdown_images,
|
|
16
|
+
resolve_local_path,
|
|
17
|
+
is_data_url,
|
|
18
|
+
is_http_url,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def discover_mineru_dirs(inputs: Iterable[str], recursive: bool) -> list[Path]:
|
|
26
|
+
results: set[Path] = set()
|
|
27
|
+
for raw in inputs:
|
|
28
|
+
path = Path(raw)
|
|
29
|
+
if path.is_file():
|
|
30
|
+
if path.name != "full.md":
|
|
31
|
+
raise FileNotFoundError(f"Expected full.md file but got: {path}")
|
|
32
|
+
parent = path.parent.resolve()
|
|
33
|
+
if (parent / "images").is_dir():
|
|
34
|
+
results.add(parent)
|
|
35
|
+
else:
|
|
36
|
+
logger.warning("Skipping %s (missing images/)", parent)
|
|
37
|
+
continue
|
|
38
|
+
if not path.exists():
|
|
39
|
+
raise FileNotFoundError(f"Input path not found: {path}")
|
|
40
|
+
if path.is_dir():
|
|
41
|
+
if (path / "full.md").is_file():
|
|
42
|
+
if (path / "images").is_dir():
|
|
43
|
+
results.add(path.resolve())
|
|
44
|
+
else:
|
|
45
|
+
logger.warning("Skipping %s (missing images/)", path)
|
|
46
|
+
pattern = path.rglob("full.md") if recursive else path.glob("full.md")
|
|
47
|
+
for full_path in pattern:
|
|
48
|
+
parent = full_path.parent.resolve()
|
|
49
|
+
if (parent / "images").is_dir():
|
|
50
|
+
results.add(parent)
|
|
51
|
+
else:
|
|
52
|
+
logger.warning("Skipping %s (missing images/)", parent)
|
|
53
|
+
continue
|
|
54
|
+
raise FileNotFoundError(f"Input path not found: {path}")
|
|
55
|
+
return sorted(results)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def organize_mineru_dir(
|
|
59
|
+
layout_dir: Path,
|
|
60
|
+
output_simple: Path | None,
|
|
61
|
+
output_base64: Path | None,
|
|
62
|
+
output_filename: str,
|
|
63
|
+
image_registry: NameRegistry | None,
|
|
64
|
+
) -> None:
|
|
65
|
+
md_path = layout_dir / "full.md"
|
|
66
|
+
content = await asyncio.to_thread(read_text, md_path)
|
|
67
|
+
|
|
68
|
+
if output_simple is not None and image_registry is not None:
|
|
69
|
+
images_dir = output_simple / "images"
|
|
70
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
71
|
+
image_map: dict[Path, str] = {}
|
|
72
|
+
|
|
73
|
+
async def replace_simple(_: str, target: str) -> str | None:
|
|
74
|
+
if not target or is_data_url(target) or is_http_url(target):
|
|
75
|
+
return None
|
|
76
|
+
source_path = resolve_local_path(md_path, target)
|
|
77
|
+
if not source_path.exists() or not source_path.is_file():
|
|
78
|
+
logger.warning("Image not found: %s", source_path)
|
|
79
|
+
return None
|
|
80
|
+
if source_path in image_map:
|
|
81
|
+
return f"images/{image_map[source_path]}"
|
|
82
|
+
filename = await image_registry.reserve_async(source_path.stem, source_path.suffix)
|
|
83
|
+
dest_path = images_dir / filename
|
|
84
|
+
await asyncio.to_thread(shutil.copy2, source_path, dest_path)
|
|
85
|
+
image_map[source_path] = filename
|
|
86
|
+
return f"images/{filename}"
|
|
87
|
+
|
|
88
|
+
updated = await rewrite_markdown_images(content, replace_simple)
|
|
89
|
+
output_path = output_simple / output_filename
|
|
90
|
+
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
91
|
+
|
|
92
|
+
if output_base64 is not None:
|
|
93
|
+
updated = await embed_markdown_images(content, md_path, False, None)
|
|
94
|
+
output_path = output_base64 / output_filename
|
|
95
|
+
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|