deepresearch-flow 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/__init__.py +5 -0
- deepresearch_flow/cli.py +23 -0
- deepresearch_flow/paper/__init__.py +1 -0
- deepresearch_flow/paper/cli.py +286 -0
- deepresearch_flow/paper/config.py +249 -0
- deepresearch_flow/paper/db.py +768 -0
- deepresearch_flow/paper/extract.py +870 -0
- deepresearch_flow/paper/llm.py +115 -0
- deepresearch_flow/paper/prompt_templates/__init__.py +1 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +82 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +28 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +24 -0
- deepresearch_flow/paper/prompt_templates/three_pass_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/three_pass_user.j2 +44 -0
- deepresearch_flow/paper/prompts.py +11 -0
- deepresearch_flow/paper/providers/__init__.py +1 -0
- deepresearch_flow/paper/providers/azure_openai.py +66 -0
- deepresearch_flow/paper/providers/base.py +19 -0
- deepresearch_flow/paper/providers/claude.py +71 -0
- deepresearch_flow/paper/providers/dashscope.py +58 -0
- deepresearch_flow/paper/providers/gemini.py +116 -0
- deepresearch_flow/paper/providers/ollama.py +46 -0
- deepresearch_flow/paper/providers/openai_compatible.py +60 -0
- deepresearch_flow/paper/render.py +64 -0
- deepresearch_flow/paper/schema.py +58 -0
- deepresearch_flow/paper/schemas/__init__.py +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +46 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +47 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +34 -0
- deepresearch_flow/paper/schemas/three_pass_schema.json +24 -0
- deepresearch_flow/paper/template_registry.py +189 -0
- deepresearch_flow/paper/templates/__init__.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +79 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +32 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +49 -0
- deepresearch_flow/paper/templates/three_pass.md.j2 +28 -0
- deepresearch_flow/paper/utils.py +136 -0
- deepresearch_flow/paper/web/__init__.py +2 -0
- deepresearch_flow/paper/web/app.py +2307 -0
- deepresearch_flow/paper/web/pdfjs/LICENSE +177 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/78ms-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/83pv-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90ms-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90msp-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/90pv-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Add-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-6.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-CNS1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-GB1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-3.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-4.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-5.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-6.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Japan1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-0.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-1.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Adobe-Korea1-UCS2.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/B5pc-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS1-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/CNS2-V.bcmap +3 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETHK-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETen-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-H.bcmap +3 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/ETenms-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Ext-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-H.bcmap +4 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GB-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBK2K-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBKp-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBT-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBTpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/GBpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdla-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKdlb-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKgccs-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm314-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKm471-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/HKscs-B5-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Hankaku.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Hiragana.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-Johab-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCms-UHC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/KSCpc-EUC-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Katakana.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/LICENSE +36 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/NWP-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/RKSJ-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/Roman.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniCNS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniGB-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJIS2004-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-HW-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISPro-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX0213-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniJISX02132004-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UCS2-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF16-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF32-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-H.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/UniKS-UTF8-V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/V.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/cmaps/WP-Symbol.bcmap +0 -0
- deepresearch_flow/paper/web/pdfjs/web/compressed.tracemonkey-pldi-09.pdf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/debugger.css +111 -0
- deepresearch_flow/paper/web/pdfjs/web/debugger.js +611 -0
- deepresearch_flow/paper/web/pdfjs/web/images/altText_add.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/altText_done.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-check.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-comment.svg +16 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-help.svg +26 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-insert.svg +10 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-key.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-newparagraph.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-noicon.svg +7 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-note.svg +42 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-paperclip.svg +6 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-paragraph.svg +16 -0
- deepresearch_flow/paper/web/pdfjs/web/images/annotation-pushpin.svg +7 -0
- deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorFreeText.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/cursor-editorInk.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-next.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/findbarButton-previous.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-download.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/gv-toolbarButton-openinapp.svg +11 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading-dark.svg +24 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading-icon.gif +0 -0
- deepresearch_flow/paper/web/pdfjs/web/images/loading.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-documentProperties.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-firstPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-handTool.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-lastPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCcw.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-rotateCw.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollHorizontal.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollPage.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollVertical.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-scrollWrapped.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-selectTool.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadEven.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadNone.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/secondaryToolbarButton-spreadOdd.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-bookmark.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-currentOutlineItem.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-download.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorFreeText.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorInk.svg +4 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-editorStamp.svg +8 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-menuArrow.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-openFile.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageDown.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-pageUp.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-presentationMode.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-print.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-search.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-secondaryToolbarToggle.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-sidebarToggle.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewAttachments.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewLayers.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewOutline.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-viewThumbnail.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomIn.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/toolbarButton-zoomOut.svg +3 -0
- deepresearch_flow/paper/web/pdfjs/web/images/treeitem-collapsed.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/images/treeitem-expanded.svg +1 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ach/viewer.properties +203 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/af/viewer.properties +156 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/an/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ar/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ast/viewer.properties +185 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/az/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/be/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bg/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bn/viewer.properties +218 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bo/viewer.properties +217 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/br/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/brx/viewer.properties +184 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/bs/viewer.properties +173 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ca/viewer.properties +256 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cak/viewer.properties +253 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ckb/viewer.properties +213 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cs/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/cy/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/da/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/de/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/dsb/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/el/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-CA/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-GB/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/en-US/viewer.properties +282 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/eo/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-AR/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-CL/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-ES/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/es-MX/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/et/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/eu/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fa/viewer.properties +221 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ff/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fi/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fr/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fur/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/fy-NL/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ga-IE/viewer.properties +181 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gd/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gl/viewer.properties +267 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gn/viewer.properties +278 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/gu-IN/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/he/viewer.properties +283 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hi-IN/viewer.properties +227 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hr/viewer.properties +243 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hsb/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hu/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hy-AM/viewer.properties +232 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/hye/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ia/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/id/viewer.properties +253 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/is/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/it/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ja/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ka/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kab/viewer.properties +264 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kk/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/km/viewer.properties +189 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/kn/viewer.properties +166 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ko/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lij/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lo/viewer.properties +257 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/locale.properties +333 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lt/viewer.properties +229 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ltg/viewer.properties +192 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/lv/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/meh/viewer.properties +106 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/mk/viewer.properties +211 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/mr/viewer.properties +210 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ms/viewer.properties +214 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/my/viewer.properties +170 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nb-NO/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ne-NP/viewer.properties +197 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nl/viewer.properties +274 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/nn-NO/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/oc/viewer.properties +278 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pa-IN/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pl/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pt-BR/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/pt-PT/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/rm/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ro/viewer.properties +220 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ru/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sat/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sc/viewer.properties +258 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/scn/viewer.properties +101 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sco/viewer.properties +226 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/si/viewer.properties +228 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sk/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/skr/viewer.properties +264 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sl/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/son/viewer.properties +152 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sq/viewer.properties +247 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sr/viewer.properties +259 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/sv-SE/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/szl/viewer.properties +224 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ta/viewer.properties +173 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/te/viewer.properties +216 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tg/viewer.properties +281 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/th/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tl/viewer.properties +222 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/tr/viewer.properties +283 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/trs/viewer.properties +184 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/uk/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/ur/viewer.properties +218 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/uz/viewer.properties +142 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/vi/viewer.properties +270 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/wo/viewer.properties +104 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/xh/viewer.properties +156 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/zh-CN/viewer.properties +284 -0
- deepresearch_flow/paper/web/pdfjs/web/locale/zh-TW/viewer.properties +281 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitDingbats.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixed.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBold.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedBoldItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitFixedItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerif.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBold.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifBoldItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSerifItalic.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/FoxitSymbol.pfb +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_FOXIT +27 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LICENSE_LIBERATION +102 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf +0 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.css +3528 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.html +486 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.js +14099 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.js.map +1 -0
- deepresearch_flow/paper/web/query.py +90 -0
- deepresearch_flow/recognize/__init__.py +1 -0
- deepresearch_flow/recognize/cli.py +469 -0
- deepresearch_flow/recognize/markdown.py +277 -0
- deepresearch_flow/recognize/organize.py +95 -0
- deepresearch_flow-0.1.1.dist-info/METADATA +416 -0
- deepresearch_flow-0.1.1.dist-info/RECORD +417 -0
- deepresearch_flow-0.1.1.dist-info/WHEEL +5 -0
- deepresearch_flow-0.1.1.dist-info/entry_points.txt +2 -0
- deepresearch_flow-0.1.1.dist-info/licenses/LICENSE +21 -0
- deepresearch_flow-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class QueryTerm:
|
|
9
|
+
field: str | None
|
|
10
|
+
value: str
|
|
11
|
+
negated: bool
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class Query:
|
|
16
|
+
# OR over groups; each group is AND over terms
|
|
17
|
+
groups: list[list[QueryTerm]]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
_FIELD_RE = re.compile(r"^(title|author|tag|venue|year|month):(.+)$", re.IGNORECASE)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_query(text: str) -> Query:
|
|
24
|
+
text = (text or "").strip()
|
|
25
|
+
if not text:
|
|
26
|
+
return Query(groups=[[]])
|
|
27
|
+
|
|
28
|
+
tokens = _tokenize(text)
|
|
29
|
+
groups: list[list[QueryTerm]] = [[]]
|
|
30
|
+
|
|
31
|
+
idx = 0
|
|
32
|
+
while idx < len(tokens):
|
|
33
|
+
token = tokens[idx]
|
|
34
|
+
if token.upper() == "OR":
|
|
35
|
+
if groups[-1]:
|
|
36
|
+
groups.append([])
|
|
37
|
+
idx += 1
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
negated = token.startswith("-")
|
|
41
|
+
if negated:
|
|
42
|
+
token = token[1:].strip()
|
|
43
|
+
if not token:
|
|
44
|
+
idx += 1
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
field = None
|
|
48
|
+
value = token
|
|
49
|
+
match = _FIELD_RE.match(token)
|
|
50
|
+
if match:
|
|
51
|
+
field = match.group(1).lower()
|
|
52
|
+
value = match.group(2).strip()
|
|
53
|
+
|
|
54
|
+
if value:
|
|
55
|
+
groups[-1].append(QueryTerm(field=field, value=value, negated=negated))
|
|
56
|
+
idx += 1
|
|
57
|
+
|
|
58
|
+
return Query(groups=[g for g in groups if g] or [[]])
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _tokenize(text: str) -> list[str]:
|
|
62
|
+
out: list[str] = []
|
|
63
|
+
buf: list[str] = []
|
|
64
|
+
in_quote = False
|
|
65
|
+
|
|
66
|
+
idx = 0
|
|
67
|
+
while idx < len(text):
|
|
68
|
+
ch = text[idx]
|
|
69
|
+
if ch == '"':
|
|
70
|
+
in_quote = not in_quote
|
|
71
|
+
idx += 1
|
|
72
|
+
continue
|
|
73
|
+
|
|
74
|
+
if not in_quote and ch.isspace():
|
|
75
|
+
token = "".join(buf).strip()
|
|
76
|
+
if token:
|
|
77
|
+
out.append(token)
|
|
78
|
+
buf = []
|
|
79
|
+
idx += 1
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
buf.append(ch)
|
|
83
|
+
idx += 1
|
|
84
|
+
|
|
85
|
+
token = "".join(buf).strip()
|
|
86
|
+
if token:
|
|
87
|
+
out.append(token)
|
|
88
|
+
|
|
89
|
+
return out
|
|
90
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Recognize command helpers."""
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""CLI commands for recognize workflows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Awaitable, Callable, Iterable
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
import coloredlogs
|
|
13
|
+
import httpx
|
|
14
|
+
from rich.console import Console
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
from deepresearch_flow.paper.utils import discover_markdown
|
|
19
|
+
from deepresearch_flow.recognize.markdown import (
|
|
20
|
+
DEFAULT_USER_AGENT,
|
|
21
|
+
HTTP_TIMEOUT_SECONDS,
|
|
22
|
+
NameRegistry,
|
|
23
|
+
count_markdown_images,
|
|
24
|
+
embed_markdown_images,
|
|
25
|
+
read_text,
|
|
26
|
+
sanitize_filename,
|
|
27
|
+
unpack_markdown_images,
|
|
28
|
+
)
|
|
29
|
+
from deepresearch_flow.recognize.organize import discover_mineru_dirs, organize_mineru_dir
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def configure_logging(verbose: bool) -> None:
|
|
36
|
+
level = "DEBUG" if verbose else "INFO"
|
|
37
|
+
coloredlogs.install(level=level, fmt="%(asctime)s %(levelname)s %(message)s")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _ensure_output_dir(path_str: str) -> Path:
|
|
41
|
+
output_dir = Path(path_str)
|
|
42
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
43
|
+
return output_dir
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _relative_path(path: Path) -> str:
|
|
47
|
+
try:
|
|
48
|
+
return str(path.resolve().relative_to(Path.cwd().resolve()))
|
|
49
|
+
except ValueError:
|
|
50
|
+
return str(path.resolve())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _warn_if_not_empty(output_dir: Path) -> None:
|
|
54
|
+
if output_dir.exists() and any(output_dir.iterdir()):
|
|
55
|
+
logger.warning("Output directory not empty: %s", output_dir)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _print_summary(title: str, rows: list[tuple[str, str]]) -> None:
|
|
59
|
+
table = Table(title=title, header_style="bold cyan", title_style="bold magenta")
|
|
60
|
+
table.add_column("Item", style="cyan", no_wrap=True)
|
|
61
|
+
table.add_column("Value", style="white", overflow="fold")
|
|
62
|
+
for key, value in rows:
|
|
63
|
+
table.add_row(key, value)
|
|
64
|
+
Console().print(table)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _unique_output_filename(
|
|
68
|
+
base: str,
|
|
69
|
+
output_dirs: Iterable[Path],
|
|
70
|
+
used: set[str],
|
|
71
|
+
) -> str:
|
|
72
|
+
base = sanitize_filename(base) or "document"
|
|
73
|
+
candidate = f"{base}.md"
|
|
74
|
+
counter = 0
|
|
75
|
+
while candidate in used or any((directory / candidate).exists() for directory in output_dirs):
|
|
76
|
+
counter += 1
|
|
77
|
+
candidate = f"{base}_{counter}.md"
|
|
78
|
+
used.add(candidate)
|
|
79
|
+
return candidate
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _map_output_files(paths: Iterable[Path], output_dirs: list[Path]) -> dict[Path, str]:
|
|
83
|
+
used: set[str] = set()
|
|
84
|
+
mapping: dict[Path, str] = {}
|
|
85
|
+
for path in paths:
|
|
86
|
+
base = path.stem
|
|
87
|
+
mapping[path] = _unique_output_filename(base, output_dirs, used)
|
|
88
|
+
return mapping
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _aggregate_image_counts(paths: Iterable[Path]) -> dict[str, int]:
|
|
92
|
+
totals = {"total": 0, "data": 0, "http": 0, "local": 0}
|
|
93
|
+
for path in paths:
|
|
94
|
+
content = read_text(path)
|
|
95
|
+
counts = count_markdown_images(content)
|
|
96
|
+
for key in totals:
|
|
97
|
+
totals[key] += counts.get(key, 0)
|
|
98
|
+
return totals
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _format_duration(seconds: float) -> str:
|
|
102
|
+
if seconds < 60:
|
|
103
|
+
return f"{seconds:.2f}s"
|
|
104
|
+
minutes, remainder = divmod(seconds, 60)
|
|
105
|
+
if minutes < 60:
|
|
106
|
+
return f"{int(minutes)}m {remainder:.1f}s"
|
|
107
|
+
hours, minutes = divmod(minutes, 60)
|
|
108
|
+
return f"{int(hours)}h {int(minutes)}m {remainder:.1f}s"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
async def _run_with_workers(
|
|
112
|
+
items: Iterable[Path],
|
|
113
|
+
workers: int,
|
|
114
|
+
handler: Callable[[Path], Awaitable[None]],
|
|
115
|
+
progress: tqdm | None = None,
|
|
116
|
+
) -> None:
|
|
117
|
+
semaphore = asyncio.Semaphore(workers)
|
|
118
|
+
progress_lock = asyncio.Lock() if progress else None
|
|
119
|
+
|
|
120
|
+
async def runner(item: Path) -> None:
|
|
121
|
+
async with semaphore:
|
|
122
|
+
await handler(item)
|
|
123
|
+
if progress and progress_lock:
|
|
124
|
+
async with progress_lock:
|
|
125
|
+
progress.update(1)
|
|
126
|
+
|
|
127
|
+
await asyncio.gather(*(runner(item) for item in items))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
async def _run_md_embed(
|
|
131
|
+
paths: list[Path],
|
|
132
|
+
output_dir: Path,
|
|
133
|
+
output_map: dict[Path, str],
|
|
134
|
+
enable_http: bool,
|
|
135
|
+
workers: int,
|
|
136
|
+
progress: tqdm | None,
|
|
137
|
+
) -> None:
|
|
138
|
+
timeout = httpx.Timeout(HTTP_TIMEOUT_SECONDS)
|
|
139
|
+
headers = {"User-Agent": DEFAULT_USER_AGENT}
|
|
140
|
+
client: httpx.AsyncClient | None = None
|
|
141
|
+
if enable_http:
|
|
142
|
+
client = httpx.AsyncClient(timeout=timeout, headers=headers, follow_redirects=True)
|
|
143
|
+
|
|
144
|
+
async def handler(path: Path) -> None:
|
|
145
|
+
content = await asyncio.to_thread(read_text, path)
|
|
146
|
+
updated = await embed_markdown_images(content, path, enable_http, client)
|
|
147
|
+
output_path = output_dir / output_map[path]
|
|
148
|
+
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
await _run_with_workers(paths, workers, handler, progress=progress)
|
|
152
|
+
finally:
|
|
153
|
+
if client is not None:
|
|
154
|
+
await client.aclose()
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
async def _run_md_unpack(
|
|
158
|
+
paths: list[Path],
|
|
159
|
+
output_dir: Path,
|
|
160
|
+
output_map: dict[Path, str],
|
|
161
|
+
workers: int,
|
|
162
|
+
progress: tqdm | None,
|
|
163
|
+
) -> None:
|
|
164
|
+
images_dir = output_dir / "images"
|
|
165
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
name_registry = NameRegistry(images_dir)
|
|
167
|
+
|
|
168
|
+
async def handler(path: Path) -> None:
|
|
169
|
+
content = await asyncio.to_thread(read_text, path)
|
|
170
|
+
updated = await unpack_markdown_images(content, images_dir, name_registry)
|
|
171
|
+
output_path = output_dir / output_map[path]
|
|
172
|
+
await asyncio.to_thread(output_path.write_text, updated, encoding="utf-8")
|
|
173
|
+
|
|
174
|
+
await _run_with_workers(paths, workers, handler, progress=progress)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
async def _run_organize(
|
|
178
|
+
layout_dirs: list[Path],
|
|
179
|
+
output_simple: Path | None,
|
|
180
|
+
output_base64: Path | None,
|
|
181
|
+
output_map: dict[Path, str],
|
|
182
|
+
workers: int,
|
|
183
|
+
progress: tqdm | None,
|
|
184
|
+
) -> None:
|
|
185
|
+
image_registry = None
|
|
186
|
+
if output_simple is not None:
|
|
187
|
+
images_dir = output_simple / "images"
|
|
188
|
+
images_dir.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
image_registry = NameRegistry(images_dir)
|
|
190
|
+
|
|
191
|
+
async def handler(layout_dir: Path) -> None:
|
|
192
|
+
output_filename = output_map[layout_dir]
|
|
193
|
+
await organize_mineru_dir(
|
|
194
|
+
layout_dir,
|
|
195
|
+
output_simple,
|
|
196
|
+
output_base64,
|
|
197
|
+
output_filename,
|
|
198
|
+
image_registry,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
await _run_with_workers(layout_dirs, workers, handler, progress=progress)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
@click.group()
|
|
205
|
+
def recognize() -> None:
|
|
206
|
+
"""OCR recognition and Markdown post-processing commands."""
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@recognize.group()
|
|
210
|
+
def md() -> None:
|
|
211
|
+
"""Markdown image utilities."""
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@md.command()
|
|
215
|
+
@click.option(
|
|
216
|
+
"-i",
|
|
217
|
+
"--input",
|
|
218
|
+
"inputs",
|
|
219
|
+
multiple=True,
|
|
220
|
+
required=True,
|
|
221
|
+
help="Input markdown file or directory (repeatable)",
|
|
222
|
+
)
|
|
223
|
+
@click.option("-o", "--output", "output_dir", required=True, help="Output directory")
|
|
224
|
+
@click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
|
|
225
|
+
@click.option("--enable-http", is_flag=True, help="Allow embedding HTTP(S) images")
|
|
226
|
+
@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
|
|
227
|
+
@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
|
|
228
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
|
|
229
|
+
def embed(
|
|
230
|
+
inputs: tuple[str, ...],
|
|
231
|
+
output_dir: str,
|
|
232
|
+
recursive: bool,
|
|
233
|
+
enable_http: bool,
|
|
234
|
+
workers: int,
|
|
235
|
+
dry_run: bool,
|
|
236
|
+
verbose: bool,
|
|
237
|
+
) -> None:
|
|
238
|
+
"""Embed images into markdown as data URLs."""
|
|
239
|
+
configure_logging(verbose)
|
|
240
|
+
start_time = time.monotonic()
|
|
241
|
+
if workers <= 0:
|
|
242
|
+
raise click.ClickException("--workers must be positive")
|
|
243
|
+
output_path = Path(output_dir)
|
|
244
|
+
if not dry_run:
|
|
245
|
+
output_path = _ensure_output_dir(output_dir)
|
|
246
|
+
_warn_if_not_empty(output_path)
|
|
247
|
+
paths = discover_markdown(inputs, None, recursive=recursive)
|
|
248
|
+
if not paths:
|
|
249
|
+
click.echo("No markdown files discovered")
|
|
250
|
+
return
|
|
251
|
+
output_map = _map_output_files(paths, [output_path])
|
|
252
|
+
image_counts = _aggregate_image_counts(paths)
|
|
253
|
+
embed_count = image_counts["local"] + (image_counts["http"] if enable_http else 0)
|
|
254
|
+
if dry_run:
|
|
255
|
+
_print_summary(
|
|
256
|
+
"recognize md embed (dry-run)",
|
|
257
|
+
[
|
|
258
|
+
("Inputs", str(len(paths))),
|
|
259
|
+
("Outputs", str(len(output_map))),
|
|
260
|
+
("Images total", str(image_counts["total"])),
|
|
261
|
+
("Images to embed", str(embed_count)),
|
|
262
|
+
("Images data", str(image_counts["data"])),
|
|
263
|
+
("Images http", str(image_counts["http"])),
|
|
264
|
+
("Images local", str(image_counts["local"])),
|
|
265
|
+
("Output dir", _relative_path(output_path)),
|
|
266
|
+
("HTTP enabled", "yes" if enable_http else "no"),
|
|
267
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
268
|
+
],
|
|
269
|
+
)
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
progress = tqdm(total=len(paths), desc="embed", unit="file")
|
|
273
|
+
try:
|
|
274
|
+
asyncio.run(_run_md_embed(paths, output_path, output_map, enable_http, workers, progress))
|
|
275
|
+
finally:
|
|
276
|
+
progress.close()
|
|
277
|
+
_print_summary(
|
|
278
|
+
"recognize md embed",
|
|
279
|
+
[
|
|
280
|
+
("Inputs", str(len(paths))),
|
|
281
|
+
("Outputs", str(len(output_map))),
|
|
282
|
+
("Images total", str(image_counts["total"])),
|
|
283
|
+
("Images to embed", str(embed_count)),
|
|
284
|
+
("Images data", str(image_counts["data"])),
|
|
285
|
+
("Images http", str(image_counts["http"])),
|
|
286
|
+
("Images local", str(image_counts["local"])),
|
|
287
|
+
("Output dir", _relative_path(output_path)),
|
|
288
|
+
("HTTP enabled", "yes" if enable_http else "no"),
|
|
289
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
290
|
+
],
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
@md.command()
|
|
295
|
+
@click.option(
|
|
296
|
+
"-i",
|
|
297
|
+
"--input",
|
|
298
|
+
"inputs",
|
|
299
|
+
multiple=True,
|
|
300
|
+
required=True,
|
|
301
|
+
help="Input markdown file or directory (repeatable)",
|
|
302
|
+
)
|
|
303
|
+
@click.option("-o", "--output", "output_dir", required=True, help="Output directory")
|
|
304
|
+
@click.option("-r", "--recursive", is_flag=True, help="Recursively discover markdown files")
|
|
305
|
+
@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
|
|
306
|
+
@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
|
|
307
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
|
|
308
|
+
def unpack(
|
|
309
|
+
inputs: tuple[str, ...],
|
|
310
|
+
output_dir: str,
|
|
311
|
+
recursive: bool,
|
|
312
|
+
workers: int,
|
|
313
|
+
dry_run: bool,
|
|
314
|
+
verbose: bool,
|
|
315
|
+
) -> None:
|
|
316
|
+
"""Extract embedded data URLs into image files."""
|
|
317
|
+
configure_logging(verbose)
|
|
318
|
+
start_time = time.monotonic()
|
|
319
|
+
if workers <= 0:
|
|
320
|
+
raise click.ClickException("--workers must be positive")
|
|
321
|
+
output_path = Path(output_dir)
|
|
322
|
+
if not dry_run:
|
|
323
|
+
output_path = _ensure_output_dir(output_dir)
|
|
324
|
+
_warn_if_not_empty(output_path)
|
|
325
|
+
paths = discover_markdown(inputs, None, recursive=recursive)
|
|
326
|
+
if not paths:
|
|
327
|
+
click.echo("No markdown files discovered")
|
|
328
|
+
return
|
|
329
|
+
output_map = _map_output_files(paths, [output_path])
|
|
330
|
+
image_counts = _aggregate_image_counts(paths)
|
|
331
|
+
if dry_run:
|
|
332
|
+
_print_summary(
|
|
333
|
+
"recognize md unpack (dry-run)",
|
|
334
|
+
[
|
|
335
|
+
("Inputs", str(len(paths))),
|
|
336
|
+
("Outputs", str(len(output_map))),
|
|
337
|
+
("Images total", str(image_counts["total"])),
|
|
338
|
+
("Images embedded", str(image_counts["data"])),
|
|
339
|
+
("Images http", str(image_counts["http"])),
|
|
340
|
+
("Images local", str(image_counts["local"])),
|
|
341
|
+
("Output dir", _relative_path(output_path)),
|
|
342
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
343
|
+
],
|
|
344
|
+
)
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
progress = tqdm(total=len(paths), desc="unpack", unit="file")
|
|
348
|
+
try:
|
|
349
|
+
asyncio.run(_run_md_unpack(paths, output_path, output_map, workers, progress))
|
|
350
|
+
finally:
|
|
351
|
+
progress.close()
|
|
352
|
+
_print_summary(
|
|
353
|
+
"recognize md unpack",
|
|
354
|
+
[
|
|
355
|
+
("Inputs", str(len(paths))),
|
|
356
|
+
("Outputs", str(len(output_map))),
|
|
357
|
+
("Images total", str(image_counts["total"])),
|
|
358
|
+
("Images embedded", str(image_counts["data"])),
|
|
359
|
+
("Images http", str(image_counts["http"])),
|
|
360
|
+
("Images local", str(image_counts["local"])),
|
|
361
|
+
("Output dir", _relative_path(output_path)),
|
|
362
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
363
|
+
],
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
@recognize.command()
|
|
368
|
+
@click.option(
|
|
369
|
+
"--layout",
|
|
370
|
+
"layout",
|
|
371
|
+
type=click.Choice(["mineru"]),
|
|
372
|
+
default="mineru",
|
|
373
|
+
show_default=True,
|
|
374
|
+
help="OCR output layout type",
|
|
375
|
+
)
|
|
376
|
+
@click.option(
|
|
377
|
+
"-i",
|
|
378
|
+
"--input",
|
|
379
|
+
"inputs",
|
|
380
|
+
multiple=True,
|
|
381
|
+
required=True,
|
|
382
|
+
help="Input directory (repeatable)",
|
|
383
|
+
)
|
|
384
|
+
@click.option("-r", "--recursive", is_flag=True, help="Recursively search for layout folders")
|
|
385
|
+
@click.option("--output-simple", "output_simple", default=None, help="Output directory for copied markdown")
|
|
386
|
+
@click.option("--output-base64", "output_base64", default=None, help="Output directory for embedded markdown")
|
|
387
|
+
@click.option("--workers", type=int, default=4, show_default=True, help="Concurrent workers")
|
|
388
|
+
@click.option("--dry-run", is_flag=True, help="Report actions without writing files")
|
|
389
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
|
|
390
|
+
def organize(
|
|
391
|
+
layout: str,
|
|
392
|
+
inputs: tuple[str, ...],
|
|
393
|
+
recursive: bool,
|
|
394
|
+
output_simple: str | None,
|
|
395
|
+
output_base64: str | None,
|
|
396
|
+
workers: int,
|
|
397
|
+
dry_run: bool,
|
|
398
|
+
verbose: bool,
|
|
399
|
+
) -> None:
|
|
400
|
+
"""Organize OCR outputs into markdown files."""
|
|
401
|
+
configure_logging(verbose)
|
|
402
|
+
start_time = time.monotonic()
|
|
403
|
+
if workers <= 0:
|
|
404
|
+
raise click.ClickException("--workers must be positive")
|
|
405
|
+
if output_simple is None and output_base64 is None:
|
|
406
|
+
raise click.ClickException("At least one of --output-simple or --output-base64 is required")
|
|
407
|
+
|
|
408
|
+
if layout != "mineru":
|
|
409
|
+
raise click.ClickException(f"Unsupported layout: {layout}")
|
|
410
|
+
|
|
411
|
+
output_simple_path = Path(output_simple) if output_simple else None
|
|
412
|
+
output_base64_path = Path(output_base64) if output_base64 else None
|
|
413
|
+
if not dry_run:
|
|
414
|
+
output_simple_path = _ensure_output_dir(output_simple) if output_simple else None
|
|
415
|
+
output_base64_path = _ensure_output_dir(output_base64) if output_base64 else None
|
|
416
|
+
output_dirs = [path for path in (output_simple_path, output_base64_path) if path]
|
|
417
|
+
for output_dir in output_dirs:
|
|
418
|
+
_warn_if_not_empty(output_dir)
|
|
419
|
+
|
|
420
|
+
layout_dirs = discover_mineru_dirs(inputs, recursive)
|
|
421
|
+
if not layout_dirs:
|
|
422
|
+
click.echo("No layout directories discovered")
|
|
423
|
+
return
|
|
424
|
+
|
|
425
|
+
output_map = _map_output_files(layout_dirs, output_dirs)
|
|
426
|
+
image_counts = _aggregate_image_counts([path / "full.md" for path in layout_dirs])
|
|
427
|
+
if dry_run:
|
|
428
|
+
rows = [
|
|
429
|
+
("Layout", layout),
|
|
430
|
+
("Inputs", str(len(layout_dirs))),
|
|
431
|
+
("Outputs", str(len(output_map))),
|
|
432
|
+
("Images total", str(image_counts["total"])),
|
|
433
|
+
("Images data", str(image_counts["data"])),
|
|
434
|
+
("Images http", str(image_counts["http"])),
|
|
435
|
+
("Images local", str(image_counts["local"])),
|
|
436
|
+
("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
|
|
437
|
+
("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
|
|
438
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
439
|
+
]
|
|
440
|
+
_print_summary("recognize organize (dry-run)", rows)
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
progress = tqdm(total=len(layout_dirs), desc="organize", unit="doc")
|
|
444
|
+
try:
|
|
445
|
+
asyncio.run(
|
|
446
|
+
_run_organize(
|
|
447
|
+
layout_dirs,
|
|
448
|
+
output_simple_path,
|
|
449
|
+
output_base64_path,
|
|
450
|
+
output_map,
|
|
451
|
+
workers,
|
|
452
|
+
progress,
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
finally:
|
|
456
|
+
progress.close()
|
|
457
|
+
rows = [
|
|
458
|
+
("Layout", layout),
|
|
459
|
+
("Inputs", str(len(layout_dirs))),
|
|
460
|
+
("Outputs", str(len(output_map))),
|
|
461
|
+
("Images total", str(image_counts["total"])),
|
|
462
|
+
("Images data", str(image_counts["data"])),
|
|
463
|
+
("Images http", str(image_counts["http"])),
|
|
464
|
+
("Images local", str(image_counts["local"])),
|
|
465
|
+
("Output simple", _relative_path(output_simple_path) if output_simple_path else "-"),
|
|
466
|
+
("Output base64", _relative_path(output_base64_path) if output_base64_path else "-"),
|
|
467
|
+
("Duration", _format_duration(time.monotonic() - start_time)),
|
|
468
|
+
]
|
|
469
|
+
_print_summary("recognize organize", rows)
|