deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. deepresearch_flow/paper/db.py +184 -0
  2. deepresearch_flow/paper/db_ops.py +1939 -0
  3. deepresearch_flow/paper/web/app.py +38 -3705
  4. deepresearch_flow/paper/web/constants.py +23 -0
  5. deepresearch_flow/paper/web/filters.py +255 -0
  6. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  7. deepresearch_flow/paper/web/handlers/api.py +217 -0
  8. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  9. deepresearch_flow/paper/web/markdown.py +549 -0
  10. deepresearch_flow/paper/web/static/css/main.css +857 -0
  11. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  12. deepresearch_flow/paper/web/static/js/index.js +266 -0
  13. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  14. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  15. deepresearch_flow/paper/web/templates/base.html +43 -0
  16. deepresearch_flow/paper/web/templates/detail.html +332 -0
  17. deepresearch_flow/paper/web/templates/index.html +114 -0
  18. deepresearch_flow/paper/web/templates/stats.html +29 -0
  19. deepresearch_flow/paper/web/templates.py +85 -0
  20. deepresearch_flow/paper/web/text.py +68 -0
  21. deepresearch_flow/recognize/cli.py +805 -26
  22. deepresearch_flow/recognize/katex_check.js +29 -0
  23. deepresearch_flow/recognize/math.py +719 -0
  24. deepresearch_flow/recognize/mermaid.py +690 -0
  25. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
  26. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
  27. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
  28. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
  29. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
  30. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepresearch-flow
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: Workflow tools for paper extraction, review, and research automation.
5
5
  Author-email: DengQi <dengqi935@gmail.com>
6
6
  License: MIT License
@@ -51,9 +51,10 @@ Requires-Dist: jsonschema>=4.21.1
51
51
  Requires-Dist: markdown-it-py>=3.0.0
52
52
  Requires-Dist: mdit-py-plugins>=0.4.0
53
53
  Requires-Dist: pypdf>=3.0.0
54
+ Requires-Dist: pylatexenc>=2.10
54
55
  Requires-Dist: pybtex>=0.24.0
55
56
  Requires-Dist: rich>=13.7.1
56
- Requires-Dist: rumdl>=0.0.214
57
+ Requires-Dist: rumdl>=0.0.218
57
58
  Requires-Dist: starlette>=0.37.2
58
59
  Requires-Dist: tqdm>=4.66.4
59
60
  Requires-Dist: uvicorn>=0.27.1
@@ -121,6 +122,7 @@ DeepResearch Flow provides a unified pipeline to **Repair**, **Translate**, **Ex
121
122
  - **Smart Extraction**: Turn unstructured Markdown into schema-enforced JSON (summaries, metadata, Q&A) using LLMs (OpenAI, Claude, Gemini, etc.).
122
123
  - **Precision Translation**: Translate OCR Markdown to Chinese/Japanese (`.zh.md`, `.ja.md`) while **freezing** formulas, code, tables, and references. No more broken layout.
123
124
  - **Local Knowledge DB**: A high-performance local Web UI to browse papers with **Split View** (Source vs. Translated vs. Summary), full-text search, and multi-dimensional filtering.
125
+ - **Coverage Compare**: Compare JSON/PDF/Markdown/Translated datasets to find missing artifacts and export CSV reports.
124
126
  - **OCR Post-Processing**: Automatically fix broken references (`[1]` -> `[^1]`), merge split paragraphs, and standardize layouts.
125
127
 
126
128
  ---
@@ -171,7 +173,36 @@ uv run deepresearch-flow translator translate \
171
173
  --fix-level moderate
172
174
  ```
173
175
 
174
- #### Step 3: Serve Your Database
176
+ #### Step 3: Repair OCR Outputs (Recommended)
177
+
178
+ Recommended sequence to stabilize markdown before serving:
179
+
180
+ ```bash
181
+ # 1) Fix OCR markdown (auto-detects JSON if inputs are .json)
182
+ uv run deepresearch-flow recognize fix \
183
+ --input ./docs \
184
+ --in-place
185
+
186
+ # 2) Fix LaTeX formulas
187
+ uv run deepresearch-flow recognize fix-math \
188
+ --input ./docs \
189
+ --model openai/gpt-4o-mini \
190
+ --in-place
191
+
192
+ # 3) Fix Mermaid diagrams
193
+ uv run deepresearch-flow recognize fix-mermaid \
194
+ --input ./paper_outputs \
195
+ --json \
196
+ --model openai/gpt-4o-mini \
197
+ --in-place
198
+
199
+ # 4) Fix again to normalize formatting
200
+ uv run deepresearch-flow recognize fix \
201
+ --input ./docs \
202
+ --in-place
203
+ ```
204
+
205
+ #### Step 4: Serve Your Database
175
206
 
176
207
  Launch a local UI to read and manage your papers.
177
208
 
@@ -246,7 +277,27 @@ uv run deepresearch-flow paper db serve \
246
277
  </details>
247
278
 
248
279
  <details>
249
- <summary><strong>4. Recognize: OCR Post-Processing</strong></summary>
280
+ <summary><strong>4. Paper DB Compare: Coverage Audit</strong></summary>
281
+
282
+ Compare two datasets (A/B) to find missing PDFs, markdowns, translations, or JSON items, with match metadata.
283
+
284
+ ```bash
285
+ uv run deepresearch-flow paper db compare \
286
+ --input-a ./a.json \
287
+ --md-root-b ./md_root \
288
+ --output-csv ./compare.csv
289
+
290
+ # Compare translated markdowns by language
291
+ uv run deepresearch-flow paper db compare \
292
+ --md-translated-root-a ./translated_a \
293
+ --md-translated-root-b ./translated_b \
294
+ --lang zh
295
+ ```
296
+
297
+ </details>
298
+
299
+ <details>
300
+ <summary><strong>5. Recognize: OCR Post-Processing</strong></summary>
250
301
 
251
302
  Tools to clean up raw outputs from OCR engines like MinerU.
252
303
 
@@ -254,6 +305,10 @@ Tools to clean up raw outputs from OCR engines like MinerU.
254
305
  - Unpack Images: extract Base64 images back to files.
255
306
  - Organize: flatten nested OCR output directories.
256
307
  - Fix: apply OCR fixes and rumdl formatting during organize, or as a standalone step.
308
+ - Fix JSON: apply the same fixes to markdown fields inside paper JSON outputs.
309
+ - Fix Math: validate and repair LaTeX formulas with optional LLM assistance.
310
+ - Fix Mermaid: validate and repair Mermaid diagrams (requires `mmdc` from mermaid-cli).
311
+ - Recommended order: `fix` -> `fix-math` -> `fix-mermaid` -> `fix`.
257
312
 
258
313
  ```bash
259
314
  uv run deepresearch-flow recognize md embed --input ./raw_ocr --output ./clean_md
@@ -275,6 +330,25 @@ uv run deepresearch-flow recognize fix \
275
330
  uv run deepresearch-flow recognize fix \
276
331
  --input ./ocr_md \
277
332
  --in-place
333
+
334
+ # Fix JSON outputs in place
335
+ uv run deepresearch-flow recognize fix \
336
+ --json \
337
+ --input ./paper_outputs \
338
+ --in-place
339
+
340
+ # Fix LaTeX formulas in markdown
341
+ uv run deepresearch-flow recognize fix-math \
342
+ --input ./docs \
343
+ --model openai/gpt-4o-mini \
344
+ --in-place
345
+
346
+ # Fix Mermaid diagrams in JSON outputs
347
+ uv run deepresearch-flow recognize fix-mermaid \
348
+ --json \
349
+ --input ./paper_outputs \
350
+ --model openai/gpt-4o-mini \
351
+ --in-place
278
352
  ```
279
353
 
280
354
  </details>
@@ -4,7 +4,8 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
4
4
  deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
5
5
  deepresearch_flow/paper/cli.py,sha256=4UY3KHi6BUGztL1vB4w0cCMiIAo9KNxrfQn1GBHt6fA,11153
6
6
  deepresearch_flow/paper/config.py,sha256=totVBGzouh0KS6mhRNPneXZYPuuw0SHiOGdO3r6HSfc,9289
7
- deepresearch_flow/paper/db.py,sha256=ymVLzSEXDksdhLNSdvNA2IWLzT5lQOG1CpJlPU9CSQ8,33586
7
+ deepresearch_flow/paper/db.py,sha256=i3v3n-YrG-kPpc62C9-InhEfInoZMBQd-r_pYz_fO_A,41847
8
+ deepresearch_flow/paper/db_ops.py,sha256=l0lNPP1v00ZtdQb7ZAWE_tUf2JUzqKWxU1wwzlEjDrw,69766
8
9
  deepresearch_flow/paper/extract.py,sha256=ID1dd2r6LTB0kRF4qBSH6bGtBGv0znw--g_mXYBcoeU,32314
9
10
  deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
10
11
  deepresearch_flow/paper/prompts.py,sha256=mV7cEXw8pwukBUE4Trah0SjEPSSDgg5-RGaNaUdo4EU,519
@@ -40,8 +41,16 @@ deepresearch_flow/paper/templates/default_paper.md.j2,sha256=3azu48534QtLtHrCwI1
40
41
  deepresearch_flow/paper/templates/eight_questions.md.j2,sha256=Ecz4CD3nd7jZ4Dg8himZkTwF4WDkk0ILWk8V728uOPI,3038
41
42
  deepresearch_flow/paper/templates/three_pass.md.j2,sha256=ZRj-NkpZePnqp0gSE8OT1dN5Lr5RW4vdOYdeVejYJW0,1576
42
43
  deepresearch_flow/paper/web/__init__.py,sha256=eQBtBjvOYsNEdivHTI0aO286SCG2c86xI02tf-0jz5I,39
43
- deepresearch_flow/paper/web/app.py,sha256=nb4uzsDJ2R5dz_WA69NKwTgVgMqAyZv5OZ88GxFTWLQ,133311
44
+ deepresearch_flow/paper/web/app.py,sha256=rXnQjffyzH5b64oCwv6ucihU_y5zaFbpzdEB5PRUvHc,3063
45
+ deepresearch_flow/paper/web/constants.py,sha256=HuuE_oZKckmisD3F_1RAqWzO7bnhNmMLyM8FqyM5Yfk,1085
46
+ deepresearch_flow/paper/web/filters.py,sha256=OVMB4GfigP9GPD5dXytHyeLYtnVXEK-QjYfA_k7QbaA,8315
47
+ deepresearch_flow/paper/web/markdown.py,sha256=QHrxUYKB-uAZjG5jVGmkQ6EIT2dSxQNzlibgjGIIKuA,18888
44
48
  deepresearch_flow/paper/web/query.py,sha256=vTegfm5zGVkYCd6_K3yNrXJEmKMccUUFKG9DePPcKMw,1938
49
+ deepresearch_flow/paper/web/templates.py,sha256=suJ67-nwWdExNVx8vvcInwqiHu6bhslaEFS1ouifLto,2515
50
+ deepresearch_flow/paper/web/text.py,sha256=OiqOEzNepPXxcCIal38bxkUarIkcOXG6a30luxObFOI,2199
51
+ deepresearch_flow/paper/web/handlers/__init__.py,sha256=HGQud4xuEtdB9eVYPzzilXV9ool-1Db5UU29WJ6cjNk,295
52
+ deepresearch_flow/paper/web/handlers/api.py,sha256=Z7H0nr1cSIj1-nR6ZxhxtU6-4sjiuqzy1U1OpK56B0g,9014
53
+ deepresearch_flow/paper/web/handlers/pages.py,sha256=euORL0_Avmqy-kOPKOfVQxyeQjLU4a6EBIufmwoLeCM,12247
45
54
  deepresearch_flow/paper/web/pdfjs/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
46
55
  deepresearch_flow/paper/web/pdfjs/build/pdf.js,sha256=2Ddm8gpMMfvOWinZh4nN--94GxR0QdpFvh0Qeejg-Bw,568294
47
56
  deepresearch_flow/paper/web/pdfjs/build/pdf.js.map,sha256=W0nwVFY4inhYxz1raDU6NZ6-rNA21FxLj13txVAqbm4,1434098
@@ -412,9 +421,21 @@ deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf,sha
412
421
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf,sha256=oiQHWsF0la0KOvO8CkGawHBKiz_RCVRWIB-5sJX8KB0,135124
413
422
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf,sha256=gytEBtvvI2KIANOqrSEEhTSshNfjrZVb6DuBcu2O9RI,162036
414
423
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf,sha256=-Kzh-JKyvZ3BeSun8Jf6dYj4T-1IMhSA4E3lOQgoIh8,139512
424
+ deepresearch_flow/paper/web/static/css/main.css,sha256=oUuEFEi4YP6bIlEQlIz-zQEQje7hRq3j63imvtJ6IQ4,15386
425
+ deepresearch_flow/paper/web/static/js/detail.js,sha256=9bZmTID74otrZxJfHDJRMWuI_x1pgk71E3Zu2Q6sBIA,13368
426
+ deepresearch_flow/paper/web/static/js/index.js,sha256=bbQz8QAewmu3TT8ImAzUqNtTWQCMKwVOQfU0Lkw6Lv0,10460
427
+ deepresearch_flow/paper/web/static/js/outline.js,sha256=e9ydLcBqaTXOYULXt-1OKgKIzrZcZaH1RebPXWBbLvE,1882
428
+ deepresearch_flow/paper/web/static/js/stats.js,sha256=USGIAx9cPQTMeyFwYu_bTYPJM7OoiqimhCYuAjoP0-s,1420
429
+ deepresearch_flow/paper/web/templates/base.html,sha256=4gWJLvjOuDSnBYRpJqxhGKmKC6UuOl19q_Q_cOjhL-g,1806
430
+ deepresearch_flow/paper/web/templates/detail.html,sha256=VC5VbsaAONajZG8_WFSuURCViRXLdi4gH_wDAMt3EVI,16332
431
+ deepresearch_flow/paper/web/templates/index.html,sha256=qNWwyQWa3QzmHdJbohSe5PJOZS3-KxWjk0RxoQSZiys,6117
432
+ deepresearch_flow/paper/web/templates/stats.html,sha256=bcQBawoZ9KoRkM0NNo9WJBVeN_8O1WU2xNiye-Fugyo,671
415
433
  deepresearch_flow/recognize/__init__.py,sha256=yMAqbdCzpdRSiwFhq9j7yx9ZWxqz_Zq3vfYlTLFCWek,33
416
- deepresearch_flow/recognize/cli.py,sha256=zWUsqvou2h6c5zR_myGaySvK6cG9ItJp9cJFtqqJk7Y,21597
434
+ deepresearch_flow/recognize/cli.py,sha256=QV0d9XhOdcWcr05427GPSSMheal06WvvmejV7wLVfz8,53460
435
+ deepresearch_flow/recognize/katex_check.js,sha256=jKFLk0Y7y_XR0fBJe2xdfQhAMMuYRXo-pSpWqcEyAH0,735
417
436
  deepresearch_flow/recognize/markdown.py,sha256=y-PMJbGqrfWCNBVGanXK1M4OuMP9e1eqh7HDYye5a7Q,8757
437
+ deepresearch_flow/recognize/math.py,sha256=qgI4WRsoWgLaue9OxIq1pcO18wUOlpCNBLKQgicN2hs,22623
438
+ deepresearch_flow/recognize/mermaid.py,sha256=O8uQoEC9mG4mSdTpr-OnmP_vrThaFdUeqt6U00m6O-0,22545
418
439
  deepresearch_flow/recognize/organize.py,sha256=-KVzuwNjiT2bLwqwLwcguEMQYxnGiZXjLNlov_oXSTo,5237
419
440
  deepresearch_flow/translator/__init__.py,sha256=iaAkufvEELVKNbcs08Nh7bkTO4JlkT3rT_JIBP9jGfc,26
420
441
  deepresearch_flow/translator/cli.py,sha256=BceOZhQuN9s5kqhpvLJuwpbB5J0MY1ucWUKw0jXWUPc,16872
@@ -425,9 +446,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
425
446
  deepresearch_flow/translator/prompts.py,sha256=kl_9O2YvmtXC1w6WLnsLuVZKz4mcOtUF887SiTaOvc0,4754
426
447
  deepresearch_flow/translator/protector.py,sha256=sXwNJ1Y8tyPm7dgm8-7S8HkcPe23TGsBdwRxH6mKL70,11291
427
448
  deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
428
- deepresearch_flow-0.3.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
429
- deepresearch_flow-0.3.0.dist-info/METADATA,sha256=AJ4RfKW-V9BPhrrlFSP8stAoXG4SwpF-AvZH5HEtWyw,10831
430
- deepresearch_flow-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
431
- deepresearch_flow-0.3.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
432
- deepresearch_flow-0.3.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
433
- deepresearch_flow-0.3.0.dist-info/RECORD,,
449
+ deepresearch_flow-0.4.1.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
450
+ deepresearch_flow-0.4.1.dist-info/METADATA,sha256=bfOksObo91hopsY_NbQNce_FjC8MEW8kkYUjkQQi9Xo,12918
451
+ deepresearch_flow-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
452
+ deepresearch_flow-0.4.1.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
453
+ deepresearch_flow-0.4.1.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
454
+ deepresearch_flow-0.4.1.dist-info/RECORD,,