deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +184 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/web/app.py +38 -3705
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- deepresearch_flow/recognize/cli.py +805 -26
- deepresearch_flow/recognize/katex_check.js +29 -0
- deepresearch_flow/recognize/math.py +719 -0
- deepresearch_flow/recognize/mermaid.py +690 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/METADATA +78 -4
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/RECORD +30 -9
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepresearch-flow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Workflow tools for paper extraction, review, and research automation.
|
|
5
5
|
Author-email: DengQi <dengqi935@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -51,9 +51,10 @@ Requires-Dist: jsonschema>=4.21.1
|
|
|
51
51
|
Requires-Dist: markdown-it-py>=3.0.0
|
|
52
52
|
Requires-Dist: mdit-py-plugins>=0.4.0
|
|
53
53
|
Requires-Dist: pypdf>=3.0.0
|
|
54
|
+
Requires-Dist: pylatexenc>=2.10
|
|
54
55
|
Requires-Dist: pybtex>=0.24.0
|
|
55
56
|
Requires-Dist: rich>=13.7.1
|
|
56
|
-
Requires-Dist: rumdl>=0.0.
|
|
57
|
+
Requires-Dist: rumdl>=0.0.218
|
|
57
58
|
Requires-Dist: starlette>=0.37.2
|
|
58
59
|
Requires-Dist: tqdm>=4.66.4
|
|
59
60
|
Requires-Dist: uvicorn>=0.27.1
|
|
@@ -121,6 +122,7 @@ DeepResearch Flow provides a unified pipeline to **Repair**, **Translate**, **Ex
|
|
|
121
122
|
- **Smart Extraction**: Turn unstructured Markdown into schema-enforced JSON (summaries, metadata, Q&A) using LLMs (OpenAI, Claude, Gemini, etc.).
|
|
122
123
|
- **Precision Translation**: Translate OCR Markdown to Chinese/Japanese (`.zh.md`, `.ja.md`) while **freezing** formulas, code, tables, and references. No more broken layout.
|
|
123
124
|
- **Local Knowledge DB**: A high-performance local Web UI to browse papers with **Split View** (Source vs. Translated vs. Summary), full-text search, and multi-dimensional filtering.
|
|
125
|
+
- **Coverage Compare**: Compare JSON/PDF/Markdown/Translated datasets to find missing artifacts and export CSV reports.
|
|
124
126
|
- **OCR Post-Processing**: Automatically fix broken references (`[1]` -> `[^1]`), merge split paragraphs, and standardize layouts.
|
|
125
127
|
|
|
126
128
|
---
|
|
@@ -171,7 +173,36 @@ uv run deepresearch-flow translator translate \
|
|
|
171
173
|
--fix-level moderate
|
|
172
174
|
```
|
|
173
175
|
|
|
174
|
-
#### Step 3:
|
|
176
|
+
#### Step 3: Repair OCR Outputs (Recommended)
|
|
177
|
+
|
|
178
|
+
Recommended sequence to stabilize markdown before serving:
|
|
179
|
+
|
|
180
|
+
```bash
|
|
181
|
+
# 1) Fix OCR markdown (auto-detects JSON if inputs are .json)
|
|
182
|
+
uv run deepresearch-flow recognize fix \
|
|
183
|
+
--input ./docs \
|
|
184
|
+
--in-place
|
|
185
|
+
|
|
186
|
+
# 2) Fix LaTeX formulas
|
|
187
|
+
uv run deepresearch-flow recognize fix-math \
|
|
188
|
+
--input ./docs \
|
|
189
|
+
--model openai/gpt-4o-mini \
|
|
190
|
+
--in-place
|
|
191
|
+
|
|
192
|
+
# 3) Fix Mermaid diagrams
|
|
193
|
+
uv run deepresearch-flow recognize fix-mermaid \
|
|
194
|
+
--input ./paper_outputs \
|
|
195
|
+
--json \
|
|
196
|
+
--model openai/gpt-4o-mini \
|
|
197
|
+
--in-place
|
|
198
|
+
|
|
199
|
+
# 4) Fix again to normalize formatting
|
|
200
|
+
uv run deepresearch-flow recognize fix \
|
|
201
|
+
--input ./docs \
|
|
202
|
+
--in-place
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
#### Step 4: Serve Your Database
|
|
175
206
|
|
|
176
207
|
Launch a local UI to read and manage your papers.
|
|
177
208
|
|
|
@@ -246,7 +277,27 @@ uv run deepresearch-flow paper db serve \
|
|
|
246
277
|
</details>
|
|
247
278
|
|
|
248
279
|
<details>
|
|
249
|
-
<summary><strong>4.
|
|
280
|
+
<summary><strong>4. Paper DB Compare: Coverage Audit</strong></summary>
|
|
281
|
+
|
|
282
|
+
Compare two datasets (A/B) to find missing PDFs, markdowns, translations, or JSON items, with match metadata.
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
uv run deepresearch-flow paper db compare \
|
|
286
|
+
--input-a ./a.json \
|
|
287
|
+
--md-root-b ./md_root \
|
|
288
|
+
--output-csv ./compare.csv
|
|
289
|
+
|
|
290
|
+
# Compare translated markdowns by language
|
|
291
|
+
uv run deepresearch-flow paper db compare \
|
|
292
|
+
--md-translated-root-a ./translated_a \
|
|
293
|
+
--md-translated-root-b ./translated_b \
|
|
294
|
+
--lang zh
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
</details>
|
|
298
|
+
|
|
299
|
+
<details>
|
|
300
|
+
<summary><strong>5. Recognize: OCR Post-Processing</strong></summary>
|
|
250
301
|
|
|
251
302
|
Tools to clean up raw outputs from OCR engines like MinerU.
|
|
252
303
|
|
|
@@ -254,6 +305,10 @@ Tools to clean up raw outputs from OCR engines like MinerU.
|
|
|
254
305
|
- Unpack Images: extract Base64 images back to files.
|
|
255
306
|
- Organize: flatten nested OCR output directories.
|
|
256
307
|
- Fix: apply OCR fixes and rumdl formatting during organize, or as a standalone step.
|
|
308
|
+
- Fix JSON: apply the same fixes to markdown fields inside paper JSON outputs.
|
|
309
|
+
- Fix Math: validate and repair LaTeX formulas with optional LLM assistance.
|
|
310
|
+
- Fix Mermaid: validate and repair Mermaid diagrams (requires `mmdc` from mermaid-cli).
|
|
311
|
+
- Recommended order: `fix` -> `fix-math` -> `fix-mermaid` -> `fix`.
|
|
257
312
|
|
|
258
313
|
```bash
|
|
259
314
|
uv run deepresearch-flow recognize md embed --input ./raw_ocr --output ./clean_md
|
|
@@ -275,6 +330,25 @@ uv run deepresearch-flow recognize fix \
|
|
|
275
330
|
uv run deepresearch-flow recognize fix \
|
|
276
331
|
--input ./ocr_md \
|
|
277
332
|
--in-place
|
|
333
|
+
|
|
334
|
+
# Fix JSON outputs in place
|
|
335
|
+
uv run deepresearch-flow recognize fix \
|
|
336
|
+
--json \
|
|
337
|
+
--input ./paper_outputs \
|
|
338
|
+
--in-place
|
|
339
|
+
|
|
340
|
+
# Fix LaTeX formulas in markdown
|
|
341
|
+
uv run deepresearch-flow recognize fix-math \
|
|
342
|
+
--input ./docs \
|
|
343
|
+
--model openai/gpt-4o-mini \
|
|
344
|
+
--in-place
|
|
345
|
+
|
|
346
|
+
# Fix Mermaid diagrams in JSON outputs
|
|
347
|
+
uv run deepresearch-flow recognize fix-mermaid \
|
|
348
|
+
--json \
|
|
349
|
+
--input ./paper_outputs \
|
|
350
|
+
--model openai/gpt-4o-mini \
|
|
351
|
+
--in-place
|
|
278
352
|
```
|
|
279
353
|
|
|
280
354
|
</details>
|
|
@@ -4,7 +4,8 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
|
|
|
4
4
|
deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
|
|
5
5
|
deepresearch_flow/paper/cli.py,sha256=4UY3KHi6BUGztL1vB4w0cCMiIAo9KNxrfQn1GBHt6fA,11153
|
|
6
6
|
deepresearch_flow/paper/config.py,sha256=totVBGzouh0KS6mhRNPneXZYPuuw0SHiOGdO3r6HSfc,9289
|
|
7
|
-
deepresearch_flow/paper/db.py,sha256=
|
|
7
|
+
deepresearch_flow/paper/db.py,sha256=i3v3n-YrG-kPpc62C9-InhEfInoZMBQd-r_pYz_fO_A,41847
|
|
8
|
+
deepresearch_flow/paper/db_ops.py,sha256=l0lNPP1v00ZtdQb7ZAWE_tUf2JUzqKWxU1wwzlEjDrw,69766
|
|
8
9
|
deepresearch_flow/paper/extract.py,sha256=ID1dd2r6LTB0kRF4qBSH6bGtBGv0znw--g_mXYBcoeU,32314
|
|
9
10
|
deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
|
|
10
11
|
deepresearch_flow/paper/prompts.py,sha256=mV7cEXw8pwukBUE4Trah0SjEPSSDgg5-RGaNaUdo4EU,519
|
|
@@ -40,8 +41,16 @@ deepresearch_flow/paper/templates/default_paper.md.j2,sha256=3azu48534QtLtHrCwI1
|
|
|
40
41
|
deepresearch_flow/paper/templates/eight_questions.md.j2,sha256=Ecz4CD3nd7jZ4Dg8himZkTwF4WDkk0ILWk8V728uOPI,3038
|
|
41
42
|
deepresearch_flow/paper/templates/three_pass.md.j2,sha256=ZRj-NkpZePnqp0gSE8OT1dN5Lr5RW4vdOYdeVejYJW0,1576
|
|
42
43
|
deepresearch_flow/paper/web/__init__.py,sha256=eQBtBjvOYsNEdivHTI0aO286SCG2c86xI02tf-0jz5I,39
|
|
43
|
-
deepresearch_flow/paper/web/app.py,sha256=
|
|
44
|
+
deepresearch_flow/paper/web/app.py,sha256=rXnQjffyzH5b64oCwv6ucihU_y5zaFbpzdEB5PRUvHc,3063
|
|
45
|
+
deepresearch_flow/paper/web/constants.py,sha256=HuuE_oZKckmisD3F_1RAqWzO7bnhNmMLyM8FqyM5Yfk,1085
|
|
46
|
+
deepresearch_flow/paper/web/filters.py,sha256=OVMB4GfigP9GPD5dXytHyeLYtnVXEK-QjYfA_k7QbaA,8315
|
|
47
|
+
deepresearch_flow/paper/web/markdown.py,sha256=QHrxUYKB-uAZjG5jVGmkQ6EIT2dSxQNzlibgjGIIKuA,18888
|
|
44
48
|
deepresearch_flow/paper/web/query.py,sha256=vTegfm5zGVkYCd6_K3yNrXJEmKMccUUFKG9DePPcKMw,1938
|
|
49
|
+
deepresearch_flow/paper/web/templates.py,sha256=suJ67-nwWdExNVx8vvcInwqiHu6bhslaEFS1ouifLto,2515
|
|
50
|
+
deepresearch_flow/paper/web/text.py,sha256=OiqOEzNepPXxcCIal38bxkUarIkcOXG6a30luxObFOI,2199
|
|
51
|
+
deepresearch_flow/paper/web/handlers/__init__.py,sha256=HGQud4xuEtdB9eVYPzzilXV9ool-1Db5UU29WJ6cjNk,295
|
|
52
|
+
deepresearch_flow/paper/web/handlers/api.py,sha256=Z7H0nr1cSIj1-nR6ZxhxtU6-4sjiuqzy1U1OpK56B0g,9014
|
|
53
|
+
deepresearch_flow/paper/web/handlers/pages.py,sha256=euORL0_Avmqy-kOPKOfVQxyeQjLU4a6EBIufmwoLeCM,12247
|
|
45
54
|
deepresearch_flow/paper/web/pdfjs/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
|
46
55
|
deepresearch_flow/paper/web/pdfjs/build/pdf.js,sha256=2Ddm8gpMMfvOWinZh4nN--94GxR0QdpFvh0Qeejg-Bw,568294
|
|
47
56
|
deepresearch_flow/paper/web/pdfjs/build/pdf.js.map,sha256=W0nwVFY4inhYxz1raDU6NZ6-rNA21FxLj13txVAqbm4,1434098
|
|
@@ -412,9 +421,21 @@ deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf,sha
|
|
|
412
421
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf,sha256=oiQHWsF0la0KOvO8CkGawHBKiz_RCVRWIB-5sJX8KB0,135124
|
|
413
422
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf,sha256=gytEBtvvI2KIANOqrSEEhTSshNfjrZVb6DuBcu2O9RI,162036
|
|
414
423
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf,sha256=-Kzh-JKyvZ3BeSun8Jf6dYj4T-1IMhSA4E3lOQgoIh8,139512
|
|
424
|
+
deepresearch_flow/paper/web/static/css/main.css,sha256=oUuEFEi4YP6bIlEQlIz-zQEQje7hRq3j63imvtJ6IQ4,15386
|
|
425
|
+
deepresearch_flow/paper/web/static/js/detail.js,sha256=9bZmTID74otrZxJfHDJRMWuI_x1pgk71E3Zu2Q6sBIA,13368
|
|
426
|
+
deepresearch_flow/paper/web/static/js/index.js,sha256=bbQz8QAewmu3TT8ImAzUqNtTWQCMKwVOQfU0Lkw6Lv0,10460
|
|
427
|
+
deepresearch_flow/paper/web/static/js/outline.js,sha256=e9ydLcBqaTXOYULXt-1OKgKIzrZcZaH1RebPXWBbLvE,1882
|
|
428
|
+
deepresearch_flow/paper/web/static/js/stats.js,sha256=USGIAx9cPQTMeyFwYu_bTYPJM7OoiqimhCYuAjoP0-s,1420
|
|
429
|
+
deepresearch_flow/paper/web/templates/base.html,sha256=4gWJLvjOuDSnBYRpJqxhGKmKC6UuOl19q_Q_cOjhL-g,1806
|
|
430
|
+
deepresearch_flow/paper/web/templates/detail.html,sha256=VC5VbsaAONajZG8_WFSuURCViRXLdi4gH_wDAMt3EVI,16332
|
|
431
|
+
deepresearch_flow/paper/web/templates/index.html,sha256=qNWwyQWa3QzmHdJbohSe5PJOZS3-KxWjk0RxoQSZiys,6117
|
|
432
|
+
deepresearch_flow/paper/web/templates/stats.html,sha256=bcQBawoZ9KoRkM0NNo9WJBVeN_8O1WU2xNiye-Fugyo,671
|
|
415
433
|
deepresearch_flow/recognize/__init__.py,sha256=yMAqbdCzpdRSiwFhq9j7yx9ZWxqz_Zq3vfYlTLFCWek,33
|
|
416
|
-
deepresearch_flow/recognize/cli.py,sha256=
|
|
434
|
+
deepresearch_flow/recognize/cli.py,sha256=QV0d9XhOdcWcr05427GPSSMheal06WvvmejV7wLVfz8,53460
|
|
435
|
+
deepresearch_flow/recognize/katex_check.js,sha256=jKFLk0Y7y_XR0fBJe2xdfQhAMMuYRXo-pSpWqcEyAH0,735
|
|
417
436
|
deepresearch_flow/recognize/markdown.py,sha256=y-PMJbGqrfWCNBVGanXK1M4OuMP9e1eqh7HDYye5a7Q,8757
|
|
437
|
+
deepresearch_flow/recognize/math.py,sha256=qgI4WRsoWgLaue9OxIq1pcO18wUOlpCNBLKQgicN2hs,22623
|
|
438
|
+
deepresearch_flow/recognize/mermaid.py,sha256=O8uQoEC9mG4mSdTpr-OnmP_vrThaFdUeqt6U00m6O-0,22545
|
|
418
439
|
deepresearch_flow/recognize/organize.py,sha256=-KVzuwNjiT2bLwqwLwcguEMQYxnGiZXjLNlov_oXSTo,5237
|
|
419
440
|
deepresearch_flow/translator/__init__.py,sha256=iaAkufvEELVKNbcs08Nh7bkTO4JlkT3rT_JIBP9jGfc,26
|
|
420
441
|
deepresearch_flow/translator/cli.py,sha256=BceOZhQuN9s5kqhpvLJuwpbB5J0MY1ucWUKw0jXWUPc,16872
|
|
@@ -425,9 +446,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
|
|
|
425
446
|
deepresearch_flow/translator/prompts.py,sha256=kl_9O2YvmtXC1w6WLnsLuVZKz4mcOtUF887SiTaOvc0,4754
|
|
426
447
|
deepresearch_flow/translator/protector.py,sha256=sXwNJ1Y8tyPm7dgm8-7S8HkcPe23TGsBdwRxH6mKL70,11291
|
|
427
448
|
deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
|
|
428
|
-
deepresearch_flow-0.
|
|
429
|
-
deepresearch_flow-0.
|
|
430
|
-
deepresearch_flow-0.
|
|
431
|
-
deepresearch_flow-0.
|
|
432
|
-
deepresearch_flow-0.
|
|
433
|
-
deepresearch_flow-0.
|
|
449
|
+
deepresearch_flow-0.4.1.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
|
|
450
|
+
deepresearch_flow-0.4.1.dist-info/METADATA,sha256=bfOksObo91hopsY_NbQNce_FjC8MEW8kkYUjkQQi9Xo,12918
|
|
451
|
+
deepresearch_flow-0.4.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
452
|
+
deepresearch_flow-0.4.1.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
|
|
453
|
+
deepresearch_flow-0.4.1.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
|
|
454
|
+
deepresearch_flow-0.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|