deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. deepresearch_flow/paper/db.py +184 -0
  2. deepresearch_flow/paper/db_ops.py +1939 -0
  3. deepresearch_flow/paper/web/app.py +38 -3705
  4. deepresearch_flow/paper/web/constants.py +23 -0
  5. deepresearch_flow/paper/web/filters.py +255 -0
  6. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  7. deepresearch_flow/paper/web/handlers/api.py +217 -0
  8. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  9. deepresearch_flow/paper/web/markdown.py +549 -0
  10. deepresearch_flow/paper/web/static/css/main.css +857 -0
  11. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  12. deepresearch_flow/paper/web/static/js/index.js +266 -0
  13. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  14. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  15. deepresearch_flow/paper/web/templates/base.html +43 -0
  16. deepresearch_flow/paper/web/templates/detail.html +332 -0
  17. deepresearch_flow/paper/web/templates/index.html +114 -0
  18. deepresearch_flow/paper/web/templates/stats.html +29 -0
  19. deepresearch_flow/paper/web/templates.py +85 -0
  20. deepresearch_flow/paper/web/text.py +68 -0
  21. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/METADATA +23 -2
  22. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +26 -8
  23. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  24. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  25. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  26. {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,85 @@
1
+ """Jinja2 template utilities for paper web UI.
2
+
3
+ This module provides Jinja2 environment setup and template rendering functions.
4
+ Templates are stored in the 'templates' directory and use the PackageLoader
5
+ for installed package compatibility.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from urllib.parse import quote
11
+
12
+ from jinja2 import Environment, FileSystemLoader, PackageLoader
13
+
14
+ from importlib import metadata
15
+
16
+ from deepresearch_flow import __version__
17
+ from deepresearch_flow.paper.web.constants import PDFJS_VIEWER_PATH, REPO_URL, TEMPLATES_DIR
18
+
19
+
20
+ def get_jinja_env() -> Environment:
21
+ """Get a Jinja2 environment configured for web templates.
22
+
23
+ Uses PackageLoader for installed packages (works after pip install).
24
+ Falls back to FileSystemLoader for development mode.
25
+ """
26
+ try:
27
+ # Try PackageLoader first (works in installed package)
28
+ env = Environment(
29
+ loader=PackageLoader("deepresearch_flow.paper.web", "templates"),
30
+ autoescape=True,
31
+ )
32
+ return env
33
+ except Exception:
34
+ # Fallback to FileSystemLoader for development
35
+ env = Environment(
36
+ loader=FileSystemLoader(str(TEMPLATES_DIR)),
37
+ autoescape=True,
38
+ )
39
+ return env
40
+
41
+
42
+ # Global Jinja2 environment
43
+ _jinja_env = None
44
+
45
+
46
+ def get_template_env() -> Environment:
47
+ """Get the shared Jinja2 environment for web handlers."""
48
+ global _jinja_env
49
+ if _jinja_env is None:
50
+ _jinja_env = get_jinja_env()
51
+ return _jinja_env
52
+
53
+
54
+ def render_template(template_name: str, **context) -> str:
55
+ """Render a template with the given context.
56
+
57
+ Args:
58
+ template_name: Name of the template file (e.g., "detail.html")
59
+ **context: Key-value pairs to pass to the template
60
+
61
+ Returns:
62
+ Rendered HTML string
63
+ """
64
+ env = get_template_env()
65
+ try:
66
+ resolved_version = metadata.version("deepresearch-flow")
67
+ except metadata.PackageNotFoundError:
68
+ resolved_version = __version__
69
+ context.setdefault("app_version", resolved_version)
70
+ context.setdefault("repo_url", REPO_URL)
71
+ template = env.get_template(template_name)
72
+ return template.render(**context)
73
+
74
+
75
+ def build_pdfjs_viewer_url(pdf_url: str) -> str:
76
+ """Build a PDF.js viewer URL for the given PDF URL.
77
+
78
+ Args:
79
+ pdf_url: The URL of the PDF file
80
+
81
+ Returns:
82
+ Full URL to the PDF.js viewer with the PDF file as a query parameter
83
+ """
84
+ encoded = quote(pdf_url, safe="")
85
+ return f"{PDFJS_VIEWER_PATH}?file={encoded}"
@@ -0,0 +1,68 @@
1
+ """Text normalization helpers for web rendering."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import html
6
+ import re
7
+
8
+ _INLINE_FORMULA_RE = re.compile(r"<inline-formula[^>]*>.*?</inline-formula>", re.IGNORECASE | re.DOTALL)
9
+ _TEX_MATH_RE = re.compile(r"<tex-math[^>]*>(.*?)</tex-math>", re.IGNORECASE | re.DOTALL)
10
+ _TAG_RE = re.compile(r"<[^>]+>")
11
+ _WS_RE = re.compile(r"\s+")
12
+ _VENUE_BRACE_RE = re.compile(r"\{\{|\}\}")
13
+
14
+
15
+ def normalize_title(raw: str) -> str:
16
+ """Normalize paper titles for display by stripping XML/HTML noise."""
17
+ if not raw:
18
+ return ""
19
+
20
+ def replace_inline(match: re.Match[str]) -> str:
21
+ block = match.group(0)
22
+ tex = _TEX_MATH_RE.search(block)
23
+ if tex:
24
+ return tex.group(1)
25
+ return ""
26
+
27
+ text = _INLINE_FORMULA_RE.sub(replace_inline, raw)
28
+ text = _TAG_RE.sub("", text)
29
+ text = html.unescape(text)
30
+ text = _WS_RE.sub(" ", text).strip()
31
+ return text
32
+
33
+
34
+ def normalize_venue(raw: str) -> str:
35
+ """Normalize venue strings by removing extra BibTeX braces."""
36
+ if not raw:
37
+ return ""
38
+ text = _VENUE_BRACE_RE.sub("", raw)
39
+ text = _WS_RE.sub(" ", text).strip()
40
+ return text
41
+
42
+
43
+ def extract_summary_snippet(paper: dict[str, object], max_len: int = 280) -> str:
44
+ """Extract a short summary snippet, preferring the 'simple' template."""
45
+ summary = ""
46
+ templates = paper.get("templates")
47
+ if isinstance(templates, dict):
48
+ simple = templates.get("simple")
49
+ if isinstance(simple, dict):
50
+ for key in ("summary", "abstract"):
51
+ value = simple.get(key)
52
+ if isinstance(value, str) and value.strip():
53
+ summary = value.strip()
54
+ break
55
+ if not summary:
56
+ for key in ("summary", "abstract"):
57
+ value = paper.get(key)
58
+ if isinstance(value, str) and value.strip():
59
+ summary = value.strip()
60
+ break
61
+ if not summary:
62
+ return ""
63
+ summary = _TAG_RE.sub("", summary)
64
+ summary = html.unescape(summary)
65
+ summary = _WS_RE.sub(" ", summary).strip()
66
+ if len(summary) > max_len:
67
+ return summary[: max_len - 1].rstrip() + "…"
68
+ return summary
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deepresearch-flow
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Workflow tools for paper extraction, review, and research automation.
5
5
  Author-email: DengQi <dengqi935@gmail.com>
6
6
  License: MIT License
@@ -121,6 +121,7 @@ DeepResearch Flow provides a unified pipeline to **Repair**, **Translate**, **Ex
121
121
  - **Smart Extraction**: Turn unstructured Markdown into schema-enforced JSON (summaries, metadata, Q&A) using LLMs (OpenAI, Claude, Gemini, etc.).
122
122
  - **Precision Translation**: Translate OCR Markdown to Chinese/Japanese (`.zh.md`, `.ja.md`) while **freezing** formulas, code, tables, and references. No more broken layout.
123
123
  - **Local Knowledge DB**: A high-performance local Web UI to browse papers with **Split View** (Source vs. Translated vs. Summary), full-text search, and multi-dimensional filtering.
124
+ - **Coverage Compare**: Compare JSON/PDF/Markdown/Translated datasets to find missing artifacts and export CSV reports.
124
125
  - **OCR Post-Processing**: Automatically fix broken references (`[1]` -> `[^1]`), merge split paragraphs, and standardize layouts.
125
126
 
126
127
  ---
@@ -246,7 +247,27 @@ uv run deepresearch-flow paper db serve \
246
247
  </details>
247
248
 
248
249
  <details>
249
- <summary><strong>4. Recognize: OCR Post-Processing</strong></summary>
250
+ <summary><strong>4. Paper DB Compare: Coverage Audit</strong></summary>
251
+
252
+ Compare two datasets (A/B) to find missing PDFs, markdowns, translations, or JSON items, with match metadata.
253
+
254
+ ```bash
255
+ uv run deepresearch-flow paper db compare \
256
+ --input-a ./a.json \
257
+ --md-root-b ./md_root \
258
+ --output-csv ./compare.csv
259
+
260
+ # Compare translated markdowns by language
261
+ uv run deepresearch-flow paper db compare \
262
+ --md-translated-root-a ./translated_a \
263
+ --md-translated-root-b ./translated_b \
264
+ --lang zh
265
+ ```
266
+
267
+ </details>
268
+
269
+ <details>
270
+ <summary><strong>5. Recognize: OCR Post-Processing</strong></summary>
250
271
 
251
272
  Tools to clean up raw outputs from OCR engines like MinerU.
252
273
 
@@ -4,7 +4,8 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
4
4
  deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
5
5
  deepresearch_flow/paper/cli.py,sha256=4UY3KHi6BUGztL1vB4w0cCMiIAo9KNxrfQn1GBHt6fA,11153
6
6
  deepresearch_flow/paper/config.py,sha256=totVBGzouh0KS6mhRNPneXZYPuuw0SHiOGdO3r6HSfc,9289
7
- deepresearch_flow/paper/db.py,sha256=ymVLzSEXDksdhLNSdvNA2IWLzT5lQOG1CpJlPU9CSQ8,33586
7
+ deepresearch_flow/paper/db.py,sha256=i3v3n-YrG-kPpc62C9-InhEfInoZMBQd-r_pYz_fO_A,41847
8
+ deepresearch_flow/paper/db_ops.py,sha256=l0lNPP1v00ZtdQb7ZAWE_tUf2JUzqKWxU1wwzlEjDrw,69766
8
9
  deepresearch_flow/paper/extract.py,sha256=ID1dd2r6LTB0kRF4qBSH6bGtBGv0znw--g_mXYBcoeU,32314
9
10
  deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
10
11
  deepresearch_flow/paper/prompts.py,sha256=mV7cEXw8pwukBUE4Trah0SjEPSSDgg5-RGaNaUdo4EU,519
@@ -40,8 +41,16 @@ deepresearch_flow/paper/templates/default_paper.md.j2,sha256=3azu48534QtLtHrCwI1
40
41
  deepresearch_flow/paper/templates/eight_questions.md.j2,sha256=Ecz4CD3nd7jZ4Dg8himZkTwF4WDkk0ILWk8V728uOPI,3038
41
42
  deepresearch_flow/paper/templates/three_pass.md.j2,sha256=ZRj-NkpZePnqp0gSE8OT1dN5Lr5RW4vdOYdeVejYJW0,1576
42
43
  deepresearch_flow/paper/web/__init__.py,sha256=eQBtBjvOYsNEdivHTI0aO286SCG2c86xI02tf-0jz5I,39
43
- deepresearch_flow/paper/web/app.py,sha256=nb4uzsDJ2R5dz_WA69NKwTgVgMqAyZv5OZ88GxFTWLQ,133311
44
+ deepresearch_flow/paper/web/app.py,sha256=rXnQjffyzH5b64oCwv6ucihU_y5zaFbpzdEB5PRUvHc,3063
45
+ deepresearch_flow/paper/web/constants.py,sha256=DzE1TO5Sd-1pfy8ww71J6LnS6cta_Je84jmXyxJ6DNI,1085
46
+ deepresearch_flow/paper/web/filters.py,sha256=OVMB4GfigP9GPD5dXytHyeLYtnVXEK-QjYfA_k7QbaA,8315
47
+ deepresearch_flow/paper/web/markdown.py,sha256=QHrxUYKB-uAZjG5jVGmkQ6EIT2dSxQNzlibgjGIIKuA,18888
44
48
  deepresearch_flow/paper/web/query.py,sha256=vTegfm5zGVkYCd6_K3yNrXJEmKMccUUFKG9DePPcKMw,1938
49
+ deepresearch_flow/paper/web/templates.py,sha256=suJ67-nwWdExNVx8vvcInwqiHu6bhslaEFS1ouifLto,2515
50
+ deepresearch_flow/paper/web/text.py,sha256=OiqOEzNepPXxcCIal38bxkUarIkcOXG6a30luxObFOI,2199
51
+ deepresearch_flow/paper/web/handlers/__init__.py,sha256=HGQud4xuEtdB9eVYPzzilXV9ool-1Db5UU29WJ6cjNk,295
52
+ deepresearch_flow/paper/web/handlers/api.py,sha256=Z7H0nr1cSIj1-nR6ZxhxtU6-4sjiuqzy1U1OpK56B0g,9014
53
+ deepresearch_flow/paper/web/handlers/pages.py,sha256=euORL0_Avmqy-kOPKOfVQxyeQjLU4a6EBIufmwoLeCM,12247
45
54
  deepresearch_flow/paper/web/pdfjs/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
46
55
  deepresearch_flow/paper/web/pdfjs/build/pdf.js,sha256=2Ddm8gpMMfvOWinZh4nN--94GxR0QdpFvh0Qeejg-Bw,568294
47
56
  deepresearch_flow/paper/web/pdfjs/build/pdf.js.map,sha256=W0nwVFY4inhYxz1raDU6NZ6-rNA21FxLj13txVAqbm4,1434098
@@ -412,6 +421,15 @@ deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf,sha
412
421
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf,sha256=oiQHWsF0la0KOvO8CkGawHBKiz_RCVRWIB-5sJX8KB0,135124
413
422
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf,sha256=gytEBtvvI2KIANOqrSEEhTSshNfjrZVb6DuBcu2O9RI,162036
414
423
  deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf,sha256=-Kzh-JKyvZ3BeSun8Jf6dYj4T-1IMhSA4E3lOQgoIh8,139512
424
+ deepresearch_flow/paper/web/static/css/main.css,sha256=oUuEFEi4YP6bIlEQlIz-zQEQje7hRq3j63imvtJ6IQ4,15386
425
+ deepresearch_flow/paper/web/static/js/detail.js,sha256=9bZmTID74otrZxJfHDJRMWuI_x1pgk71E3Zu2Q6sBIA,13368
426
+ deepresearch_flow/paper/web/static/js/index.js,sha256=bbQz8QAewmu3TT8ImAzUqNtTWQCMKwVOQfU0Lkw6Lv0,10460
427
+ deepresearch_flow/paper/web/static/js/outline.js,sha256=e9ydLcBqaTXOYULXt-1OKgKIzrZcZaH1RebPXWBbLvE,1882
428
+ deepresearch_flow/paper/web/static/js/stats.js,sha256=USGIAx9cPQTMeyFwYu_bTYPJM7OoiqimhCYuAjoP0-s,1420
429
+ deepresearch_flow/paper/web/templates/base.html,sha256=4gWJLvjOuDSnBYRpJqxhGKmKC6UuOl19q_Q_cOjhL-g,1806
430
+ deepresearch_flow/paper/web/templates/detail.html,sha256=jM7rkMu1rQs-kx7LHfLwUvv4yJlhua4eE-wZoOJPufA,16332
431
+ deepresearch_flow/paper/web/templates/index.html,sha256=eQJgjr-RuKPcDxdCJG_hM6KvlMYIp-OSG0oQEJHO7Is,6117
432
+ deepresearch_flow/paper/web/templates/stats.html,sha256=bcQBawoZ9KoRkM0NNo9WJBVeN_8O1WU2xNiye-Fugyo,671
415
433
  deepresearch_flow/recognize/__init__.py,sha256=yMAqbdCzpdRSiwFhq9j7yx9ZWxqz_Zq3vfYlTLFCWek,33
416
434
  deepresearch_flow/recognize/cli.py,sha256=zWUsqvou2h6c5zR_myGaySvK6cG9ItJp9cJFtqqJk7Y,21597
417
435
  deepresearch_flow/recognize/markdown.py,sha256=y-PMJbGqrfWCNBVGanXK1M4OuMP9e1eqh7HDYye5a7Q,8757
@@ -425,9 +443,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
425
443
  deepresearch_flow/translator/prompts.py,sha256=kl_9O2YvmtXC1w6WLnsLuVZKz4mcOtUF887SiTaOvc0,4754
426
444
  deepresearch_flow/translator/protector.py,sha256=sXwNJ1Y8tyPm7dgm8-7S8HkcPe23TGsBdwRxH6mKL70,11291
427
445
  deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
428
- deepresearch_flow-0.3.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
429
- deepresearch_flow-0.3.0.dist-info/METADATA,sha256=AJ4RfKW-V9BPhrrlFSP8stAoXG4SwpF-AvZH5HEtWyw,10831
430
- deepresearch_flow-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
431
- deepresearch_flow-0.3.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
432
- deepresearch_flow-0.3.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
433
- deepresearch_flow-0.3.0.dist-info/RECORD,,
446
+ deepresearch_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
447
+ deepresearch_flow-0.4.0.dist-info/METADATA,sha256=FKueIvCHzloXlQk71dliJK29rxEC9tyMOMS7ISXdfAY,11476
448
+ deepresearch_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
449
+ deepresearch_flow-0.4.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
450
+ deepresearch_flow-0.4.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
451
+ deepresearch_flow-0.4.0.dist-info/RECORD,,