deepresearch-flow 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +184 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/web/app.py +38 -3705
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/METADATA +23 -2
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +26 -8
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.3.0.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Jinja2 template utilities for paper web UI.
|
|
2
|
+
|
|
3
|
+
This module provides Jinja2 environment setup and template rendering functions.
|
|
4
|
+
Templates are stored in the 'templates' directory and use the PackageLoader
|
|
5
|
+
for installed package compatibility.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from urllib.parse import quote
|
|
11
|
+
|
|
12
|
+
from jinja2 import Environment, FileSystemLoader, PackageLoader
|
|
13
|
+
|
|
14
|
+
from importlib import metadata
|
|
15
|
+
|
|
16
|
+
from deepresearch_flow import __version__
|
|
17
|
+
from deepresearch_flow.paper.web.constants import PDFJS_VIEWER_PATH, REPO_URL, TEMPLATES_DIR
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_jinja_env() -> Environment:
|
|
21
|
+
"""Get a Jinja2 environment configured for web templates.
|
|
22
|
+
|
|
23
|
+
Uses PackageLoader for installed packages (works after pip install).
|
|
24
|
+
Falls back to FileSystemLoader for development mode.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
# Try PackageLoader first (works in installed package)
|
|
28
|
+
env = Environment(
|
|
29
|
+
loader=PackageLoader("deepresearch_flow.paper.web", "templates"),
|
|
30
|
+
autoescape=True,
|
|
31
|
+
)
|
|
32
|
+
return env
|
|
33
|
+
except Exception:
|
|
34
|
+
# Fallback to FileSystemLoader for development
|
|
35
|
+
env = Environment(
|
|
36
|
+
loader=FileSystemLoader(str(TEMPLATES_DIR)),
|
|
37
|
+
autoescape=True,
|
|
38
|
+
)
|
|
39
|
+
return env
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Global Jinja2 environment
|
|
43
|
+
_jinja_env = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_template_env() -> Environment:
|
|
47
|
+
"""Get the shared Jinja2 environment for web handlers."""
|
|
48
|
+
global _jinja_env
|
|
49
|
+
if _jinja_env is None:
|
|
50
|
+
_jinja_env = get_jinja_env()
|
|
51
|
+
return _jinja_env
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def render_template(template_name: str, **context) -> str:
|
|
55
|
+
"""Render a template with the given context.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
template_name: Name of the template file (e.g., "detail.html")
|
|
59
|
+
**context: Key-value pairs to pass to the template
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Rendered HTML string
|
|
63
|
+
"""
|
|
64
|
+
env = get_template_env()
|
|
65
|
+
try:
|
|
66
|
+
resolved_version = metadata.version("deepresearch-flow")
|
|
67
|
+
except metadata.PackageNotFoundError:
|
|
68
|
+
resolved_version = __version__
|
|
69
|
+
context.setdefault("app_version", resolved_version)
|
|
70
|
+
context.setdefault("repo_url", REPO_URL)
|
|
71
|
+
template = env.get_template(template_name)
|
|
72
|
+
return template.render(**context)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def build_pdfjs_viewer_url(pdf_url: str) -> str:
|
|
76
|
+
"""Build a PDF.js viewer URL for the given PDF URL.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
pdf_url: The URL of the PDF file
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
Full URL to the PDF.js viewer with the PDF file as a query parameter
|
|
83
|
+
"""
|
|
84
|
+
encoded = quote(pdf_url, safe="")
|
|
85
|
+
return f"{PDFJS_VIEWER_PATH}?file={encoded}"
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Text normalization helpers for web rendering."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import html
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
_INLINE_FORMULA_RE = re.compile(r"<inline-formula[^>]*>.*?</inline-formula>", re.IGNORECASE | re.DOTALL)
|
|
9
|
+
_TEX_MATH_RE = re.compile(r"<tex-math[^>]*>(.*?)</tex-math>", re.IGNORECASE | re.DOTALL)
|
|
10
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
11
|
+
_WS_RE = re.compile(r"\s+")
|
|
12
|
+
_VENUE_BRACE_RE = re.compile(r"\{\{|\}\}")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def normalize_title(raw: str) -> str:
|
|
16
|
+
"""Normalize paper titles for display by stripping XML/HTML noise."""
|
|
17
|
+
if not raw:
|
|
18
|
+
return ""
|
|
19
|
+
|
|
20
|
+
def replace_inline(match: re.Match[str]) -> str:
|
|
21
|
+
block = match.group(0)
|
|
22
|
+
tex = _TEX_MATH_RE.search(block)
|
|
23
|
+
if tex:
|
|
24
|
+
return tex.group(1)
|
|
25
|
+
return ""
|
|
26
|
+
|
|
27
|
+
text = _INLINE_FORMULA_RE.sub(replace_inline, raw)
|
|
28
|
+
text = _TAG_RE.sub("", text)
|
|
29
|
+
text = html.unescape(text)
|
|
30
|
+
text = _WS_RE.sub(" ", text).strip()
|
|
31
|
+
return text
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def normalize_venue(raw: str) -> str:
|
|
35
|
+
"""Normalize venue strings by removing extra BibTeX braces."""
|
|
36
|
+
if not raw:
|
|
37
|
+
return ""
|
|
38
|
+
text = _VENUE_BRACE_RE.sub("", raw)
|
|
39
|
+
text = _WS_RE.sub(" ", text).strip()
|
|
40
|
+
return text
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def extract_summary_snippet(paper: dict[str, object], max_len: int = 280) -> str:
|
|
44
|
+
"""Extract a short summary snippet, preferring the 'simple' template."""
|
|
45
|
+
summary = ""
|
|
46
|
+
templates = paper.get("templates")
|
|
47
|
+
if isinstance(templates, dict):
|
|
48
|
+
simple = templates.get("simple")
|
|
49
|
+
if isinstance(simple, dict):
|
|
50
|
+
for key in ("summary", "abstract"):
|
|
51
|
+
value = simple.get(key)
|
|
52
|
+
if isinstance(value, str) and value.strip():
|
|
53
|
+
summary = value.strip()
|
|
54
|
+
break
|
|
55
|
+
if not summary:
|
|
56
|
+
for key in ("summary", "abstract"):
|
|
57
|
+
value = paper.get(key)
|
|
58
|
+
if isinstance(value, str) and value.strip():
|
|
59
|
+
summary = value.strip()
|
|
60
|
+
break
|
|
61
|
+
if not summary:
|
|
62
|
+
return ""
|
|
63
|
+
summary = _TAG_RE.sub("", summary)
|
|
64
|
+
summary = html.unescape(summary)
|
|
65
|
+
summary = _WS_RE.sub(" ", summary).strip()
|
|
66
|
+
if len(summary) > max_len:
|
|
67
|
+
return summary[: max_len - 1].rstrip() + "…"
|
|
68
|
+
return summary
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: deepresearch-flow
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Workflow tools for paper extraction, review, and research automation.
|
|
5
5
|
Author-email: DengQi <dengqi935@gmail.com>
|
|
6
6
|
License: MIT License
|
|
@@ -121,6 +121,7 @@ DeepResearch Flow provides a unified pipeline to **Repair**, **Translate**, **Ex
|
|
|
121
121
|
- **Smart Extraction**: Turn unstructured Markdown into schema-enforced JSON (summaries, metadata, Q&A) using LLMs (OpenAI, Claude, Gemini, etc.).
|
|
122
122
|
- **Precision Translation**: Translate OCR Markdown to Chinese/Japanese (`.zh.md`, `.ja.md`) while **freezing** formulas, code, tables, and references. No more broken layout.
|
|
123
123
|
- **Local Knowledge DB**: A high-performance local Web UI to browse papers with **Split View** (Source vs. Translated vs. Summary), full-text search, and multi-dimensional filtering.
|
|
124
|
+
- **Coverage Compare**: Compare JSON/PDF/Markdown/Translated datasets to find missing artifacts and export CSV reports.
|
|
124
125
|
- **OCR Post-Processing**: Automatically fix broken references (`[1]` -> `[^1]`), merge split paragraphs, and standardize layouts.
|
|
125
126
|
|
|
126
127
|
---
|
|
@@ -246,7 +247,27 @@ uv run deepresearch-flow paper db serve \
|
|
|
246
247
|
</details>
|
|
247
248
|
|
|
248
249
|
<details>
|
|
249
|
-
<summary><strong>4.
|
|
250
|
+
<summary><strong>4. Paper DB Compare: Coverage Audit</strong></summary>
|
|
251
|
+
|
|
252
|
+
Compare two datasets (A/B) to find missing PDFs, markdowns, translations, or JSON items, with match metadata.
|
|
253
|
+
|
|
254
|
+
```bash
|
|
255
|
+
uv run deepresearch-flow paper db compare \
|
|
256
|
+
--input-a ./a.json \
|
|
257
|
+
--md-root-b ./md_root \
|
|
258
|
+
--output-csv ./compare.csv
|
|
259
|
+
|
|
260
|
+
# Compare translated markdowns by language
|
|
261
|
+
uv run deepresearch-flow paper db compare \
|
|
262
|
+
--md-translated-root-a ./translated_a \
|
|
263
|
+
--md-translated-root-b ./translated_b \
|
|
264
|
+
--lang zh
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
</details>
|
|
268
|
+
|
|
269
|
+
<details>
|
|
270
|
+
<summary><strong>5. Recognize: OCR Post-Processing</strong></summary>
|
|
250
271
|
|
|
251
272
|
Tools to clean up raw outputs from OCR engines like MinerU.
|
|
252
273
|
|
|
@@ -4,7 +4,8 @@ deepresearch_flow/cli.py,sha256=t4oowCNWldL0DrVJ4d0UlRkuGU2qHej_G0mAc_quteQ,455
|
|
|
4
4
|
deepresearch_flow/paper/__init__.py,sha256=sunaOkcgAJBrfmcaJTumcWbPGVUSGWvOv2a2Yidzy0A,43
|
|
5
5
|
deepresearch_flow/paper/cli.py,sha256=4UY3KHi6BUGztL1vB4w0cCMiIAo9KNxrfQn1GBHt6fA,11153
|
|
6
6
|
deepresearch_flow/paper/config.py,sha256=totVBGzouh0KS6mhRNPneXZYPuuw0SHiOGdO3r6HSfc,9289
|
|
7
|
-
deepresearch_flow/paper/db.py,sha256=
|
|
7
|
+
deepresearch_flow/paper/db.py,sha256=i3v3n-YrG-kPpc62C9-InhEfInoZMBQd-r_pYz_fO_A,41847
|
|
8
|
+
deepresearch_flow/paper/db_ops.py,sha256=l0lNPP1v00ZtdQb7ZAWE_tUf2JUzqKWxU1wwzlEjDrw,69766
|
|
8
9
|
deepresearch_flow/paper/extract.py,sha256=ID1dd2r6LTB0kRF4qBSH6bGtBGv0znw--g_mXYBcoeU,32314
|
|
9
10
|
deepresearch_flow/paper/llm.py,sha256=mHfs5IkT3Q6BOh46MDlfUmgVTX24WRf0IKKoOnN8nV8,4007
|
|
10
11
|
deepresearch_flow/paper/prompts.py,sha256=mV7cEXw8pwukBUE4Trah0SjEPSSDgg5-RGaNaUdo4EU,519
|
|
@@ -40,8 +41,16 @@ deepresearch_flow/paper/templates/default_paper.md.j2,sha256=3azu48534QtLtHrCwI1
|
|
|
40
41
|
deepresearch_flow/paper/templates/eight_questions.md.j2,sha256=Ecz4CD3nd7jZ4Dg8himZkTwF4WDkk0ILWk8V728uOPI,3038
|
|
41
42
|
deepresearch_flow/paper/templates/three_pass.md.j2,sha256=ZRj-NkpZePnqp0gSE8OT1dN5Lr5RW4vdOYdeVejYJW0,1576
|
|
42
43
|
deepresearch_flow/paper/web/__init__.py,sha256=eQBtBjvOYsNEdivHTI0aO286SCG2c86xI02tf-0jz5I,39
|
|
43
|
-
deepresearch_flow/paper/web/app.py,sha256=
|
|
44
|
+
deepresearch_flow/paper/web/app.py,sha256=rXnQjffyzH5b64oCwv6ucihU_y5zaFbpzdEB5PRUvHc,3063
|
|
45
|
+
deepresearch_flow/paper/web/constants.py,sha256=DzE1TO5Sd-1pfy8ww71J6LnS6cta_Je84jmXyxJ6DNI,1085
|
|
46
|
+
deepresearch_flow/paper/web/filters.py,sha256=OVMB4GfigP9GPD5dXytHyeLYtnVXEK-QjYfA_k7QbaA,8315
|
|
47
|
+
deepresearch_flow/paper/web/markdown.py,sha256=QHrxUYKB-uAZjG5jVGmkQ6EIT2dSxQNzlibgjGIIKuA,18888
|
|
44
48
|
deepresearch_flow/paper/web/query.py,sha256=vTegfm5zGVkYCd6_K3yNrXJEmKMccUUFKG9DePPcKMw,1938
|
|
49
|
+
deepresearch_flow/paper/web/templates.py,sha256=suJ67-nwWdExNVx8vvcInwqiHu6bhslaEFS1ouifLto,2515
|
|
50
|
+
deepresearch_flow/paper/web/text.py,sha256=OiqOEzNepPXxcCIal38bxkUarIkcOXG6a30luxObFOI,2199
|
|
51
|
+
deepresearch_flow/paper/web/handlers/__init__.py,sha256=HGQud4xuEtdB9eVYPzzilXV9ool-1Db5UU29WJ6cjNk,295
|
|
52
|
+
deepresearch_flow/paper/web/handlers/api.py,sha256=Z7H0nr1cSIj1-nR6ZxhxtU6-4sjiuqzy1U1OpK56B0g,9014
|
|
53
|
+
deepresearch_flow/paper/web/handlers/pages.py,sha256=euORL0_Avmqy-kOPKOfVQxyeQjLU4a6EBIufmwoLeCM,12247
|
|
45
54
|
deepresearch_flow/paper/web/pdfjs/LICENSE,sha256=DVQuDIgE45qn836wDaWnYhSdxoLXgpRRKH4RuTjpRZQ,10174
|
|
46
55
|
deepresearch_flow/paper/web/pdfjs/build/pdf.js,sha256=2Ddm8gpMMfvOWinZh4nN--94GxR0QdpFvh0Qeejg-Bw,568294
|
|
47
56
|
deepresearch_flow/paper/web/pdfjs/build/pdf.js.map,sha256=W0nwVFY4inhYxz1raDU6NZ6-rNA21FxLj13txVAqbm4,1434098
|
|
@@ -412,6 +421,15 @@ deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Bold.ttf,sha
|
|
|
412
421
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-BoldItalic.ttf,sha256=oiQHWsF0la0KOvO8CkGawHBKiz_RCVRWIB-5sJX8KB0,135124
|
|
413
422
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Italic.ttf,sha256=gytEBtvvI2KIANOqrSEEhTSshNfjrZVb6DuBcu2O9RI,162036
|
|
414
423
|
deepresearch_flow/paper/web/pdfjs/web/standard_fonts/LiberationSans-Regular.ttf,sha256=-Kzh-JKyvZ3BeSun8Jf6dYj4T-1IMhSA4E3lOQgoIh8,139512
|
|
424
|
+
deepresearch_flow/paper/web/static/css/main.css,sha256=oUuEFEi4YP6bIlEQlIz-zQEQje7hRq3j63imvtJ6IQ4,15386
|
|
425
|
+
deepresearch_flow/paper/web/static/js/detail.js,sha256=9bZmTID74otrZxJfHDJRMWuI_x1pgk71E3Zu2Q6sBIA,13368
|
|
426
|
+
deepresearch_flow/paper/web/static/js/index.js,sha256=bbQz8QAewmu3TT8ImAzUqNtTWQCMKwVOQfU0Lkw6Lv0,10460
|
|
427
|
+
deepresearch_flow/paper/web/static/js/outline.js,sha256=e9ydLcBqaTXOYULXt-1OKgKIzrZcZaH1RebPXWBbLvE,1882
|
|
428
|
+
deepresearch_flow/paper/web/static/js/stats.js,sha256=USGIAx9cPQTMeyFwYu_bTYPJM7OoiqimhCYuAjoP0-s,1420
|
|
429
|
+
deepresearch_flow/paper/web/templates/base.html,sha256=4gWJLvjOuDSnBYRpJqxhGKmKC6UuOl19q_Q_cOjhL-g,1806
|
|
430
|
+
deepresearch_flow/paper/web/templates/detail.html,sha256=jM7rkMu1rQs-kx7LHfLwUvv4yJlhua4eE-wZoOJPufA,16332
|
|
431
|
+
deepresearch_flow/paper/web/templates/index.html,sha256=eQJgjr-RuKPcDxdCJG_hM6KvlMYIp-OSG0oQEJHO7Is,6117
|
|
432
|
+
deepresearch_flow/paper/web/templates/stats.html,sha256=bcQBawoZ9KoRkM0NNo9WJBVeN_8O1WU2xNiye-Fugyo,671
|
|
415
433
|
deepresearch_flow/recognize/__init__.py,sha256=yMAqbdCzpdRSiwFhq9j7yx9ZWxqz_Zq3vfYlTLFCWek,33
|
|
416
434
|
deepresearch_flow/recognize/cli.py,sha256=zWUsqvou2h6c5zR_myGaySvK6cG9ItJp9cJFtqqJk7Y,21597
|
|
417
435
|
deepresearch_flow/recognize/markdown.py,sha256=y-PMJbGqrfWCNBVGanXK1M4OuMP9e1eqh7HDYye5a7Q,8757
|
|
@@ -425,9 +443,9 @@ deepresearch_flow/translator/placeholder.py,sha256=mEgqA-dPdOsIhno0h_hzfpXpY2asb
|
|
|
425
443
|
deepresearch_flow/translator/prompts.py,sha256=kl_9O2YvmtXC1w6WLnsLuVZKz4mcOtUF887SiTaOvc0,4754
|
|
426
444
|
deepresearch_flow/translator/protector.py,sha256=sXwNJ1Y8tyPm7dgm8-7S8HkcPe23TGsBdwRxH6mKL70,11291
|
|
427
445
|
deepresearch_flow/translator/segment.py,sha256=rBFMCLTrvm2GrPc_hNFymi-8Ih2DAtUQlZHCRE9nLaM,5146
|
|
428
|
-
deepresearch_flow-0.
|
|
429
|
-
deepresearch_flow-0.
|
|
430
|
-
deepresearch_flow-0.
|
|
431
|
-
deepresearch_flow-0.
|
|
432
|
-
deepresearch_flow-0.
|
|
433
|
-
deepresearch_flow-0.
|
|
446
|
+
deepresearch_flow-0.4.0.dist-info/licenses/LICENSE,sha256=hT8F2Py1pe6flxq3Ufdm2UKFk0B8CBm0aAQfsLXfvjw,1063
|
|
447
|
+
deepresearch_flow-0.4.0.dist-info/METADATA,sha256=FKueIvCHzloXlQk71dliJK29rxEC9tyMOMS7ISXdfAY,11476
|
|
448
|
+
deepresearch_flow-0.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
449
|
+
deepresearch_flow-0.4.0.dist-info/entry_points.txt,sha256=1uIKscs0YRMg_mFsg9NjsaTt4CvQqQ_-zGERUKhhL_Y,65
|
|
450
|
+
deepresearch_flow-0.4.0.dist-info/top_level.txt,sha256=qBl4RvPJNJUbL8CFfMNWxY0HpQLx5RlF_ko-z_aKpm0,18
|
|
451
|
+
deepresearch_flow-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|