deepresearch-flow 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +34 -0
- deepresearch_flow/paper/db_ops.py +21 -7
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +391 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +133 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +6 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +30 -0
- deepresearch_flow/paper/template_registry.py +39 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +40 -0
- deepresearch_flow/paper/web/app.py +106 -1
- deepresearch_flow/paper/web/constants.py +1 -0
- deepresearch_flow/paper/web/handlers/__init__.py +2 -1
- deepresearch_flow/paper/web/handlers/api.py +55 -0
- deepresearch_flow/paper/web/handlers/pages.py +105 -25
- deepresearch_flow/paper/web/markdown.py +230 -4
- deepresearch_flow/paper/web/pdfjs/web/viewer.html +57 -5
- deepresearch_flow/paper/web/pdfjs/web/viewer.js +5 -1
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +527 -124
- deepresearch_flow/paper/web/static/js/outline.js +48 -34
- deepresearch_flow/paper/web/static_assets.py +289 -0
- deepresearch_flow/paper/web/templates/detail.html +52 -66
- deepresearch_flow/paper/web/templates.py +7 -4
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/organize.py +9 -12
- deepresearch_flow/translator/fixers.py +15 -0
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/METADATA +62 -2
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/RECORD +34 -25
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.4.1.dist-info → deepresearch_flow-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Template: simple_phi
|
|
2
|
+
|
|
3
|
+
{% if stage_name %}
|
|
4
|
+
Current stage: {{ stage_name }}.
|
|
5
|
+
Return JSON with keys: {{ stage_fields | join(", ") }}.
|
|
6
|
+
Output language: {{ output_language }}. Use that language in all answers (if zh, use Chinese).
|
|
7
|
+
Previously completed outputs (JSON): {{ previous_outputs }}
|
|
8
|
+
{% else %}
|
|
9
|
+
你是“哲学/伦理学论文高密度摘要写手”。基于我提供的论文全文/摘录/笔记,写一段单段摘要(900–1100字,禁止第一人称),风格参考:先讲问题与争点→指出现有立场/方法的不足→给出本文主论题→用3–6句复述论证主链(按“首先/其次/最后”串起来)→点出2–4个关键概念/区分(每个用极短句说明其在文中的用法/定义)→交代作者处理的1–2条关键反对意见及回应→用2–3句总结贡献/意义→用1–2句写局限与后续方向。
|
|
10
|
+
|
|
11
|
+
硬约束:
|
|
12
|
+
1) 只写论文中明确支持的内容;超出原文必须标【推测】;论文没交代的证据/覆盖范围写“论文未给出”。
|
|
13
|
+
2) 若可能,在段内插入≥5个最小定位标签(例如{Sect.X.Y}或{锚点:段首8-12字})。若无法定位,标【未定位】并说明原因(材料缺页码/无小节/摘录不完整等)。
|
|
14
|
+
3) 不输出列表/分段/小标题,只输出一整段连续文本(允许少量分号)。
|
|
15
|
+
4) 输出语言:{{ output_language }}。请使用该语言输出(zh 时使用中文,保留必要英文术语)。
|
|
16
|
+
|
|
17
|
+
段末必须追加一行“核心压缩表达(四选一)”,从论文中抽取最关键的一条,以纯ASCII表示、避免引号和特殊符号:
|
|
18
|
+
- 核心判准:X iff (A and B and C)
|
|
19
|
+
- 或 核心定义:X := (R1,R2,R3)
|
|
20
|
+
- 或 核心原则:If P then Q
|
|
21
|
+
- 或 核心论证骨架:P1 + P2 + ... + Pn => C
|
|
22
|
+
选择规则:若论文给出明确必要/充分条件或工作定义,优先用“判准/定义”;若是规范伦理,优先用“原则”;若不存在单一判准/原则,则用“论证骨架”。
|
|
23
|
+
|
|
24
|
+
现在开始输出这一段摘要。
|
|
25
|
+
{% endif %}
|
|
26
|
+
|
|
27
|
+
Document content:
|
|
28
|
+
{{ content }}
|
|
29
|
+
|
|
30
|
+
JSON Schema:
|
|
31
|
+
{{ schema }}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"type": "object",
|
|
4
|
+
"additionalProperties": false,
|
|
5
|
+
"required": [
|
|
6
|
+
"paper_title",
|
|
7
|
+
"paper_authors",
|
|
8
|
+
"publication_date",
|
|
9
|
+
"publication_venue",
|
|
10
|
+
"module_m1",
|
|
11
|
+
"module_m2",
|
|
12
|
+
"module_m3",
|
|
13
|
+
"module_m4",
|
|
14
|
+
"module_m5",
|
|
15
|
+
"module_m6"
|
|
16
|
+
],
|
|
17
|
+
"properties": {
|
|
18
|
+
"paper_title": {"type": "string", "minLength": 1},
|
|
19
|
+
"paper_authors": {"type": "array", "items": {"type": "string"}},
|
|
20
|
+
"publication_date": {"type": "string"},
|
|
21
|
+
"publication_venue": {"type": "string"},
|
|
22
|
+
"module_m1": {"type": "string"},
|
|
23
|
+
"module_m2": {"type": "string"},
|
|
24
|
+
"module_m3": {"type": "string"},
|
|
25
|
+
"module_m4": {"type": "string"},
|
|
26
|
+
"module_m5": {"type": "string"},
|
|
27
|
+
"module_m6": {"type": "string"},
|
|
28
|
+
"output_language": {"type": "string"}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
@@ -34,6 +34,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
34
34
|
schema_file="default_paper_schema.json",
|
|
35
35
|
render_template="default_paper.md.j2",
|
|
36
36
|
),
|
|
37
|
+
"simple_phi": TemplateBundle(
|
|
38
|
+
name="simple_phi",
|
|
39
|
+
prompt_system="simple_phi_system.j2",
|
|
40
|
+
prompt_user="simple_phi_user.j2",
|
|
41
|
+
schema_file="default_paper_schema.json",
|
|
42
|
+
render_template="default_paper.md.j2",
|
|
43
|
+
),
|
|
37
44
|
"deep_read": TemplateBundle(
|
|
38
45
|
name="deep_read",
|
|
39
46
|
prompt_system="deep_read_system.j2",
|
|
@@ -41,6 +48,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
41
48
|
schema_file="deep_read_schema.json",
|
|
42
49
|
render_template="deep_read.md.j2",
|
|
43
50
|
),
|
|
51
|
+
"deep_read_phi": TemplateBundle(
|
|
52
|
+
name="deep_read_phi",
|
|
53
|
+
prompt_system="deep_read_phi_system.j2",
|
|
54
|
+
prompt_user="deep_read_phi_user.j2",
|
|
55
|
+
schema_file="deep_read_phi_schema.json",
|
|
56
|
+
render_template="deep_read_phi.md.j2",
|
|
57
|
+
),
|
|
44
58
|
"eight_questions": TemplateBundle(
|
|
45
59
|
name="eight_questions",
|
|
46
60
|
prompt_system="eight_questions_system.j2",
|
|
@@ -48,6 +62,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
48
62
|
schema_file="eight_questions_schema.json",
|
|
49
63
|
render_template="eight_questions.md.j2",
|
|
50
64
|
),
|
|
65
|
+
"eight_questions_phi": TemplateBundle(
|
|
66
|
+
name="eight_questions_phi",
|
|
67
|
+
prompt_system="eight_questions_phi_system.j2",
|
|
68
|
+
prompt_user="eight_questions_phi_user.j2",
|
|
69
|
+
schema_file="eight_questions_schema.json",
|
|
70
|
+
render_template="eight_questions.md.j2",
|
|
71
|
+
),
|
|
51
72
|
"three_pass": TemplateBundle(
|
|
52
73
|
name="three_pass",
|
|
53
74
|
prompt_system="three_pass_system.j2",
|
|
@@ -75,6 +96,14 @@ _STAGES: dict[str, list[StageDefinition]] = {
|
|
|
75
96
|
StageDefinition("module_g", ["module_g"]),
|
|
76
97
|
StageDefinition("module_h", ["module_h"]),
|
|
77
98
|
],
|
|
99
|
+
"deep_read_phi": [
|
|
100
|
+
StageDefinition("module_m1", ["module_m1"]),
|
|
101
|
+
StageDefinition("module_m2", ["module_m2"]),
|
|
102
|
+
StageDefinition("module_m3", ["module_m3"]),
|
|
103
|
+
StageDefinition("module_m4", ["module_m4"]),
|
|
104
|
+
StageDefinition("module_m5", ["module_m5"]),
|
|
105
|
+
StageDefinition("module_m6", ["module_m6"]),
|
|
106
|
+
],
|
|
78
107
|
"eight_questions": [
|
|
79
108
|
StageDefinition(
|
|
80
109
|
"questions_1to4",
|
|
@@ -85,6 +114,16 @@ _STAGES: dict[str, list[StageDefinition]] = {
|
|
|
85
114
|
["question5", "question6", "question7", "question8"],
|
|
86
115
|
),
|
|
87
116
|
],
|
|
117
|
+
"eight_questions_phi": [
|
|
118
|
+
StageDefinition(
|
|
119
|
+
"questions_1to4",
|
|
120
|
+
["question1", "question2", "question3", "question4"],
|
|
121
|
+
),
|
|
122
|
+
StageDefinition(
|
|
123
|
+
"questions_5to8",
|
|
124
|
+
["question5", "question6", "question7", "question8"],
|
|
125
|
+
),
|
|
126
|
+
],
|
|
88
127
|
"three_pass": [
|
|
89
128
|
StageDefinition("step1_summary", ["step1_summary"]),
|
|
90
129
|
StageDefinition("step2_analysis", ["step2_analysis"]),
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# {{ paper_title }}
|
|
2
|
+
|
|
3
|
+
{% set is_zh = output_language == "zh" %}
|
|
4
|
+
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
|
+
|
|
6
|
+
{% if output_language %}
|
|
7
|
+
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
10
|
+
{% if publication_date %}
|
|
11
|
+
**{{ "发表日期 / Publication Date" if is_zh else "Publication Date" }}:** {{ publication_date }}
|
|
12
|
+
{% endif %}
|
|
13
|
+
|
|
14
|
+
{% if publication_venue %}
|
|
15
|
+
**{{ "期刊/会议 / Publication Venue" if is_zh else "Publication Venue" }}:** {{ publication_venue }}
|
|
16
|
+
{% endif %}
|
|
17
|
+
|
|
18
|
+
## {{ "模块 M1:对齐目标 + 输入校验 + 论文地图 + Exhibit全量索引 + markmap大纲脑图" if is_zh else "Module M1: Alignment + Input Check + Paper Map + Exhibit Index + Markmap" }}
|
|
19
|
+
|
|
20
|
+
{{ module_m1 }}
|
|
21
|
+
|
|
22
|
+
## {{ "模块 M2:第一遍鸟瞰 + Exhibit客观打分排序" if is_zh else "Module M2: First Pass Overview + Exhibit Scoring" }}
|
|
23
|
+
|
|
24
|
+
{{ module_m2 }}
|
|
25
|
+
|
|
26
|
+
## {{ "模块 M3:概念与术语工程 + 辩论谱系" if is_zh else "Module M3: Concepts + Debate Lineage" }}
|
|
27
|
+
|
|
28
|
+
{{ module_m3 }}
|
|
29
|
+
|
|
30
|
+
## {{ "模块 M4:论证重建 + Top Exhibits 深读(上半)" if is_zh else "Module M4: Argument Map + Top Exhibits (Part 1)" }}
|
|
31
|
+
|
|
32
|
+
{{ module_m4 }}
|
|
33
|
+
|
|
34
|
+
## {{ "模块 M5:深度审视 + objection mining + Top Exhibits 深读(下半)" if is_zh else "Module M5: Deep Review + Objection Mining + Top Exhibits (Part 2)" }}
|
|
35
|
+
|
|
36
|
+
{{ module_m5 }}
|
|
37
|
+
|
|
38
|
+
## {{ "模块 M6:写作级产出包" if is_zh else "Module M6: Writing-Ready Output Pack" }}
|
|
39
|
+
|
|
40
|
+
{{ module_m6 }}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from starlette.applications import Starlette
|
|
@@ -10,8 +11,9 @@ from starlette.routing import Mount, Route
|
|
|
10
11
|
from starlette.staticfiles import StaticFiles
|
|
11
12
|
|
|
12
13
|
from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
|
|
13
|
-
from deepresearch_flow.paper.web.constants import PDFJS_STATIC_DIR, STATIC_DIR
|
|
14
|
+
from deepresearch_flow.paper.web.constants import DEFAULT_PDFJS_CDN_BASE_URL, PDFJS_STATIC_DIR, STATIC_DIR
|
|
14
15
|
from deepresearch_flow.paper.web.handlers import (
|
|
16
|
+
api_markdown,
|
|
15
17
|
api_papers,
|
|
16
18
|
api_pdf,
|
|
17
19
|
api_stats,
|
|
@@ -21,6 +23,7 @@ from deepresearch_flow.paper.web.handlers import (
|
|
|
21
23
|
stats_page,
|
|
22
24
|
)
|
|
23
25
|
from deepresearch_flow.paper.web.markdown import create_md_renderer
|
|
26
|
+
from deepresearch_flow.paper.web.static_assets import build_static_assets
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
26
29
|
|
|
@@ -32,6 +35,35 @@ class _NoIndexMiddleware(BaseHTTPMiddleware):
|
|
|
32
35
|
return response
|
|
33
36
|
|
|
34
37
|
|
|
38
|
+
class _StaticAssetFiles(StaticFiles):
|
|
39
|
+
def __init__(self, *args, cache_control: str | None = None, **kwargs) -> None:
|
|
40
|
+
super().__init__(*args, **kwargs)
|
|
41
|
+
self._cache_control = cache_control
|
|
42
|
+
|
|
43
|
+
async def get_response(self, path: str, scope): # type: ignore[override]
|
|
44
|
+
response = await super().get_response(path, scope)
|
|
45
|
+
if self._cache_control and response.status_code == 200:
|
|
46
|
+
response.headers.setdefault("Cache-Control", self._cache_control)
|
|
47
|
+
return response
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_static_mode(value: str | None) -> str:
|
|
51
|
+
if not value:
|
|
52
|
+
return "auto"
|
|
53
|
+
normalized = value.strip().lower()
|
|
54
|
+
if normalized in {"dev", "development"}:
|
|
55
|
+
return "dev"
|
|
56
|
+
if normalized in {"prod", "production"}:
|
|
57
|
+
return "prod"
|
|
58
|
+
return "auto"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _resolve_static_mode(value: str, static_base_url: str | None) -> str:
|
|
62
|
+
if value == "auto":
|
|
63
|
+
return "prod" if static_base_url else "dev"
|
|
64
|
+
return value
|
|
65
|
+
|
|
66
|
+
|
|
35
67
|
def create_app(
|
|
36
68
|
*,
|
|
37
69
|
db_paths: list[Path],
|
|
@@ -42,6 +74,10 @@ def create_app(
|
|
|
42
74
|
pdf_roots: list[Path] | None = None,
|
|
43
75
|
cache_dir: Path | None = None,
|
|
44
76
|
use_cache: bool = True,
|
|
77
|
+
static_base_url: str | None = None,
|
|
78
|
+
static_mode: str | None = None,
|
|
79
|
+
static_export_dir: Path | None = None,
|
|
80
|
+
pdfjs_cdn_base_url: str | None = None,
|
|
45
81
|
) -> Starlette:
|
|
46
82
|
papers = load_and_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
|
|
47
83
|
|
|
@@ -55,6 +91,44 @@ def create_app(
|
|
|
55
91
|
pdf_roots=pdf_roots,
|
|
56
92
|
)
|
|
57
93
|
md = create_md_renderer()
|
|
94
|
+
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
95
|
+
static_mode = _normalize_static_mode(static_mode or os.getenv("PAPER_DB_STATIC_MODE"))
|
|
96
|
+
resolved_mode = _resolve_static_mode(static_mode, static_base_url)
|
|
97
|
+
export_dir_value = static_export_dir or os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
|
|
98
|
+
export_dir = Path(export_dir_value) if export_dir_value else None
|
|
99
|
+
pdfjs_cdn_base_url = (
|
|
100
|
+
pdfjs_cdn_base_url
|
|
101
|
+
or os.getenv("PAPER_DB_PDFJS_CDN_BASE_URL")
|
|
102
|
+
or DEFAULT_PDFJS_CDN_BASE_URL
|
|
103
|
+
)
|
|
104
|
+
if pdfjs_cdn_base_url:
|
|
105
|
+
lowered = pdfjs_cdn_base_url.strip().lower()
|
|
106
|
+
if lowered in {"none", "off", "local"}:
|
|
107
|
+
pdfjs_cdn_base_url = None
|
|
108
|
+
else:
|
|
109
|
+
pdfjs_cdn_base_url = pdfjs_cdn_base_url.rstrip("/")
|
|
110
|
+
|
|
111
|
+
asset_config = None
|
|
112
|
+
if resolved_mode == "prod":
|
|
113
|
+
if not static_base_url:
|
|
114
|
+
logger.warning("Static mode set to prod without base URL; falling back to dev asset routes.")
|
|
115
|
+
resolved_mode = "dev"
|
|
116
|
+
else:
|
|
117
|
+
asset_config = build_static_assets(
|
|
118
|
+
index,
|
|
119
|
+
static_base_url=static_base_url,
|
|
120
|
+
static_export_dir=export_dir,
|
|
121
|
+
)
|
|
122
|
+
if resolved_mode == "dev" and export_dir:
|
|
123
|
+
asset_config = build_static_assets(
|
|
124
|
+
index,
|
|
125
|
+
static_base_url="",
|
|
126
|
+
static_export_dir=export_dir,
|
|
127
|
+
allow_empty_base=True,
|
|
128
|
+
)
|
|
129
|
+
if asset_config is None:
|
|
130
|
+
asset_config = build_static_assets(index, static_base_url=None)
|
|
131
|
+
|
|
58
132
|
routes = [
|
|
59
133
|
Route("/", index_page, methods=["GET"]),
|
|
60
134
|
Route("/robots.txt", robots_txt, methods=["GET"]),
|
|
@@ -63,6 +137,7 @@ def create_app(
|
|
|
63
137
|
Route("/api/papers", api_papers, methods=["GET"]),
|
|
64
138
|
Route("/api/stats", api_stats, methods=["GET"]),
|
|
65
139
|
Route("/api/pdf/{source_hash:str}", api_pdf, methods=["GET"]),
|
|
140
|
+
Route("/api/dev/markdown/{source_hash:str}", api_markdown, methods=["GET"]),
|
|
66
141
|
]
|
|
67
142
|
if PDFJS_STATIC_DIR.exists():
|
|
68
143
|
routes.append(
|
|
@@ -85,10 +160,40 @@ def create_app(
|
|
|
85
160
|
name="static",
|
|
86
161
|
)
|
|
87
162
|
)
|
|
163
|
+
if export_dir and export_dir.exists() and asset_config.enabled and not asset_config.base_url:
|
|
164
|
+
cache_header = "public, max-age=31536000, immutable"
|
|
165
|
+
routes.extend(
|
|
166
|
+
[
|
|
167
|
+
Mount(
|
|
168
|
+
"/pdf",
|
|
169
|
+
app=_StaticAssetFiles(directory=str(export_dir / "pdf"), cache_control=cache_header),
|
|
170
|
+
name="static_pdf",
|
|
171
|
+
),
|
|
172
|
+
Mount(
|
|
173
|
+
"/images",
|
|
174
|
+
app=_StaticAssetFiles(directory=str(export_dir / "images"), cache_control=cache_header),
|
|
175
|
+
name="static_images",
|
|
176
|
+
),
|
|
177
|
+
Mount(
|
|
178
|
+
"/md",
|
|
179
|
+
app=_StaticAssetFiles(directory=str(export_dir / "md"), cache_control=cache_header),
|
|
180
|
+
name="static_md",
|
|
181
|
+
),
|
|
182
|
+
Mount(
|
|
183
|
+
"/md_translate",
|
|
184
|
+
app=_StaticAssetFiles(directory=str(export_dir / "md_translate"), cache_control=cache_header),
|
|
185
|
+
name="static_md_translate",
|
|
186
|
+
),
|
|
187
|
+
]
|
|
188
|
+
)
|
|
88
189
|
app = Starlette(routes=routes)
|
|
89
190
|
app.add_middleware(_NoIndexMiddleware)
|
|
90
191
|
app.state.index = index
|
|
91
192
|
app.state.md = md
|
|
92
193
|
app.state.fallback_language = fallback_language
|
|
93
194
|
app.state.pdf_roots = pdf_roots
|
|
195
|
+
app.state.static_mode = resolved_mode
|
|
196
|
+
app.state.asset_config = asset_config
|
|
197
|
+
app.state.static_export_dir = export_dir
|
|
198
|
+
app.state.pdfjs_cdn_base_url = pdfjs_cdn_base_url
|
|
94
199
|
return app
|
|
@@ -12,6 +12,7 @@ CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/contrib/auto-r
|
|
|
12
12
|
# Use legacy builds to ensure `pdfjsLib` is available as a global.
|
|
13
13
|
CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
|
|
14
14
|
CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
|
|
15
|
+
DEFAULT_PDFJS_CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174"
|
|
15
16
|
|
|
16
17
|
# PDF.js viewer configuration
|
|
17
18
|
PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""Route handlers for paper web UI."""
|
|
2
2
|
|
|
3
|
-
from .api import api_papers, api_pdf, api_stats
|
|
3
|
+
from .api import api_markdown, api_papers, api_pdf, api_stats
|
|
4
4
|
from .pages import index_page, paper_detail, robots_txt, stats_page
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
7
|
"api_papers",
|
|
8
8
|
"api_pdf",
|
|
9
9
|
"api_stats",
|
|
10
|
+
"api_markdown",
|
|
10
11
|
"index_page",
|
|
11
12
|
"paper_detail",
|
|
12
13
|
"robots_txt",
|
|
@@ -19,6 +19,8 @@ from deepresearch_flow.paper.web.filters import (
|
|
|
19
19
|
presence_filter,
|
|
20
20
|
sorted_ids,
|
|
21
21
|
)
|
|
22
|
+
from deepresearch_flow.paper.web.markdown import normalize_markdown_images
|
|
23
|
+
from deepresearch_flow.paper.web.static_assets import resolve_asset_urls
|
|
22
24
|
from deepresearch_flow.paper.web.text import extract_summary_snippet, normalize_title, normalize_venue
|
|
23
25
|
from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
|
|
24
26
|
|
|
@@ -92,9 +94,18 @@ def _apply_query(index: PaperIndex, query: Query) -> set[int]:
|
|
|
92
94
|
return result
|
|
93
95
|
|
|
94
96
|
|
|
97
|
+
def _safe_read_text(path: Path) -> str:
|
|
98
|
+
try:
|
|
99
|
+
return path.read_text(encoding="utf-8")
|
|
100
|
+
except UnicodeDecodeError:
|
|
101
|
+
return path.read_text(encoding="latin-1")
|
|
102
|
+
|
|
103
|
+
|
|
95
104
|
async def api_papers(request: Request) -> JSONResponse:
|
|
96
105
|
"""API endpoint for paper list with filtering, sorting, and pagination."""
|
|
97
106
|
index: PaperIndex = request.app.state.index
|
|
107
|
+
asset_config = request.app.state.asset_config
|
|
108
|
+
prefer_local = request.app.state.static_mode == "dev"
|
|
98
109
|
filters = parse_filters(request)
|
|
99
110
|
page = int(filters["page"])
|
|
100
111
|
page_size = int(filters["page_size"])
|
|
@@ -165,6 +176,7 @@ async def api_papers(request: Request) -> JSONResponse:
|
|
|
165
176
|
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
166
177
|
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
167
178
|
translation_languages = sorted(translations.keys(), key=str.lower)
|
|
179
|
+
asset_urls = resolve_asset_urls(index, source_hash, asset_config, prefer_local=prefer_local)
|
|
168
180
|
items.append(
|
|
169
181
|
{
|
|
170
182
|
"source_hash": source_hash,
|
|
@@ -183,6 +195,10 @@ async def api_papers(request: Request) -> JSONResponse:
|
|
|
183
195
|
"has_summary": bool(paper.get("_has_summary")),
|
|
184
196
|
"is_pdf_only": bool(paper.get("_is_pdf_only")),
|
|
185
197
|
"translation_languages": translation_languages,
|
|
198
|
+
"pdf_url": asset_urls["pdf_url"],
|
|
199
|
+
"md_url": asset_urls["md_url"],
|
|
200
|
+
"md_translated_url": asset_urls["md_translated_url"],
|
|
201
|
+
"images_base_url": asset_urls["images_base_url"],
|
|
186
202
|
}
|
|
187
203
|
)
|
|
188
204
|
|
|
@@ -215,3 +231,42 @@ async def api_pdf(request: Request) -> Response:
|
|
|
215
231
|
if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
|
|
216
232
|
return Response("Forbidden", status_code=403)
|
|
217
233
|
return FileResponse(pdf_path)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
async def api_markdown(request: Request) -> Response:
|
|
237
|
+
"""Dev-only API endpoint to serve raw markdown content."""
|
|
238
|
+
if request.app.state.static_mode != "dev":
|
|
239
|
+
return Response("Not Found", status_code=404)
|
|
240
|
+
index: PaperIndex = request.app.state.index
|
|
241
|
+
asset_config = request.app.state.asset_config
|
|
242
|
+
export_dir = request.app.state.static_export_dir
|
|
243
|
+
source_hash = request.path_params["source_hash"]
|
|
244
|
+
lang = request.query_params.get("lang")
|
|
245
|
+
md_path = None
|
|
246
|
+
if export_dir and asset_config and asset_config.enabled and (asset_config.base_url or "") == "":
|
|
247
|
+
if lang:
|
|
248
|
+
translated_url = asset_config.translated_md_urls.get(source_hash, {}).get(lang.lower())
|
|
249
|
+
if translated_url:
|
|
250
|
+
rel_path = translated_url.lstrip("/")
|
|
251
|
+
export_path = export_dir / rel_path
|
|
252
|
+
if export_path.exists():
|
|
253
|
+
raw = _safe_read_text(export_path)
|
|
254
|
+
return Response(raw, media_type="text/markdown")
|
|
255
|
+
else:
|
|
256
|
+
md_url = asset_config.md_urls.get(source_hash)
|
|
257
|
+
if md_url:
|
|
258
|
+
rel_path = md_url.lstrip("/")
|
|
259
|
+
export_path = export_dir / rel_path
|
|
260
|
+
if export_path.exists():
|
|
261
|
+
raw = _safe_read_text(export_path)
|
|
262
|
+
return Response(raw, media_type="text/markdown")
|
|
263
|
+
if lang:
|
|
264
|
+
md_path = index.translated_md_by_hash.get(source_hash, {}).get(lang.lower())
|
|
265
|
+
else:
|
|
266
|
+
md_path = index.md_path_by_hash.get(source_hash)
|
|
267
|
+
if not md_path:
|
|
268
|
+
return Response("Markdown not found", status_code=404)
|
|
269
|
+
raw = _safe_read_text(md_path)
|
|
270
|
+
if lang:
|
|
271
|
+
raw = normalize_markdown_images(raw)
|
|
272
|
+
return Response(raw, media_type="text/markdown")
|