deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +145 -26
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +40 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/paper/web/markdown.py +174 -8
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +46 -12
- deepresearch_flow/paper/web/templates/detail.html +9 -0
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +35 -16
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/fixers.py +15 -0
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from markdown_it import MarkdownIt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_HTML_TABLE_RE = re.compile(r"<table\b.*?</table>", re.IGNORECASE | re.DOTALL)
|
|
10
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
11
|
+
_WS_RE = re.compile(r"\s+")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_cjk_char(ch: str) -> bool:
|
|
15
|
+
code = ord(ch)
|
|
16
|
+
return (
|
|
17
|
+
0x3400 <= code <= 0x4DBF # CJK Unified Ideographs Extension A
|
|
18
|
+
or 0x4E00 <= code <= 0x9FFF # CJK Unified Ideographs
|
|
19
|
+
or 0xF900 <= code <= 0xFAFF # CJK Compatibility Ideographs
|
|
20
|
+
or 0x3040 <= code <= 0x309F # Hiragana
|
|
21
|
+
or 0x30A0 <= code <= 0x30FF # Katakana
|
|
22
|
+
or 0xAC00 <= code <= 0xD7AF # Hangul syllables
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def insert_cjk_spaces(text: str) -> str:
|
|
27
|
+
out: list[str] = []
|
|
28
|
+
prev_cjk = False
|
|
29
|
+
for ch in text:
|
|
30
|
+
cur_cjk = _is_cjk_char(ch)
|
|
31
|
+
if prev_cjk and cur_cjk:
|
|
32
|
+
out.append(" ")
|
|
33
|
+
out.append(ch)
|
|
34
|
+
prev_cjk = cur_cjk
|
|
35
|
+
return "".join(out)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def remove_cjk_spaces(text: str) -> str:
|
|
39
|
+
if " " not in text:
|
|
40
|
+
return text
|
|
41
|
+
chars = list(text)
|
|
42
|
+
out: list[str] = []
|
|
43
|
+
for idx, ch in enumerate(chars):
|
|
44
|
+
if ch == " " and 0 < idx < len(chars) - 1:
|
|
45
|
+
if _is_cjk_char(chars[idx - 1]) and _is_cjk_char(chars[idx + 1]):
|
|
46
|
+
continue
|
|
47
|
+
out.append(ch)
|
|
48
|
+
return "".join(out)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def merge_adjacent_markers(text: str, *, start_marker: str = "[[[", end_marker: str = "]]]") -> str:
|
|
52
|
+
needle = f"{end_marker}{start_marker}"
|
|
53
|
+
while needle in text:
|
|
54
|
+
text = text.replace(needle, "")
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _md_renderer() -> MarkdownIt:
|
|
59
|
+
md = MarkdownIt("commonmark", {"html": False, "linkify": False})
|
|
60
|
+
md.enable("table")
|
|
61
|
+
return md
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def markdown_to_plain_text(markdown: str) -> str:
|
|
65
|
+
if not markdown:
|
|
66
|
+
return ""
|
|
67
|
+
text = _HTML_TABLE_RE.sub(" ", markdown)
|
|
68
|
+
md = _md_renderer()
|
|
69
|
+
tokens = md.parse(text)
|
|
70
|
+
|
|
71
|
+
out: list[str] = []
|
|
72
|
+
in_table = 0
|
|
73
|
+
for token in tokens:
|
|
74
|
+
if token.type == "table_open":
|
|
75
|
+
in_table += 1
|
|
76
|
+
continue
|
|
77
|
+
if token.type == "table_close":
|
|
78
|
+
in_table = max(0, in_table - 1)
|
|
79
|
+
continue
|
|
80
|
+
if in_table:
|
|
81
|
+
continue
|
|
82
|
+
if token.type != "inline":
|
|
83
|
+
continue
|
|
84
|
+
for child in token.children or []:
|
|
85
|
+
if child.type in {"text", "code_inline"}:
|
|
86
|
+
out.append(child.content)
|
|
87
|
+
elif child.type == "softbreak":
|
|
88
|
+
out.append("\n")
|
|
89
|
+
elif child.type == "hardbreak":
|
|
90
|
+
out.append("\n")
|
|
91
|
+
elif child.type == "image":
|
|
92
|
+
if child.content:
|
|
93
|
+
out.append(child.content)
|
|
94
|
+
|
|
95
|
+
collapsed = _WS_RE.sub(" ", " ".join(out)).strip()
|
|
96
|
+
collapsed = _TAG_RE.sub(" ", collapsed)
|
|
97
|
+
return _WS_RE.sub(" ", collapsed).strip()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def normalize_query_punctuation(text: str) -> str:
|
|
101
|
+
if not text:
|
|
102
|
+
return ""
|
|
103
|
+
return re.sub(r"[,。、《》、;:!?()【】「」『』·…—]+", " ", text)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def split_mixed_cjk_latin(token: str) -> list[str]:
|
|
107
|
+
if not token:
|
|
108
|
+
return []
|
|
109
|
+
parts: list[str] = []
|
|
110
|
+
buf: list[str] = []
|
|
111
|
+
buf_is_cjk: bool | None = None
|
|
112
|
+
for ch in token:
|
|
113
|
+
cur_is_cjk = _is_cjk_char(ch)
|
|
114
|
+
if buf_is_cjk is None or cur_is_cjk == buf_is_cjk:
|
|
115
|
+
buf.append(ch)
|
|
116
|
+
buf_is_cjk = cur_is_cjk
|
|
117
|
+
continue
|
|
118
|
+
parts.append("".join(buf))
|
|
119
|
+
buf = [ch]
|
|
120
|
+
buf_is_cjk = cur_is_cjk
|
|
121
|
+
if buf:
|
|
122
|
+
parts.append("".join(buf))
|
|
123
|
+
return parts
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def rewrite_search_query(user_query: str) -> str:
|
|
127
|
+
cleaned = normalize_query_punctuation(user_query)
|
|
128
|
+
cleaned = _WS_RE.sub(" ", cleaned).strip()
|
|
129
|
+
if not cleaned:
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
out: list[str] = []
|
|
133
|
+
for raw in cleaned.split(" "):
|
|
134
|
+
if not raw:
|
|
135
|
+
continue
|
|
136
|
+
upper = raw.upper()
|
|
137
|
+
if upper in {"AND", "OR"}:
|
|
138
|
+
out.append(upper)
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
segments = split_mixed_cjk_latin(raw)
|
|
142
|
+
for seg in segments:
|
|
143
|
+
if not seg:
|
|
144
|
+
continue
|
|
145
|
+
if all(_is_cjk_char(ch) for ch in seg):
|
|
146
|
+
phrase = insert_cjk_spaces(seg)
|
|
147
|
+
out.append(f"\"{phrase}\"")
|
|
148
|
+
else:
|
|
149
|
+
safe = re.sub(r"[^0-9A-Za-z._+-]+", "", seg)
|
|
150
|
+
if safe:
|
|
151
|
+
out.append(safe.lower())
|
|
152
|
+
|
|
153
|
+
return " ".join(out)
|
|
154
|
+
|
|
@@ -24,6 +24,7 @@ class TemplateBundle:
|
|
|
24
24
|
class StageDefinition:
|
|
25
25
|
name: str
|
|
26
26
|
fields: list[str]
|
|
27
|
+
depends_on: list[str] | None = None
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
_TEMPLATES: dict[str, TemplateBundle] = {
|
|
@@ -34,6 +35,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
34
35
|
schema_file="default_paper_schema.json",
|
|
35
36
|
render_template="default_paper.md.j2",
|
|
36
37
|
),
|
|
38
|
+
"simple_phi": TemplateBundle(
|
|
39
|
+
name="simple_phi",
|
|
40
|
+
prompt_system="simple_phi_system.j2",
|
|
41
|
+
prompt_user="simple_phi_user.j2",
|
|
42
|
+
schema_file="default_paper_schema.json",
|
|
43
|
+
render_template="default_paper.md.j2",
|
|
44
|
+
),
|
|
37
45
|
"deep_read": TemplateBundle(
|
|
38
46
|
name="deep_read",
|
|
39
47
|
prompt_system="deep_read_system.j2",
|
|
@@ -41,6 +49,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
41
49
|
schema_file="deep_read_schema.json",
|
|
42
50
|
render_template="deep_read.md.j2",
|
|
43
51
|
),
|
|
52
|
+
"deep_read_phi": TemplateBundle(
|
|
53
|
+
name="deep_read_phi",
|
|
54
|
+
prompt_system="deep_read_phi_system.j2",
|
|
55
|
+
prompt_user="deep_read_phi_user.j2",
|
|
56
|
+
schema_file="deep_read_phi_schema.json",
|
|
57
|
+
render_template="deep_read_phi.md.j2",
|
|
58
|
+
),
|
|
44
59
|
"eight_questions": TemplateBundle(
|
|
45
60
|
name="eight_questions",
|
|
46
61
|
prompt_system="eight_questions_system.j2",
|
|
@@ -48,6 +63,13 @@ _TEMPLATES: dict[str, TemplateBundle] = {
|
|
|
48
63
|
schema_file="eight_questions_schema.json",
|
|
49
64
|
render_template="eight_questions.md.j2",
|
|
50
65
|
),
|
|
66
|
+
"eight_questions_phi": TemplateBundle(
|
|
67
|
+
name="eight_questions_phi",
|
|
68
|
+
prompt_system="eight_questions_phi_system.j2",
|
|
69
|
+
prompt_user="eight_questions_phi_user.j2",
|
|
70
|
+
schema_file="eight_questions_schema.json",
|
|
71
|
+
render_template="eight_questions.md.j2",
|
|
72
|
+
),
|
|
51
73
|
"three_pass": TemplateBundle(
|
|
52
74
|
name="three_pass",
|
|
53
75
|
prompt_system="three_pass_system.j2",
|
|
@@ -75,6 +97,14 @@ _STAGES: dict[str, list[StageDefinition]] = {
|
|
|
75
97
|
StageDefinition("module_g", ["module_g"]),
|
|
76
98
|
StageDefinition("module_h", ["module_h"]),
|
|
77
99
|
],
|
|
100
|
+
"deep_read_phi": [
|
|
101
|
+
StageDefinition("module_m1", ["module_m1"]),
|
|
102
|
+
StageDefinition("module_m2", ["module_m2"]),
|
|
103
|
+
StageDefinition("module_m3", ["module_m3"]),
|
|
104
|
+
StageDefinition("module_m4", ["module_m4"]),
|
|
105
|
+
StageDefinition("module_m5", ["module_m5"]),
|
|
106
|
+
StageDefinition("module_m6", ["module_m6"]),
|
|
107
|
+
],
|
|
78
108
|
"eight_questions": [
|
|
79
109
|
StageDefinition(
|
|
80
110
|
"questions_1to4",
|
|
@@ -85,6 +115,16 @@ _STAGES: dict[str, list[StageDefinition]] = {
|
|
|
85
115
|
["question5", "question6", "question7", "question8"],
|
|
86
116
|
),
|
|
87
117
|
],
|
|
118
|
+
"eight_questions_phi": [
|
|
119
|
+
StageDefinition(
|
|
120
|
+
"questions_1to4",
|
|
121
|
+
["question1", "question2", "question3", "question4"],
|
|
122
|
+
),
|
|
123
|
+
StageDefinition(
|
|
124
|
+
"questions_5to8",
|
|
125
|
+
["question5", "question6", "question7", "question8"],
|
|
126
|
+
),
|
|
127
|
+
],
|
|
88
128
|
"three_pass": [
|
|
89
129
|
StageDefinition("step1_summary", ["step1_summary"]),
|
|
90
130
|
StageDefinition("step2_analysis", ["step2_analysis"]),
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
{% set is_zh = output_language == "zh" %}
|
|
4
4
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
5
|
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
6
10
|
{% if output_language %}
|
|
7
11
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
12
|
{% endif %}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# {{ paper_title }}
|
|
2
|
+
|
|
3
|
+
{% set is_zh = output_language == "zh" %}
|
|
4
|
+
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
|
+
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
10
|
+
{% if output_language %}
|
|
11
|
+
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
12
|
+
{% endif %}
|
|
13
|
+
|
|
14
|
+
{% if publication_date %}
|
|
15
|
+
**{{ "发表日期 / Publication Date" if is_zh else "Publication Date" }}:** {{ publication_date }}
|
|
16
|
+
{% endif %}
|
|
17
|
+
|
|
18
|
+
{% if publication_venue %}
|
|
19
|
+
**{{ "期刊/会议 / Publication Venue" if is_zh else "Publication Venue" }}:** {{ publication_venue }}
|
|
20
|
+
{% endif %}
|
|
21
|
+
|
|
22
|
+
## {{ "模块 M1:对齐目标 + 输入校验 + 论文地图 + Exhibit全量索引 + markmap大纲脑图" if is_zh else "Module M1: Alignment + Input Check + Paper Map + Exhibit Index + Markmap" }}
|
|
23
|
+
|
|
24
|
+
{{ module_m1 }}
|
|
25
|
+
|
|
26
|
+
## {{ "模块 M2:第一遍鸟瞰 + Exhibit客观打分排序" if is_zh else "Module M2: First Pass Overview + Exhibit Scoring" }}
|
|
27
|
+
|
|
28
|
+
{{ module_m2 }}
|
|
29
|
+
|
|
30
|
+
## {{ "模块 M3:概念与术语工程 + 辩论谱系" if is_zh else "Module M3: Concepts + Debate Lineage" }}
|
|
31
|
+
|
|
32
|
+
{{ module_m3 }}
|
|
33
|
+
|
|
34
|
+
## {{ "模块 M4:论证重建 + Top Exhibits 深读(上半)" if is_zh else "Module M4: Argument Map + Top Exhibits (Part 1)" }}
|
|
35
|
+
|
|
36
|
+
{{ module_m4 }}
|
|
37
|
+
|
|
38
|
+
## {{ "模块 M5:深度审视 + objection mining + Top Exhibits 深读(下半)" if is_zh else "Module M5: Deep Review + Objection Mining + Top Exhibits (Part 2)" }}
|
|
39
|
+
|
|
40
|
+
{{ module_m5 }}
|
|
41
|
+
|
|
42
|
+
## {{ "模块 M6:写作级产出包" if is_zh else "Module M6: Writing-Ready Output Pack" }}
|
|
43
|
+
|
|
44
|
+
{{ module_m6 }}
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
{% set is_zh = output_language == "zh" %}
|
|
4
4
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
5
|
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
6
10
|
{% if output_language %}
|
|
7
11
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
12
|
{% endif %}
|
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
6
6
|
|
|
7
|
+
{% if paper_institutions %}
|
|
8
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
9
|
+
{% endif %}
|
|
10
|
+
|
|
7
11
|
{% if output_language %}
|
|
8
12
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
9
13
|
{% endif %}
|
|
@@ -91,7 +91,7 @@ def create_app(
|
|
|
91
91
|
pdf_roots=pdf_roots,
|
|
92
92
|
)
|
|
93
93
|
md = create_md_renderer()
|
|
94
|
-
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
94
|
+
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE") or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
95
95
|
static_mode = _normalize_static_mode(static_mode or os.getenv("PAPER_DB_STATIC_MODE"))
|
|
96
96
|
resolved_mode = _resolve_static_mode(static_mode, static_base_url)
|
|
97
97
|
export_dir_value = static_export_dir or os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
|
|
@@ -111,7 +111,12 @@ def create_app(
|
|
|
111
111
|
asset_config = None
|
|
112
112
|
if resolved_mode == "prod":
|
|
113
113
|
if not static_base_url:
|
|
114
|
-
logger.warning(
|
|
114
|
+
logger.warning(
|
|
115
|
+
"Static mode set to prod without base URL; falling back to dev asset routes "
|
|
116
|
+
"(static_mode=%s, static_base_url=%s)",
|
|
117
|
+
static_mode,
|
|
118
|
+
static_base_url or "<empty>",
|
|
119
|
+
)
|
|
115
120
|
resolved_mode = "dev"
|
|
116
121
|
else:
|
|
117
122
|
asset_config = build_static_assets(
|
|
@@ -149,8 +154,10 @@ def create_app(
|
|
|
149
154
|
)
|
|
150
155
|
elif pdf_roots:
|
|
151
156
|
logger.warning(
|
|
152
|
-
"PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable
|
|
157
|
+
"PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable "
|
|
158
|
+
"(pdf_roots=%d).",
|
|
153
159
|
PDFJS_STATIC_DIR,
|
|
160
|
+
len(pdf_roots),
|
|
154
161
|
)
|
|
155
162
|
if STATIC_DIR.exists():
|
|
156
163
|
routes.append(
|
|
@@ -41,13 +41,88 @@ def strip_paragraph_wrapped_tables(text: str) -> str:
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
def normalize_footnote_definitions(text: str) -> str:
|
|
44
|
-
"""Normalize
|
|
44
|
+
"""Normalize footnotes and numbered notes to markdown-it footnote format."""
|
|
45
45
|
lines = text.splitlines()
|
|
46
|
-
|
|
46
|
+
out: list[str] = []
|
|
47
|
+
in_fence = False
|
|
48
|
+
fence_char = ""
|
|
49
|
+
fence_len = 0
|
|
50
|
+
in_notes = False
|
|
51
|
+
notes_level: int | None = None
|
|
52
|
+
notes_heading_re = re.compile(
|
|
53
|
+
r"^#{1,6}\s*(参考文献|参考资料|参考书目|文献|引用|注释|脚注|notes?|references?|bibliography|works\s+cited|citations?)\b",
|
|
54
|
+
re.IGNORECASE,
|
|
55
|
+
)
|
|
56
|
+
notes_heading_plain_re = re.compile(
|
|
57
|
+
r"^(参考文献|参考资料|参考书目|文献|引用|注释|脚注|notes?|references?|bibliography|works\s+cited|citations?)\s*:?$",
|
|
58
|
+
re.IGNORECASE,
|
|
59
|
+
)
|
|
60
|
+
last_note_index: int | None = None
|
|
61
|
+
|
|
62
|
+
for line in lines:
|
|
63
|
+
stripped = line.lstrip()
|
|
64
|
+
if stripped.startswith(("```", "~~~")):
|
|
65
|
+
run_len = 0
|
|
66
|
+
while run_len < len(stripped) and stripped[run_len] == stripped[0]:
|
|
67
|
+
run_len += 1
|
|
68
|
+
if not in_fence:
|
|
69
|
+
in_fence = True
|
|
70
|
+
fence_char = stripped[0]
|
|
71
|
+
fence_len = run_len
|
|
72
|
+
elif stripped[0] == fence_char and run_len >= fence_len:
|
|
73
|
+
in_fence = False
|
|
74
|
+
fence_char = ""
|
|
75
|
+
fence_len = 0
|
|
76
|
+
out.append(line)
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
if in_fence:
|
|
80
|
+
out.append(line)
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
heading_match = notes_heading_re.match(stripped)
|
|
84
|
+
if heading_match:
|
|
85
|
+
in_notes = True
|
|
86
|
+
notes_level = len(stripped.split(" ")[0].lstrip("#"))
|
|
87
|
+
last_note_index = None
|
|
88
|
+
elif notes_heading_plain_re.match(stripped):
|
|
89
|
+
in_notes = True
|
|
90
|
+
notes_level = None
|
|
91
|
+
last_note_index = None
|
|
92
|
+
elif re.match(r"^#{1,6}\s+", stripped):
|
|
93
|
+
if notes_level is not None:
|
|
94
|
+
level = len(stripped.split(" ")[0].lstrip("#"))
|
|
95
|
+
if level <= notes_level:
|
|
96
|
+
in_notes = False
|
|
97
|
+
notes_level = None
|
|
98
|
+
last_note_index = None
|
|
99
|
+
|
|
47
100
|
match = re.match(r"^\[\^([0-9]+)\]\s+", line)
|
|
48
101
|
if match:
|
|
49
|
-
|
|
50
|
-
|
|
102
|
+
out.append(re.sub(r"^\[\^([0-9]+)\]\s+", r"[^\1]: ", line))
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if in_notes:
|
|
106
|
+
list_match = re.match(r"^\s*(\d{1,4})[.)]\s+", line)
|
|
107
|
+
if list_match:
|
|
108
|
+
number = list_match.group(1)
|
|
109
|
+
rest = line[list_match.end() :].strip()
|
|
110
|
+
out.append(f"[^{number}]: {rest}")
|
|
111
|
+
last_note_index = len(out) - 1
|
|
112
|
+
continue
|
|
113
|
+
if last_note_index is not None:
|
|
114
|
+
if line.strip() == "":
|
|
115
|
+
out.append(line)
|
|
116
|
+
last_note_index = None
|
|
117
|
+
continue
|
|
118
|
+
if line.startswith((" ", "\t")):
|
|
119
|
+
out[last_note_index] = f"{out[last_note_index]} {line.strip()}"
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
line = re.sub(r"(?<!\^)\[(\d{1,4})\]", r"[^\1]", line)
|
|
123
|
+
out.append(line)
|
|
124
|
+
|
|
125
|
+
return "\n".join(out)
|
|
51
126
|
|
|
52
127
|
|
|
53
128
|
def normalize_markdown_images(text: str) -> str:
|
|
@@ -115,6 +190,68 @@ def normalize_fenced_code_blocks(text: str) -> str:
|
|
|
115
190
|
return "\n".join(out)
|
|
116
191
|
|
|
117
192
|
|
|
193
|
+
def normalize_mermaid_blocks(text: str) -> str:
|
|
194
|
+
"""Keep mermaid fences clean by moving legend text outside the block."""
|
|
195
|
+
lines = text.splitlines()
|
|
196
|
+
out: list[str] = []
|
|
197
|
+
in_mermaid = False
|
|
198
|
+
fence_char = ""
|
|
199
|
+
fence_len = 0
|
|
200
|
+
mermaid_lines: list[str] = []
|
|
201
|
+
legend_lines: list[str] = []
|
|
202
|
+
|
|
203
|
+
def is_legend(line: str) -> bool:
|
|
204
|
+
stripped = line.strip()
|
|
205
|
+
if not stripped:
|
|
206
|
+
return False
|
|
207
|
+
if stripped.startswith("图例") or stripped.lower().startswith("legend"):
|
|
208
|
+
return True
|
|
209
|
+
return "节点定位" in stripped
|
|
210
|
+
|
|
211
|
+
for line in lines:
|
|
212
|
+
stripped = line.lstrip()
|
|
213
|
+
if stripped.startswith(("```", "~~~")):
|
|
214
|
+
run_len = 0
|
|
215
|
+
while run_len < len(stripped) and stripped[run_len] == stripped[0]:
|
|
216
|
+
run_len += 1
|
|
217
|
+
rest = stripped[run_len:].strip()
|
|
218
|
+
if not in_mermaid and rest.lower().startswith("mermaid"):
|
|
219
|
+
in_mermaid = True
|
|
220
|
+
fence_char = stripped[0]
|
|
221
|
+
fence_len = run_len
|
|
222
|
+
mermaid_lines = []
|
|
223
|
+
legend_lines = []
|
|
224
|
+
out.append(line)
|
|
225
|
+
continue
|
|
226
|
+
if in_mermaid and stripped[0] == fence_char and run_len >= fence_len and rest == "":
|
|
227
|
+
out.extend(mermaid_lines)
|
|
228
|
+
out.append(line)
|
|
229
|
+
out.extend(legend_lines)
|
|
230
|
+
in_mermaid = False
|
|
231
|
+
fence_char = ""
|
|
232
|
+
fence_len = 0
|
|
233
|
+
mermaid_lines = []
|
|
234
|
+
legend_lines = []
|
|
235
|
+
continue
|
|
236
|
+
out.append(line)
|
|
237
|
+
continue
|
|
238
|
+
|
|
239
|
+
if in_mermaid:
|
|
240
|
+
if is_legend(line):
|
|
241
|
+
legend_lines.append(line)
|
|
242
|
+
else:
|
|
243
|
+
mermaid_lines.append(line)
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
out.append(line)
|
|
247
|
+
|
|
248
|
+
if in_mermaid:
|
|
249
|
+
out.extend(mermaid_lines)
|
|
250
|
+
out.extend(legend_lines)
|
|
251
|
+
|
|
252
|
+
return "\n".join(out)
|
|
253
|
+
|
|
254
|
+
|
|
118
255
|
def normalize_unbalanced_fences(text: str) -> str:
|
|
119
256
|
"""Drop unmatched opening fences so later content still renders."""
|
|
120
257
|
lines = text.splitlines()
|
|
@@ -122,6 +259,7 @@ def normalize_unbalanced_fences(text: str) -> str:
|
|
|
122
259
|
in_fence = False
|
|
123
260
|
fence_char = ""
|
|
124
261
|
fence_len = 0
|
|
262
|
+
fence_has_content = False
|
|
125
263
|
fence_open_indices: list[int] = []
|
|
126
264
|
fence_re = re.compile(r"([`~]{3,})(.*)$")
|
|
127
265
|
|
|
@@ -135,19 +273,46 @@ def normalize_unbalanced_fences(text: str) -> str:
|
|
|
135
273
|
run = match.group(1)
|
|
136
274
|
fence = run[0]
|
|
137
275
|
run_len = len(run)
|
|
276
|
+
rest = match.group(2) or ""
|
|
277
|
+
has_info = bool(rest.strip())
|
|
138
278
|
if not in_fence:
|
|
139
279
|
in_fence = True
|
|
140
280
|
fence_char = fence
|
|
141
281
|
fence_len = run_len
|
|
282
|
+
fence_has_content = False
|
|
142
283
|
fence_open_indices.append(len(out))
|
|
143
284
|
is_fence = True
|
|
144
|
-
elif fence == fence_char and run_len >= fence_len:
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
285
|
+
elif fence == fence_char and run_len >= fence_len and not has_info:
|
|
286
|
+
if not fence_has_content:
|
|
287
|
+
if fence_open_indices:
|
|
288
|
+
out.pop(fence_open_indices[-1])
|
|
289
|
+
fence_open_indices.pop()
|
|
290
|
+
in_fence = True
|
|
291
|
+
fence_char = fence
|
|
292
|
+
fence_len = run_len
|
|
293
|
+
fence_has_content = False
|
|
294
|
+
fence_open_indices.append(len(out))
|
|
295
|
+
is_fence = True
|
|
296
|
+
else:
|
|
297
|
+
in_fence = False
|
|
298
|
+
fence_char = ""
|
|
299
|
+
fence_len = 0
|
|
300
|
+
fence_has_content = False
|
|
301
|
+
is_fence = True
|
|
302
|
+
elif fence == fence_char and run_len >= fence_len and has_info:
|
|
303
|
+
if fence_open_indices:
|
|
304
|
+
out.pop(fence_open_indices[-1])
|
|
305
|
+
fence_open_indices.pop()
|
|
306
|
+
in_fence = True
|
|
307
|
+
fence_char = fence
|
|
308
|
+
fence_len = run_len
|
|
309
|
+
fence_has_content = False
|
|
310
|
+
fence_open_indices.append(len(out))
|
|
148
311
|
is_fence = True
|
|
149
312
|
|
|
150
313
|
out.append(line)
|
|
314
|
+
if in_fence and not is_fence and line.strip():
|
|
315
|
+
fence_has_content = True
|
|
151
316
|
|
|
152
317
|
if in_fence and fence_open_indices:
|
|
153
318
|
out.pop(fence_open_indices[-1])
|
|
@@ -534,6 +699,7 @@ def extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
|
534
699
|
|
|
535
700
|
def render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
|
|
536
701
|
"""Render markdown with math, images, and tables properly escaped."""
|
|
702
|
+
text = normalize_mermaid_blocks(text)
|
|
537
703
|
text = normalize_fenced_code_blocks(text)
|
|
538
704
|
text = normalize_unbalanced_fences(text)
|
|
539
705
|
text = strip_paragraph_wrapped_tables(text)
|
|
@@ -223,11 +223,18 @@ header a {
|
|
|
223
223
|
|
|
224
224
|
.markmap {
|
|
225
225
|
width: 100%;
|
|
226
|
-
height: 420px;
|
|
227
226
|
border: 1px solid #e2e8f0;
|
|
228
227
|
border-radius: 12px;
|
|
229
228
|
background: #ffffff;
|
|
230
229
|
margin: 12px 0;
|
|
230
|
+
padding: 8px;
|
|
231
|
+
overflow-x: auto;
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
.markmap > svg {
|
|
235
|
+
width: 100%;
|
|
236
|
+
min-height: 240px;
|
|
237
|
+
display: block;
|
|
231
238
|
}
|
|
232
239
|
|
|
233
240
|
/* Utilities */
|
|
@@ -421,24 +421,58 @@
|
|
|
421
421
|
var content = document.getElementById('content');
|
|
422
422
|
if (!content) return;
|
|
423
423
|
|
|
424
|
-
// Markmap: convert fenced markmap blocks to
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
424
|
+
// Markmap: convert fenced markmap blocks to autoloader containers
|
|
425
|
+
var markmapBlocks = 0;
|
|
426
|
+
document.querySelectorAll('code.language-markmap').forEach(function(code) {
|
|
427
|
+
var pre = code.parentElement;
|
|
428
|
+
if (!pre) return;
|
|
429
|
+
var wrapper = document.createElement('div');
|
|
430
|
+
wrapper.className = 'markmap';
|
|
431
|
+
var template = document.createElement('script');
|
|
432
|
+
template.type = 'text/template';
|
|
433
|
+
template.textContent = code.textContent || '';
|
|
434
|
+
wrapper.appendChild(template);
|
|
435
|
+
pre.replaceWith(wrapper);
|
|
436
|
+
markmapBlocks += 1;
|
|
437
|
+
});
|
|
438
|
+
function resizeMarkmaps() {
|
|
439
|
+
document.querySelectorAll('.markmap svg').forEach(function(svg) {
|
|
433
440
|
try {
|
|
434
|
-
var
|
|
435
|
-
|
|
441
|
+
var bbox = svg.getBBox();
|
|
442
|
+
if (!bbox || !bbox.height) {
|
|
443
|
+
svg.style.height = '800px';
|
|
444
|
+
svg.style.width = '100%';
|
|
445
|
+
return;
|
|
446
|
+
}
|
|
447
|
+
var height = Math.ceil(bbox.height * 2);
|
|
448
|
+
svg.style.height = height + 'px';
|
|
449
|
+
if (bbox.width && bbox.width > svg.clientWidth) {
|
|
450
|
+
svg.style.width = Math.ceil(bbox.width * 2) + 'px';
|
|
451
|
+
if (svg.parentElement) {
|
|
452
|
+
svg.parentElement.style.overflowX = 'auto';
|
|
453
|
+
}
|
|
454
|
+
} else {
|
|
455
|
+
svg.style.width = '100%';
|
|
456
|
+
}
|
|
436
457
|
} catch (err) {
|
|
437
|
-
// Ignore
|
|
458
|
+
// Ignore sizing errors
|
|
438
459
|
}
|
|
439
460
|
});
|
|
440
461
|
}
|
|
441
462
|
|
|
463
|
+
if (markmapBlocks && window.markmap && window.markmap.autoLoader && window.markmap.autoLoader.renderAll) {
|
|
464
|
+
window.markmap.autoLoader.renderAll();
|
|
465
|
+
setTimeout(resizeMarkmaps, 120);
|
|
466
|
+
setTimeout(resizeMarkmaps, 600);
|
|
467
|
+
setTimeout(resizeMarkmaps, 1600);
|
|
468
|
+
if (!window.__markmapResizeBound) {
|
|
469
|
+
window.__markmapResizeBound = true;
|
|
470
|
+
window.addEventListener('resize', function() {
|
|
471
|
+
setTimeout(resizeMarkmaps, 120);
|
|
472
|
+
});
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
|
|
442
476
|
// Mermaid: convert fenced code blocks to mermaid divs
|
|
443
477
|
document.querySelectorAll('code.language-mermaid').forEach(function(code) {
|
|
444
478
|
var pre = code.parentElement;
|