deepresearch-flow 0.5.1__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +124 -19
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +5 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +1 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +1 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +1 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +4 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +29 -7
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +51 -43
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.1.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from markdown_it import MarkdownIt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
_HTML_TABLE_RE = re.compile(r"<table\b.*?</table>", re.IGNORECASE | re.DOTALL)
|
|
10
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
11
|
+
_WS_RE = re.compile(r"\s+")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_cjk_char(ch: str) -> bool:
|
|
15
|
+
code = ord(ch)
|
|
16
|
+
return (
|
|
17
|
+
0x3400 <= code <= 0x4DBF # CJK Unified Ideographs Extension A
|
|
18
|
+
or 0x4E00 <= code <= 0x9FFF # CJK Unified Ideographs
|
|
19
|
+
or 0xF900 <= code <= 0xFAFF # CJK Compatibility Ideographs
|
|
20
|
+
or 0x3040 <= code <= 0x309F # Hiragana
|
|
21
|
+
or 0x30A0 <= code <= 0x30FF # Katakana
|
|
22
|
+
or 0xAC00 <= code <= 0xD7AF # Hangul syllables
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def insert_cjk_spaces(text: str) -> str:
|
|
27
|
+
out: list[str] = []
|
|
28
|
+
prev_cjk = False
|
|
29
|
+
for ch in text:
|
|
30
|
+
cur_cjk = _is_cjk_char(ch)
|
|
31
|
+
if prev_cjk and cur_cjk:
|
|
32
|
+
out.append(" ")
|
|
33
|
+
out.append(ch)
|
|
34
|
+
prev_cjk = cur_cjk
|
|
35
|
+
return "".join(out)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def remove_cjk_spaces(text: str) -> str:
|
|
39
|
+
if " " not in text:
|
|
40
|
+
return text
|
|
41
|
+
chars = list(text)
|
|
42
|
+
out: list[str] = []
|
|
43
|
+
for idx, ch in enumerate(chars):
|
|
44
|
+
if ch == " " and 0 < idx < len(chars) - 1:
|
|
45
|
+
if _is_cjk_char(chars[idx - 1]) and _is_cjk_char(chars[idx + 1]):
|
|
46
|
+
continue
|
|
47
|
+
out.append(ch)
|
|
48
|
+
return "".join(out)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def merge_adjacent_markers(text: str, *, start_marker: str = "[[[", end_marker: str = "]]]") -> str:
|
|
52
|
+
needle = f"{end_marker}{start_marker}"
|
|
53
|
+
while needle in text:
|
|
54
|
+
text = text.replace(needle, "")
|
|
55
|
+
return text
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _md_renderer() -> MarkdownIt:
|
|
59
|
+
md = MarkdownIt("commonmark", {"html": False, "linkify": False})
|
|
60
|
+
md.enable("table")
|
|
61
|
+
return md
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def markdown_to_plain_text(markdown: str) -> str:
|
|
65
|
+
if not markdown:
|
|
66
|
+
return ""
|
|
67
|
+
text = _HTML_TABLE_RE.sub(" ", markdown)
|
|
68
|
+
md = _md_renderer()
|
|
69
|
+
tokens = md.parse(text)
|
|
70
|
+
|
|
71
|
+
out: list[str] = []
|
|
72
|
+
in_table = 0
|
|
73
|
+
for token in tokens:
|
|
74
|
+
if token.type == "table_open":
|
|
75
|
+
in_table += 1
|
|
76
|
+
continue
|
|
77
|
+
if token.type == "table_close":
|
|
78
|
+
in_table = max(0, in_table - 1)
|
|
79
|
+
continue
|
|
80
|
+
if in_table:
|
|
81
|
+
continue
|
|
82
|
+
if token.type != "inline":
|
|
83
|
+
continue
|
|
84
|
+
for child in token.children or []:
|
|
85
|
+
if child.type in {"text", "code_inline"}:
|
|
86
|
+
out.append(child.content)
|
|
87
|
+
elif child.type == "softbreak":
|
|
88
|
+
out.append("\n")
|
|
89
|
+
elif child.type == "hardbreak":
|
|
90
|
+
out.append("\n")
|
|
91
|
+
elif child.type == "image":
|
|
92
|
+
if child.content:
|
|
93
|
+
out.append(child.content)
|
|
94
|
+
|
|
95
|
+
collapsed = _WS_RE.sub(" ", " ".join(out)).strip()
|
|
96
|
+
collapsed = _TAG_RE.sub(" ", collapsed)
|
|
97
|
+
return _WS_RE.sub(" ", collapsed).strip()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def normalize_query_punctuation(text: str) -> str:
|
|
101
|
+
if not text:
|
|
102
|
+
return ""
|
|
103
|
+
return re.sub(r"[,。、《》、;:!?()【】「」『』·…—]+", " ", text)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def split_mixed_cjk_latin(token: str) -> list[str]:
|
|
107
|
+
if not token:
|
|
108
|
+
return []
|
|
109
|
+
parts: list[str] = []
|
|
110
|
+
buf: list[str] = []
|
|
111
|
+
buf_is_cjk: bool | None = None
|
|
112
|
+
for ch in token:
|
|
113
|
+
cur_is_cjk = _is_cjk_char(ch)
|
|
114
|
+
if buf_is_cjk is None or cur_is_cjk == buf_is_cjk:
|
|
115
|
+
buf.append(ch)
|
|
116
|
+
buf_is_cjk = cur_is_cjk
|
|
117
|
+
continue
|
|
118
|
+
parts.append("".join(buf))
|
|
119
|
+
buf = [ch]
|
|
120
|
+
buf_is_cjk = cur_is_cjk
|
|
121
|
+
if buf:
|
|
122
|
+
parts.append("".join(buf))
|
|
123
|
+
return parts
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def rewrite_search_query(user_query: str) -> str:
|
|
127
|
+
cleaned = normalize_query_punctuation(user_query)
|
|
128
|
+
cleaned = _WS_RE.sub(" ", cleaned).strip()
|
|
129
|
+
if not cleaned:
|
|
130
|
+
return ""
|
|
131
|
+
|
|
132
|
+
out: list[str] = []
|
|
133
|
+
for raw in cleaned.split(" "):
|
|
134
|
+
if not raw:
|
|
135
|
+
continue
|
|
136
|
+
upper = raw.upper()
|
|
137
|
+
if upper in {"AND", "OR"}:
|
|
138
|
+
out.append(upper)
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
segments = split_mixed_cjk_latin(raw)
|
|
142
|
+
for seg in segments:
|
|
143
|
+
if not seg:
|
|
144
|
+
continue
|
|
145
|
+
if all(_is_cjk_char(ch) for ch in seg):
|
|
146
|
+
phrase = insert_cjk_spaces(seg)
|
|
147
|
+
out.append(f"\"{phrase}\"")
|
|
148
|
+
else:
|
|
149
|
+
safe = re.sub(r"[^0-9A-Za-z._+-]+", "", seg)
|
|
150
|
+
if safe:
|
|
151
|
+
out.append(safe.lower())
|
|
152
|
+
|
|
153
|
+
return " ".join(out)
|
|
154
|
+
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
{% set is_zh = output_language == "zh" %}
|
|
4
4
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
5
|
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
6
10
|
{% if output_language %}
|
|
7
11
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
12
|
{% endif %}
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
{% set is_zh = output_language == "zh" %}
|
|
4
4
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
5
|
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
6
10
|
{% if output_language %}
|
|
7
11
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
12
|
{% endif %}
|
|
@@ -3,6 +3,10 @@
|
|
|
3
3
|
{% set is_zh = output_language == "zh" %}
|
|
4
4
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
5
5
|
|
|
6
|
+
{% if paper_institutions %}
|
|
7
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
8
|
+
{% endif %}
|
|
9
|
+
|
|
6
10
|
{% if output_language %}
|
|
7
11
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
8
12
|
{% endif %}
|
|
@@ -4,6 +4,10 @@
|
|
|
4
4
|
|
|
5
5
|
**{{ "作者 / Authors" if is_zh else "Authors" }}:** {{ paper_authors | join(", ") }}
|
|
6
6
|
|
|
7
|
+
{% if paper_institutions %}
|
|
8
|
+
**{{ "单位 / Institutions" if is_zh else "Institutions" }}:** {{ paper_institutions | join(", ") }}
|
|
9
|
+
{% endif %}
|
|
10
|
+
|
|
7
11
|
{% if output_language %}
|
|
8
12
|
**{{ "输出语言 / Output Language" if is_zh else "Output Language" }}:** {{ output_language }}
|
|
9
13
|
{% endif %}
|
|
@@ -91,7 +91,7 @@ def create_app(
|
|
|
91
91
|
pdf_roots=pdf_roots,
|
|
92
92
|
)
|
|
93
93
|
md = create_md_renderer()
|
|
94
|
-
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
94
|
+
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE") or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
95
95
|
static_mode = _normalize_static_mode(static_mode or os.getenv("PAPER_DB_STATIC_MODE"))
|
|
96
96
|
resolved_mode = _resolve_static_mode(static_mode, static_base_url)
|
|
97
97
|
export_dir_value = static_export_dir or os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
|
|
@@ -111,7 +111,12 @@ def create_app(
|
|
|
111
111
|
asset_config = None
|
|
112
112
|
if resolved_mode == "prod":
|
|
113
113
|
if not static_base_url:
|
|
114
|
-
logger.warning(
|
|
114
|
+
logger.warning(
|
|
115
|
+
"Static mode set to prod without base URL; falling back to dev asset routes "
|
|
116
|
+
"(static_mode=%s, static_base_url=%s)",
|
|
117
|
+
static_mode,
|
|
118
|
+
static_base_url or "<empty>",
|
|
119
|
+
)
|
|
115
120
|
resolved_mode = "dev"
|
|
116
121
|
else:
|
|
117
122
|
asset_config = build_static_assets(
|
|
@@ -149,8 +154,10 @@ def create_app(
|
|
|
149
154
|
)
|
|
150
155
|
elif pdf_roots:
|
|
151
156
|
logger.warning(
|
|
152
|
-
"PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable
|
|
157
|
+
"PDF.js viewer assets not found at %s; PDF Viewer mode will be unavailable "
|
|
158
|
+
"(pdf_roots=%d).",
|
|
153
159
|
PDFJS_STATIC_DIR,
|
|
160
|
+
len(pdf_roots),
|
|
154
161
|
)
|
|
155
162
|
if STATIC_DIR.exists():
|
|
156
163
|
routes.append(
|