deepresearch-flow 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +9 -0
  4. deepresearch_flow/paper/llm.py +2 -0
  5. deepresearch_flow/paper/web/app.py +413 -20
  6. deepresearch_flow/paper/web/pdfjs/build/pdf.js +18146 -0
  7. deepresearch_flow/paper/web/pdfjs/build/pdf.js.map +1 -0
  8. deepresearch_flow/paper/web/pdfjs/build/pdf.sandbox.js +280 -0
  9. deepresearch_flow/paper/web/pdfjs/build/pdf.sandbox.js.map +1 -0
  10. deepresearch_flow/paper/web/pdfjs/build/pdf.worker.js +58353 -0
  11. deepresearch_flow/paper/web/pdfjs/build/pdf.worker.js.map +1 -0
  12. deepresearch_flow/recognize/cli.py +157 -3
  13. deepresearch_flow/recognize/organize.py +58 -0
  14. deepresearch_flow/translator/__init__.py +1 -0
  15. deepresearch_flow/translator/cli.py +451 -0
  16. deepresearch_flow/translator/config.py +19 -0
  17. deepresearch_flow/translator/engine.py +959 -0
  18. deepresearch_flow/translator/fixers.py +451 -0
  19. deepresearch_flow/translator/placeholder.py +62 -0
  20. deepresearch_flow/translator/prompts.py +116 -0
  21. deepresearch_flow/translator/protector.py +291 -0
  22. deepresearch_flow/translator/segment.py +180 -0
  23. deepresearch_flow-0.3.0.dist-info/METADATA +306 -0
  24. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/RECORD +28 -13
  25. deepresearch_flow-0.2.0.dist-info/METADATA +0 -424
  26. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/WHEEL +0 -0
  27. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/entry_points.txt +0 -0
  28. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/licenses/LICENSE +0 -0
  29. {deepresearch_flow-0.2.0.dist-info → deepresearch_flow-0.3.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,291 @@
1
+ """Markdown protection and restoration using placeholders."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import List, Optional
7
+
8
+ from deepresearch_flow.translator.config import TranslateConfig
9
+ from deepresearch_flow.translator.placeholder import PlaceHolderStore
10
+
11
+
12
+ class MarkdownProtector:
13
+ BLOCK_HTML_TAGS = (
14
+ "address|article|aside|blockquote|body|caption|center|col|colgroup|dd|details|dialog|div|dl|dt|fieldset|"
15
+ "figcaption|figure|footer|form|frame|frameset|h[1-6]|head|header|hr|html|legend|li|link|main|menu|nav|"
16
+ "noframes|ol|optgroup|option|p|param|section|summary|table|tbody|td|tfoot|th|thead|title|tr|ul|video|audio|canvas"
17
+ )
18
+ VOID_HTML_TAGS = (
19
+ "area|base|br|col|embed|hr|img|input|keygen|link|meta|param|source|track|wbr"
20
+ )
21
+
22
+ def protect(self, text: str, cfg: TranslateConfig, store: PlaceHolderStore) -> str:
23
+ stage1 = self._partition_by_blocks(text, cfg, store)
24
+ stage2 = self._freeze_inline(stage1, cfg, store)
25
+ return stage2
26
+
27
+ def unprotect(self, text: str, store: PlaceHolderStore) -> str:
28
+ return store.restore_all(text)
29
+
30
+ @staticmethod
31
+ def _is_blank(line: str) -> bool:
32
+ return len(line.strip()) == 0
33
+
34
+ @staticmethod
35
+ def _line_starts_with_fence(line: str) -> Optional[str]:
36
+ match = re.match(r"^\s*(`{3,}|~{3,})", line)
37
+ return match.group(1) if match else None
38
+
39
+ @staticmethod
40
+ def _line_is_block_math_open(line: str) -> bool:
41
+ return line.strip() == "$$"
42
+
43
+ @staticmethod
44
+ def _line_is_block_math_close(line: str) -> bool:
45
+ return line.strip() == "$$"
46
+
47
+ @staticmethod
48
+ def _line_starts_html_codey(line: str) -> Optional[str]:
49
+ match = re.search(r"<(pre|code|script|style)(\s|>)", line, flags=re.IGNORECASE)
50
+ return match.group(1).lower() if match else None
51
+
52
+ @staticmethod
53
+ def _line_ends_html(tag: str, line: str) -> bool:
54
+ return re.search(rf"</{tag}\s*>", line, flags=re.IGNORECASE) is not None
55
+
56
+ @staticmethod
57
+ def _looks_like_table_header(line: str) -> bool:
58
+ return "|" in line
59
+
60
+ @staticmethod
61
+ def _looks_like_table_delim(line: str) -> bool:
62
+ return (
63
+ re.match(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$", line)
64
+ is not None
65
+ )
66
+
67
+ @staticmethod
68
+ def _line_starts_block_html_open(line: str) -> str | None:
69
+ s = line.lstrip()
70
+ if s.startswith("<!--"):
71
+ return "__comment__"
72
+ if s.startswith("<![CDATA["):
73
+ return "__cdata__"
74
+ if s.startswith("<?"):
75
+ return "__pi__"
76
+ match = re.match(
77
+ rf"^<(?P<tag>{MarkdownProtector.BLOCK_HTML_TAGS})\b", s, flags=re.IGNORECASE
78
+ )
79
+ if match:
80
+ return match.group("tag").lower()
81
+ return None
82
+
83
+ @staticmethod
84
+ def _scan_until_html_block_end(lines: List[str], start: int, tag: str) -> int:
85
+ n = len(lines)
86
+ if tag == "__comment__":
87
+ idx = start
88
+ while idx < n and "-->" not in lines[idx]:
89
+ idx += 1
90
+ return min(idx, n - 1)
91
+ if tag == "__cdata__":
92
+ idx = start
93
+ while idx < n and "]]>" not in lines[idx]:
94
+ idx += 1
95
+ return min(idx, n - 1)
96
+ if tag == "__pi__":
97
+ idx = start
98
+ while idx < n and "?>" not in lines[idx]:
99
+ idx += 1
100
+ return min(idx, n - 1)
101
+
102
+ if re.match(rf"^(?:{MarkdownProtector.VOID_HTML_TAGS})$", tag, flags=re.IGNORECASE):
103
+ return start
104
+
105
+ open_pat = re.compile(rf"<{tag}\b(?![^>]*?/>)", re.IGNORECASE)
106
+ close_pat = re.compile(rf"</{tag}\s*>", re.IGNORECASE)
107
+ depth = 0
108
+ idx = start
109
+ while idx < n:
110
+ depth += len(open_pat.findall(lines[idx]))
111
+ depth -= len(close_pat.findall(lines[idx]))
112
+ if depth <= 0 and idx >= start:
113
+ return idx
114
+ idx += 1
115
+ return n - 1
116
+
117
+ @staticmethod
118
+ def _partition_by_blocks(
119
+ text: str, cfg: TranslateConfig, store: PlaceHolderStore
120
+ ) -> str:
121
+ lines = text.splitlines(keepends=True)
122
+ out: List[str] = []
123
+ i = 0
124
+ n = len(lines)
125
+
126
+ while i < n:
127
+ line = lines[i]
128
+
129
+ fence = MarkdownProtector._line_starts_with_fence(line)
130
+ if fence:
131
+ j = i + 1
132
+ while j < n and not re.match(rf"^\s*{re.escape(fence)}", lines[j]):
133
+ j += 1
134
+ if j < n:
135
+ block = "".join(lines[i : j + 1])
136
+ placeholder = store.add("CODEFENCE", block)
137
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
138
+ i = j + 1
139
+ continue
140
+ block = "".join(lines[i:])
141
+ placeholder = store.add("CODEFENCE", block)
142
+ out.append(placeholder)
143
+ break
144
+
145
+ tag = MarkdownProtector._line_starts_html_codey(line)
146
+ if tag:
147
+ j = i
148
+ while j < n and not MarkdownProtector._line_ends_html(tag, lines[j]):
149
+ j += 1
150
+ if j < n:
151
+ block = "".join(lines[i : j + 1])
152
+ placeholder = store.add("HTMLBLOCK", block)
153
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
154
+ i = j + 1
155
+ continue
156
+ block = "".join(lines[i:])
157
+ placeholder = store.add("HTMLBLOCK", block)
158
+ out.append(placeholder)
159
+ break
160
+
161
+ tag_block = MarkdownProtector._line_starts_block_html_open(line)
162
+ if tag_block:
163
+ j = MarkdownProtector._scan_until_html_block_end(lines, i, tag_block)
164
+ block = "".join(lines[i : j + 1])
165
+ placeholder = store.add("HTMLBLOCK", block)
166
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
167
+ i = j + 1
168
+ continue
169
+
170
+ if MarkdownProtector._line_is_block_math_open(line):
171
+ j = i + 1
172
+ while j < n and not MarkdownProtector._line_is_block_math_close(lines[j]):
173
+ j += 1
174
+ if j < n:
175
+ block = "".join(lines[i : j + 1])
176
+ placeholder = store.add("MATHBLOCK", block)
177
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
178
+ i = j + 1
179
+ continue
180
+ block = "".join(lines[i:])
181
+ placeholder = store.add("MATHBLOCK", block)
182
+ out.append(placeholder)
183
+ break
184
+
185
+ if not cfg.translate_tables:
186
+ if (
187
+ i + 1 < n
188
+ and MarkdownProtector._looks_like_table_header(line)
189
+ and MarkdownProtector._looks_like_table_delim(lines[i + 1])
190
+ ):
191
+ j = i + 2
192
+ while (
193
+ j < n
194
+ and ("|" in lines[j] or MarkdownProtector._looks_like_table_delim(lines[j]))
195
+ and not MarkdownProtector._is_blank(lines[j])
196
+ ):
197
+ j += 1
198
+ block = "".join(lines[i:j])
199
+ placeholder = store.add("TABLE", block)
200
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
201
+ i = j
202
+ continue
203
+
204
+ if re.match(r"^\[\^[^\]]+\]:", line):
205
+ j = i + 1
206
+ while j < n and (
207
+ re.match(r"^\s{4,}", lines[j]) or MarkdownProtector._is_blank(lines[j])
208
+ ):
209
+ j += 1
210
+ block = "".join(lines[i:j])
211
+ placeholder = store.add("FOOTDEF", block)
212
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
213
+ i = j
214
+ continue
215
+
216
+ if re.match(r"^( {4}|\t)", line):
217
+ j = i + 1
218
+ while j < n and re.match(r"^( {4}|\t)", lines[j]):
219
+ j += 1
220
+ block = "".join(lines[i:j])
221
+ placeholder = store.add("INDENTCODE", block)
222
+ out.append(placeholder + ("\n" if block.endswith("\n") else ""))
223
+ i = j
224
+ continue
225
+
226
+ out.append(line)
227
+ i += 1
228
+
229
+ return "".join(out)
230
+
231
+ @staticmethod
232
+ def _freeze_inline(text: str, cfg: TranslateConfig, store: PlaceHolderStore) -> str:
233
+ s = text
234
+
235
+ def repl_link_def(match: re.Match) -> str:
236
+ return store.add("LINKDEF", match.group(0))
237
+
238
+ s = re.sub(r"^\s*\[[^\]]+\]:\s*\S+.*$", repl_link_def, s, flags=re.MULTILINE)
239
+
240
+ img_pattern = re.compile(r"!\[(?:[^\]\\]|\\.)*?\]\((?:[^()\\]|\\.)*?\)")
241
+ if not cfg.translate_image_alt:
242
+ s = img_pattern.sub(lambda m: store.add("IMAGE", m.group(0)), s)
243
+ else:
244
+ def repl_img_alt(match: re.Match) -> str:
245
+ full = match.group(0)
246
+ match2 = re.match(r"(!\[)(.*?)(\]\()(.+)(\))", full)
247
+ if not match2:
248
+ return store.add("IMAGE", full)
249
+ head, alt, mid, tail, endp = match2.groups()
250
+ placeholder = store.add("IMGURL", mid + tail + endp)
251
+ return f"{head}{alt}{placeholder}"
252
+
253
+ s = img_pattern.sub(repl_img_alt, s)
254
+
255
+ link_pattern = re.compile(r"\[(?:[^\]\\]|\\.)*?\]\((?:[^()\\]|\\.)*?\)")
256
+ if not cfg.translate_links_text:
257
+ s = link_pattern.sub(lambda m: store.add("LINK", m.group(0)), s)
258
+ else:
259
+ def repl_link_text(match: re.Match) -> str:
260
+ full = match.group(0)
261
+ match2 = re.match(r"(\[)(.*?)(\]\()(.+)(\))", full)
262
+ if not match2:
263
+ return store.add("LINK", full)
264
+ lbr, txt, mid, tail, rbr = match2.groups()
265
+ placeholder = store.add("LINKURL", mid + tail + rbr)
266
+ return f"{lbr}{txt}{placeholder}"
267
+
268
+ s = link_pattern.sub(repl_link_text, s)
269
+
270
+ ref_link_pattern = re.compile(r"\[(?:[^\]\\]|\\.)*?\]\[[^\]]+\]")
271
+ s = ref_link_pattern.sub(lambda m: store.add("REFLINK", m.group(0)), s)
272
+
273
+ autolink_pattern = re.compile(r"<(?:https?://|mailto:)[^>]+>")
274
+ s = autolink_pattern.sub(lambda m: store.add("AUTOLINK", m.group(0)), s)
275
+
276
+ url_pattern = re.compile(r"(https?://[^ )\n]+)")
277
+ s = url_pattern.sub(lambda m: store.add("URL", m.group(0)), s)
278
+
279
+ inline_code_pattern = re.compile(r"(?<!`)(`+)([^`\n]+?)\1(?!`)")
280
+ s = inline_code_pattern.sub(lambda m: store.add("CODE", m.group(0)), s)
281
+
282
+ inline_math_pattern = re.compile(r"\$(?!\s)([^$\n]+?)\$(?!\$)")
283
+ s = inline_math_pattern.sub(lambda m: store.add("MATH", m.group(0)), s)
284
+
285
+ footref_pattern = re.compile(r"\[\^[^\]]+\]")
286
+ s = footref_pattern.sub(lambda m: store.add("FOOTREF", m.group(0)), s)
287
+
288
+ inline_html_pattern = re.compile(r"<[A-Za-z][^>]*?>.*?</[A-Za-z][^>]*?>", re.DOTALL)
289
+ s = inline_html_pattern.sub(lambda m: store.add("HTMLINLINE", m.group(0)), s)
290
+
291
+ return s
@@ -0,0 +1,180 @@
1
+ """Segment markdown into translatable nodes while preserving separators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import re
7
+ from typing import Iterable
8
+
9
+
10
+ @dataclass
11
+ class Node:
12
+ nid: int
13
+ origin_text: str
14
+ translated_text: str = ""
15
+
16
+
17
+ @dataclass
18
+ class Segment:
19
+ kind: str # "sep" or "nodes"
20
+ content: str | list[int]
21
+
22
+
23
+ def _is_blank(line: str) -> bool:
24
+ return len(line.strip()) == 0
25
+
26
+
27
+ def _looks_like_heading(line: str) -> bool:
28
+ return re.match(r"^\s{0,3}#{1,6}\s+", line) is not None
29
+
30
+
31
+ def _looks_like_list_item(line: str) -> bool:
32
+ return re.match(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+", line) is not None
33
+
34
+
35
+ def _split_long_text(text: str, max_chars: int) -> list[str]:
36
+ if max_chars <= 0 or len(text) <= max_chars:
37
+ return [text]
38
+ tokens = re.split(r"(?<=[。!?!?\.])(\s+)", text)
39
+ parts: list[str] = []
40
+ buf = ""
41
+ for token in tokens:
42
+ if token == "":
43
+ continue
44
+ if buf and len(buf) + len(token) > max_chars:
45
+ parts.append(buf)
46
+ buf = ""
47
+ buf += token
48
+ if buf:
49
+ parts.append(buf)
50
+ final_parts: list[str] = []
51
+ for part in parts:
52
+ if len(part) <= max_chars:
53
+ final_parts.append(part)
54
+ continue
55
+ soft_parts = _soft_split_long_sentence(part, max_chars)
56
+ if len(soft_parts) == 1:
57
+ final_parts.append(part)
58
+ else:
59
+ final_parts.extend(soft_parts)
60
+ return final_parts
61
+
62
+
63
+ def _soft_split_long_sentence(text: str, max_chars: int) -> list[str]:
64
+ if max_chars <= 0 or len(text) <= max_chars:
65
+ return [text]
66
+ tokens = re.split(r"(?<=[,,、;;::])(\s+)", text)
67
+ parts: list[str] = []
68
+ buf = ""
69
+ for token in tokens:
70
+ if token == "":
71
+ continue
72
+ if buf and len(buf) + len(token) > max_chars:
73
+ parts.append(buf)
74
+ buf = ""
75
+ buf += token
76
+ if buf:
77
+ parts.append(buf)
78
+ if len(parts) <= 1:
79
+ return [text]
80
+ return parts
81
+
82
+
83
+ def _collect_list_block(lines: list[str], start: int) -> tuple[list[str], int]:
84
+ indent = len(lines[start]) - len(lines[start].lstrip())
85
+ block = [lines[start]]
86
+ i = start + 1
87
+ while i < len(lines):
88
+ line = lines[i]
89
+ if _is_blank(line):
90
+ break
91
+ if _looks_like_list_item(line) and (
92
+ len(line) - len(line.lstrip())
93
+ ) <= indent:
94
+ break
95
+ if len(line) - len(line.lstrip()) > indent:
96
+ block.append(line)
97
+ i += 1
98
+ continue
99
+ break
100
+ return block, i
101
+
102
+
103
+ def split_to_segments(
104
+ text: str, max_chunk_chars: int
105
+ ) -> tuple[list[Segment], dict[int, Node]]:
106
+ lines = text.splitlines(keepends=True)
107
+ segments: list[Segment] = []
108
+ nodes: dict[int, Node] = {}
109
+ node_id = 0
110
+ buffer: list[str] = []
111
+
112
+ def flush_buffer() -> None:
113
+ nonlocal node_id
114
+ if not buffer:
115
+ return
116
+ block_text = "".join(buffer)
117
+ buffer.clear()
118
+ parts = _split_long_text(block_text, max_chunk_chars)
119
+ node_ids: list[int] = []
120
+ for part in parts:
121
+ if part == "":
122
+ continue
123
+ nodes[node_id] = Node(nid=node_id, origin_text=part)
124
+ node_ids.append(node_id)
125
+ node_id += 1
126
+ if node_ids:
127
+ segments.append(Segment(kind="nodes", content=node_ids))
128
+
129
+ i = 0
130
+ while i < len(lines):
131
+ line = lines[i]
132
+ if _is_blank(line):
133
+ flush_buffer()
134
+ segments.append(Segment(kind="sep", content=line))
135
+ i += 1
136
+ continue
137
+ if _looks_like_heading(line):
138
+ flush_buffer()
139
+ nodes[node_id] = Node(nid=node_id, origin_text=line)
140
+ segments.append(Segment(kind="nodes", content=[node_id]))
141
+ node_id += 1
142
+ i += 1
143
+ continue
144
+ if _looks_like_list_item(line):
145
+ flush_buffer()
146
+ block, next_idx = _collect_list_block(lines, i)
147
+ block_text = "".join(block)
148
+ parts = _split_long_text(block_text, max_chunk_chars)
149
+ node_ids: list[int] = []
150
+ for part in parts:
151
+ if part == "":
152
+ continue
153
+ nodes[node_id] = Node(nid=node_id, origin_text=part)
154
+ node_ids.append(node_id)
155
+ node_id += 1
156
+ if node_ids:
157
+ segments.append(Segment(kind="nodes", content=node_ids))
158
+ i = next_idx
159
+ continue
160
+ buffer.append(line)
161
+ i += 1
162
+
163
+ flush_buffer()
164
+ return segments, nodes
165
+
166
+
167
+ def reassemble_segments(segments: Iterable[Segment], nodes: dict[int, Node]) -> str:
168
+ parts: list[str] = []
169
+ for segment in segments:
170
+ if segment.kind == "sep":
171
+ parts.append(str(segment.content))
172
+ continue
173
+ node_ids = segment.content
174
+ if not isinstance(node_ids, list):
175
+ continue
176
+ for node_id in node_ids:
177
+ node = nodes.get(node_id)
178
+ if node is not None:
179
+ parts.append(node.translated_text)
180
+ return "".join(parts)