deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/cli.py +2 -0
- deepresearch_flow/paper/config.py +15 -0
- deepresearch_flow/paper/db.py +193 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/llm.py +2 -0
- deepresearch_flow/paper/web/app.py +46 -3320
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- deepresearch_flow/recognize/cli.py +157 -3
- deepresearch_flow/recognize/organize.py +58 -0
- deepresearch_flow/translator/__init__.py +1 -0
- deepresearch_flow/translator/cli.py +451 -0
- deepresearch_flow/translator/config.py +19 -0
- deepresearch_flow/translator/engine.py +959 -0
- deepresearch_flow/translator/fixers.py +451 -0
- deepresearch_flow/translator/placeholder.py +62 -0
- deepresearch_flow/translator/prompts.py +116 -0
- deepresearch_flow/translator/protector.py +291 -0
- deepresearch_flow/translator/segment.py +180 -0
- deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
- deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Markdown protection and restoration using placeholders."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
|
|
8
|
+
from deepresearch_flow.translator.config import TranslateConfig
|
|
9
|
+
from deepresearch_flow.translator.placeholder import PlaceHolderStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MarkdownProtector:
|
|
13
|
+
BLOCK_HTML_TAGS = (
|
|
14
|
+
"address|article|aside|blockquote|body|caption|center|col|colgroup|dd|details|dialog|div|dl|dt|fieldset|"
|
|
15
|
+
"figcaption|figure|footer|form|frame|frameset|h[1-6]|head|header|hr|html|legend|li|link|main|menu|nav|"
|
|
16
|
+
"noframes|ol|optgroup|option|p|param|section|summary|table|tbody|td|tfoot|th|thead|title|tr|ul|video|audio|canvas"
|
|
17
|
+
)
|
|
18
|
+
VOID_HTML_TAGS = (
|
|
19
|
+
"area|base|br|col|embed|hr|img|input|keygen|link|meta|param|source|track|wbr"
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def protect(self, text: str, cfg: TranslateConfig, store: PlaceHolderStore) -> str:
|
|
23
|
+
stage1 = self._partition_by_blocks(text, cfg, store)
|
|
24
|
+
stage2 = self._freeze_inline(stage1, cfg, store)
|
|
25
|
+
return stage2
|
|
26
|
+
|
|
27
|
+
def unprotect(self, text: str, store: PlaceHolderStore) -> str:
|
|
28
|
+
return store.restore_all(text)
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def _is_blank(line: str) -> bool:
|
|
32
|
+
return len(line.strip()) == 0
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def _line_starts_with_fence(line: str) -> Optional[str]:
|
|
36
|
+
match = re.match(r"^\s*(`{3,}|~{3,})", line)
|
|
37
|
+
return match.group(1) if match else None
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _line_is_block_math_open(line: str) -> bool:
|
|
41
|
+
return line.strip() == "$$"
|
|
42
|
+
|
|
43
|
+
@staticmethod
|
|
44
|
+
def _line_is_block_math_close(line: str) -> bool:
|
|
45
|
+
return line.strip() == "$$"
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _line_starts_html_codey(line: str) -> Optional[str]:
|
|
49
|
+
match = re.search(r"<(pre|code|script|style)(\s|>)", line, flags=re.IGNORECASE)
|
|
50
|
+
return match.group(1).lower() if match else None
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _line_ends_html(tag: str, line: str) -> bool:
|
|
54
|
+
return re.search(rf"</{tag}\s*>", line, flags=re.IGNORECASE) is not None
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _looks_like_table_header(line: str) -> bool:
|
|
58
|
+
return "|" in line
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _looks_like_table_delim(line: str) -> bool:
|
|
62
|
+
return (
|
|
63
|
+
re.match(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$", line)
|
|
64
|
+
is not None
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _line_starts_block_html_open(line: str) -> str | None:
|
|
69
|
+
s = line.lstrip()
|
|
70
|
+
if s.startswith("<!--"):
|
|
71
|
+
return "__comment__"
|
|
72
|
+
if s.startswith("<![CDATA["):
|
|
73
|
+
return "__cdata__"
|
|
74
|
+
if s.startswith("<?"):
|
|
75
|
+
return "__pi__"
|
|
76
|
+
match = re.match(
|
|
77
|
+
rf"^<(?P<tag>{MarkdownProtector.BLOCK_HTML_TAGS})\b", s, flags=re.IGNORECASE
|
|
78
|
+
)
|
|
79
|
+
if match:
|
|
80
|
+
return match.group("tag").lower()
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _scan_until_html_block_end(lines: List[str], start: int, tag: str) -> int:
|
|
85
|
+
n = len(lines)
|
|
86
|
+
if tag == "__comment__":
|
|
87
|
+
idx = start
|
|
88
|
+
while idx < n and "-->" not in lines[idx]:
|
|
89
|
+
idx += 1
|
|
90
|
+
return min(idx, n - 1)
|
|
91
|
+
if tag == "__cdata__":
|
|
92
|
+
idx = start
|
|
93
|
+
while idx < n and "]]>" not in lines[idx]:
|
|
94
|
+
idx += 1
|
|
95
|
+
return min(idx, n - 1)
|
|
96
|
+
if tag == "__pi__":
|
|
97
|
+
idx = start
|
|
98
|
+
while idx < n and "?>" not in lines[idx]:
|
|
99
|
+
idx += 1
|
|
100
|
+
return min(idx, n - 1)
|
|
101
|
+
|
|
102
|
+
if re.match(rf"^(?:{MarkdownProtector.VOID_HTML_TAGS})$", tag, flags=re.IGNORECASE):
|
|
103
|
+
return start
|
|
104
|
+
|
|
105
|
+
open_pat = re.compile(rf"<{tag}\b(?![^>]*?/>)", re.IGNORECASE)
|
|
106
|
+
close_pat = re.compile(rf"</{tag}\s*>", re.IGNORECASE)
|
|
107
|
+
depth = 0
|
|
108
|
+
idx = start
|
|
109
|
+
while idx < n:
|
|
110
|
+
depth += len(open_pat.findall(lines[idx]))
|
|
111
|
+
depth -= len(close_pat.findall(lines[idx]))
|
|
112
|
+
if depth <= 0 and idx >= start:
|
|
113
|
+
return idx
|
|
114
|
+
idx += 1
|
|
115
|
+
return n - 1
|
|
116
|
+
|
|
117
|
+
@staticmethod
|
|
118
|
+
def _partition_by_blocks(
|
|
119
|
+
text: str, cfg: TranslateConfig, store: PlaceHolderStore
|
|
120
|
+
) -> str:
|
|
121
|
+
lines = text.splitlines(keepends=True)
|
|
122
|
+
out: List[str] = []
|
|
123
|
+
i = 0
|
|
124
|
+
n = len(lines)
|
|
125
|
+
|
|
126
|
+
while i < n:
|
|
127
|
+
line = lines[i]
|
|
128
|
+
|
|
129
|
+
fence = MarkdownProtector._line_starts_with_fence(line)
|
|
130
|
+
if fence:
|
|
131
|
+
j = i + 1
|
|
132
|
+
while j < n and not re.match(rf"^\s*{re.escape(fence)}", lines[j]):
|
|
133
|
+
j += 1
|
|
134
|
+
if j < n:
|
|
135
|
+
block = "".join(lines[i : j + 1])
|
|
136
|
+
placeholder = store.add("CODEFENCE", block)
|
|
137
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
138
|
+
i = j + 1
|
|
139
|
+
continue
|
|
140
|
+
block = "".join(lines[i:])
|
|
141
|
+
placeholder = store.add("CODEFENCE", block)
|
|
142
|
+
out.append(placeholder)
|
|
143
|
+
break
|
|
144
|
+
|
|
145
|
+
tag = MarkdownProtector._line_starts_html_codey(line)
|
|
146
|
+
if tag:
|
|
147
|
+
j = i
|
|
148
|
+
while j < n and not MarkdownProtector._line_ends_html(tag, lines[j]):
|
|
149
|
+
j += 1
|
|
150
|
+
if j < n:
|
|
151
|
+
block = "".join(lines[i : j + 1])
|
|
152
|
+
placeholder = store.add("HTMLBLOCK", block)
|
|
153
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
154
|
+
i = j + 1
|
|
155
|
+
continue
|
|
156
|
+
block = "".join(lines[i:])
|
|
157
|
+
placeholder = store.add("HTMLBLOCK", block)
|
|
158
|
+
out.append(placeholder)
|
|
159
|
+
break
|
|
160
|
+
|
|
161
|
+
tag_block = MarkdownProtector._line_starts_block_html_open(line)
|
|
162
|
+
if tag_block:
|
|
163
|
+
j = MarkdownProtector._scan_until_html_block_end(lines, i, tag_block)
|
|
164
|
+
block = "".join(lines[i : j + 1])
|
|
165
|
+
placeholder = store.add("HTMLBLOCK", block)
|
|
166
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
167
|
+
i = j + 1
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
if MarkdownProtector._line_is_block_math_open(line):
|
|
171
|
+
j = i + 1
|
|
172
|
+
while j < n and not MarkdownProtector._line_is_block_math_close(lines[j]):
|
|
173
|
+
j += 1
|
|
174
|
+
if j < n:
|
|
175
|
+
block = "".join(lines[i : j + 1])
|
|
176
|
+
placeholder = store.add("MATHBLOCK", block)
|
|
177
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
178
|
+
i = j + 1
|
|
179
|
+
continue
|
|
180
|
+
block = "".join(lines[i:])
|
|
181
|
+
placeholder = store.add("MATHBLOCK", block)
|
|
182
|
+
out.append(placeholder)
|
|
183
|
+
break
|
|
184
|
+
|
|
185
|
+
if not cfg.translate_tables:
|
|
186
|
+
if (
|
|
187
|
+
i + 1 < n
|
|
188
|
+
and MarkdownProtector._looks_like_table_header(line)
|
|
189
|
+
and MarkdownProtector._looks_like_table_delim(lines[i + 1])
|
|
190
|
+
):
|
|
191
|
+
j = i + 2
|
|
192
|
+
while (
|
|
193
|
+
j < n
|
|
194
|
+
and ("|" in lines[j] or MarkdownProtector._looks_like_table_delim(lines[j]))
|
|
195
|
+
and not MarkdownProtector._is_blank(lines[j])
|
|
196
|
+
):
|
|
197
|
+
j += 1
|
|
198
|
+
block = "".join(lines[i:j])
|
|
199
|
+
placeholder = store.add("TABLE", block)
|
|
200
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
201
|
+
i = j
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
if re.match(r"^\[\^[^\]]+\]:", line):
|
|
205
|
+
j = i + 1
|
|
206
|
+
while j < n and (
|
|
207
|
+
re.match(r"^\s{4,}", lines[j]) or MarkdownProtector._is_blank(lines[j])
|
|
208
|
+
):
|
|
209
|
+
j += 1
|
|
210
|
+
block = "".join(lines[i:j])
|
|
211
|
+
placeholder = store.add("FOOTDEF", block)
|
|
212
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
213
|
+
i = j
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
if re.match(r"^( {4}|\t)", line):
|
|
217
|
+
j = i + 1
|
|
218
|
+
while j < n and re.match(r"^( {4}|\t)", lines[j]):
|
|
219
|
+
j += 1
|
|
220
|
+
block = "".join(lines[i:j])
|
|
221
|
+
placeholder = store.add("INDENTCODE", block)
|
|
222
|
+
out.append(placeholder + ("\n" if block.endswith("\n") else ""))
|
|
223
|
+
i = j
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
out.append(line)
|
|
227
|
+
i += 1
|
|
228
|
+
|
|
229
|
+
return "".join(out)
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def _freeze_inline(text: str, cfg: TranslateConfig, store: PlaceHolderStore) -> str:
|
|
233
|
+
s = text
|
|
234
|
+
|
|
235
|
+
def repl_link_def(match: re.Match) -> str:
|
|
236
|
+
return store.add("LINKDEF", match.group(0))
|
|
237
|
+
|
|
238
|
+
s = re.sub(r"^\s*\[[^\]]+\]:\s*\S+.*$", repl_link_def, s, flags=re.MULTILINE)
|
|
239
|
+
|
|
240
|
+
img_pattern = re.compile(r"!\[(?:[^\]\\]|\\.)*?\]\((?:[^()\\]|\\.)*?\)")
|
|
241
|
+
if not cfg.translate_image_alt:
|
|
242
|
+
s = img_pattern.sub(lambda m: store.add("IMAGE", m.group(0)), s)
|
|
243
|
+
else:
|
|
244
|
+
def repl_img_alt(match: re.Match) -> str:
|
|
245
|
+
full = match.group(0)
|
|
246
|
+
match2 = re.match(r"(!\[)(.*?)(\]\()(.+)(\))", full)
|
|
247
|
+
if not match2:
|
|
248
|
+
return store.add("IMAGE", full)
|
|
249
|
+
head, alt, mid, tail, endp = match2.groups()
|
|
250
|
+
placeholder = store.add("IMGURL", mid + tail + endp)
|
|
251
|
+
return f"{head}{alt}{placeholder}"
|
|
252
|
+
|
|
253
|
+
s = img_pattern.sub(repl_img_alt, s)
|
|
254
|
+
|
|
255
|
+
link_pattern = re.compile(r"\[(?:[^\]\\]|\\.)*?\]\((?:[^()\\]|\\.)*?\)")
|
|
256
|
+
if not cfg.translate_links_text:
|
|
257
|
+
s = link_pattern.sub(lambda m: store.add("LINK", m.group(0)), s)
|
|
258
|
+
else:
|
|
259
|
+
def repl_link_text(match: re.Match) -> str:
|
|
260
|
+
full = match.group(0)
|
|
261
|
+
match2 = re.match(r"(\[)(.*?)(\]\()(.+)(\))", full)
|
|
262
|
+
if not match2:
|
|
263
|
+
return store.add("LINK", full)
|
|
264
|
+
lbr, txt, mid, tail, rbr = match2.groups()
|
|
265
|
+
placeholder = store.add("LINKURL", mid + tail + rbr)
|
|
266
|
+
return f"{lbr}{txt}{placeholder}"
|
|
267
|
+
|
|
268
|
+
s = link_pattern.sub(repl_link_text, s)
|
|
269
|
+
|
|
270
|
+
ref_link_pattern = re.compile(r"\[(?:[^\]\\]|\\.)*?\]\[[^\]]+\]")
|
|
271
|
+
s = ref_link_pattern.sub(lambda m: store.add("REFLINK", m.group(0)), s)
|
|
272
|
+
|
|
273
|
+
autolink_pattern = re.compile(r"<(?:https?://|mailto:)[^>]+>")
|
|
274
|
+
s = autolink_pattern.sub(lambda m: store.add("AUTOLINK", m.group(0)), s)
|
|
275
|
+
|
|
276
|
+
url_pattern = re.compile(r"(https?://[^ )\n]+)")
|
|
277
|
+
s = url_pattern.sub(lambda m: store.add("URL", m.group(0)), s)
|
|
278
|
+
|
|
279
|
+
inline_code_pattern = re.compile(r"(?<!`)(`+)([^`\n]+?)\1(?!`)")
|
|
280
|
+
s = inline_code_pattern.sub(lambda m: store.add("CODE", m.group(0)), s)
|
|
281
|
+
|
|
282
|
+
inline_math_pattern = re.compile(r"\$(?!\s)([^$\n]+?)\$(?!\$)")
|
|
283
|
+
s = inline_math_pattern.sub(lambda m: store.add("MATH", m.group(0)), s)
|
|
284
|
+
|
|
285
|
+
footref_pattern = re.compile(r"\[\^[^\]]+\]")
|
|
286
|
+
s = footref_pattern.sub(lambda m: store.add("FOOTREF", m.group(0)), s)
|
|
287
|
+
|
|
288
|
+
inline_html_pattern = re.compile(r"<[A-Za-z][^>]*?>.*?</[A-Za-z][^>]*?>", re.DOTALL)
|
|
289
|
+
s = inline_html_pattern.sub(lambda m: store.add("HTMLINLINE", m.group(0)), s)
|
|
290
|
+
|
|
291
|
+
return s
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Segment markdown into translatable nodes while preserving separators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import re
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Node:
|
|
12
|
+
nid: int
|
|
13
|
+
origin_text: str
|
|
14
|
+
translated_text: str = ""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class Segment:
|
|
19
|
+
kind: str # "sep" or "nodes"
|
|
20
|
+
content: str | list[int]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _is_blank(line: str) -> bool:
|
|
24
|
+
return len(line.strip()) == 0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _looks_like_heading(line: str) -> bool:
|
|
28
|
+
return re.match(r"^\s{0,3}#{1,6}\s+", line) is not None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _looks_like_list_item(line: str) -> bool:
|
|
32
|
+
return re.match(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+", line) is not None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _split_long_text(text: str, max_chars: int) -> list[str]:
|
|
36
|
+
if max_chars <= 0 or len(text) <= max_chars:
|
|
37
|
+
return [text]
|
|
38
|
+
tokens = re.split(r"(?<=[。!?!?\.])(\s+)", text)
|
|
39
|
+
parts: list[str] = []
|
|
40
|
+
buf = ""
|
|
41
|
+
for token in tokens:
|
|
42
|
+
if token == "":
|
|
43
|
+
continue
|
|
44
|
+
if buf and len(buf) + len(token) > max_chars:
|
|
45
|
+
parts.append(buf)
|
|
46
|
+
buf = ""
|
|
47
|
+
buf += token
|
|
48
|
+
if buf:
|
|
49
|
+
parts.append(buf)
|
|
50
|
+
final_parts: list[str] = []
|
|
51
|
+
for part in parts:
|
|
52
|
+
if len(part) <= max_chars:
|
|
53
|
+
final_parts.append(part)
|
|
54
|
+
continue
|
|
55
|
+
soft_parts = _soft_split_long_sentence(part, max_chars)
|
|
56
|
+
if len(soft_parts) == 1:
|
|
57
|
+
final_parts.append(part)
|
|
58
|
+
else:
|
|
59
|
+
final_parts.extend(soft_parts)
|
|
60
|
+
return final_parts
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _soft_split_long_sentence(text: str, max_chars: int) -> list[str]:
|
|
64
|
+
if max_chars <= 0 or len(text) <= max_chars:
|
|
65
|
+
return [text]
|
|
66
|
+
tokens = re.split(r"(?<=[,,、;;::])(\s+)", text)
|
|
67
|
+
parts: list[str] = []
|
|
68
|
+
buf = ""
|
|
69
|
+
for token in tokens:
|
|
70
|
+
if token == "":
|
|
71
|
+
continue
|
|
72
|
+
if buf and len(buf) + len(token) > max_chars:
|
|
73
|
+
parts.append(buf)
|
|
74
|
+
buf = ""
|
|
75
|
+
buf += token
|
|
76
|
+
if buf:
|
|
77
|
+
parts.append(buf)
|
|
78
|
+
if len(parts) <= 1:
|
|
79
|
+
return [text]
|
|
80
|
+
return parts
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _collect_list_block(lines: list[str], start: int) -> tuple[list[str], int]:
|
|
84
|
+
indent = len(lines[start]) - len(lines[start].lstrip())
|
|
85
|
+
block = [lines[start]]
|
|
86
|
+
i = start + 1
|
|
87
|
+
while i < len(lines):
|
|
88
|
+
line = lines[i]
|
|
89
|
+
if _is_blank(line):
|
|
90
|
+
break
|
|
91
|
+
if _looks_like_list_item(line) and (
|
|
92
|
+
len(line) - len(line.lstrip())
|
|
93
|
+
) <= indent:
|
|
94
|
+
break
|
|
95
|
+
if len(line) - len(line.lstrip()) > indent:
|
|
96
|
+
block.append(line)
|
|
97
|
+
i += 1
|
|
98
|
+
continue
|
|
99
|
+
break
|
|
100
|
+
return block, i
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def split_to_segments(
|
|
104
|
+
text: str, max_chunk_chars: int
|
|
105
|
+
) -> tuple[list[Segment], dict[int, Node]]:
|
|
106
|
+
lines = text.splitlines(keepends=True)
|
|
107
|
+
segments: list[Segment] = []
|
|
108
|
+
nodes: dict[int, Node] = {}
|
|
109
|
+
node_id = 0
|
|
110
|
+
buffer: list[str] = []
|
|
111
|
+
|
|
112
|
+
def flush_buffer() -> None:
|
|
113
|
+
nonlocal node_id
|
|
114
|
+
if not buffer:
|
|
115
|
+
return
|
|
116
|
+
block_text = "".join(buffer)
|
|
117
|
+
buffer.clear()
|
|
118
|
+
parts = _split_long_text(block_text, max_chunk_chars)
|
|
119
|
+
node_ids: list[int] = []
|
|
120
|
+
for part in parts:
|
|
121
|
+
if part == "":
|
|
122
|
+
continue
|
|
123
|
+
nodes[node_id] = Node(nid=node_id, origin_text=part)
|
|
124
|
+
node_ids.append(node_id)
|
|
125
|
+
node_id += 1
|
|
126
|
+
if node_ids:
|
|
127
|
+
segments.append(Segment(kind="nodes", content=node_ids))
|
|
128
|
+
|
|
129
|
+
i = 0
|
|
130
|
+
while i < len(lines):
|
|
131
|
+
line = lines[i]
|
|
132
|
+
if _is_blank(line):
|
|
133
|
+
flush_buffer()
|
|
134
|
+
segments.append(Segment(kind="sep", content=line))
|
|
135
|
+
i += 1
|
|
136
|
+
continue
|
|
137
|
+
if _looks_like_heading(line):
|
|
138
|
+
flush_buffer()
|
|
139
|
+
nodes[node_id] = Node(nid=node_id, origin_text=line)
|
|
140
|
+
segments.append(Segment(kind="nodes", content=[node_id]))
|
|
141
|
+
node_id += 1
|
|
142
|
+
i += 1
|
|
143
|
+
continue
|
|
144
|
+
if _looks_like_list_item(line):
|
|
145
|
+
flush_buffer()
|
|
146
|
+
block, next_idx = _collect_list_block(lines, i)
|
|
147
|
+
block_text = "".join(block)
|
|
148
|
+
parts = _split_long_text(block_text, max_chunk_chars)
|
|
149
|
+
node_ids: list[int] = []
|
|
150
|
+
for part in parts:
|
|
151
|
+
if part == "":
|
|
152
|
+
continue
|
|
153
|
+
nodes[node_id] = Node(nid=node_id, origin_text=part)
|
|
154
|
+
node_ids.append(node_id)
|
|
155
|
+
node_id += 1
|
|
156
|
+
if node_ids:
|
|
157
|
+
segments.append(Segment(kind="nodes", content=node_ids))
|
|
158
|
+
i = next_idx
|
|
159
|
+
continue
|
|
160
|
+
buffer.append(line)
|
|
161
|
+
i += 1
|
|
162
|
+
|
|
163
|
+
flush_buffer()
|
|
164
|
+
return segments, nodes
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def reassemble_segments(segments: Iterable[Segment], nodes: dict[int, Node]) -> str:
|
|
168
|
+
parts: list[str] = []
|
|
169
|
+
for segment in segments:
|
|
170
|
+
if segment.kind == "sep":
|
|
171
|
+
parts.append(str(segment.content))
|
|
172
|
+
continue
|
|
173
|
+
node_ids = segment.content
|
|
174
|
+
if not isinstance(node_ids, list):
|
|
175
|
+
continue
|
|
176
|
+
for node_id in node_ids:
|
|
177
|
+
node = nodes.get(node_id)
|
|
178
|
+
if node is not None:
|
|
179
|
+
parts.append(node.translated_text)
|
|
180
|
+
return "".join(parts)
|