deepresearch-flow 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/cli.py +2 -0
- deepresearch_flow/paper/config.py +15 -0
- deepresearch_flow/paper/db.py +9 -0
- deepresearch_flow/paper/llm.py +2 -0
- deepresearch_flow/paper/web/app.py +413 -20
- deepresearch_flow/recognize/cli.py +157 -3
- deepresearch_flow/recognize/organize.py +58 -0
- deepresearch_flow/translator/__init__.py +1 -0
- deepresearch_flow/translator/cli.py +451 -0
- deepresearch_flow/translator/config.py +19 -0
- deepresearch_flow/translator/engine.py +959 -0
- deepresearch_flow/translator/fixers.py +451 -0
- deepresearch_flow/translator/placeholder.py +62 -0
- deepresearch_flow/translator/prompts.py +116 -0
- deepresearch_flow/translator/protector.py +291 -0
- deepresearch_flow/translator/segment.py +180 -0
- deepresearch_flow-0.3.0.dist-info/METADATA +306 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/RECORD +22 -13
- deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""OCR markdown repair utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import re
|
|
7
|
+
from typing import Iterable, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ReferenceProcessor:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self._patterns = {
|
|
13
|
+
"reference_def": re.compile(
|
|
14
|
+
r"^\[(\d+)\]((?:(?!\[\d+\])[^\n])*)\n(?=^\[\d+\]|$)",
|
|
15
|
+
re.MULTILINE,
|
|
16
|
+
),
|
|
17
|
+
"reference_range": re.compile(r"\[(\d+)\-(\d+)\]"),
|
|
18
|
+
"reference_multi": re.compile(r"\[(\d+(?:,\s*\d+)*)\]"),
|
|
19
|
+
"reference_single": re.compile(r"\[(\d+)\]"),
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def fix_references(self, text: str) -> str:
|
|
23
|
+
for match in re.findall(self._patterns["reference_def"], text):
|
|
24
|
+
original = f"[{match[0]}] {match[1].strip()}"
|
|
25
|
+
replacement = f"[^{match[0]}]: {match[1].strip()}\n"
|
|
26
|
+
text = text.replace(original, replacement)
|
|
27
|
+
|
|
28
|
+
for match in re.findall(self._patterns["reference_range"], text):
|
|
29
|
+
original = f"[{match[0]}-{match[1]}]"
|
|
30
|
+
expanded = " ".join(
|
|
31
|
+
f"[^{i}]" for i in range(int(match[0]), int(match[1]) + 1)
|
|
32
|
+
)
|
|
33
|
+
text = text.replace(original, expanded)
|
|
34
|
+
|
|
35
|
+
for match in re.findall(self._patterns["reference_multi"], text):
|
|
36
|
+
original = f"[{match}]"
|
|
37
|
+
numbers = [n.strip() for n in match.split(",")]
|
|
38
|
+
expanded = " ".join(f"[^{n}]" for n in numbers)
|
|
39
|
+
text = text.replace(original, expanded)
|
|
40
|
+
|
|
41
|
+
for match in re.findall(self._patterns["reference_single"], text):
|
|
42
|
+
original = f"[{match}]"
|
|
43
|
+
replacement = f"[^{match}]"
|
|
44
|
+
if original in text and f"[^{match}]" not in text.replace(replacement, ""):
|
|
45
|
+
text = text.replace(original, replacement)
|
|
46
|
+
|
|
47
|
+
return text
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class LinkProcessor:
|
|
51
|
+
def __init__(self) -> None:
|
|
52
|
+
self._patterns = {
|
|
53
|
+
"url": re.compile(
|
|
54
|
+
r"(?<!<)(?<!]\()(?:(?<=^)|(?<=\s)|(?<=[\(\[{\"“]))"
|
|
55
|
+
r"(https?://[^\s\)\]\}>]+)"
|
|
56
|
+
r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
|
|
57
|
+
),
|
|
58
|
+
"email": re.compile(
|
|
59
|
+
r"(?<!<)(?<!]\()(?<![\w.%+-])"
|
|
60
|
+
r"([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})"
|
|
61
|
+
r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
|
|
62
|
+
),
|
|
63
|
+
"phone": re.compile(
|
|
64
|
+
r"(?<!<)(?<!]\()(?:(?<=^)|(?<=\s)|(?<=[\(\[{\"“]))"
|
|
65
|
+
r"(\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4})"
|
|
66
|
+
r"(?=[\s\)\]\}>.,!?;:,。!?;:]|$)"
|
|
67
|
+
),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def fix_links(self, text: str) -> str:
|
|
71
|
+
def bracket_urls(value: str) -> str:
|
|
72
|
+
def repl(match: re.Match) -> str:
|
|
73
|
+
url = match.group(1)
|
|
74
|
+
if len(url) > 1 and url[-1] in ".,!?;:,。!?;:" and url[-2].isalnum():
|
|
75
|
+
return f"<{url[:-1]}>{url[-1]}"
|
|
76
|
+
return f"<{url}>"
|
|
77
|
+
|
|
78
|
+
return self._patterns["url"].sub(repl, value)
|
|
79
|
+
|
|
80
|
+
def bracket_emails(value: str) -> str:
|
|
81
|
+
return self._patterns["email"].sub(r"<mailto:\1>", value)
|
|
82
|
+
|
|
83
|
+
def bracket_phones(value: str) -> str:
|
|
84
|
+
return self._patterns["phone"].sub(lambda m: f"<tel:{m.group(1)}>", value)
|
|
85
|
+
|
|
86
|
+
text = bracket_urls(text)
|
|
87
|
+
text = bracket_emails(text)
|
|
88
|
+
text = bracket_phones(text)
|
|
89
|
+
return text
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class PseudocodeProcessor:
|
|
93
|
+
def __init__(self) -> None:
|
|
94
|
+
self._header_pattern = re.compile(
|
|
95
|
+
r"^\s*\*?\*?\s*(Algorithm|算法)\s+([A-Za-z0-9.-]+)?\*?\*?\s*(.*)$",
|
|
96
|
+
re.IGNORECASE,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def wrap_pseudocode_blocks(self, text: str, lang: str = "pseudo") -> str:
|
|
100
|
+
lines = text.splitlines()
|
|
101
|
+
out: list[str] = []
|
|
102
|
+
i = 0
|
|
103
|
+
in_fence = False
|
|
104
|
+
|
|
105
|
+
while i < len(lines):
|
|
106
|
+
line = lines[i]
|
|
107
|
+
if line.strip().startswith("```"):
|
|
108
|
+
in_fence = not in_fence
|
|
109
|
+
out.append(line)
|
|
110
|
+
i += 1
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
if not in_fence and self._header_pattern.match(line):
|
|
114
|
+
header_line = line
|
|
115
|
+
block = [header_line]
|
|
116
|
+
i += 1
|
|
117
|
+
while i < len(lines):
|
|
118
|
+
peek = lines[i]
|
|
119
|
+
if peek.strip().startswith("```") or re.match(r"^\s*#{1,6}\s", peek):
|
|
120
|
+
break
|
|
121
|
+
if not self._is_algo_continuation(peek):
|
|
122
|
+
break
|
|
123
|
+
block.append(peek)
|
|
124
|
+
i += 1
|
|
125
|
+
|
|
126
|
+
out.append(f"```{lang}")
|
|
127
|
+
title = self._format_title(header_line)
|
|
128
|
+
if title:
|
|
129
|
+
out.append(f"// {title}")
|
|
130
|
+
for raw in block[1:]:
|
|
131
|
+
s = raw.strip()
|
|
132
|
+
if s == "***":
|
|
133
|
+
out.append("// " + "-" * 40)
|
|
134
|
+
continue
|
|
135
|
+
out.append(self._clean_inline(raw))
|
|
136
|
+
out.append("```")
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
out.append(line)
|
|
140
|
+
i += 1
|
|
141
|
+
|
|
142
|
+
return "\n".join(out)
|
|
143
|
+
|
|
144
|
+
def _format_title(self, header_line: str) -> str | None:
|
|
145
|
+
match = self._header_pattern.match(header_line)
|
|
146
|
+
if not match:
|
|
147
|
+
return None
|
|
148
|
+
alg_no = (match.group(2) or "").strip()
|
|
149
|
+
rest = (match.group(3) or "").strip()
|
|
150
|
+
rest = self._clean_inline(rest)
|
|
151
|
+
if alg_no and rest:
|
|
152
|
+
return f"Algorithm {alg_no}: {rest}"
|
|
153
|
+
if alg_no:
|
|
154
|
+
return f"Algorithm {alg_no}"
|
|
155
|
+
if rest:
|
|
156
|
+
return f"Algorithm: {rest}"
|
|
157
|
+
return "Algorithm"
|
|
158
|
+
|
|
159
|
+
def _is_algo_continuation(self, line: str) -> bool:
|
|
160
|
+
s = line.strip()
|
|
161
|
+
if s == "" or s == "***":
|
|
162
|
+
return True
|
|
163
|
+
if re.match(r"^\s*\d+\s*[:.)]\ ", s):
|
|
164
|
+
return True
|
|
165
|
+
if re.match(r"^\s*(Input|Output|Require|Ensure):\s*", s, re.I):
|
|
166
|
+
return True
|
|
167
|
+
if re.match(
|
|
168
|
+
r"^\s*(function|procedure|for|while|if|else|repeat|return|end)\b",
|
|
169
|
+
s,
|
|
170
|
+
re.I,
|
|
171
|
+
):
|
|
172
|
+
return True
|
|
173
|
+
return False
|
|
174
|
+
|
|
175
|
+
def _clean_inline(self, text: str) -> str:
|
|
176
|
+
text = re.sub(r"<\s*sub\s*>\s*(.*?)\s*<\s*/\s*sub\s*>", lambda m: "_" + re.sub(r"\*", "", m.group(1)), text, flags=re.I)
|
|
177
|
+
text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
|
|
178
|
+
text = re.sub(r"\*([^\*]+)\*", r"\1", text)
|
|
179
|
+
text = re.sub(r"\*+$", "", text)
|
|
180
|
+
text = re.sub(r"^\*+", "", text)
|
|
181
|
+
return text.strip()
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
class TitleProcessor:
|
|
185
|
+
def __init__(self) -> None:
|
|
186
|
+
self._patterns = {
|
|
187
|
+
"roman_with_sec": re.compile(
|
|
188
|
+
r"^(#{1,6})?\s*(Sec(?:tion)?\.\s*)?([IVX]+(?:\.[IVX]+)*)(\.?)\s+(.+)$"
|
|
189
|
+
),
|
|
190
|
+
"number": re.compile(r"^\s*(#{1,6})?\s*(\d+(?:\.\d+)*)(\.?)\s+(.+)$"),
|
|
191
|
+
"letter_upper": re.compile(r"^(#{1,6})?\s*([A-Z])\.\s+(.+)$"),
|
|
192
|
+
"letter_lower": re.compile(r"^(#{1,6})?\s*([a-z])\.\s+(.+)$"),
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
def fix_titles(self, text: str) -> str:
|
|
196
|
+
lines = text.split("\n")
|
|
197
|
+
new_lines: list[str] = []
|
|
198
|
+
|
|
199
|
+
def is_title(line: str) -> bool:
|
|
200
|
+
return re.match(r"^#{1,6}\s+", line) is not None
|
|
201
|
+
|
|
202
|
+
has_roman = bool(
|
|
203
|
+
re.search(
|
|
204
|
+
r"^#{1,6}?\s*(?:Sec(?:tion)?\.\s*)?[IVX]+(?:\.[IVX]+)*\.?\s+",
|
|
205
|
+
text,
|
|
206
|
+
re.MULTILINE,
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
for line in lines:
|
|
211
|
+
if not is_title(line):
|
|
212
|
+
new_lines.append(line)
|
|
213
|
+
continue
|
|
214
|
+
modified = False
|
|
215
|
+
|
|
216
|
+
match = self._patterns["roman_with_sec"].match(line)
|
|
217
|
+
if match:
|
|
218
|
+
section_prefix = match.group(2) or ""
|
|
219
|
+
roman_num = match.group(3)
|
|
220
|
+
dot = match.group(4)
|
|
221
|
+
title = match.group(5)
|
|
222
|
+
level = len(roman_num.split(".")) + 1
|
|
223
|
+
new_hashes = "#" * level
|
|
224
|
+
new_line = f"{new_hashes} {section_prefix}{roman_num}{dot or '.'} {title}"
|
|
225
|
+
new_lines.append(new_line)
|
|
226
|
+
modified = True
|
|
227
|
+
|
|
228
|
+
if not modified:
|
|
229
|
+
match = self._patterns["number"].match(line)
|
|
230
|
+
if match:
|
|
231
|
+
number = match.group(2)
|
|
232
|
+
dot = match.group(3)
|
|
233
|
+
title = match.group(4)
|
|
234
|
+
level = len(number.split(".")) + 1
|
|
235
|
+
if has_roman:
|
|
236
|
+
level += 1
|
|
237
|
+
new_hashes = "#" * min(level, 6)
|
|
238
|
+
trail_dot = dot if has_roman else (dot or ".")
|
|
239
|
+
new_line = f"{new_hashes} {number}{trail_dot} {title}"
|
|
240
|
+
new_lines.append(new_line)
|
|
241
|
+
modified = True
|
|
242
|
+
|
|
243
|
+
if not modified:
|
|
244
|
+
for pattern_name in ["letter_upper", "letter_lower"]:
|
|
245
|
+
match = self._patterns[pattern_name].match(line)
|
|
246
|
+
if match and not re.match(r"^[A-Z][a-z]", match.group(3)):
|
|
247
|
+
letter = match.group(2)
|
|
248
|
+
title = match.group(3)
|
|
249
|
+
level = 4 if pattern_name == "letter_upper" else 5
|
|
250
|
+
new_hashes = "#" * level
|
|
251
|
+
new_line = f"{new_hashes} {letter}. {title}"
|
|
252
|
+
new_lines.append(new_line)
|
|
253
|
+
modified = True
|
|
254
|
+
break
|
|
255
|
+
|
|
256
|
+
if not modified:
|
|
257
|
+
new_lines.append(line)
|
|
258
|
+
|
|
259
|
+
return "\n".join(new_lines)
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
@dataclass
|
|
263
|
+
class Block:
|
|
264
|
+
kind: str
|
|
265
|
+
content: str
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _is_blank(line: str) -> bool:
|
|
269
|
+
return len(line.strip()) == 0
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _line_starts_with_fence(line: str) -> Optional[str]:
|
|
273
|
+
match = re.match(r"^\s*(`{3,}|~{3,})", line)
|
|
274
|
+
return match.group(1) if match else None
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _looks_like_table_header(line: str) -> bool:
|
|
278
|
+
return "|" in line
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _looks_like_table_delim(line: str) -> bool:
|
|
282
|
+
return (
|
|
283
|
+
re.match(r"^\s*\|?\s*:?-{3,}:?\s*(\|\s*:?-{3,}:?\s*)+\|?\s*$", line)
|
|
284
|
+
is not None
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
|
|
288
|
+
def _is_image_line(line: str) -> bool:
|
|
289
|
+
return re.match(r"^\s*!\[.*?\]\(.*?\)\s*$", line) is not None
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _parse_blocks(text: str) -> list[Block]:
|
|
293
|
+
lines = text.splitlines(keepends=True)
|
|
294
|
+
blocks: list[Block] = []
|
|
295
|
+
i = 0
|
|
296
|
+
n = len(lines)
|
|
297
|
+
|
|
298
|
+
while i < n:
|
|
299
|
+
line = lines[i]
|
|
300
|
+
|
|
301
|
+
if _is_blank(line):
|
|
302
|
+
blocks.append(Block(kind="sep", content=line))
|
|
303
|
+
i += 1
|
|
304
|
+
continue
|
|
305
|
+
|
|
306
|
+
if line.strip() == "---":
|
|
307
|
+
blocks.append(Block(kind="page", content=line))
|
|
308
|
+
i += 1
|
|
309
|
+
continue
|
|
310
|
+
|
|
311
|
+
fence = _line_starts_with_fence(line)
|
|
312
|
+
if fence:
|
|
313
|
+
j = i + 1
|
|
314
|
+
while j < n and not re.match(rf"^\s*{re.escape(fence)}", lines[j]):
|
|
315
|
+
j += 1
|
|
316
|
+
if j < n:
|
|
317
|
+
block = "".join(lines[i : j + 1])
|
|
318
|
+
blocks.append(Block(kind="code", content=block))
|
|
319
|
+
i = j + 1
|
|
320
|
+
continue
|
|
321
|
+
block = "".join(lines[i:])
|
|
322
|
+
blocks.append(Block(kind="code", content=block))
|
|
323
|
+
break
|
|
324
|
+
|
|
325
|
+
if _is_image_line(line):
|
|
326
|
+
blocks.append(Block(kind="image", content=line))
|
|
327
|
+
i += 1
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
if i + 1 < n and _looks_like_table_header(line) and _looks_like_table_delim(lines[i + 1]):
|
|
331
|
+
j = i + 2
|
|
332
|
+
while j < n and ("|" in lines[j]) and not _is_blank(lines[j]):
|
|
333
|
+
j += 1
|
|
334
|
+
block = "".join(lines[i:j])
|
|
335
|
+
blocks.append(Block(kind="table", content=block))
|
|
336
|
+
i = j
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
if line.strip() == "$$":
|
|
340
|
+
j = i + 1
|
|
341
|
+
while j < n and lines[j].strip() != "$$":
|
|
342
|
+
j += 1
|
|
343
|
+
if j < n:
|
|
344
|
+
block = "".join(lines[i : j + 1])
|
|
345
|
+
blocks.append(Block(kind="math", content=block))
|
|
346
|
+
i = j + 1
|
|
347
|
+
continue
|
|
348
|
+
block = "".join(lines[i:])
|
|
349
|
+
blocks.append(Block(kind="math", content=block))
|
|
350
|
+
break
|
|
351
|
+
|
|
352
|
+
text_lines = [line]
|
|
353
|
+
j = i + 1
|
|
354
|
+
while j < n:
|
|
355
|
+
peek = lines[j]
|
|
356
|
+
if _is_blank(peek) or _is_image_line(peek) or peek.strip() == "---":
|
|
357
|
+
break
|
|
358
|
+
if _line_starts_with_fence(peek):
|
|
359
|
+
break
|
|
360
|
+
if j + 1 < n and _looks_like_table_header(peek) and _looks_like_table_delim(lines[j + 1]):
|
|
361
|
+
break
|
|
362
|
+
if peek.strip() == "$$":
|
|
363
|
+
break
|
|
364
|
+
text_lines.append(peek)
|
|
365
|
+
j += 1
|
|
366
|
+
blocks.append(Block(kind="text", content="".join(text_lines)))
|
|
367
|
+
i = j
|
|
368
|
+
|
|
369
|
+
return blocks
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _word_set(text: str) -> set[str]:
|
|
373
|
+
return {w for w in re.split(r"\W+", text.lower()) if w}
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
def _split_confidence(before_text: str, after_text: str) -> float:
|
|
377
|
+
confidence = 0.0
|
|
378
|
+
if not re.search(r"[.!?。!?]\s*$", before_text):
|
|
379
|
+
confidence += 0.3
|
|
380
|
+
if after_text and after_text[0].islower():
|
|
381
|
+
confidence += 0.4
|
|
382
|
+
common_words = len(_word_set(before_text) & _word_set(after_text))
|
|
383
|
+
if common_words > 1:
|
|
384
|
+
confidence += min(0.3, common_words * 0.1)
|
|
385
|
+
return min(confidence, 1.0)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _merge_blocks(blocks: list[Block]) -> list[Block]:
|
|
389
|
+
idx = 0
|
|
390
|
+
while idx + 2 < len(blocks):
|
|
391
|
+
before = blocks[idx]
|
|
392
|
+
middle = blocks[idx + 1]
|
|
393
|
+
after = blocks[idx + 2]
|
|
394
|
+
if before.kind != "text" or after.kind != "text":
|
|
395
|
+
idx += 1
|
|
396
|
+
continue
|
|
397
|
+
if middle.kind not in {"page", "image", "table", "code"}:
|
|
398
|
+
idx += 1
|
|
399
|
+
continue
|
|
400
|
+
|
|
401
|
+
before_text = before.content.strip()
|
|
402
|
+
after_text = after.content.strip()
|
|
403
|
+
if before_text == "" or after_text == "":
|
|
404
|
+
idx += 1
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
if middle.kind == "page" and before_text.endswith("-") and after_text[0].islower():
|
|
408
|
+
merged_text = before.content.rstrip("-") + after.content.lstrip()
|
|
409
|
+
blocks = blocks[:idx] + [Block(kind="text", content=merged_text)] + blocks[idx + 3 :]
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
confidence = _split_confidence(before_text, after_text)
|
|
413
|
+
if confidence < 0.7:
|
|
414
|
+
idx += 1
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
merged_text = before.content.rstrip() + " " + after.content.lstrip()
|
|
418
|
+
if middle.kind == "page":
|
|
419
|
+
blocks = blocks[:idx] + [Block(kind="text", content=merged_text)] + blocks[idx + 3 :]
|
|
420
|
+
continue
|
|
421
|
+
|
|
422
|
+
blocks = blocks[:idx] + [Block(kind="text", content=merged_text), middle] + blocks[idx + 3 :]
|
|
423
|
+
idx += 1
|
|
424
|
+
|
|
425
|
+
return blocks
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def merge_paragraphs(text: str) -> str:
|
|
429
|
+
blocks = _parse_blocks(text)
|
|
430
|
+
merged = _merge_blocks(blocks)
|
|
431
|
+
return "".join(block.content for block in merged)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def fix_markdown(text: str, level: str) -> str:
|
|
435
|
+
if level == "off":
|
|
436
|
+
return text
|
|
437
|
+
|
|
438
|
+
ref_processor = ReferenceProcessor()
|
|
439
|
+
link_processor = LinkProcessor()
|
|
440
|
+
pseudo_processor = PseudocodeProcessor()
|
|
441
|
+
title_processor = TitleProcessor()
|
|
442
|
+
|
|
443
|
+
text = merge_paragraphs(text)
|
|
444
|
+
text = ref_processor.fix_references(text)
|
|
445
|
+
text = link_processor.fix_links(text)
|
|
446
|
+
text = pseudo_processor.wrap_pseudocode_blocks(text)
|
|
447
|
+
|
|
448
|
+
if level == "aggressive":
|
|
449
|
+
text = title_processor.fix_titles(text)
|
|
450
|
+
|
|
451
|
+
return text
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Placeholder store for protected markdown segments."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PlaceHolderStore:
|
|
10
|
+
def __init__(self) -> None:
|
|
11
|
+
self._map: dict[str, str] = {}
|
|
12
|
+
self._rev: dict[str, str] = {}
|
|
13
|
+
self._kind_count: dict[str, int] = {}
|
|
14
|
+
self.length = 0
|
|
15
|
+
|
|
16
|
+
def add(self, kind: str, text: str) -> str:
|
|
17
|
+
if text in self._map:
|
|
18
|
+
return self._map[text]
|
|
19
|
+
|
|
20
|
+
self.length += 1
|
|
21
|
+
length_str = str(self.length).zfill(6)
|
|
22
|
+
placeholder = f"__PH_{kind}_{length_str}__"
|
|
23
|
+
self._map[text] = placeholder
|
|
24
|
+
self._rev[placeholder] = text
|
|
25
|
+
self._kind_count[kind] = self._kind_count.get(kind, 0) + 1
|
|
26
|
+
return placeholder
|
|
27
|
+
|
|
28
|
+
def save(self, file_path: str) -> None:
|
|
29
|
+
payload = {
|
|
30
|
+
"map": self._map,
|
|
31
|
+
"rev": self._rev,
|
|
32
|
+
"kind_count": self._kind_count,
|
|
33
|
+
}
|
|
34
|
+
with open(file_path, "w", encoding="utf-8") as handle:
|
|
35
|
+
json.dump(payload, handle, indent=2, ensure_ascii=False)
|
|
36
|
+
|
|
37
|
+
def load(self, file_path: str) -> None:
|
|
38
|
+
with open(file_path, "r", encoding="utf-8") as handle:
|
|
39
|
+
payload = json.load(handle)
|
|
40
|
+
self._map = payload.get("map", {})
|
|
41
|
+
self._rev = payload.get("rev", {})
|
|
42
|
+
self._kind_count = payload.get("kind_count", {})
|
|
43
|
+
self.length = len(self._map)
|
|
44
|
+
|
|
45
|
+
def restore_all(self, text: str) -> str:
|
|
46
|
+
for placeholder, raw in sorted(self._rev.items(), key=lambda item: -len(item[0])):
|
|
47
|
+
if raw.endswith("\n"):
|
|
48
|
+
text = text.replace(f"{placeholder}\n", raw)
|
|
49
|
+
text = text.replace(placeholder, raw)
|
|
50
|
+
return text
|
|
51
|
+
|
|
52
|
+
def contains_all(self, text: str) -> bool:
|
|
53
|
+
return all(placeholder in text for placeholder in self._map.values())
|
|
54
|
+
|
|
55
|
+
def diff_missing(self, text: str) -> List[str]:
|
|
56
|
+
return [ph for ph in self._map.values() if ph not in text]
|
|
57
|
+
|
|
58
|
+
def snapshot(self) -> Dict[str, str]:
|
|
59
|
+
return dict(self._map)
|
|
60
|
+
|
|
61
|
+
def kind_counts(self) -> Dict[str, int]:
|
|
62
|
+
return dict(self._kind_count)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Prompt templates for markdown translation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from string import Template
|
|
6
|
+
from textwrap import dedent
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _cdata_wrap(text: str) -> str:
|
|
10
|
+
return text.replace("]]>", "]]]]><![CDATA[>")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
SYSTEM_RULES = dedent(
|
|
14
|
+
"""\
|
|
15
|
+
You are a professional translation engine. Follow these invariant rules:
|
|
16
|
+
- Preserve all original formatting exactly (Markdown, whitespace, paragraph breaks).
|
|
17
|
+
- Do NOT translate LaTeX ($...$, $$...$$, \\( ... \\), \\[ ... \\]) or LaTeX commands/environments.
|
|
18
|
+
- Keep all HTML tags intact.
|
|
19
|
+
- Do NOT alter abbreviations, technical terms, or code identifiers.
|
|
20
|
+
- Handle NODE styles: @@NODE_START_{n}@@/@@NODE_END_{n}@@ and <NODE_START_{n}></NODE_END_{n}>.
|
|
21
|
+
- Respect PRESERVE spans: @@PRESERVE_{n}@@ ... @@/PRESERVE_{n}@@ (leave markers and enclosed text unchanged).
|
|
22
|
+
- Placeholders like __PH_[A-Z0-9_]+__ must remain unchanged.
|
|
23
|
+
- Output ONLY the NODE blocks in original order; no extra commentary.
|
|
24
|
+
- If markers malformed: reproduce original block verbatim and append <!-- VIOLATION: reason --> once.
|
|
25
|
+
"""
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
TRANSLATE_XML_TEMPLATE = Template(
|
|
30
|
+
dedent(
|
|
31
|
+
"""\
|
|
32
|
+
<TranslationTask version="1.0">
|
|
33
|
+
<meta>
|
|
34
|
+
<source_lang>$SOURCE_LANG</source_lang>
|
|
35
|
+
<target_lang>$TARGET_LANG</target_lang>
|
|
36
|
+
<visibility_note>Sections with visibility="internal" are instructions and MUST NOT appear in the final output.</visibility_note>
|
|
37
|
+
</meta>
|
|
38
|
+
|
|
39
|
+
<constraints visibility="internal">
|
|
40
|
+
<rule id="fmt-1">Preserve ALL original formatting exactly: Markdown, whitespace, line breaks, paragraph spacing.</rule>
|
|
41
|
+
<rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$, \\( ... \\), \\[ ... \\]) or LaTeX commands/environments.</rule>
|
|
42
|
+
<rule id="fmt-3">Keep ALL HTML tags intact.</rule>
|
|
43
|
+
<rule id="fmt-4">Do NOT alter abbreviations, technical terms, or code identifiers; translate surrounding prose only.</rule>
|
|
44
|
+
<rule id="fmt-5">Document structure must be preserved, including blank lines (double newlines) between blocks.</rule>
|
|
45
|
+
</constraints>
|
|
46
|
+
|
|
47
|
+
<markers visibility="internal">
|
|
48
|
+
<preserve>
|
|
49
|
+
<open>@@PRESERVE_{n}@@</open>
|
|
50
|
+
<close>@@/PRESERVE_{n}@@</close>
|
|
51
|
+
<instruction>Leave both markers and enclosed text EXACTLY unchanged.</instruction>
|
|
52
|
+
</preserve>
|
|
53
|
+
<node accepted_styles="double">
|
|
54
|
+
<style type="at">
|
|
55
|
+
<open>@@NODE_START_{n}@@</open>
|
|
56
|
+
<close>@@NODE_END_{n}@@</close>
|
|
57
|
+
</style>
|
|
58
|
+
<style type="angle">
|
|
59
|
+
<open><NODE_START_{n}></open>
|
|
60
|
+
<close></NODE_END_{n}></close>
|
|
61
|
+
</style>
|
|
62
|
+
<scope>Translate ONLY the text inside each NODE block.</scope>
|
|
63
|
+
<layout>
|
|
64
|
+
<rule>Preserve the exact presence/absence of newlines around the content.</rule>
|
|
65
|
+
<rule>Preserve all spaces and blank lines BETWEEN NODE blocks exactly.</rule>
|
|
66
|
+
</layout>
|
|
67
|
+
</node>
|
|
68
|
+
<placeholders>
|
|
69
|
+
<pattern>__PH_[A-Z0-9_]+__</pattern>
|
|
70
|
+
<instruction>All placeholders matching this regex MUST be left unchanged.</instruction>
|
|
71
|
+
</placeholders>
|
|
72
|
+
</markers>
|
|
73
|
+
|
|
74
|
+
<output_spec visibility="internal">
|
|
75
|
+
<rule id="out-1">Output ONLY the NODE blocks in the original order. Non-NODE text must NOT be echoed.</rule>
|
|
76
|
+
<rule id="out-2">For each NODE: emit the exact START marker, then the translated content, then the exact END marker.</rule>
|
|
77
|
+
<rule id="out-3">Do NOT reveal or restate any instructions with visibility="internal".</rule>
|
|
78
|
+
</output_spec>
|
|
79
|
+
|
|
80
|
+
<quality_checks visibility="internal">
|
|
81
|
+
<check>Count of START and END NODE markers is identical to input; indices {n} match 1:1.</check>
|
|
82
|
+
<check>No PRESERVE spans were altered; byte-for-byte identical.</check>
|
|
83
|
+
<check>No LaTeX/HTML/code tokens changed; only prose translated.</check>
|
|
84
|
+
<check>Paragraph breaks and intra-block whitespace unchanged.</check>
|
|
85
|
+
</quality_checks>
|
|
86
|
+
|
|
87
|
+
<fallback visibility="internal">
|
|
88
|
+
<strategy>If a block violates constraints or markers are malformed, do NOT guess. Reproduce the original block unchanged and append a single-line comment <!-- VIOLATION: reason --> after the block.</strategy>
|
|
89
|
+
</fallback>
|
|
90
|
+
|
|
91
|
+
<io>
|
|
92
|
+
<input>
|
|
93
|
+
<![CDATA[
|
|
94
|
+
$TEXT_TO_TRANSLATE
|
|
95
|
+
]]>
|
|
96
|
+
</input>
|
|
97
|
+
<expected_output visibility="internal">
|
|
98
|
+
<note>Emit only transformed NODE blocks per output_spec. Nothing else.</note>
|
|
99
|
+
</expected_output>
|
|
100
|
+
</io>
|
|
101
|
+
</TranslationTask>
|
|
102
|
+
"""
|
|
103
|
+
)
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_translation_messages(source_lang: str | None, target_lang: str, text: str) -> list[dict[str, str]]:
|
|
108
|
+
user_xml = TRANSLATE_XML_TEMPLATE.substitute(
|
|
109
|
+
SOURCE_LANG=source_lang or "auto",
|
|
110
|
+
TARGET_LANG=target_lang,
|
|
111
|
+
TEXT_TO_TRANSLATE=_cdata_wrap(text),
|
|
112
|
+
)
|
|
113
|
+
return [
|
|
114
|
+
{"role": "system", "content": SYSTEM_RULES},
|
|
115
|
+
{"role": "user", "content": user_xml},
|
|
116
|
+
]
|