deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/cli.py +2 -0
- deepresearch_flow/paper/config.py +15 -0
- deepresearch_flow/paper/db.py +193 -0
- deepresearch_flow/paper/db_ops.py +1939 -0
- deepresearch_flow/paper/llm.py +2 -0
- deepresearch_flow/paper/web/app.py +46 -3320
- deepresearch_flow/paper/web/constants.py +23 -0
- deepresearch_flow/paper/web/filters.py +255 -0
- deepresearch_flow/paper/web/handlers/__init__.py +14 -0
- deepresearch_flow/paper/web/handlers/api.py +217 -0
- deepresearch_flow/paper/web/handlers/pages.py +334 -0
- deepresearch_flow/paper/web/markdown.py +549 -0
- deepresearch_flow/paper/web/static/css/main.css +857 -0
- deepresearch_flow/paper/web/static/js/detail.js +406 -0
- deepresearch_flow/paper/web/static/js/index.js +266 -0
- deepresearch_flow/paper/web/static/js/outline.js +58 -0
- deepresearch_flow/paper/web/static/js/stats.js +39 -0
- deepresearch_flow/paper/web/templates/base.html +43 -0
- deepresearch_flow/paper/web/templates/detail.html +332 -0
- deepresearch_flow/paper/web/templates/index.html +114 -0
- deepresearch_flow/paper/web/templates/stats.html +29 -0
- deepresearch_flow/paper/web/templates.py +85 -0
- deepresearch_flow/paper/web/text.py +68 -0
- deepresearch_flow/recognize/cli.py +157 -3
- deepresearch_flow/recognize/organize.py +58 -0
- deepresearch_flow/translator/__init__.py +1 -0
- deepresearch_flow/translator/cli.py +451 -0
- deepresearch_flow/translator/config.py +19 -0
- deepresearch_flow/translator/engine.py +959 -0
- deepresearch_flow/translator/fixers.py +451 -0
- deepresearch_flow/translator/placeholder.py +62 -0
- deepresearch_flow/translator/prompts.py +116 -0
- deepresearch_flow/translator/protector.py +291 -0
- deepresearch_flow/translator/segment.py +180 -0
- deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
- deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,959 @@
|
|
|
1
|
+
"""Translation engine for OCR markdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
import difflib
|
|
8
|
+
import logging
|
|
9
|
+
import re
|
|
10
|
+
import shutil
|
|
11
|
+
import subprocess
|
|
12
|
+
from typing import Optional, Protocol
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from deepresearch_flow.paper.config import ProviderConfig, resolve_api_keys
|
|
17
|
+
from deepresearch_flow.paper.llm import call_provider, backoff_delay
|
|
18
|
+
from deepresearch_flow.paper.providers.base import ProviderError
|
|
19
|
+
from deepresearch_flow.translator.config import TranslateConfig
|
|
20
|
+
from deepresearch_flow.translator.fixers import fix_markdown
|
|
21
|
+
from deepresearch_flow.translator.placeholder import PlaceHolderStore
|
|
22
|
+
from deepresearch_flow.translator.prompts import build_translation_messages
|
|
23
|
+
from deepresearch_flow.translator.protector import MarkdownProtector
|
|
24
|
+
from deepresearch_flow.translator.segment import Node, reassemble_segments, split_to_segments
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class TranslationProgress(Protocol):
|
|
31
|
+
async def add_groups(self, count: int) -> None:
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
async def advance_groups(self, count: int) -> None:
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
async def set_group_status(self, text: str) -> None:
|
|
38
|
+
...
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class TranslationResult:
|
|
43
|
+
translated_text: str
|
|
44
|
+
protected_text: str
|
|
45
|
+
placeholder_store: PlaceHolderStore
|
|
46
|
+
nodes: dict[int, Node]
|
|
47
|
+
stats: "TranslationStats"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class TranslationStats:
|
|
52
|
+
total_nodes: int
|
|
53
|
+
success_nodes: int
|
|
54
|
+
failed_nodes: int
|
|
55
|
+
skipped_nodes: int
|
|
56
|
+
initial_groups: int
|
|
57
|
+
retry_groups: int
|
|
58
|
+
retry_rounds: int
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class KeyRotator:
|
|
62
|
+
def __init__(self, keys: list[str]) -> None:
|
|
63
|
+
self._keys = keys
|
|
64
|
+
self._idx = 0
|
|
65
|
+
self._lock = asyncio.Lock()
|
|
66
|
+
|
|
67
|
+
async def next_key(self) -> str | None:
|
|
68
|
+
if not self._keys:
|
|
69
|
+
return None
|
|
70
|
+
async with self._lock:
|
|
71
|
+
key = self._keys[self._idx % len(self._keys)]
|
|
72
|
+
self._idx += 1
|
|
73
|
+
return key
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class RequestThrottle:
|
|
77
|
+
def __init__(self, sleep_every: int, sleep_time: float) -> None:
|
|
78
|
+
if sleep_every <= 0 or sleep_time <= 0:
|
|
79
|
+
raise ValueError("sleep_every and sleep_time must be positive")
|
|
80
|
+
self.sleep_every = sleep_every
|
|
81
|
+
self.sleep_time = sleep_time
|
|
82
|
+
self._count = 0
|
|
83
|
+
self._lock = asyncio.Lock()
|
|
84
|
+
|
|
85
|
+
async def tick(self) -> None:
|
|
86
|
+
async with self._lock:
|
|
87
|
+
self._count += 1
|
|
88
|
+
if self._count % self.sleep_every == 0:
|
|
89
|
+
await asyncio.sleep(self.sleep_time)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class MarkdownTranslator:
|
|
93
|
+
def __init__(self, cfg: TranslateConfig) -> None:
|
|
94
|
+
self.cfg = cfg
|
|
95
|
+
self.protector = MarkdownProtector()
|
|
96
|
+
self._rumdl_path = shutil.which("rumdl")
|
|
97
|
+
self._rumdl_warned = False
|
|
98
|
+
|
|
99
|
+
self._rx_preserve = re.compile(
|
|
100
|
+
r"@@PRESERVE_(\d+)@@[\s\S]*?@@/PRESERVE_\1@@", re.DOTALL
|
|
101
|
+
)
|
|
102
|
+
self._rx_placeholder = re.compile(r"__PH_[A-Z0-9_]+__")
|
|
103
|
+
self._rx_placeholder_fuzzy = re.compile(
|
|
104
|
+
r"__PH[^A-Za-z0-9]*([A-Za-z0-9]+)[^0-9]*([0-9]{6})__"
|
|
105
|
+
)
|
|
106
|
+
self._rx_latex_dbl = re.compile(r"\$\$[\s\S]*?\$\$", re.DOTALL)
|
|
107
|
+
self._rx_latex_sgl = re.compile(r"\$[^$]*?\$")
|
|
108
|
+
self._rx_latex_pi = re.compile(r"\\\((?:.|\n)*?\\\)", re.DOTALL)
|
|
109
|
+
self._rx_latex_br = re.compile(r"\\\[(?:.|\n)*?\\\]", re.DOTALL)
|
|
110
|
+
self._rx_html_tag = re.compile(r"</?[^>]+>")
|
|
111
|
+
self._rx_code_fence = re.compile(r"```[\s\S]*?```", re.DOTALL)
|
|
112
|
+
self._rx_code_inline = re.compile(r"`[^`]*`")
|
|
113
|
+
self._rx_url = re.compile(r"https?://\S+|www\.\S+")
|
|
114
|
+
self._rx_letters = re.compile(
|
|
115
|
+
r"[A-Za-z\u00C0-\u024F\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
self._rx_node_unpack = re.compile(
|
|
119
|
+
r"<NODE_START_(\d{4})>(.*?)</NODE_END_\1>", re.DOTALL
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _strip_untranslatables(self, s: str) -> str:
|
|
123
|
+
s = self._rx_preserve.sub("", s)
|
|
124
|
+
s = self._rx_placeholder.sub("", s)
|
|
125
|
+
s = self._rx_latex_dbl.sub("", s)
|
|
126
|
+
s = self._rx_latex_sgl.sub("", s)
|
|
127
|
+
s = self._rx_latex_pi.sub("", s)
|
|
128
|
+
s = self._rx_latex_br.sub("", s)
|
|
129
|
+
s = self._rx_code_fence.sub("", s)
|
|
130
|
+
s = self._rx_code_inline.sub("", s)
|
|
131
|
+
s = self._rx_html_tag.sub("", s)
|
|
132
|
+
s = self._rx_url.sub("", s)
|
|
133
|
+
s = re.sub(r"[\s\W_]+", "", s, flags=re.UNICODE)
|
|
134
|
+
return s
|
|
135
|
+
|
|
136
|
+
def _is_placeholder_only(self, s: str) -> bool:
|
|
137
|
+
core = self._strip_untranslatables(s)
|
|
138
|
+
return not bool(self._rx_letters.search(core))
|
|
139
|
+
|
|
140
|
+
def _placeholders_multiset(self, s: str) -> list[str]:
|
|
141
|
+
return sorted(self._rx_placeholder.findall(s))
|
|
142
|
+
|
|
143
|
+
def _normalize_for_compare(self, s: str) -> str:
|
|
144
|
+
s = self._rx_placeholder.sub("", s)
|
|
145
|
+
s = self._rx_latex_dbl.sub("", s)
|
|
146
|
+
s = self._rx_latex_sgl.sub("", s)
|
|
147
|
+
s = self._rx_latex_pi.sub("", s)
|
|
148
|
+
s = self._rx_latex_br.sub("", s)
|
|
149
|
+
s = self._rx_code_fence.sub("", s)
|
|
150
|
+
s = self._rx_code_inline.sub("", s)
|
|
151
|
+
s = self._rx_html_tag.sub("", s)
|
|
152
|
+
s = self._rx_url.sub("", s)
|
|
153
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
154
|
+
return s
|
|
155
|
+
|
|
156
|
+
def _contains_target_script(self, s: str, target_lang: str) -> bool:
|
|
157
|
+
tl = (target_lang or "").lower()
|
|
158
|
+
if tl.startswith("zh"):
|
|
159
|
+
return bool(re.search(r"[\u4E00-\u9FFF]", s))
|
|
160
|
+
if tl.startswith(("ja", "jp")):
|
|
161
|
+
return bool(re.search(r"[\u3040-\u30FF\u4E00-\u9FFF]", s))
|
|
162
|
+
if tl.startswith("en"):
|
|
163
|
+
return bool(re.search(r"[A-Za-z]", s))
|
|
164
|
+
return True
|
|
165
|
+
|
|
166
|
+
def _looks_like_identifier(self, s: str) -> bool:
|
|
167
|
+
text = s.strip()
|
|
168
|
+
if not text:
|
|
169
|
+
return False
|
|
170
|
+
if re.search(r"\b(?:isbn|issn|doi|arxiv)\b", text, flags=re.IGNORECASE):
|
|
171
|
+
return True
|
|
172
|
+
if re.search(r"\b(?:acm|ieee)\b", text, flags=re.IGNORECASE):
|
|
173
|
+
return True
|
|
174
|
+
if re.search(
|
|
175
|
+
r"\b\S+\.(?:pdf|png|jpg|jpeg|gif|svg|tex|bib|csv|json|md)\b",
|
|
176
|
+
text,
|
|
177
|
+
flags=re.IGNORECASE,
|
|
178
|
+
):
|
|
179
|
+
return True
|
|
180
|
+
letters = re.findall(r"[A-Za-z]", text)
|
|
181
|
+
if not letters:
|
|
182
|
+
return True
|
|
183
|
+
if text.upper() == text and len(re.findall(r"[A-Z]+", text)) <= 6:
|
|
184
|
+
return True
|
|
185
|
+
if len(re.findall(r"[A-Za-z]+", text)) <= 2 and len(text) <= 24:
|
|
186
|
+
return True
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
def _looks_like_person_name(self, s: str) -> bool:
|
|
190
|
+
text = s.strip()
|
|
191
|
+
if not text or len(text) > 80:
|
|
192
|
+
return False
|
|
193
|
+
particles = {
|
|
194
|
+
"van",
|
|
195
|
+
"von",
|
|
196
|
+
"de",
|
|
197
|
+
"del",
|
|
198
|
+
"der",
|
|
199
|
+
"da",
|
|
200
|
+
"di",
|
|
201
|
+
"la",
|
|
202
|
+
"le",
|
|
203
|
+
"du",
|
|
204
|
+
"al",
|
|
205
|
+
"bin",
|
|
206
|
+
"ibn",
|
|
207
|
+
"dos",
|
|
208
|
+
"das",
|
|
209
|
+
"mac",
|
|
210
|
+
"mc",
|
|
211
|
+
}
|
|
212
|
+
suffixes = {"jr", "sr", "ii", "iii", "iv"}
|
|
213
|
+
cleaned_parts: list[str] = []
|
|
214
|
+
for raw in re.split(r"\s+", text):
|
|
215
|
+
part = raw.strip().strip(",.;:*†‡")
|
|
216
|
+
part = part.strip("()[]{}")
|
|
217
|
+
part = re.sub(r"\d+$", "", part)
|
|
218
|
+
part = part.strip(",.;:*†‡")
|
|
219
|
+
if part:
|
|
220
|
+
cleaned_parts.append(part)
|
|
221
|
+
if len(cleaned_parts) < 2 or len(cleaned_parts) > 6:
|
|
222
|
+
return False
|
|
223
|
+
valid = 0
|
|
224
|
+
for part in cleaned_parts:
|
|
225
|
+
lower = part.lower()
|
|
226
|
+
if lower in particles or lower in suffixes:
|
|
227
|
+
continue
|
|
228
|
+
if re.match(r"^[A-Z]\.?$", part):
|
|
229
|
+
valid += 1
|
|
230
|
+
continue
|
|
231
|
+
if re.match(r"^[A-Z][A-Za-z]+(?:[-'][A-Za-z]+)*\.?$", part):
|
|
232
|
+
valid += 1
|
|
233
|
+
continue
|
|
234
|
+
return False
|
|
235
|
+
return valid >= 2
|
|
236
|
+
|
|
237
|
+
def _is_translation_success(self, orig: str, trans: str) -> bool:
|
|
238
|
+
if self._placeholders_multiset(orig) != self._placeholders_multiset(trans):
|
|
239
|
+
return False
|
|
240
|
+
if self._is_placeholder_only(orig):
|
|
241
|
+
return bool(trans and trans.strip())
|
|
242
|
+
if not trans or not trans.strip():
|
|
243
|
+
return False
|
|
244
|
+
core = self._strip_untranslatables(orig)
|
|
245
|
+
if not bool(self._rx_letters.search(core)):
|
|
246
|
+
return True
|
|
247
|
+
ratio = difflib.SequenceMatcher(
|
|
248
|
+
None, self._normalize_for_compare(orig), self._normalize_for_compare(trans)
|
|
249
|
+
).ratio()
|
|
250
|
+
if ratio < 0.92:
|
|
251
|
+
return True
|
|
252
|
+
if self._contains_target_script(trans, self.cfg.target_lang):
|
|
253
|
+
return True
|
|
254
|
+
return self._looks_like_identifier(orig) or self._looks_like_person_name(orig)
|
|
255
|
+
|
|
256
|
+
def _translation_failure_reason(self, orig: str, trans: str) -> str | None:
|
|
257
|
+
if self._placeholders_multiset(orig) != self._placeholders_multiset(trans):
|
|
258
|
+
return "placeholders_mismatch"
|
|
259
|
+
if self._is_placeholder_only(orig):
|
|
260
|
+
if not trans or not trans.strip():
|
|
261
|
+
return "placeholder_only_empty"
|
|
262
|
+
return None
|
|
263
|
+
if not trans or not trans.strip():
|
|
264
|
+
return "empty_output"
|
|
265
|
+
core = self._strip_untranslatables(orig)
|
|
266
|
+
if not bool(self._rx_letters.search(core)):
|
|
267
|
+
return None
|
|
268
|
+
ratio = difflib.SequenceMatcher(
|
|
269
|
+
None, self._normalize_for_compare(orig), self._normalize_for_compare(trans)
|
|
270
|
+
).ratio()
|
|
271
|
+
if ratio >= 0.92 and not self._contains_target_script(trans, self.cfg.target_lang):
|
|
272
|
+
if self._looks_like_identifier(orig):
|
|
273
|
+
return None
|
|
274
|
+
if self._looks_like_person_name(orig):
|
|
275
|
+
return None
|
|
276
|
+
return f"missing_target_script ratio={ratio:.2f}"
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
def _fix_placeholder_typos(self, text: str, valid_placeholders: set[str]) -> str:
|
|
280
|
+
def replace(match: re.Match[str]) -> str:
|
|
281
|
+
kind = match.group(1).upper()
|
|
282
|
+
num = match.group(2)
|
|
283
|
+
candidate = f"__PH_{kind}_{num}__"
|
|
284
|
+
if candidate in valid_placeholders:
|
|
285
|
+
return candidate
|
|
286
|
+
return match.group(0)
|
|
287
|
+
|
|
288
|
+
return self._rx_placeholder_fuzzy.sub(replace, text)
|
|
289
|
+
|
|
290
|
+
def _align_placeholders(self, orig: str, trans: str) -> str:
|
|
291
|
+
orig_phs = self._rx_placeholder.findall(orig)
|
|
292
|
+
trans_phs = self._rx_placeholder.findall(trans)
|
|
293
|
+
if not orig_phs and not trans_phs:
|
|
294
|
+
return trans
|
|
295
|
+
if not orig_phs:
|
|
296
|
+
return self._rx_placeholder.sub("", trans)
|
|
297
|
+
if not trans_phs:
|
|
298
|
+
joiner = " " if trans and not trans.endswith((" ", "\n")) else ""
|
|
299
|
+
return f"{trans}{joiner}{' '.join(orig_phs)}"
|
|
300
|
+
parts = self._rx_placeholder.split(trans)
|
|
301
|
+
out = parts[0]
|
|
302
|
+
used = 0
|
|
303
|
+
for idx in range(len(trans_phs)):
|
|
304
|
+
if used < len(orig_phs):
|
|
305
|
+
out += orig_phs[used]
|
|
306
|
+
used += 1
|
|
307
|
+
out += parts[idx + 1]
|
|
308
|
+
if used < len(orig_phs):
|
|
309
|
+
joiner = " " if out and not out.endswith((" ", "\n")) else ""
|
|
310
|
+
out += f"{joiner}{' '.join(orig_phs[used:])}"
|
|
311
|
+
return out
|
|
312
|
+
|
|
313
|
+
def _summarize_text(self, text: str, limit: int = 160) -> str:
|
|
314
|
+
compact = re.sub(r"\s+", " ", text).strip()
|
|
315
|
+
if len(compact) > limit:
|
|
316
|
+
return f"{compact[:limit]}…"
|
|
317
|
+
return compact
|
|
318
|
+
|
|
319
|
+
def _log_failed_sample(self, failed_nodes: dict[int, Node], label: str) -> None:
|
|
320
|
+
if not logger.isEnabledFor(logging.DEBUG) or not failed_nodes:
|
|
321
|
+
return
|
|
322
|
+
sample_ids = list(sorted(failed_nodes.keys()))[:5]
|
|
323
|
+
for nid in sample_ids:
|
|
324
|
+
node = failed_nodes[nid]
|
|
325
|
+
reason = self._translation_failure_reason(node.origin_text, node.translated_text)
|
|
326
|
+
logger.debug(
|
|
327
|
+
"Failed node %d (%s) reason=%s origin=%s translated=%s",
|
|
328
|
+
nid,
|
|
329
|
+
label,
|
|
330
|
+
reason or "unknown",
|
|
331
|
+
self._summarize_text(node.origin_text),
|
|
332
|
+
self._summarize_text(node.translated_text),
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
def _normalize_markdown_blocks(self, text: str) -> str:
|
|
336
|
+
text = self._normalize_markdown_images(text)
|
|
337
|
+
return self._normalize_markdown_block_math(text)
|
|
338
|
+
|
|
339
|
+
async def _format_markdown(self, text: str, stage: str) -> str:
|
|
340
|
+
if not text.strip():
|
|
341
|
+
return text
|
|
342
|
+
if not self._rumdl_path:
|
|
343
|
+
if not self._rumdl_warned:
|
|
344
|
+
logger.warning("rumdl not available; skip markdown formatting")
|
|
345
|
+
self._rumdl_warned = True
|
|
346
|
+
return text
|
|
347
|
+
|
|
348
|
+
def run() -> subprocess.CompletedProcess[str]:
|
|
349
|
+
return subprocess.run(
|
|
350
|
+
[self._rumdl_path, "fmt", "--stdin", "--quiet"],
|
|
351
|
+
input=text,
|
|
352
|
+
text=True,
|
|
353
|
+
capture_output=True,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
result = await asyncio.to_thread(run)
|
|
357
|
+
if result.returncode != 0:
|
|
358
|
+
logger.warning(
|
|
359
|
+
"rumdl fmt failed (%s): %s",
|
|
360
|
+
stage,
|
|
361
|
+
(result.stderr or "").strip() or "unknown error",
|
|
362
|
+
)
|
|
363
|
+
return text
|
|
364
|
+
return result.stdout or text
|
|
365
|
+
|
|
366
|
+
def _normalize_markdown_images(self, text: str) -> str:
|
|
367
|
+
lines = text.splitlines()
|
|
368
|
+
out: list[str] = []
|
|
369
|
+
in_fence = False
|
|
370
|
+
fence_char = ""
|
|
371
|
+
fence_len = 0
|
|
372
|
+
img_re = re.compile(r"!\[[^\]]*\]\((?:[^)\\]|\\.)*\)")
|
|
373
|
+
list_re = re.compile(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+")
|
|
374
|
+
|
|
375
|
+
for line in lines:
|
|
376
|
+
stripped = line.lstrip()
|
|
377
|
+
if stripped.startswith(("```", "~~~")):
|
|
378
|
+
run_len = 0
|
|
379
|
+
while run_len < len(stripped) and stripped[run_len] == stripped[0]:
|
|
380
|
+
run_len += 1
|
|
381
|
+
if not in_fence:
|
|
382
|
+
in_fence = True
|
|
383
|
+
fence_char = stripped[0]
|
|
384
|
+
fence_len = run_len
|
|
385
|
+
elif stripped[0] == fence_char and run_len >= fence_len:
|
|
386
|
+
in_fence = False
|
|
387
|
+
out.append(line)
|
|
388
|
+
continue
|
|
389
|
+
if in_fence:
|
|
390
|
+
out.append(line)
|
|
391
|
+
continue
|
|
392
|
+
match = img_re.search(line)
|
|
393
|
+
if not match:
|
|
394
|
+
out.append(line)
|
|
395
|
+
continue
|
|
396
|
+
if list_re.match(line) or (line.lstrip().startswith("|") and line.count("|") >= 2):
|
|
397
|
+
out.append(line)
|
|
398
|
+
continue
|
|
399
|
+
prefix = line[:match.start()]
|
|
400
|
+
suffix = line[match.end():]
|
|
401
|
+
prefix_text = prefix.strip()
|
|
402
|
+
suffix_text = suffix.strip()
|
|
403
|
+
indent = prefix if not prefix_text else ""
|
|
404
|
+
if prefix_text:
|
|
405
|
+
out.append(prefix.rstrip())
|
|
406
|
+
out.append("")
|
|
407
|
+
elif out and out[-1].strip():
|
|
408
|
+
out.append("")
|
|
409
|
+
out.append(f"{indent}{line[match.start():match.end()]}")
|
|
410
|
+
if suffix_text:
|
|
411
|
+
out.append("")
|
|
412
|
+
out.append(suffix.strip())
|
|
413
|
+
elif out and out[-1].strip():
|
|
414
|
+
out.append("")
|
|
415
|
+
return "\n".join(out)
|
|
416
|
+
|
|
417
|
+
def _normalize_markdown_block_math(self, text: str) -> str:
|
|
418
|
+
lines = text.splitlines()
|
|
419
|
+
out: list[str] = []
|
|
420
|
+
in_fence = False
|
|
421
|
+
fence_char = ""
|
|
422
|
+
fence_len = 0
|
|
423
|
+
in_math = False
|
|
424
|
+
|
|
425
|
+
for idx, line in enumerate(lines):
|
|
426
|
+
stripped = line.strip()
|
|
427
|
+
if stripped.startswith(("```", "~~~")):
|
|
428
|
+
run_len = 0
|
|
429
|
+
while run_len < len(stripped) and stripped[run_len] == stripped[0]:
|
|
430
|
+
run_len += 1
|
|
431
|
+
if not in_fence:
|
|
432
|
+
in_fence = True
|
|
433
|
+
fence_char = stripped[0]
|
|
434
|
+
fence_len = run_len
|
|
435
|
+
elif stripped[0] == fence_char and run_len >= fence_len:
|
|
436
|
+
in_fence = False
|
|
437
|
+
out.append(line)
|
|
438
|
+
continue
|
|
439
|
+
if in_fence:
|
|
440
|
+
out.append(line)
|
|
441
|
+
continue
|
|
442
|
+
if not in_math and stripped in {"$$", "\\["}:
|
|
443
|
+
if out and out[-1].strip():
|
|
444
|
+
out.append("")
|
|
445
|
+
out.append(line)
|
|
446
|
+
in_math = True
|
|
447
|
+
continue
|
|
448
|
+
if in_math:
|
|
449
|
+
out.append(line)
|
|
450
|
+
if stripped in {"$$", "\\]"}:
|
|
451
|
+
in_math = False
|
|
452
|
+
next_line = lines[idx + 1] if idx + 1 < len(lines) else ""
|
|
453
|
+
if next_line.strip():
|
|
454
|
+
out.append("")
|
|
455
|
+
continue
|
|
456
|
+
out.append(line)
|
|
457
|
+
return "\n".join(out)
|
|
458
|
+
|
|
459
|
+
def _group_nodes(
|
|
460
|
+
self,
|
|
461
|
+
nodes: dict[int, Node],
|
|
462
|
+
only_ids: Optional[list[int]] = None,
|
|
463
|
+
max_chunk_chars: Optional[int] = None,
|
|
464
|
+
include_translated: bool = False,
|
|
465
|
+
) -> list[str]:
|
|
466
|
+
groups: list[str] = []
|
|
467
|
+
cur_group = ""
|
|
468
|
+
limit = max_chunk_chars or self.cfg.max_chunk_chars
|
|
469
|
+
|
|
470
|
+
ids = sorted(only_ids if only_ids is not None else nodes.keys())
|
|
471
|
+
for nid in ids:
|
|
472
|
+
node = nodes[nid]
|
|
473
|
+
if (not include_translated) and node.translated_text:
|
|
474
|
+
continue
|
|
475
|
+
id_str = f"{nid:04d}"
|
|
476
|
+
node_str = f"<NODE_START_{id_str}>\n{node.origin_text}\n</NODE_END_{id_str}>\n"
|
|
477
|
+
if len(cur_group) + len(node_str) > limit and cur_group:
|
|
478
|
+
groups.append(cur_group)
|
|
479
|
+
cur_group = ""
|
|
480
|
+
cur_group += node_str
|
|
481
|
+
if cur_group:
|
|
482
|
+
groups.append(cur_group)
|
|
483
|
+
return groups
|
|
484
|
+
|
|
485
|
+
def _ungroup_nodes(self, group_text: str, origin_nodes: dict[int, Node]) -> dict[int, Node]:
|
|
486
|
+
nodes: dict[int, Node] = {}
|
|
487
|
+
for match in self._rx_node_unpack.finditer(group_text):
|
|
488
|
+
node_id = int(match.group(1))
|
|
489
|
+
if node_id not in origin_nodes:
|
|
490
|
+
continue
|
|
491
|
+
nodes[node_id] = Node(
|
|
492
|
+
nid=node_id,
|
|
493
|
+
origin_text=origin_nodes[node_id].origin_text,
|
|
494
|
+
translated_text=match.group(2),
|
|
495
|
+
)
|
|
496
|
+
return nodes
|
|
497
|
+
|
|
498
|
+
def _ungroup_groups(
|
|
499
|
+
self,
|
|
500
|
+
groups: list[str],
|
|
501
|
+
origin_nodes: dict[int, Node],
|
|
502
|
+
fill_missing: bool = True,
|
|
503
|
+
) -> dict[int, Node]:
|
|
504
|
+
nodes: dict[int, Node] = {}
|
|
505
|
+
for group_text in groups:
|
|
506
|
+
nodes.update(self._ungroup_nodes(group_text, origin_nodes))
|
|
507
|
+
if fill_missing:
|
|
508
|
+
for nid, node in origin_nodes.items():
|
|
509
|
+
if nid not in nodes:
|
|
510
|
+
nodes[nid] = node
|
|
511
|
+
return nodes
|
|
512
|
+
|
|
513
|
+
def _collect_failed_nodes(self, nodes: dict[int, Node]) -> dict[int, Node]:
|
|
514
|
+
failed: dict[int, Node] = {}
|
|
515
|
+
for nid, node in nodes.items():
|
|
516
|
+
ok = self._is_translation_success(node.origin_text, node.translated_text) if node.translated_text else False
|
|
517
|
+
if not ok:
|
|
518
|
+
failed[nid] = node
|
|
519
|
+
return failed
|
|
520
|
+
|
|
521
|
+
async def _translate_group(
|
|
522
|
+
self,
|
|
523
|
+
group_text: str,
|
|
524
|
+
provider: ProviderConfig,
|
|
525
|
+
model: str,
|
|
526
|
+
client: httpx.AsyncClient,
|
|
527
|
+
api_key: str | None,
|
|
528
|
+
timeout: float,
|
|
529
|
+
semaphore: asyncio.Semaphore,
|
|
530
|
+
throttle: RequestThrottle | None,
|
|
531
|
+
max_tokens: int | None,
|
|
532
|
+
max_retries: int,
|
|
533
|
+
) -> str:
|
|
534
|
+
attempts = 0
|
|
535
|
+
while True:
|
|
536
|
+
attempts += 1
|
|
537
|
+
if throttle:
|
|
538
|
+
await throttle.tick()
|
|
539
|
+
messages = build_translation_messages(
|
|
540
|
+
self.cfg.source_lang, self.cfg.target_lang, group_text
|
|
541
|
+
)
|
|
542
|
+
try:
|
|
543
|
+
async with semaphore:
|
|
544
|
+
return await call_provider(
|
|
545
|
+
provider,
|
|
546
|
+
model,
|
|
547
|
+
messages,
|
|
548
|
+
{},
|
|
549
|
+
api_key,
|
|
550
|
+
timeout,
|
|
551
|
+
"none",
|
|
552
|
+
client,
|
|
553
|
+
max_tokens=max_tokens,
|
|
554
|
+
)
|
|
555
|
+
except ProviderError as exc:
|
|
556
|
+
if exc.retryable and attempts < max_retries:
|
|
557
|
+
await asyncio.sleep(backoff_delay(1.0, attempts, 20.0))
|
|
558
|
+
continue
|
|
559
|
+
raise
|
|
560
|
+
|
|
561
|
+
async def translate(
|
|
562
|
+
self,
|
|
563
|
+
text: str,
|
|
564
|
+
provider: ProviderConfig,
|
|
565
|
+
model: str,
|
|
566
|
+
client: httpx.AsyncClient,
|
|
567
|
+
api_keys: list[str],
|
|
568
|
+
timeout: float,
|
|
569
|
+
semaphore: asyncio.Semaphore,
|
|
570
|
+
throttle: RequestThrottle | None,
|
|
571
|
+
max_tokens: int | None,
|
|
572
|
+
fix_level: str,
|
|
573
|
+
progress: TranslationProgress | None = None,
|
|
574
|
+
fallback_provider: ProviderConfig | None = None,
|
|
575
|
+
fallback_model: str | None = None,
|
|
576
|
+
fallback_max_tokens: int | None = None,
|
|
577
|
+
fallback_provider_2: ProviderConfig | None = None,
|
|
578
|
+
fallback_model_2: str | None = None,
|
|
579
|
+
fallback_max_tokens_2: int | None = None,
|
|
580
|
+
fallback_retry_times: int | None = None,
|
|
581
|
+
fallback_retry_times_2: int | None = None,
|
|
582
|
+
format_enabled: bool = True,
|
|
583
|
+
) -> TranslationResult:
|
|
584
|
+
if fix_level != "off":
|
|
585
|
+
text = fix_markdown(text, fix_level)
|
|
586
|
+
if format_enabled:
|
|
587
|
+
text = await self._format_markdown(text, "pre")
|
|
588
|
+
|
|
589
|
+
store = PlaceHolderStore()
|
|
590
|
+
protected = self.protector.protect(text, self.cfg, store)
|
|
591
|
+
segments, nodes = split_to_segments(protected, self.cfg.max_chunk_chars)
|
|
592
|
+
total_nodes = len(nodes)
|
|
593
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
594
|
+
logger.debug("Segments: %d", len(segments))
|
|
595
|
+
logger.debug("Nodes: %d", len(nodes))
|
|
596
|
+
|
|
597
|
+
skip_count = 0
|
|
598
|
+
for node in nodes.values():
|
|
599
|
+
if self._is_placeholder_only(node.origin_text):
|
|
600
|
+
node.translated_text = node.origin_text
|
|
601
|
+
skip_count += 1
|
|
602
|
+
if skip_count:
|
|
603
|
+
logger.debug("Skipped %d placeholder-only nodes", skip_count)
|
|
604
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
605
|
+
logger.debug("Placeholder counts: %s", store.kind_counts())
|
|
606
|
+
|
|
607
|
+
rotator = KeyRotator(resolve_api_keys(api_keys))
|
|
608
|
+
max_retries = max(self.cfg.retry_times, 1)
|
|
609
|
+
|
|
610
|
+
groups = self._group_nodes(nodes)
|
|
611
|
+
initial_groups = len(groups)
|
|
612
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
613
|
+
logger.debug("Groups: %d", len(groups))
|
|
614
|
+
if progress:
|
|
615
|
+
await progress.add_groups(len(groups))
|
|
616
|
+
outputs: list[str] = []
|
|
617
|
+
for group in groups:
|
|
618
|
+
api_key = await rotator.next_key()
|
|
619
|
+
outputs.append(
|
|
620
|
+
await self._translate_group(
|
|
621
|
+
group,
|
|
622
|
+
provider,
|
|
623
|
+
model,
|
|
624
|
+
client,
|
|
625
|
+
api_key,
|
|
626
|
+
timeout,
|
|
627
|
+
semaphore,
|
|
628
|
+
throttle,
|
|
629
|
+
max_tokens,
|
|
630
|
+
max_retries,
|
|
631
|
+
)
|
|
632
|
+
)
|
|
633
|
+
if progress:
|
|
634
|
+
await progress.advance_groups(1)
|
|
635
|
+
|
|
636
|
+
translated_nodes = self._ungroup_groups(outputs, nodes)
|
|
637
|
+
valid_placeholders = set(store.snapshot().values())
|
|
638
|
+
if valid_placeholders:
|
|
639
|
+
for node in translated_nodes.values():
|
|
640
|
+
if node.translated_text:
|
|
641
|
+
node.translated_text = self._fix_placeholder_typos(
|
|
642
|
+
node.translated_text, valid_placeholders
|
|
643
|
+
)
|
|
644
|
+
for node in translated_nodes.values():
|
|
645
|
+
if node.translated_text:
|
|
646
|
+
node.translated_text = self._align_placeholders(
|
|
647
|
+
node.origin_text, node.translated_text
|
|
648
|
+
)
|
|
649
|
+
failed_nodes = self._collect_failed_nodes(translated_nodes)
|
|
650
|
+
success_count = max(total_nodes - len(failed_nodes), 0)
|
|
651
|
+
logger.info(
|
|
652
|
+
"Initial translation: nodes=%d ok=%d fail=%d skip=%d groups=%d",
|
|
653
|
+
total_nodes,
|
|
654
|
+
success_count,
|
|
655
|
+
len(failed_nodes),
|
|
656
|
+
skip_count,
|
|
657
|
+
initial_groups,
|
|
658
|
+
)
|
|
659
|
+
self._log_failed_sample(failed_nodes, "initial")
|
|
660
|
+
if progress:
|
|
661
|
+
await progress.set_group_status(
|
|
662
|
+
f"nodes {total_nodes} ok {success_count} "
|
|
663
|
+
f"fail {len(failed_nodes)} skip {skip_count}"
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
retry_groups_total = 0
|
|
667
|
+
retry_rounds = 0
|
|
668
|
+
retry_limit = max_retries
|
|
669
|
+
retry_group_limit = self.cfg.retry_group_max_chars or max(
|
|
670
|
+
1024, self.cfg.max_chunk_chars // 2
|
|
671
|
+
)
|
|
672
|
+
if self.cfg.retry_failed_nodes and failed_nodes:
|
|
673
|
+
attempt = 1
|
|
674
|
+
while failed_nodes and attempt <= retry_limit:
|
|
675
|
+
retry_ids = sorted(failed_nodes.keys())
|
|
676
|
+
retry_groups = self._group_nodes(
|
|
677
|
+
failed_nodes,
|
|
678
|
+
only_ids=retry_ids,
|
|
679
|
+
max_chunk_chars=retry_group_limit,
|
|
680
|
+
include_translated=True,
|
|
681
|
+
)
|
|
682
|
+
if not retry_groups:
|
|
683
|
+
break
|
|
684
|
+
retry_rounds += 1
|
|
685
|
+
retry_groups_total += len(retry_groups)
|
|
686
|
+
logger.info(
|
|
687
|
+
"Retrying %d failed nodes in %d groups (round %d/%d)",
|
|
688
|
+
len(failed_nodes),
|
|
689
|
+
len(retry_groups),
|
|
690
|
+
attempt,
|
|
691
|
+
retry_limit,
|
|
692
|
+
)
|
|
693
|
+
if progress:
|
|
694
|
+
await progress.add_groups(len(retry_groups))
|
|
695
|
+
retry_outputs: list[str] = []
|
|
696
|
+
for group in retry_groups:
|
|
697
|
+
api_key = await rotator.next_key()
|
|
698
|
+
retry_outputs.append(
|
|
699
|
+
await self._translate_group(
|
|
700
|
+
group,
|
|
701
|
+
provider,
|
|
702
|
+
model,
|
|
703
|
+
client,
|
|
704
|
+
api_key,
|
|
705
|
+
timeout,
|
|
706
|
+
semaphore,
|
|
707
|
+
throttle,
|
|
708
|
+
max_tokens,
|
|
709
|
+
retry_limit,
|
|
710
|
+
)
|
|
711
|
+
)
|
|
712
|
+
if progress:
|
|
713
|
+
await progress.advance_groups(1)
|
|
714
|
+
retry_nodes = self._ungroup_groups(
|
|
715
|
+
retry_outputs, failed_nodes, fill_missing=False
|
|
716
|
+
)
|
|
717
|
+
if valid_placeholders:
|
|
718
|
+
for node in retry_nodes.values():
|
|
719
|
+
if node.translated_text:
|
|
720
|
+
node.translated_text = self._fix_placeholder_typos(
|
|
721
|
+
node.translated_text, valid_placeholders
|
|
722
|
+
)
|
|
723
|
+
for node in retry_nodes.values():
|
|
724
|
+
if node.translated_text:
|
|
725
|
+
node.translated_text = self._align_placeholders(
|
|
726
|
+
node.origin_text, node.translated_text
|
|
727
|
+
)
|
|
728
|
+
for nid, node in retry_nodes.items():
|
|
729
|
+
translated_nodes[nid] = node
|
|
730
|
+
failed_nodes = self._collect_failed_nodes(translated_nodes)
|
|
731
|
+
success_count = max(total_nodes - len(failed_nodes), 0)
|
|
732
|
+
logger.info(
|
|
733
|
+
"Retry round %d done: nodes=%d ok=%d fail=%d skip=%d",
|
|
734
|
+
attempt,
|
|
735
|
+
total_nodes,
|
|
736
|
+
success_count,
|
|
737
|
+
len(failed_nodes),
|
|
738
|
+
skip_count,
|
|
739
|
+
)
|
|
740
|
+
self._log_failed_sample(failed_nodes, f"retry-{attempt}")
|
|
741
|
+
if progress:
|
|
742
|
+
await progress.set_group_status(
|
|
743
|
+
f"nodes {total_nodes} ok {success_count} "
|
|
744
|
+
f"fail {len(failed_nodes)} skip {skip_count}"
|
|
745
|
+
)
|
|
746
|
+
attempt += 1
|
|
747
|
+
|
|
748
|
+
if (
|
|
749
|
+
self.cfg.retry_failed_nodes
|
|
750
|
+
and failed_nodes
|
|
751
|
+
and fallback_provider
|
|
752
|
+
and fallback_model
|
|
753
|
+
):
|
|
754
|
+
fallback_rotator = KeyRotator(resolve_api_keys(fallback_provider.api_keys))
|
|
755
|
+
attempt = 1
|
|
756
|
+
fallback_retry_limit = fallback_retry_times or retry_limit
|
|
757
|
+
while failed_nodes and attempt <= fallback_retry_limit:
|
|
758
|
+
retry_ids = sorted(failed_nodes.keys())
|
|
759
|
+
retry_groups = self._group_nodes(
|
|
760
|
+
failed_nodes,
|
|
761
|
+
only_ids=retry_ids,
|
|
762
|
+
max_chunk_chars=retry_group_limit,
|
|
763
|
+
include_translated=True,
|
|
764
|
+
)
|
|
765
|
+
if not retry_groups:
|
|
766
|
+
break
|
|
767
|
+
retry_rounds += 1
|
|
768
|
+
retry_groups_total += len(retry_groups)
|
|
769
|
+
logger.info(
|
|
770
|
+
"Fallback %s/%s: retrying %d failed nodes in %d groups (round %d/%d)",
|
|
771
|
+
fallback_provider.name,
|
|
772
|
+
fallback_model,
|
|
773
|
+
len(failed_nodes),
|
|
774
|
+
len(retry_groups),
|
|
775
|
+
attempt,
|
|
776
|
+
fallback_retry_limit,
|
|
777
|
+
)
|
|
778
|
+
if progress:
|
|
779
|
+
await progress.add_groups(len(retry_groups))
|
|
780
|
+
retry_outputs: list[str] = []
|
|
781
|
+
for group in retry_groups:
|
|
782
|
+
api_key = await fallback_rotator.next_key()
|
|
783
|
+
retry_outputs.append(
|
|
784
|
+
await self._translate_group(
|
|
785
|
+
group,
|
|
786
|
+
fallback_provider,
|
|
787
|
+
fallback_model,
|
|
788
|
+
client,
|
|
789
|
+
api_key,
|
|
790
|
+
timeout,
|
|
791
|
+
semaphore,
|
|
792
|
+
throttle,
|
|
793
|
+
fallback_max_tokens,
|
|
794
|
+
fallback_retry_limit,
|
|
795
|
+
)
|
|
796
|
+
)
|
|
797
|
+
if progress:
|
|
798
|
+
await progress.advance_groups(1)
|
|
799
|
+
retry_nodes = self._ungroup_groups(
|
|
800
|
+
retry_outputs, failed_nodes, fill_missing=False
|
|
801
|
+
)
|
|
802
|
+
if valid_placeholders:
|
|
803
|
+
for node in retry_nodes.values():
|
|
804
|
+
if node.translated_text:
|
|
805
|
+
node.translated_text = self._fix_placeholder_typos(
|
|
806
|
+
node.translated_text, valid_placeholders
|
|
807
|
+
)
|
|
808
|
+
for node in retry_nodes.values():
|
|
809
|
+
if node.translated_text:
|
|
810
|
+
node.translated_text = self._align_placeholders(
|
|
811
|
+
node.origin_text, node.translated_text
|
|
812
|
+
)
|
|
813
|
+
for nid, node in retry_nodes.items():
|
|
814
|
+
translated_nodes[nid] = node
|
|
815
|
+
failed_nodes = self._collect_failed_nodes(translated_nodes)
|
|
816
|
+
success_count = max(total_nodes - len(failed_nodes), 0)
|
|
817
|
+
logger.info(
|
|
818
|
+
"Fallback round %d done: nodes=%d ok=%d fail=%d skip=%d",
|
|
819
|
+
attempt,
|
|
820
|
+
total_nodes,
|
|
821
|
+
success_count,
|
|
822
|
+
len(failed_nodes),
|
|
823
|
+
skip_count,
|
|
824
|
+
)
|
|
825
|
+
self._log_failed_sample(failed_nodes, f"fallback-{attempt}")
|
|
826
|
+
if progress:
|
|
827
|
+
await progress.set_group_status(
|
|
828
|
+
f"nodes {total_nodes} ok {success_count} "
|
|
829
|
+
f"fail {len(failed_nodes)} skip {skip_count}"
|
|
830
|
+
)
|
|
831
|
+
attempt += 1
|
|
832
|
+
|
|
833
|
+
if (
|
|
834
|
+
self.cfg.retry_failed_nodes
|
|
835
|
+
and failed_nodes
|
|
836
|
+
and fallback_provider_2
|
|
837
|
+
and fallback_model_2
|
|
838
|
+
):
|
|
839
|
+
fallback_rotator = KeyRotator(resolve_api_keys(fallback_provider_2.api_keys))
|
|
840
|
+
attempt = 1
|
|
841
|
+
fallback_retry_limit = fallback_retry_times_2 or retry_limit
|
|
842
|
+
while failed_nodes and attempt <= fallback_retry_limit:
|
|
843
|
+
retry_ids = sorted(failed_nodes.keys())
|
|
844
|
+
retry_groups = self._group_nodes(
|
|
845
|
+
failed_nodes,
|
|
846
|
+
only_ids=retry_ids,
|
|
847
|
+
max_chunk_chars=retry_group_limit,
|
|
848
|
+
include_translated=True,
|
|
849
|
+
)
|
|
850
|
+
if not retry_groups:
|
|
851
|
+
break
|
|
852
|
+
retry_rounds += 1
|
|
853
|
+
retry_groups_total += len(retry_groups)
|
|
854
|
+
logger.info(
|
|
855
|
+
"Fallback2 %s/%s: retrying %d failed nodes in %d groups (round %d/%d)",
|
|
856
|
+
fallback_provider_2.name,
|
|
857
|
+
fallback_model_2,
|
|
858
|
+
len(failed_nodes),
|
|
859
|
+
len(retry_groups),
|
|
860
|
+
attempt,
|
|
861
|
+
fallback_retry_limit,
|
|
862
|
+
)
|
|
863
|
+
if progress:
|
|
864
|
+
await progress.add_groups(len(retry_groups))
|
|
865
|
+
retry_outputs: list[str] = []
|
|
866
|
+
for group in retry_groups:
|
|
867
|
+
api_key = await fallback_rotator.next_key()
|
|
868
|
+
retry_outputs.append(
|
|
869
|
+
await self._translate_group(
|
|
870
|
+
group,
|
|
871
|
+
fallback_provider_2,
|
|
872
|
+
fallback_model_2,
|
|
873
|
+
client,
|
|
874
|
+
api_key,
|
|
875
|
+
timeout,
|
|
876
|
+
semaphore,
|
|
877
|
+
throttle,
|
|
878
|
+
fallback_max_tokens_2,
|
|
879
|
+
fallback_retry_limit,
|
|
880
|
+
)
|
|
881
|
+
)
|
|
882
|
+
if progress:
|
|
883
|
+
await progress.advance_groups(1)
|
|
884
|
+
retry_nodes = self._ungroup_groups(
|
|
885
|
+
retry_outputs, failed_nodes, fill_missing=False
|
|
886
|
+
)
|
|
887
|
+
if valid_placeholders:
|
|
888
|
+
for node in retry_nodes.values():
|
|
889
|
+
if node.translated_text:
|
|
890
|
+
node.translated_text = self._fix_placeholder_typos(
|
|
891
|
+
node.translated_text, valid_placeholders
|
|
892
|
+
)
|
|
893
|
+
for node in retry_nodes.values():
|
|
894
|
+
if node.translated_text:
|
|
895
|
+
node.translated_text = self._align_placeholders(
|
|
896
|
+
node.origin_text, node.translated_text
|
|
897
|
+
)
|
|
898
|
+
for nid, node in retry_nodes.items():
|
|
899
|
+
translated_nodes[nid] = node
|
|
900
|
+
failed_nodes = self._collect_failed_nodes(translated_nodes)
|
|
901
|
+
success_count = max(total_nodes - len(failed_nodes), 0)
|
|
902
|
+
logger.info(
|
|
903
|
+
"Fallback2 round %d done: nodes=%d ok=%d fail=%d skip=%d",
|
|
904
|
+
attempt,
|
|
905
|
+
total_nodes,
|
|
906
|
+
success_count,
|
|
907
|
+
len(failed_nodes),
|
|
908
|
+
skip_count,
|
|
909
|
+
)
|
|
910
|
+
self._log_failed_sample(failed_nodes, f"fallback2-{attempt}")
|
|
911
|
+
if progress:
|
|
912
|
+
await progress.set_group_status(
|
|
913
|
+
f"nodes {total_nodes} ok {success_count} "
|
|
914
|
+
f"fail {len(failed_nodes)} skip {skip_count}"
|
|
915
|
+
)
|
|
916
|
+
attempt += 1
|
|
917
|
+
|
|
918
|
+
failed_count = len(failed_nodes)
|
|
919
|
+
success_count = max(total_nodes - failed_count, 0)
|
|
920
|
+
stats = TranslationStats(
|
|
921
|
+
total_nodes=total_nodes,
|
|
922
|
+
success_nodes=success_count,
|
|
923
|
+
failed_nodes=failed_count,
|
|
924
|
+
skipped_nodes=skip_count,
|
|
925
|
+
initial_groups=initial_groups,
|
|
926
|
+
retry_groups=retry_groups_total,
|
|
927
|
+
retry_rounds=retry_rounds,
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
if logger.isEnabledFor(logging.DEBUG) and failed_nodes:
|
|
931
|
+
sample_ids = list(sorted(failed_nodes.keys()))[:5]
|
|
932
|
+
for nid in sample_ids:
|
|
933
|
+
node = failed_nodes[nid]
|
|
934
|
+
reason = self._translation_failure_reason(node.origin_text, node.translated_text)
|
|
935
|
+
logger.debug(
|
|
936
|
+
"Failed node %d reason=%s origin=%s translated=%s",
|
|
937
|
+
nid,
|
|
938
|
+
reason or "unknown",
|
|
939
|
+
self._summarize_text(node.origin_text),
|
|
940
|
+
self._summarize_text(node.translated_text),
|
|
941
|
+
)
|
|
942
|
+
|
|
943
|
+
if failed_nodes:
|
|
944
|
+
for nid in failed_nodes:
|
|
945
|
+
translated_nodes[nid].translated_text = translated_nodes[nid].origin_text
|
|
946
|
+
|
|
947
|
+
merged_text = reassemble_segments(segments, translated_nodes)
|
|
948
|
+
restored = self.protector.unprotect(merged_text, store)
|
|
949
|
+
if format_enabled:
|
|
950
|
+
restored = await self._format_markdown(restored, "post")
|
|
951
|
+
restored = self._normalize_markdown_blocks(restored)
|
|
952
|
+
|
|
953
|
+
return TranslationResult(
|
|
954
|
+
translated_text=restored,
|
|
955
|
+
protected_text=protected,
|
|
956
|
+
placeholder_store=store,
|
|
957
|
+
nodes=translated_nodes,
|
|
958
|
+
stats=stats,
|
|
959
|
+
)
|