deepresearch-flow 0.2.1__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. deepresearch_flow/cli.py +2 -0
  2. deepresearch_flow/paper/config.py +15 -0
  3. deepresearch_flow/paper/db.py +193 -0
  4. deepresearch_flow/paper/db_ops.py +1939 -0
  5. deepresearch_flow/paper/llm.py +2 -0
  6. deepresearch_flow/paper/web/app.py +46 -3320
  7. deepresearch_flow/paper/web/constants.py +23 -0
  8. deepresearch_flow/paper/web/filters.py +255 -0
  9. deepresearch_flow/paper/web/handlers/__init__.py +14 -0
  10. deepresearch_flow/paper/web/handlers/api.py +217 -0
  11. deepresearch_flow/paper/web/handlers/pages.py +334 -0
  12. deepresearch_flow/paper/web/markdown.py +549 -0
  13. deepresearch_flow/paper/web/static/css/main.css +857 -0
  14. deepresearch_flow/paper/web/static/js/detail.js +406 -0
  15. deepresearch_flow/paper/web/static/js/index.js +266 -0
  16. deepresearch_flow/paper/web/static/js/outline.js +58 -0
  17. deepresearch_flow/paper/web/static/js/stats.js +39 -0
  18. deepresearch_flow/paper/web/templates/base.html +43 -0
  19. deepresearch_flow/paper/web/templates/detail.html +332 -0
  20. deepresearch_flow/paper/web/templates/index.html +114 -0
  21. deepresearch_flow/paper/web/templates/stats.html +29 -0
  22. deepresearch_flow/paper/web/templates.py +85 -0
  23. deepresearch_flow/paper/web/text.py +68 -0
  24. deepresearch_flow/recognize/cli.py +157 -3
  25. deepresearch_flow/recognize/organize.py +58 -0
  26. deepresearch_flow/translator/__init__.py +1 -0
  27. deepresearch_flow/translator/cli.py +451 -0
  28. deepresearch_flow/translator/config.py +19 -0
  29. deepresearch_flow/translator/engine.py +959 -0
  30. deepresearch_flow/translator/fixers.py +451 -0
  31. deepresearch_flow/translator/placeholder.py +62 -0
  32. deepresearch_flow/translator/prompts.py +116 -0
  33. deepresearch_flow/translator/protector.py +291 -0
  34. deepresearch_flow/translator/segment.py +180 -0
  35. deepresearch_flow-0.4.0.dist-info/METADATA +327 -0
  36. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/RECORD +40 -13
  37. deepresearch_flow-0.2.1.dist-info/METADATA +0 -424
  38. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/WHEEL +0 -0
  39. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/entry_points.txt +0 -0
  40. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/licenses/LICENSE +0 -0
  41. {deepresearch_flow-0.2.1.dist-info → deepresearch_flow-0.4.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,959 @@
1
+ """Translation engine for OCR markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from dataclasses import dataclass
7
+ import difflib
8
+ import logging
9
+ import re
10
+ import shutil
11
+ import subprocess
12
+ from typing import Optional, Protocol
13
+
14
+ import httpx
15
+
16
+ from deepresearch_flow.paper.config import ProviderConfig, resolve_api_keys
17
+ from deepresearch_flow.paper.llm import call_provider, backoff_delay
18
+ from deepresearch_flow.paper.providers.base import ProviderError
19
+ from deepresearch_flow.translator.config import TranslateConfig
20
+ from deepresearch_flow.translator.fixers import fix_markdown
21
+ from deepresearch_flow.translator.placeholder import PlaceHolderStore
22
+ from deepresearch_flow.translator.prompts import build_translation_messages
23
+ from deepresearch_flow.translator.protector import MarkdownProtector
24
+ from deepresearch_flow.translator.segment import Node, reassemble_segments, split_to_segments
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class TranslationProgress(Protocol):
31
+ async def add_groups(self, count: int) -> None:
32
+ ...
33
+
34
+ async def advance_groups(self, count: int) -> None:
35
+ ...
36
+
37
+ async def set_group_status(self, text: str) -> None:
38
+ ...
39
+
40
+
41
+ @dataclass
42
+ class TranslationResult:
43
+ translated_text: str
44
+ protected_text: str
45
+ placeholder_store: PlaceHolderStore
46
+ nodes: dict[int, Node]
47
+ stats: "TranslationStats"
48
+
49
+
50
+ @dataclass
51
+ class TranslationStats:
52
+ total_nodes: int
53
+ success_nodes: int
54
+ failed_nodes: int
55
+ skipped_nodes: int
56
+ initial_groups: int
57
+ retry_groups: int
58
+ retry_rounds: int
59
+
60
+
61
+ class KeyRotator:
62
+ def __init__(self, keys: list[str]) -> None:
63
+ self._keys = keys
64
+ self._idx = 0
65
+ self._lock = asyncio.Lock()
66
+
67
+ async def next_key(self) -> str | None:
68
+ if not self._keys:
69
+ return None
70
+ async with self._lock:
71
+ key = self._keys[self._idx % len(self._keys)]
72
+ self._idx += 1
73
+ return key
74
+
75
+
76
+ class RequestThrottle:
77
+ def __init__(self, sleep_every: int, sleep_time: float) -> None:
78
+ if sleep_every <= 0 or sleep_time <= 0:
79
+ raise ValueError("sleep_every and sleep_time must be positive")
80
+ self.sleep_every = sleep_every
81
+ self.sleep_time = sleep_time
82
+ self._count = 0
83
+ self._lock = asyncio.Lock()
84
+
85
+ async def tick(self) -> None:
86
+ async with self._lock:
87
+ self._count += 1
88
+ if self._count % self.sleep_every == 0:
89
+ await asyncio.sleep(self.sleep_time)
90
+
91
+
92
+ class MarkdownTranslator:
93
+ def __init__(self, cfg: TranslateConfig) -> None:
94
+ self.cfg = cfg
95
+ self.protector = MarkdownProtector()
96
+ self._rumdl_path = shutil.which("rumdl")
97
+ self._rumdl_warned = False
98
+
99
+ self._rx_preserve = re.compile(
100
+ r"@@PRESERVE_(\d+)@@[\s\S]*?@@/PRESERVE_\1@@", re.DOTALL
101
+ )
102
+ self._rx_placeholder = re.compile(r"__PH_[A-Z0-9_]+__")
103
+ self._rx_placeholder_fuzzy = re.compile(
104
+ r"__PH[^A-Za-z0-9]*([A-Za-z0-9]+)[^0-9]*([0-9]{6})__"
105
+ )
106
+ self._rx_latex_dbl = re.compile(r"\$\$[\s\S]*?\$\$", re.DOTALL)
107
+ self._rx_latex_sgl = re.compile(r"\$[^$]*?\$")
108
+ self._rx_latex_pi = re.compile(r"\\\((?:.|\n)*?\\\)", re.DOTALL)
109
+ self._rx_latex_br = re.compile(r"\\\[(?:.|\n)*?\\\]", re.DOTALL)
110
+ self._rx_html_tag = re.compile(r"</?[^>]+>")
111
+ self._rx_code_fence = re.compile(r"```[\s\S]*?```", re.DOTALL)
112
+ self._rx_code_inline = re.compile(r"`[^`]*`")
113
+ self._rx_url = re.compile(r"https?://\S+|www\.\S+")
114
+ self._rx_letters = re.compile(
115
+ r"[A-Za-z\u00C0-\u024F\u4E00-\u9FFF\u3040-\u30FF\uAC00-\uD7AF]"
116
+ )
117
+
118
+ self._rx_node_unpack = re.compile(
119
+ r"<NODE_START_(\d{4})>(.*?)</NODE_END_\1>", re.DOTALL
120
+ )
121
+
122
+ def _strip_untranslatables(self, s: str) -> str:
123
+ s = self._rx_preserve.sub("", s)
124
+ s = self._rx_placeholder.sub("", s)
125
+ s = self._rx_latex_dbl.sub("", s)
126
+ s = self._rx_latex_sgl.sub("", s)
127
+ s = self._rx_latex_pi.sub("", s)
128
+ s = self._rx_latex_br.sub("", s)
129
+ s = self._rx_code_fence.sub("", s)
130
+ s = self._rx_code_inline.sub("", s)
131
+ s = self._rx_html_tag.sub("", s)
132
+ s = self._rx_url.sub("", s)
133
+ s = re.sub(r"[\s\W_]+", "", s, flags=re.UNICODE)
134
+ return s
135
+
136
+ def _is_placeholder_only(self, s: str) -> bool:
137
+ core = self._strip_untranslatables(s)
138
+ return not bool(self._rx_letters.search(core))
139
+
140
+ def _placeholders_multiset(self, s: str) -> list[str]:
141
+ return sorted(self._rx_placeholder.findall(s))
142
+
143
+ def _normalize_for_compare(self, s: str) -> str:
144
+ s = self._rx_placeholder.sub("", s)
145
+ s = self._rx_latex_dbl.sub("", s)
146
+ s = self._rx_latex_sgl.sub("", s)
147
+ s = self._rx_latex_pi.sub("", s)
148
+ s = self._rx_latex_br.sub("", s)
149
+ s = self._rx_code_fence.sub("", s)
150
+ s = self._rx_code_inline.sub("", s)
151
+ s = self._rx_html_tag.sub("", s)
152
+ s = self._rx_url.sub("", s)
153
+ s = re.sub(r"\s+", " ", s).strip()
154
+ return s
155
+
156
+ def _contains_target_script(self, s: str, target_lang: str) -> bool:
157
+ tl = (target_lang or "").lower()
158
+ if tl.startswith("zh"):
159
+ return bool(re.search(r"[\u4E00-\u9FFF]", s))
160
+ if tl.startswith(("ja", "jp")):
161
+ return bool(re.search(r"[\u3040-\u30FF\u4E00-\u9FFF]", s))
162
+ if tl.startswith("en"):
163
+ return bool(re.search(r"[A-Za-z]", s))
164
+ return True
165
+
166
+ def _looks_like_identifier(self, s: str) -> bool:
167
+ text = s.strip()
168
+ if not text:
169
+ return False
170
+ if re.search(r"\b(?:isbn|issn|doi|arxiv)\b", text, flags=re.IGNORECASE):
171
+ return True
172
+ if re.search(r"\b(?:acm|ieee)\b", text, flags=re.IGNORECASE):
173
+ return True
174
+ if re.search(
175
+ r"\b\S+\.(?:pdf|png|jpg|jpeg|gif|svg|tex|bib|csv|json|md)\b",
176
+ text,
177
+ flags=re.IGNORECASE,
178
+ ):
179
+ return True
180
+ letters = re.findall(r"[A-Za-z]", text)
181
+ if not letters:
182
+ return True
183
+ if text.upper() == text and len(re.findall(r"[A-Z]+", text)) <= 6:
184
+ return True
185
+ if len(re.findall(r"[A-Za-z]+", text)) <= 2 and len(text) <= 24:
186
+ return True
187
+ return False
188
+
189
+ def _looks_like_person_name(self, s: str) -> bool:
190
+ text = s.strip()
191
+ if not text or len(text) > 80:
192
+ return False
193
+ particles = {
194
+ "van",
195
+ "von",
196
+ "de",
197
+ "del",
198
+ "der",
199
+ "da",
200
+ "di",
201
+ "la",
202
+ "le",
203
+ "du",
204
+ "al",
205
+ "bin",
206
+ "ibn",
207
+ "dos",
208
+ "das",
209
+ "mac",
210
+ "mc",
211
+ }
212
+ suffixes = {"jr", "sr", "ii", "iii", "iv"}
213
+ cleaned_parts: list[str] = []
214
+ for raw in re.split(r"\s+", text):
215
+ part = raw.strip().strip(",.;:*†‡")
216
+ part = part.strip("()[]{}")
217
+ part = re.sub(r"\d+$", "", part)
218
+ part = part.strip(",.;:*†‡")
219
+ if part:
220
+ cleaned_parts.append(part)
221
+ if len(cleaned_parts) < 2 or len(cleaned_parts) > 6:
222
+ return False
223
+ valid = 0
224
+ for part in cleaned_parts:
225
+ lower = part.lower()
226
+ if lower in particles or lower in suffixes:
227
+ continue
228
+ if re.match(r"^[A-Z]\.?$", part):
229
+ valid += 1
230
+ continue
231
+ if re.match(r"^[A-Z][A-Za-z]+(?:[-'][A-Za-z]+)*\.?$", part):
232
+ valid += 1
233
+ continue
234
+ return False
235
+ return valid >= 2
236
+
237
+ def _is_translation_success(self, orig: str, trans: str) -> bool:
238
+ if self._placeholders_multiset(orig) != self._placeholders_multiset(trans):
239
+ return False
240
+ if self._is_placeholder_only(orig):
241
+ return bool(trans and trans.strip())
242
+ if not trans or not trans.strip():
243
+ return False
244
+ core = self._strip_untranslatables(orig)
245
+ if not bool(self._rx_letters.search(core)):
246
+ return True
247
+ ratio = difflib.SequenceMatcher(
248
+ None, self._normalize_for_compare(orig), self._normalize_for_compare(trans)
249
+ ).ratio()
250
+ if ratio < 0.92:
251
+ return True
252
+ if self._contains_target_script(trans, self.cfg.target_lang):
253
+ return True
254
+ return self._looks_like_identifier(orig) or self._looks_like_person_name(orig)
255
+
256
+ def _translation_failure_reason(self, orig: str, trans: str) -> str | None:
257
+ if self._placeholders_multiset(orig) != self._placeholders_multiset(trans):
258
+ return "placeholders_mismatch"
259
+ if self._is_placeholder_only(orig):
260
+ if not trans or not trans.strip():
261
+ return "placeholder_only_empty"
262
+ return None
263
+ if not trans or not trans.strip():
264
+ return "empty_output"
265
+ core = self._strip_untranslatables(orig)
266
+ if not bool(self._rx_letters.search(core)):
267
+ return None
268
+ ratio = difflib.SequenceMatcher(
269
+ None, self._normalize_for_compare(orig), self._normalize_for_compare(trans)
270
+ ).ratio()
271
+ if ratio >= 0.92 and not self._contains_target_script(trans, self.cfg.target_lang):
272
+ if self._looks_like_identifier(orig):
273
+ return None
274
+ if self._looks_like_person_name(orig):
275
+ return None
276
+ return f"missing_target_script ratio={ratio:.2f}"
277
+ return None
278
+
279
+ def _fix_placeholder_typos(self, text: str, valid_placeholders: set[str]) -> str:
280
+ def replace(match: re.Match[str]) -> str:
281
+ kind = match.group(1).upper()
282
+ num = match.group(2)
283
+ candidate = f"__PH_{kind}_{num}__"
284
+ if candidate in valid_placeholders:
285
+ return candidate
286
+ return match.group(0)
287
+
288
+ return self._rx_placeholder_fuzzy.sub(replace, text)
289
+
290
+ def _align_placeholders(self, orig: str, trans: str) -> str:
291
+ orig_phs = self._rx_placeholder.findall(orig)
292
+ trans_phs = self._rx_placeholder.findall(trans)
293
+ if not orig_phs and not trans_phs:
294
+ return trans
295
+ if not orig_phs:
296
+ return self._rx_placeholder.sub("", trans)
297
+ if not trans_phs:
298
+ joiner = " " if trans and not trans.endswith((" ", "\n")) else ""
299
+ return f"{trans}{joiner}{' '.join(orig_phs)}"
300
+ parts = self._rx_placeholder.split(trans)
301
+ out = parts[0]
302
+ used = 0
303
+ for idx in range(len(trans_phs)):
304
+ if used < len(orig_phs):
305
+ out += orig_phs[used]
306
+ used += 1
307
+ out += parts[idx + 1]
308
+ if used < len(orig_phs):
309
+ joiner = " " if out and not out.endswith((" ", "\n")) else ""
310
+ out += f"{joiner}{' '.join(orig_phs[used:])}"
311
+ return out
312
+
313
+ def _summarize_text(self, text: str, limit: int = 160) -> str:
314
+ compact = re.sub(r"\s+", " ", text).strip()
315
+ if len(compact) > limit:
316
+ return f"{compact[:limit]}…"
317
+ return compact
318
+
319
+ def _log_failed_sample(self, failed_nodes: dict[int, Node], label: str) -> None:
320
+ if not logger.isEnabledFor(logging.DEBUG) or not failed_nodes:
321
+ return
322
+ sample_ids = list(sorted(failed_nodes.keys()))[:5]
323
+ for nid in sample_ids:
324
+ node = failed_nodes[nid]
325
+ reason = self._translation_failure_reason(node.origin_text, node.translated_text)
326
+ logger.debug(
327
+ "Failed node %d (%s) reason=%s origin=%s translated=%s",
328
+ nid,
329
+ label,
330
+ reason or "unknown",
331
+ self._summarize_text(node.origin_text),
332
+ self._summarize_text(node.translated_text),
333
+ )
334
+
335
+ def _normalize_markdown_blocks(self, text: str) -> str:
336
+ text = self._normalize_markdown_images(text)
337
+ return self._normalize_markdown_block_math(text)
338
+
339
+ async def _format_markdown(self, text: str, stage: str) -> str:
340
+ if not text.strip():
341
+ return text
342
+ if not self._rumdl_path:
343
+ if not self._rumdl_warned:
344
+ logger.warning("rumdl not available; skip markdown formatting")
345
+ self._rumdl_warned = True
346
+ return text
347
+
348
+ def run() -> subprocess.CompletedProcess[str]:
349
+ return subprocess.run(
350
+ [self._rumdl_path, "fmt", "--stdin", "--quiet"],
351
+ input=text,
352
+ text=True,
353
+ capture_output=True,
354
+ )
355
+
356
+ result = await asyncio.to_thread(run)
357
+ if result.returncode != 0:
358
+ logger.warning(
359
+ "rumdl fmt failed (%s): %s",
360
+ stage,
361
+ (result.stderr or "").strip() or "unknown error",
362
+ )
363
+ return text
364
+ return result.stdout or text
365
+
366
+ def _normalize_markdown_images(self, text: str) -> str:
367
+ lines = text.splitlines()
368
+ out: list[str] = []
369
+ in_fence = False
370
+ fence_char = ""
371
+ fence_len = 0
372
+ img_re = re.compile(r"!\[[^\]]*\]\((?:[^)\\]|\\.)*\)")
373
+ list_re = re.compile(r"^\s{0,3}(-|\*|\+|\d{1,9}\.)\s+")
374
+
375
+ for line in lines:
376
+ stripped = line.lstrip()
377
+ if stripped.startswith(("```", "~~~")):
378
+ run_len = 0
379
+ while run_len < len(stripped) and stripped[run_len] == stripped[0]:
380
+ run_len += 1
381
+ if not in_fence:
382
+ in_fence = True
383
+ fence_char = stripped[0]
384
+ fence_len = run_len
385
+ elif stripped[0] == fence_char and run_len >= fence_len:
386
+ in_fence = False
387
+ out.append(line)
388
+ continue
389
+ if in_fence:
390
+ out.append(line)
391
+ continue
392
+ match = img_re.search(line)
393
+ if not match:
394
+ out.append(line)
395
+ continue
396
+ if list_re.match(line) or (line.lstrip().startswith("|") and line.count("|") >= 2):
397
+ out.append(line)
398
+ continue
399
+ prefix = line[:match.start()]
400
+ suffix = line[match.end():]
401
+ prefix_text = prefix.strip()
402
+ suffix_text = suffix.strip()
403
+ indent = prefix if not prefix_text else ""
404
+ if prefix_text:
405
+ out.append(prefix.rstrip())
406
+ out.append("")
407
+ elif out and out[-1].strip():
408
+ out.append("")
409
+ out.append(f"{indent}{line[match.start():match.end()]}")
410
+ if suffix_text:
411
+ out.append("")
412
+ out.append(suffix.strip())
413
+ elif out and out[-1].strip():
414
+ out.append("")
415
+ return "\n".join(out)
416
+
417
+ def _normalize_markdown_block_math(self, text: str) -> str:
418
+ lines = text.splitlines()
419
+ out: list[str] = []
420
+ in_fence = False
421
+ fence_char = ""
422
+ fence_len = 0
423
+ in_math = False
424
+
425
+ for idx, line in enumerate(lines):
426
+ stripped = line.strip()
427
+ if stripped.startswith(("```", "~~~")):
428
+ run_len = 0
429
+ while run_len < len(stripped) and stripped[run_len] == stripped[0]:
430
+ run_len += 1
431
+ if not in_fence:
432
+ in_fence = True
433
+ fence_char = stripped[0]
434
+ fence_len = run_len
435
+ elif stripped[0] == fence_char and run_len >= fence_len:
436
+ in_fence = False
437
+ out.append(line)
438
+ continue
439
+ if in_fence:
440
+ out.append(line)
441
+ continue
442
+ if not in_math and stripped in {"$$", "\\["}:
443
+ if out and out[-1].strip():
444
+ out.append("")
445
+ out.append(line)
446
+ in_math = True
447
+ continue
448
+ if in_math:
449
+ out.append(line)
450
+ if stripped in {"$$", "\\]"}:
451
+ in_math = False
452
+ next_line = lines[idx + 1] if idx + 1 < len(lines) else ""
453
+ if next_line.strip():
454
+ out.append("")
455
+ continue
456
+ out.append(line)
457
+ return "\n".join(out)
458
+
459
+ def _group_nodes(
460
+ self,
461
+ nodes: dict[int, Node],
462
+ only_ids: Optional[list[int]] = None,
463
+ max_chunk_chars: Optional[int] = None,
464
+ include_translated: bool = False,
465
+ ) -> list[str]:
466
+ groups: list[str] = []
467
+ cur_group = ""
468
+ limit = max_chunk_chars or self.cfg.max_chunk_chars
469
+
470
+ ids = sorted(only_ids if only_ids is not None else nodes.keys())
471
+ for nid in ids:
472
+ node = nodes[nid]
473
+ if (not include_translated) and node.translated_text:
474
+ continue
475
+ id_str = f"{nid:04d}"
476
+ node_str = f"<NODE_START_{id_str}>\n{node.origin_text}\n</NODE_END_{id_str}>\n"
477
+ if len(cur_group) + len(node_str) > limit and cur_group:
478
+ groups.append(cur_group)
479
+ cur_group = ""
480
+ cur_group += node_str
481
+ if cur_group:
482
+ groups.append(cur_group)
483
+ return groups
484
+
485
+ def _ungroup_nodes(self, group_text: str, origin_nodes: dict[int, Node]) -> dict[int, Node]:
486
+ nodes: dict[int, Node] = {}
487
+ for match in self._rx_node_unpack.finditer(group_text):
488
+ node_id = int(match.group(1))
489
+ if node_id not in origin_nodes:
490
+ continue
491
+ nodes[node_id] = Node(
492
+ nid=node_id,
493
+ origin_text=origin_nodes[node_id].origin_text,
494
+ translated_text=match.group(2),
495
+ )
496
+ return nodes
497
+
498
+ def _ungroup_groups(
499
+ self,
500
+ groups: list[str],
501
+ origin_nodes: dict[int, Node],
502
+ fill_missing: bool = True,
503
+ ) -> dict[int, Node]:
504
+ nodes: dict[int, Node] = {}
505
+ for group_text in groups:
506
+ nodes.update(self._ungroup_nodes(group_text, origin_nodes))
507
+ if fill_missing:
508
+ for nid, node in origin_nodes.items():
509
+ if nid not in nodes:
510
+ nodes[nid] = node
511
+ return nodes
512
+
513
+ def _collect_failed_nodes(self, nodes: dict[int, Node]) -> dict[int, Node]:
514
+ failed: dict[int, Node] = {}
515
+ for nid, node in nodes.items():
516
+ ok = self._is_translation_success(node.origin_text, node.translated_text) if node.translated_text else False
517
+ if not ok:
518
+ failed[nid] = node
519
+ return failed
520
+
521
+ async def _translate_group(
522
+ self,
523
+ group_text: str,
524
+ provider: ProviderConfig,
525
+ model: str,
526
+ client: httpx.AsyncClient,
527
+ api_key: str | None,
528
+ timeout: float,
529
+ semaphore: asyncio.Semaphore,
530
+ throttle: RequestThrottle | None,
531
+ max_tokens: int | None,
532
+ max_retries: int,
533
+ ) -> str:
534
+ attempts = 0
535
+ while True:
536
+ attempts += 1
537
+ if throttle:
538
+ await throttle.tick()
539
+ messages = build_translation_messages(
540
+ self.cfg.source_lang, self.cfg.target_lang, group_text
541
+ )
542
+ try:
543
+ async with semaphore:
544
+ return await call_provider(
545
+ provider,
546
+ model,
547
+ messages,
548
+ {},
549
+ api_key,
550
+ timeout,
551
+ "none",
552
+ client,
553
+ max_tokens=max_tokens,
554
+ )
555
+ except ProviderError as exc:
556
+ if exc.retryable and attempts < max_retries:
557
+ await asyncio.sleep(backoff_delay(1.0, attempts, 20.0))
558
+ continue
559
+ raise
560
+
561
+ async def translate(
562
+ self,
563
+ text: str,
564
+ provider: ProviderConfig,
565
+ model: str,
566
+ client: httpx.AsyncClient,
567
+ api_keys: list[str],
568
+ timeout: float,
569
+ semaphore: asyncio.Semaphore,
570
+ throttle: RequestThrottle | None,
571
+ max_tokens: int | None,
572
+ fix_level: str,
573
+ progress: TranslationProgress | None = None,
574
+ fallback_provider: ProviderConfig | None = None,
575
+ fallback_model: str | None = None,
576
+ fallback_max_tokens: int | None = None,
577
+ fallback_provider_2: ProviderConfig | None = None,
578
+ fallback_model_2: str | None = None,
579
+ fallback_max_tokens_2: int | None = None,
580
+ fallback_retry_times: int | None = None,
581
+ fallback_retry_times_2: int | None = None,
582
+ format_enabled: bool = True,
583
+ ) -> TranslationResult:
584
+ if fix_level != "off":
585
+ text = fix_markdown(text, fix_level)
586
+ if format_enabled:
587
+ text = await self._format_markdown(text, "pre")
588
+
589
+ store = PlaceHolderStore()
590
+ protected = self.protector.protect(text, self.cfg, store)
591
+ segments, nodes = split_to_segments(protected, self.cfg.max_chunk_chars)
592
+ total_nodes = len(nodes)
593
+ if logger.isEnabledFor(logging.DEBUG):
594
+ logger.debug("Segments: %d", len(segments))
595
+ logger.debug("Nodes: %d", len(nodes))
596
+
597
+ skip_count = 0
598
+ for node in nodes.values():
599
+ if self._is_placeholder_only(node.origin_text):
600
+ node.translated_text = node.origin_text
601
+ skip_count += 1
602
+ if skip_count:
603
+ logger.debug("Skipped %d placeholder-only nodes", skip_count)
604
+ if logger.isEnabledFor(logging.DEBUG):
605
+ logger.debug("Placeholder counts: %s", store.kind_counts())
606
+
607
+ rotator = KeyRotator(resolve_api_keys(api_keys))
608
+ max_retries = max(self.cfg.retry_times, 1)
609
+
610
+ groups = self._group_nodes(nodes)
611
+ initial_groups = len(groups)
612
+ if logger.isEnabledFor(logging.DEBUG):
613
+ logger.debug("Groups: %d", len(groups))
614
+ if progress:
615
+ await progress.add_groups(len(groups))
616
+ outputs: list[str] = []
617
+ for group in groups:
618
+ api_key = await rotator.next_key()
619
+ outputs.append(
620
+ await self._translate_group(
621
+ group,
622
+ provider,
623
+ model,
624
+ client,
625
+ api_key,
626
+ timeout,
627
+ semaphore,
628
+ throttle,
629
+ max_tokens,
630
+ max_retries,
631
+ )
632
+ )
633
+ if progress:
634
+ await progress.advance_groups(1)
635
+
636
+ translated_nodes = self._ungroup_groups(outputs, nodes)
637
+ valid_placeholders = set(store.snapshot().values())
638
+ if valid_placeholders:
639
+ for node in translated_nodes.values():
640
+ if node.translated_text:
641
+ node.translated_text = self._fix_placeholder_typos(
642
+ node.translated_text, valid_placeholders
643
+ )
644
+ for node in translated_nodes.values():
645
+ if node.translated_text:
646
+ node.translated_text = self._align_placeholders(
647
+ node.origin_text, node.translated_text
648
+ )
649
+ failed_nodes = self._collect_failed_nodes(translated_nodes)
650
+ success_count = max(total_nodes - len(failed_nodes), 0)
651
+ logger.info(
652
+ "Initial translation: nodes=%d ok=%d fail=%d skip=%d groups=%d",
653
+ total_nodes,
654
+ success_count,
655
+ len(failed_nodes),
656
+ skip_count,
657
+ initial_groups,
658
+ )
659
+ self._log_failed_sample(failed_nodes, "initial")
660
+ if progress:
661
+ await progress.set_group_status(
662
+ f"nodes {total_nodes} ok {success_count} "
663
+ f"fail {len(failed_nodes)} skip {skip_count}"
664
+ )
665
+
666
+ retry_groups_total = 0
667
+ retry_rounds = 0
668
+ retry_limit = max_retries
669
+ retry_group_limit = self.cfg.retry_group_max_chars or max(
670
+ 1024, self.cfg.max_chunk_chars // 2
671
+ )
672
+ if self.cfg.retry_failed_nodes and failed_nodes:
673
+ attempt = 1
674
+ while failed_nodes and attempt <= retry_limit:
675
+ retry_ids = sorted(failed_nodes.keys())
676
+ retry_groups = self._group_nodes(
677
+ failed_nodes,
678
+ only_ids=retry_ids,
679
+ max_chunk_chars=retry_group_limit,
680
+ include_translated=True,
681
+ )
682
+ if not retry_groups:
683
+ break
684
+ retry_rounds += 1
685
+ retry_groups_total += len(retry_groups)
686
+ logger.info(
687
+ "Retrying %d failed nodes in %d groups (round %d/%d)",
688
+ len(failed_nodes),
689
+ len(retry_groups),
690
+ attempt,
691
+ retry_limit,
692
+ )
693
+ if progress:
694
+ await progress.add_groups(len(retry_groups))
695
+ retry_outputs: list[str] = []
696
+ for group in retry_groups:
697
+ api_key = await rotator.next_key()
698
+ retry_outputs.append(
699
+ await self._translate_group(
700
+ group,
701
+ provider,
702
+ model,
703
+ client,
704
+ api_key,
705
+ timeout,
706
+ semaphore,
707
+ throttle,
708
+ max_tokens,
709
+ retry_limit,
710
+ )
711
+ )
712
+ if progress:
713
+ await progress.advance_groups(1)
714
+ retry_nodes = self._ungroup_groups(
715
+ retry_outputs, failed_nodes, fill_missing=False
716
+ )
717
+ if valid_placeholders:
718
+ for node in retry_nodes.values():
719
+ if node.translated_text:
720
+ node.translated_text = self._fix_placeholder_typos(
721
+ node.translated_text, valid_placeholders
722
+ )
723
+ for node in retry_nodes.values():
724
+ if node.translated_text:
725
+ node.translated_text = self._align_placeholders(
726
+ node.origin_text, node.translated_text
727
+ )
728
+ for nid, node in retry_nodes.items():
729
+ translated_nodes[nid] = node
730
+ failed_nodes = self._collect_failed_nodes(translated_nodes)
731
+ success_count = max(total_nodes - len(failed_nodes), 0)
732
+ logger.info(
733
+ "Retry round %d done: nodes=%d ok=%d fail=%d skip=%d",
734
+ attempt,
735
+ total_nodes,
736
+ success_count,
737
+ len(failed_nodes),
738
+ skip_count,
739
+ )
740
+ self._log_failed_sample(failed_nodes, f"retry-{attempt}")
741
+ if progress:
742
+ await progress.set_group_status(
743
+ f"nodes {total_nodes} ok {success_count} "
744
+ f"fail {len(failed_nodes)} skip {skip_count}"
745
+ )
746
+ attempt += 1
747
+
748
+ if (
749
+ self.cfg.retry_failed_nodes
750
+ and failed_nodes
751
+ and fallback_provider
752
+ and fallback_model
753
+ ):
754
+ fallback_rotator = KeyRotator(resolve_api_keys(fallback_provider.api_keys))
755
+ attempt = 1
756
+ fallback_retry_limit = fallback_retry_times or retry_limit
757
+ while failed_nodes and attempt <= fallback_retry_limit:
758
+ retry_ids = sorted(failed_nodes.keys())
759
+ retry_groups = self._group_nodes(
760
+ failed_nodes,
761
+ only_ids=retry_ids,
762
+ max_chunk_chars=retry_group_limit,
763
+ include_translated=True,
764
+ )
765
+ if not retry_groups:
766
+ break
767
+ retry_rounds += 1
768
+ retry_groups_total += len(retry_groups)
769
+ logger.info(
770
+ "Fallback %s/%s: retrying %d failed nodes in %d groups (round %d/%d)",
771
+ fallback_provider.name,
772
+ fallback_model,
773
+ len(failed_nodes),
774
+ len(retry_groups),
775
+ attempt,
776
+ fallback_retry_limit,
777
+ )
778
+ if progress:
779
+ await progress.add_groups(len(retry_groups))
780
+ retry_outputs: list[str] = []
781
+ for group in retry_groups:
782
+ api_key = await fallback_rotator.next_key()
783
+ retry_outputs.append(
784
+ await self._translate_group(
785
+ group,
786
+ fallback_provider,
787
+ fallback_model,
788
+ client,
789
+ api_key,
790
+ timeout,
791
+ semaphore,
792
+ throttle,
793
+ fallback_max_tokens,
794
+ fallback_retry_limit,
795
+ )
796
+ )
797
+ if progress:
798
+ await progress.advance_groups(1)
799
+ retry_nodes = self._ungroup_groups(
800
+ retry_outputs, failed_nodes, fill_missing=False
801
+ )
802
+ if valid_placeholders:
803
+ for node in retry_nodes.values():
804
+ if node.translated_text:
805
+ node.translated_text = self._fix_placeholder_typos(
806
+ node.translated_text, valid_placeholders
807
+ )
808
+ for node in retry_nodes.values():
809
+ if node.translated_text:
810
+ node.translated_text = self._align_placeholders(
811
+ node.origin_text, node.translated_text
812
+ )
813
+ for nid, node in retry_nodes.items():
814
+ translated_nodes[nid] = node
815
+ failed_nodes = self._collect_failed_nodes(translated_nodes)
816
+ success_count = max(total_nodes - len(failed_nodes), 0)
817
+ logger.info(
818
+ "Fallback round %d done: nodes=%d ok=%d fail=%d skip=%d",
819
+ attempt,
820
+ total_nodes,
821
+ success_count,
822
+ len(failed_nodes),
823
+ skip_count,
824
+ )
825
+ self._log_failed_sample(failed_nodes, f"fallback-{attempt}")
826
+ if progress:
827
+ await progress.set_group_status(
828
+ f"nodes {total_nodes} ok {success_count} "
829
+ f"fail {len(failed_nodes)} skip {skip_count}"
830
+ )
831
+ attempt += 1
832
+
833
+ if (
834
+ self.cfg.retry_failed_nodes
835
+ and failed_nodes
836
+ and fallback_provider_2
837
+ and fallback_model_2
838
+ ):
839
+ fallback_rotator = KeyRotator(resolve_api_keys(fallback_provider_2.api_keys))
840
+ attempt = 1
841
+ fallback_retry_limit = fallback_retry_times_2 or retry_limit
842
+ while failed_nodes and attempt <= fallback_retry_limit:
843
+ retry_ids = sorted(failed_nodes.keys())
844
+ retry_groups = self._group_nodes(
845
+ failed_nodes,
846
+ only_ids=retry_ids,
847
+ max_chunk_chars=retry_group_limit,
848
+ include_translated=True,
849
+ )
850
+ if not retry_groups:
851
+ break
852
+ retry_rounds += 1
853
+ retry_groups_total += len(retry_groups)
854
+ logger.info(
855
+ "Fallback2 %s/%s: retrying %d failed nodes in %d groups (round %d/%d)",
856
+ fallback_provider_2.name,
857
+ fallback_model_2,
858
+ len(failed_nodes),
859
+ len(retry_groups),
860
+ attempt,
861
+ fallback_retry_limit,
862
+ )
863
+ if progress:
864
+ await progress.add_groups(len(retry_groups))
865
+ retry_outputs: list[str] = []
866
+ for group in retry_groups:
867
+ api_key = await fallback_rotator.next_key()
868
+ retry_outputs.append(
869
+ await self._translate_group(
870
+ group,
871
+ fallback_provider_2,
872
+ fallback_model_2,
873
+ client,
874
+ api_key,
875
+ timeout,
876
+ semaphore,
877
+ throttle,
878
+ fallback_max_tokens_2,
879
+ fallback_retry_limit,
880
+ )
881
+ )
882
+ if progress:
883
+ await progress.advance_groups(1)
884
+ retry_nodes = self._ungroup_groups(
885
+ retry_outputs, failed_nodes, fill_missing=False
886
+ )
887
+ if valid_placeholders:
888
+ for node in retry_nodes.values():
889
+ if node.translated_text:
890
+ node.translated_text = self._fix_placeholder_typos(
891
+ node.translated_text, valid_placeholders
892
+ )
893
+ for node in retry_nodes.values():
894
+ if node.translated_text:
895
+ node.translated_text = self._align_placeholders(
896
+ node.origin_text, node.translated_text
897
+ )
898
+ for nid, node in retry_nodes.items():
899
+ translated_nodes[nid] = node
900
+ failed_nodes = self._collect_failed_nodes(translated_nodes)
901
+ success_count = max(total_nodes - len(failed_nodes), 0)
902
+ logger.info(
903
+ "Fallback2 round %d done: nodes=%d ok=%d fail=%d skip=%d",
904
+ attempt,
905
+ total_nodes,
906
+ success_count,
907
+ len(failed_nodes),
908
+ skip_count,
909
+ )
910
+ self._log_failed_sample(failed_nodes, f"fallback2-{attempt}")
911
+ if progress:
912
+ await progress.set_group_status(
913
+ f"nodes {total_nodes} ok {success_count} "
914
+ f"fail {len(failed_nodes)} skip {skip_count}"
915
+ )
916
+ attempt += 1
917
+
918
+ failed_count = len(failed_nodes)
919
+ success_count = max(total_nodes - failed_count, 0)
920
+ stats = TranslationStats(
921
+ total_nodes=total_nodes,
922
+ success_nodes=success_count,
923
+ failed_nodes=failed_count,
924
+ skipped_nodes=skip_count,
925
+ initial_groups=initial_groups,
926
+ retry_groups=retry_groups_total,
927
+ retry_rounds=retry_rounds,
928
+ )
929
+
930
+ if logger.isEnabledFor(logging.DEBUG) and failed_nodes:
931
+ sample_ids = list(sorted(failed_nodes.keys()))[:5]
932
+ for nid in sample_ids:
933
+ node = failed_nodes[nid]
934
+ reason = self._translation_failure_reason(node.origin_text, node.translated_text)
935
+ logger.debug(
936
+ "Failed node %d reason=%s origin=%s translated=%s",
937
+ nid,
938
+ reason or "unknown",
939
+ self._summarize_text(node.origin_text),
940
+ self._summarize_text(node.translated_text),
941
+ )
942
+
943
+ if failed_nodes:
944
+ for nid in failed_nodes:
945
+ translated_nodes[nid].translated_text = translated_nodes[nid].origin_text
946
+
947
+ merged_text = reassemble_segments(segments, translated_nodes)
948
+ restored = self.protector.unprotect(merged_text, store)
949
+ if format_enabled:
950
+ restored = await self._format_markdown(restored, "post")
951
+ restored = self._normalize_markdown_blocks(restored)
952
+
953
+ return TranslationResult(
954
+ translated_text=restored,
955
+ protected_text=protected,
956
+ placeholder_store=store,
957
+ nodes=translated_nodes,
958
+ stats=stats,
959
+ )