deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. deepresearch_flow/paper/cli.py +63 -0
  2. deepresearch_flow/paper/config.py +87 -12
  3. deepresearch_flow/paper/db.py +1041 -34
  4. deepresearch_flow/paper/db_ops.py +145 -26
  5. deepresearch_flow/paper/extract.py +1546 -152
  6. deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
  7. deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
  8. deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
  9. deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
  10. deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
  11. deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
  12. deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
  13. deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
  14. deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
  15. deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
  16. deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
  17. deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
  18. deepresearch_flow/paper/providers/azure_openai.py +45 -3
  19. deepresearch_flow/paper/providers/openai_compatible.py +45 -3
  20. deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
  21. deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
  22. deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
  23. deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
  24. deepresearch_flow/paper/snapshot/__init__.py +4 -0
  25. deepresearch_flow/paper/snapshot/api.py +941 -0
  26. deepresearch_flow/paper/snapshot/builder.py +965 -0
  27. deepresearch_flow/paper/snapshot/identity.py +239 -0
  28. deepresearch_flow/paper/snapshot/schema.py +245 -0
  29. deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
  30. deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
  31. deepresearch_flow/paper/snapshot/text.py +154 -0
  32. deepresearch_flow/paper/template_registry.py +40 -0
  33. deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
  34. deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
  35. deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
  36. deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
  37. deepresearch_flow/paper/web/app.py +10 -3
  38. deepresearch_flow/paper/web/markdown.py +174 -8
  39. deepresearch_flow/paper/web/static/css/main.css +8 -1
  40. deepresearch_flow/paper/web/static/js/detail.js +46 -12
  41. deepresearch_flow/paper/web/templates/detail.html +9 -0
  42. deepresearch_flow/paper/web/text.py +8 -4
  43. deepresearch_flow/recognize/cli.py +380 -103
  44. deepresearch_flow/recognize/markdown.py +31 -7
  45. deepresearch_flow/recognize/math.py +47 -12
  46. deepresearch_flow/recognize/mermaid.py +320 -10
  47. deepresearch_flow/recognize/organize.py +35 -16
  48. deepresearch_flow/translator/cli.py +71 -20
  49. deepresearch_flow/translator/engine.py +220 -81
  50. deepresearch_flow/translator/fixers.py +15 -0
  51. deepresearch_flow/translator/prompts.py +19 -2
  52. deepresearch_flow/translator/protector.py +15 -3
  53. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
  54. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
  55. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
  56. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
  57. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
  58. {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ import json
7
7
  import logging
8
8
  from pathlib import Path
9
9
  import time
10
+ from typing import Any
10
11
 
11
12
  import click
12
13
  import coloredlogs
@@ -24,7 +25,7 @@ from deepresearch_flow.paper.utils import (
24
25
  short_hash,
25
26
  )
26
27
  from deepresearch_flow.translator.config import TranslateConfig
27
- from deepresearch_flow.translator.engine import MarkdownTranslator, RequestThrottle
28
+ from deepresearch_flow.translator.engine import DumpSnapshot, MarkdownTranslator, RequestThrottle
28
29
 
29
30
 
30
31
  logger = logging.getLogger(__name__)
@@ -126,6 +127,14 @@ def translator() -> None:
126
127
  @click.option("--fix-level", "fix_level", default="moderate", type=click.Choice(["off", "moderate", "aggressive"]))
127
128
  @click.option("--max-chunk-chars", "max_chunk_chars", default=4000, show_default=True, type=int)
128
129
  @click.option("--max-concurrency", "max_concurrency", default=4, show_default=True, type=int)
130
+ @click.option(
131
+ "--group-concurrency",
132
+ "group_concurrency",
133
+ default=1,
134
+ show_default=True,
135
+ type=int,
136
+ help="Concurrent translation groups per document",
137
+ )
129
138
  @click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
130
139
  @click.option("--retry-times", "retry_times", default=3, show_default=True, type=int)
131
140
  @click.option("--fallback-model", "fallback_model_ref", default=None, help="Fallback provider/model")
@@ -155,6 +164,12 @@ def translator() -> None:
155
164
  @click.option("--dump-protected", "dump_protected", is_flag=True, help="Write protected markdown")
156
165
  @click.option("--dump-placeholders", "dump_placeholders", is_flag=True, help="Write placeholder mapping JSON")
157
166
  @click.option("--dump-nodes", "dump_nodes", is_flag=True, help="Write per-node translation JSON")
167
+ @click.option(
168
+ "--dump-requests-log",
169
+ "dump_requests_log",
170
+ is_flag=True,
171
+ help="Write request/response attempts to JSON log",
172
+ )
158
173
  @click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
159
174
  @click.option("--dry-run", "dry_run", is_flag=True, help="Discover inputs without calling providers")
160
175
  @click.option("--force", "force", is_flag=True, help="Overwrite existing outputs")
@@ -171,6 +186,7 @@ def translate(
171
186
  fix_level: str,
172
187
  max_chunk_chars: int,
173
188
  max_concurrency: int,
189
+ group_concurrency: int,
174
190
  timeout: float,
175
191
  retry_times: int,
176
192
  fallback_model_ref: str | None,
@@ -183,6 +199,7 @@ def translate(
183
199
  dump_protected: bool,
184
200
  dump_placeholders: bool,
185
201
  dump_nodes: bool,
202
+ dump_requests_log: bool,
186
203
  no_format: bool,
187
204
  dry_run: bool,
188
205
  force: bool,
@@ -240,6 +257,8 @@ def translate(
240
257
  raise click.ClickException("--max-chunk-chars must be positive")
241
258
  if max_concurrency <= 0:
242
259
  raise click.ClickException("--max-concurrency must be positive")
260
+ if group_concurrency <= 0:
261
+ raise click.ClickException("--group-concurrency must be positive")
243
262
  if timeout <= 0:
244
263
  raise click.ClickException("--timeout must be positive")
245
264
  if retry_times <= 0:
@@ -288,7 +307,9 @@ def translate(
288
307
  output_root.mkdir(parents=True, exist_ok=True)
289
308
 
290
309
  debug_root = Path(debug_dir) if debug_dir else None
291
- if debug_root is None and (dump_protected or dump_placeholders or dump_nodes):
310
+ if debug_root is None and (
311
+ dump_protected or dump_placeholders or dump_nodes or dump_requests_log
312
+ ):
292
313
  debug_root = output_root or Path.cwd()
293
314
  if debug_root is not None:
294
315
  debug_root.mkdir(parents=True, exist_ok=True)
@@ -360,6 +381,43 @@ def translate(
360
381
  progress: ProgressTracker,
361
382
  ) -> None:
362
383
  content = read_text(path)
384
+ request_log: list[dict[str, Any]] = []
385
+ debug_tag = None
386
+ protected_path = None
387
+ placeholders_path = None
388
+ nodes_path = None
389
+ requests_path = None
390
+ if debug_root is not None:
391
+ debug_tag = f"{path.stem}.{short_hash(str(path))}"
392
+ protected_path = debug_root / f"{debug_tag}.protected.md"
393
+ placeholders_path = debug_root / f"{debug_tag}.placeholders.json"
394
+ nodes_path = debug_root / f"{debug_tag}.nodes.json"
395
+ requests_path = debug_root / f"{debug_tag}.requests.json"
396
+
397
+ def write_dump(snapshot: DumpSnapshot) -> None:
398
+ if debug_root is None or debug_tag is None:
399
+ return
400
+ if dump_protected and snapshot.protected_text is not None and protected_path:
401
+ protected_path.write_text(snapshot.protected_text, encoding="utf-8")
402
+ if dump_placeholders and snapshot.placeholder_store is not None and placeholders_path:
403
+ snapshot.placeholder_store.save(str(placeholders_path))
404
+ if dump_nodes and snapshot.nodes is not None and nodes_path:
405
+ node_payload = {
406
+ str(node_id): {
407
+ "origin_text": node.origin_text,
408
+ "translated_text": node.translated_text,
409
+ }
410
+ for node_id, node in snapshot.nodes.items()
411
+ }
412
+ nodes_path.write_text(
413
+ json.dumps(node_payload, ensure_ascii=False, indent=2),
414
+ encoding="utf-8",
415
+ )
416
+ if dump_requests_log and snapshot.request_log is not None and requests_path:
417
+ requests_path.write_text(
418
+ json.dumps(snapshot.request_log, ensure_ascii=False, indent=2),
419
+ encoding="utf-8",
420
+ )
363
421
  result = await translator.translate(
364
422
  content,
365
423
  provider,
@@ -381,6 +439,9 @@ def translate(
381
439
  fallback_retry_times=fallback_retry_times,
382
440
  fallback_retry_times_2=fallback_retry_times_2,
383
441
  format_enabled=not no_format,
442
+ request_log=request_log if dump_requests_log else None,
443
+ dump_callback=write_dump if debug_root is not None else None,
444
+ group_concurrency=group_concurrency,
384
445
  )
385
446
  output_path = output_map[path]
386
447
  output_path.write_text(result.translated_text, encoding="utf-8")
@@ -401,25 +462,15 @@ def translate(
401
462
  )
402
463
 
403
464
  if debug_root is not None:
404
- debug_tag = f"{path.stem}.{short_hash(str(path))}"
405
- if dump_protected:
406
- (debug_root / f"{debug_tag}.protected.md").write_text(
407
- result.protected_text, encoding="utf-8"
408
- )
409
- if dump_placeholders:
410
- result.placeholder_store.save(str(debug_root / f"{debug_tag}.placeholders.json"))
411
- if dump_nodes:
412
- node_payload = {
413
- str(node_id): {
414
- "origin_text": node.origin_text,
415
- "translated_text": node.translated_text,
416
- }
417
- for node_id, node in result.nodes.items()
418
- }
419
- (debug_root / f"{debug_tag}.nodes.json").write_text(
420
- json.dumps(node_payload, ensure_ascii=False, indent=2),
421
- encoding="utf-8",
465
+ write_dump(
466
+ DumpSnapshot(
467
+ stage="final",
468
+ nodes=result.nodes,
469
+ protected_text=result.protected_text,
470
+ placeholder_store=result.placeholder_store,
471
+ request_log=request_log if dump_requests_log else None,
422
472
  )
473
+ )
423
474
  await progress.advance_docs(1)
424
475
 
425
476
  async def run() -> None:
@@ -9,7 +9,8 @@ import logging
9
9
  import re
10
10
  import shutil
11
11
  import subprocess
12
- from typing import Optional, Protocol
12
+ import time
13
+ from typing import Any, Callable, Optional, Protocol
13
14
 
14
15
  import httpx
15
16
 
@@ -47,6 +48,15 @@ class TranslationResult:
47
48
  stats: "TranslationStats"
48
49
 
49
50
 
51
+ @dataclass
52
+ class DumpSnapshot:
53
+ stage: str
54
+ nodes: dict[int, Node] | None = None
55
+ protected_text: str | None = None
56
+ placeholder_store: PlaceHolderStore | None = None
57
+ request_log: list[dict[str, Any]] | None = None
58
+
59
+
50
60
  @dataclass
51
61
  class TranslationStats:
52
62
  total_nodes: int
@@ -116,7 +126,8 @@ class MarkdownTranslator:
116
126
  )
117
127
 
118
128
  self._rx_node_unpack = re.compile(
119
- r"<NODE_START_(\d{4})>(.*?)</NODE_END_\1>", re.DOTALL
129
+ r"(?:<|@@)NODE_START_(\d{4})(?:>|@@)(.*?)(?:</NODE_END_\1>|@@NODE_END_\1@@)",
130
+ re.DOTALL,
120
131
  )
121
132
 
122
133
  def _strip_untranslatables(self, s: str) -> str:
@@ -167,6 +178,10 @@ class MarkdownTranslator:
167
178
  text = s.strip()
168
179
  if not text:
169
180
  return False
181
+ if "__PH_" in text:
182
+ core = self._strip_untranslatables(text)
183
+ if len(core) <= 2:
184
+ return True
170
185
  if re.search(r"\b(?:isbn|issn|doi|arxiv)\b", text, flags=re.IGNORECASE):
171
186
  return True
172
187
  if re.search(r"\b(?:acm|ieee)\b", text, flags=re.IGNORECASE):
@@ -341,7 +356,10 @@ class MarkdownTranslator:
341
356
  return text
342
357
  if not self._rumdl_path:
343
358
  if not self._rumdl_warned:
344
- logger.warning("rumdl not available; skip markdown formatting")
359
+ logger.warning(
360
+ "rumdl not available; skip markdown formatting (stage=%s)",
361
+ stage,
362
+ )
345
363
  self._rumdl_warned = True
346
364
  return text
347
365
 
@@ -356,8 +374,9 @@ class MarkdownTranslator:
356
374
  result = await asyncio.to_thread(run)
357
375
  if result.returncode != 0:
358
376
  logger.warning(
359
- "rumdl fmt failed (%s): %s",
377
+ "rumdl fmt failed (stage=%s, rc=%s): %s",
360
378
  stage,
379
+ result.returncode,
361
380
  (result.stderr or "").strip() or "unknown error",
362
381
  )
363
382
  return text
@@ -530,6 +549,10 @@ class MarkdownTranslator:
530
549
  throttle: RequestThrottle | None,
531
550
  max_tokens: int | None,
532
551
  max_retries: int,
552
+ request_log: list[dict[str, Any]] | None,
553
+ stage: str,
554
+ group_index: int,
555
+ dump_callback: Callable[[DumpSnapshot], None] | None,
533
556
  ) -> str:
534
557
  attempts = 0
535
558
  while True:
@@ -539,9 +562,10 @@ class MarkdownTranslator:
539
562
  messages = build_translation_messages(
540
563
  self.cfg.source_lang, self.cfg.target_lang, group_text
541
564
  )
565
+ start_time = time.time()
542
566
  try:
543
567
  async with semaphore:
544
- return await call_provider(
568
+ response = await call_provider(
545
569
  provider,
546
570
  model,
547
571
  messages,
@@ -552,7 +576,60 @@ class MarkdownTranslator:
552
576
  client,
553
577
  max_tokens=max_tokens,
554
578
  )
579
+ elapsed_ms = int((time.time() - start_time) * 1000)
580
+ if request_log is not None:
581
+ request_log.append(
582
+ {
583
+ "stage": stage,
584
+ "group_index": group_index,
585
+ "attempt": attempts,
586
+ "provider": provider.name,
587
+ "model": model,
588
+ "messages": messages,
589
+ "response": response,
590
+ "elapsed_ms": elapsed_ms,
591
+ }
592
+ )
593
+ if dump_callback is not None:
594
+ dump_callback(DumpSnapshot(stage=stage, request_log=request_log))
595
+ if logger.isEnabledFor(logging.DEBUG):
596
+ logger.debug(
597
+ "Group translated: stage=%s group=%d attempt=%d chars=%d elapsed_ms=%d",
598
+ stage,
599
+ group_index,
600
+ attempts,
601
+ len(group_text),
602
+ elapsed_ms,
603
+ )
604
+ return response
555
605
  except ProviderError as exc:
606
+ elapsed_ms = int((time.time() - start_time) * 1000)
607
+ if request_log is not None:
608
+ request_log.append(
609
+ {
610
+ "stage": stage,
611
+ "group_index": group_index,
612
+ "attempt": attempts,
613
+ "provider": provider.name,
614
+ "model": model,
615
+ "messages": messages,
616
+ "error": str(exc),
617
+ "retryable": exc.retryable,
618
+ "elapsed_ms": elapsed_ms,
619
+ }
620
+ )
621
+ if dump_callback is not None:
622
+ dump_callback(DumpSnapshot(stage=stage, request_log=request_log))
623
+ if logger.isEnabledFor(logging.DEBUG):
624
+ logger.debug(
625
+ "Group failed: stage=%s group=%d attempt=%d retryable=%s elapsed_ms=%d error=%s",
626
+ stage,
627
+ group_index,
628
+ attempts,
629
+ exc.retryable,
630
+ elapsed_ms,
631
+ exc,
632
+ )
556
633
  if exc.retryable and attempts < max_retries:
557
634
  await asyncio.sleep(backoff_delay(1.0, attempts, 20.0))
558
635
  continue
@@ -580,6 +657,9 @@ class MarkdownTranslator:
580
657
  fallback_retry_times: int | None = None,
581
658
  fallback_retry_times_2: int | None = None,
582
659
  format_enabled: bool = True,
660
+ request_log: list[dict[str, Any]] | None = None,
661
+ dump_callback: Callable[[DumpSnapshot], None] | None = None,
662
+ group_concurrency: int = 1,
583
663
  ) -> TranslationResult:
584
664
  if fix_level != "off":
585
665
  text = fix_markdown(text, fix_level)
@@ -588,6 +668,15 @@ class MarkdownTranslator:
588
668
 
589
669
  store = PlaceHolderStore()
590
670
  protected = self.protector.protect(text, self.cfg, store)
671
+ if dump_callback is not None:
672
+ dump_callback(
673
+ DumpSnapshot(
674
+ stage="protected",
675
+ protected_text=protected,
676
+ placeholder_store=store,
677
+ request_log=request_log,
678
+ )
679
+ )
591
680
  segments, nodes = split_to_segments(protected, self.cfg.max_chunk_chars)
592
681
  total_nodes = len(nodes)
593
682
  if logger.isEnabledFor(logging.DEBUG):
@@ -607,31 +696,111 @@ class MarkdownTranslator:
607
696
  rotator = KeyRotator(resolve_api_keys(api_keys))
608
697
  max_retries = max(self.cfg.retry_times, 1)
609
698
 
610
- groups = self._group_nodes(nodes)
611
- initial_groups = len(groups)
612
- if logger.isEnabledFor(logging.DEBUG):
613
- logger.debug("Groups: %d", len(groups))
614
- if progress:
615
- await progress.add_groups(len(groups))
616
- outputs: list[str] = []
617
- for group in groups:
618
- api_key = await rotator.next_key()
619
- outputs.append(
620
- await self._translate_group(
621
- group,
622
- provider,
623
- model,
699
+ nodes_progress: dict[int, Node] | None = None
700
+ if dump_callback is not None:
701
+ nodes_progress = {
702
+ nid: Node(
703
+ nid=nid,
704
+ origin_text=node.origin_text,
705
+ translated_text=node.translated_text,
706
+ )
707
+ for nid, node in nodes.items()
708
+ }
709
+
710
+ async def run_groups(
711
+ groups: list[str],
712
+ rotator: KeyRotator,
713
+ stage: str,
714
+ max_tokens_value: int | None,
715
+ retry_limit_value: int,
716
+ provider_value: ProviderConfig,
717
+ model_value: str,
718
+ ) -> list[str]:
719
+ if not groups:
720
+ return []
721
+ outputs: list[str] = [""] * len(groups)
722
+
723
+ async def run_one(idx: int, group_text: str) -> tuple[int, str]:
724
+ api_key = await rotator.next_key()
725
+ response = await self._translate_group(
726
+ group_text,
727
+ provider_value,
728
+ model_value,
624
729
  client,
625
730
  api_key,
626
731
  timeout,
627
732
  semaphore,
628
733
  throttle,
629
- max_tokens,
630
- max_retries,
734
+ max_tokens_value,
735
+ retry_limit_value,
736
+ request_log,
737
+ stage,
738
+ idx,
739
+ dump_callback,
631
740
  )
632
- )
633
- if progress:
634
- await progress.advance_groups(1)
741
+ return idx, response
742
+
743
+ if group_concurrency <= 1:
744
+ for idx, group_text in enumerate(groups):
745
+ idx_out, response = await run_one(idx, group_text)
746
+ outputs[idx_out] = response
747
+ if nodes_progress is not None:
748
+ nodes_progress.update(self._ungroup_nodes(response, nodes))
749
+ dump_callback(
750
+ DumpSnapshot(
751
+ stage=stage,
752
+ nodes=nodes_progress,
753
+ request_log=request_log,
754
+ )
755
+ )
756
+ if progress:
757
+ await progress.advance_groups(1)
758
+ return outputs
759
+
760
+ guard = asyncio.Semaphore(group_concurrency)
761
+
762
+ async def guarded(idx: int, group_text: str) -> tuple[int, str]:
763
+ async with guard:
764
+ return await run_one(idx, group_text)
765
+
766
+ tasks = [asyncio.create_task(guarded(i, g)) for i, g in enumerate(groups)]
767
+ try:
768
+ for task in asyncio.as_completed(tasks):
769
+ idx_out, response = await task
770
+ outputs[idx_out] = response
771
+ if nodes_progress is not None:
772
+ nodes_progress.update(self._ungroup_nodes(response, nodes))
773
+ dump_callback(
774
+ DumpSnapshot(
775
+ stage=stage,
776
+ nodes=nodes_progress,
777
+ request_log=request_log,
778
+ )
779
+ )
780
+ if progress:
781
+ await progress.advance_groups(1)
782
+ except Exception:
783
+ for task in tasks:
784
+ task.cancel()
785
+ raise
786
+
787
+ return outputs
788
+
789
+ groups = self._group_nodes(nodes)
790
+ initial_groups = len(groups)
791
+ if logger.isEnabledFor(logging.DEBUG):
792
+ logger.debug("Groups: %d", len(groups))
793
+ if progress:
794
+ await progress.add_groups(len(groups))
795
+ outputs = await run_groups(
796
+ groups,
797
+ rotator,
798
+ "initial",
799
+ max_tokens,
800
+ max_retries,
801
+ provider,
802
+ model,
803
+ )
635
804
 
636
805
  translated_nodes = self._ungroup_groups(outputs, nodes)
637
806
  valid_placeholders = set(store.snapshot().values())
@@ -692,25 +861,15 @@ class MarkdownTranslator:
692
861
  )
693
862
  if progress:
694
863
  await progress.add_groups(len(retry_groups))
695
- retry_outputs: list[str] = []
696
- for group in retry_groups:
697
- api_key = await rotator.next_key()
698
- retry_outputs.append(
699
- await self._translate_group(
700
- group,
701
- provider,
702
- model,
703
- client,
704
- api_key,
705
- timeout,
706
- semaphore,
707
- throttle,
708
- max_tokens,
709
- retry_limit,
710
- )
711
- )
712
- if progress:
713
- await progress.advance_groups(1)
864
+ retry_outputs = await run_groups(
865
+ retry_groups,
866
+ rotator,
867
+ f"retry-{attempt}",
868
+ max_tokens,
869
+ retry_limit,
870
+ provider,
871
+ model,
872
+ )
714
873
  retry_nodes = self._ungroup_groups(
715
874
  retry_outputs, failed_nodes, fill_missing=False
716
875
  )
@@ -777,25 +936,15 @@ class MarkdownTranslator:
777
936
  )
778
937
  if progress:
779
938
  await progress.add_groups(len(retry_groups))
780
- retry_outputs: list[str] = []
781
- for group in retry_groups:
782
- api_key = await fallback_rotator.next_key()
783
- retry_outputs.append(
784
- await self._translate_group(
785
- group,
786
- fallback_provider,
787
- fallback_model,
788
- client,
789
- api_key,
790
- timeout,
791
- semaphore,
792
- throttle,
793
- fallback_max_tokens,
794
- fallback_retry_limit,
795
- )
796
- )
797
- if progress:
798
- await progress.advance_groups(1)
939
+ retry_outputs = await run_groups(
940
+ retry_groups,
941
+ fallback_rotator,
942
+ f"fallback-{attempt}",
943
+ fallback_max_tokens,
944
+ fallback_retry_limit,
945
+ fallback_provider,
946
+ fallback_model,
947
+ )
799
948
  retry_nodes = self._ungroup_groups(
800
949
  retry_outputs, failed_nodes, fill_missing=False
801
950
  )
@@ -862,25 +1011,15 @@ class MarkdownTranslator:
862
1011
  )
863
1012
  if progress:
864
1013
  await progress.add_groups(len(retry_groups))
865
- retry_outputs: list[str] = []
866
- for group in retry_groups:
867
- api_key = await fallback_rotator.next_key()
868
- retry_outputs.append(
869
- await self._translate_group(
870
- group,
871
- fallback_provider_2,
872
- fallback_model_2,
873
- client,
874
- api_key,
875
- timeout,
876
- semaphore,
877
- throttle,
878
- fallback_max_tokens_2,
879
- fallback_retry_limit,
880
- )
881
- )
882
- if progress:
883
- await progress.advance_groups(1)
1014
+ retry_outputs = await run_groups(
1015
+ retry_groups,
1016
+ fallback_rotator,
1017
+ f"fallback2-{attempt}",
1018
+ fallback_max_tokens_2,
1019
+ fallback_retry_limit,
1020
+ fallback_provider_2,
1021
+ fallback_model_2,
1022
+ )
884
1023
  retry_nodes = self._ungroup_groups(
885
1024
  retry_outputs, failed_nodes, fill_missing=False
886
1025
  )
@@ -448,4 +448,19 @@ def fix_markdown(text: str, level: str) -> str:
448
448
  if level == "aggressive":
449
449
  text = title_processor.fix_titles(text)
450
450
 
451
+ try:
452
+ from deepresearch_flow.paper.web.markdown import (
453
+ normalize_fenced_code_blocks,
454
+ normalize_footnote_definitions,
455
+ normalize_mermaid_blocks,
456
+ normalize_unbalanced_fences,
457
+ )
458
+ except Exception:
459
+ return text
460
+
461
+ text = normalize_fenced_code_blocks(text)
462
+ text = normalize_mermaid_blocks(text)
463
+ text = normalize_unbalanced_fences(text)
464
+ text = normalize_footnote_definitions(text)
465
+
451
466
  return text
@@ -28,7 +28,7 @@ You are a professional translation engine. Follow these invariant rules:
28
28
 
29
29
  TRANSLATE_XML_TEMPLATE = Template(
30
30
  dedent(
31
- """\
31
+ r"""\
32
32
  <TranslationTask version="1.0">
33
33
  <meta>
34
34
  <source_lang>$SOURCE_LANG</source_lang>
@@ -36,14 +36,23 @@ TRANSLATE_XML_TEMPLATE = Template(
36
36
  <visibility_note>Sections with visibility="internal" are instructions and MUST NOT appear in the final output.</visibility_note>
37
37
  </meta>
38
38
 
39
+ <task>
40
+ You are a professional $SOURCE_LANG_NAME ($SOURCE_LANG_CODE) to $TARGET_LANG_NAME ($TARGET_LANG_CODE) translator.
41
+ Your goal is to accurately convey the meaning and nuances of the original $SOURCE_LANG_NAME text while adhering to $TARGET_LANG_NAME grammar, vocabulary, and cultural sensitivities.
42
+ Produce only the $TARGET_LANG_NAME translation, without any additional explanations or commentary.
43
+ Please translate the following $SOURCE_LANG_NAME text into $TARGET_LANG_NAME:
44
+ Important: There are two blank lines before the text to translate.
45
+ </task>
46
+
39
47
  <constraints visibility="internal">
40
48
  <rule id="fmt-1">Preserve ALL original formatting exactly: Markdown, whitespace, line breaks, paragraph spacing.</rule>
41
- <rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$, \\( ... \\), \\[ ... \\]) or LaTeX commands/environments.</rule>
49
+ <rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$, \( ... \), \[ ... \]) or LaTeX commands/environments.</rule>
42
50
  <rule id="fmt-3">Keep ALL HTML tags intact.</rule>
43
51
  <rule id="fmt-4">Do NOT alter abbreviations, technical terms, or code identifiers; translate surrounding prose only.</rule>
44
52
  <rule id="fmt-5">Document structure must be preserved, including blank lines (double newlines) between blocks.</rule>
45
53
  </constraints>
46
54
 
55
+
47
56
  <markers visibility="internal">
48
57
  <preserve>
49
58
  <open>@@PRESERVE_{n}@@</open>
@@ -91,6 +100,8 @@ TRANSLATE_XML_TEMPLATE = Template(
91
100
  <io>
92
101
  <input>
93
102
  <![CDATA[
103
+
104
+
94
105
  $TEXT_TO_TRANSLATE
95
106
  ]]>
96
107
  </input>
@@ -105,9 +116,15 @@ $TEXT_TO_TRANSLATE
105
116
 
106
117
 
107
118
  def build_translation_messages(source_lang: str | None, target_lang: str, text: str) -> list[dict[str, str]]:
119
+ source_name = source_lang or "auto"
120
+ target_name = target_lang
108
121
  user_xml = TRANSLATE_XML_TEMPLATE.substitute(
109
122
  SOURCE_LANG=source_lang or "auto",
110
123
  TARGET_LANG=target_lang,
124
+ SOURCE_LANG_NAME=source_name,
125
+ SOURCE_LANG_CODE=source_name,
126
+ TARGET_LANG_NAME=target_name,
127
+ TARGET_LANG_CODE=target_name,
111
128
  TEXT_TO_TRANSLATE=_cdata_wrap(text),
112
129
  )
113
130
  return [