deepresearch-flow 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/cli.py +63 -0
- deepresearch_flow/paper/config.py +87 -12
- deepresearch_flow/paper/db.py +1041 -34
- deepresearch_flow/paper/db_ops.py +145 -26
- deepresearch_flow/paper/extract.py +1546 -152
- deepresearch_flow/paper/prompt_templates/deep_read_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/deep_read_phi_user.j2 +396 -0
- deepresearch_flow/paper/prompt_templates/deep_read_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/deep_read_user.j2 +272 -40
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_system.j2 +7 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_phi_user.j2 +135 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/eight_questions_user.j2 +4 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_system.j2 +8 -0
- deepresearch_flow/paper/prompt_templates/simple_phi_user.j2 +31 -0
- deepresearch_flow/paper/prompt_templates/simple_system.j2 +2 -0
- deepresearch_flow/paper/prompt_templates/simple_user.j2 +2 -0
- deepresearch_flow/paper/providers/azure_openai.py +45 -3
- deepresearch_flow/paper/providers/openai_compatible.py +45 -3
- deepresearch_flow/paper/schemas/deep_read_phi_schema.json +31 -0
- deepresearch_flow/paper/schemas/deep_read_schema.json +1 -0
- deepresearch_flow/paper/schemas/default_paper_schema.json +6 -0
- deepresearch_flow/paper/schemas/eight_questions_schema.json +1 -0
- deepresearch_flow/paper/snapshot/__init__.py +4 -0
- deepresearch_flow/paper/snapshot/api.py +941 -0
- deepresearch_flow/paper/snapshot/builder.py +965 -0
- deepresearch_flow/paper/snapshot/identity.py +239 -0
- deepresearch_flow/paper/snapshot/schema.py +245 -0
- deepresearch_flow/paper/snapshot/tests/__init__.py +2 -0
- deepresearch_flow/paper/snapshot/tests/test_identity.py +123 -0
- deepresearch_flow/paper/snapshot/text.py +154 -0
- deepresearch_flow/paper/template_registry.py +40 -0
- deepresearch_flow/paper/templates/deep_read.md.j2 +4 -0
- deepresearch_flow/paper/templates/deep_read_phi.md.j2 +44 -0
- deepresearch_flow/paper/templates/default_paper.md.j2 +4 -0
- deepresearch_flow/paper/templates/eight_questions.md.j2 +4 -0
- deepresearch_flow/paper/web/app.py +10 -3
- deepresearch_flow/paper/web/markdown.py +174 -8
- deepresearch_flow/paper/web/static/css/main.css +8 -1
- deepresearch_flow/paper/web/static/js/detail.js +46 -12
- deepresearch_flow/paper/web/templates/detail.html +9 -0
- deepresearch_flow/paper/web/text.py +8 -4
- deepresearch_flow/recognize/cli.py +380 -103
- deepresearch_flow/recognize/markdown.py +31 -7
- deepresearch_flow/recognize/math.py +47 -12
- deepresearch_flow/recognize/mermaid.py +320 -10
- deepresearch_flow/recognize/organize.py +35 -16
- deepresearch_flow/translator/cli.py +71 -20
- deepresearch_flow/translator/engine.py +220 -81
- deepresearch_flow/translator/fixers.py +15 -0
- deepresearch_flow/translator/prompts.py +19 -2
- deepresearch_flow/translator/protector.py +15 -3
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/METADATA +407 -33
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/RECORD +58 -42
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/WHEEL +1 -1
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.5.0.dist-info → deepresearch_flow-0.6.0.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
import time
|
|
10
|
+
from typing import Any
|
|
10
11
|
|
|
11
12
|
import click
|
|
12
13
|
import coloredlogs
|
|
@@ -24,7 +25,7 @@ from deepresearch_flow.paper.utils import (
|
|
|
24
25
|
short_hash,
|
|
25
26
|
)
|
|
26
27
|
from deepresearch_flow.translator.config import TranslateConfig
|
|
27
|
-
from deepresearch_flow.translator.engine import MarkdownTranslator, RequestThrottle
|
|
28
|
+
from deepresearch_flow.translator.engine import DumpSnapshot, MarkdownTranslator, RequestThrottle
|
|
28
29
|
|
|
29
30
|
|
|
30
31
|
logger = logging.getLogger(__name__)
|
|
@@ -126,6 +127,14 @@ def translator() -> None:
|
|
|
126
127
|
@click.option("--fix-level", "fix_level", default="moderate", type=click.Choice(["off", "moderate", "aggressive"]))
|
|
127
128
|
@click.option("--max-chunk-chars", "max_chunk_chars", default=4000, show_default=True, type=int)
|
|
128
129
|
@click.option("--max-concurrency", "max_concurrency", default=4, show_default=True, type=int)
|
|
130
|
+
@click.option(
|
|
131
|
+
"--group-concurrency",
|
|
132
|
+
"group_concurrency",
|
|
133
|
+
default=1,
|
|
134
|
+
show_default=True,
|
|
135
|
+
type=int,
|
|
136
|
+
help="Concurrent translation groups per document",
|
|
137
|
+
)
|
|
129
138
|
@click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
|
|
130
139
|
@click.option("--retry-times", "retry_times", default=3, show_default=True, type=int)
|
|
131
140
|
@click.option("--fallback-model", "fallback_model_ref", default=None, help="Fallback provider/model")
|
|
@@ -155,6 +164,12 @@ def translator() -> None:
|
|
|
155
164
|
@click.option("--dump-protected", "dump_protected", is_flag=True, help="Write protected markdown")
|
|
156
165
|
@click.option("--dump-placeholders", "dump_placeholders", is_flag=True, help="Write placeholder mapping JSON")
|
|
157
166
|
@click.option("--dump-nodes", "dump_nodes", is_flag=True, help="Write per-node translation JSON")
|
|
167
|
+
@click.option(
|
|
168
|
+
"--dump-requests-log",
|
|
169
|
+
"dump_requests_log",
|
|
170
|
+
is_flag=True,
|
|
171
|
+
help="Write request/response attempts to JSON log",
|
|
172
|
+
)
|
|
158
173
|
@click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
|
|
159
174
|
@click.option("--dry-run", "dry_run", is_flag=True, help="Discover inputs without calling providers")
|
|
160
175
|
@click.option("--force", "force", is_flag=True, help="Overwrite existing outputs")
|
|
@@ -171,6 +186,7 @@ def translate(
|
|
|
171
186
|
fix_level: str,
|
|
172
187
|
max_chunk_chars: int,
|
|
173
188
|
max_concurrency: int,
|
|
189
|
+
group_concurrency: int,
|
|
174
190
|
timeout: float,
|
|
175
191
|
retry_times: int,
|
|
176
192
|
fallback_model_ref: str | None,
|
|
@@ -183,6 +199,7 @@ def translate(
|
|
|
183
199
|
dump_protected: bool,
|
|
184
200
|
dump_placeholders: bool,
|
|
185
201
|
dump_nodes: bool,
|
|
202
|
+
dump_requests_log: bool,
|
|
186
203
|
no_format: bool,
|
|
187
204
|
dry_run: bool,
|
|
188
205
|
force: bool,
|
|
@@ -240,6 +257,8 @@ def translate(
|
|
|
240
257
|
raise click.ClickException("--max-chunk-chars must be positive")
|
|
241
258
|
if max_concurrency <= 0:
|
|
242
259
|
raise click.ClickException("--max-concurrency must be positive")
|
|
260
|
+
if group_concurrency <= 0:
|
|
261
|
+
raise click.ClickException("--group-concurrency must be positive")
|
|
243
262
|
if timeout <= 0:
|
|
244
263
|
raise click.ClickException("--timeout must be positive")
|
|
245
264
|
if retry_times <= 0:
|
|
@@ -288,7 +307,9 @@ def translate(
|
|
|
288
307
|
output_root.mkdir(parents=True, exist_ok=True)
|
|
289
308
|
|
|
290
309
|
debug_root = Path(debug_dir) if debug_dir else None
|
|
291
|
-
if debug_root is None and (
|
|
310
|
+
if debug_root is None and (
|
|
311
|
+
dump_protected or dump_placeholders or dump_nodes or dump_requests_log
|
|
312
|
+
):
|
|
292
313
|
debug_root = output_root or Path.cwd()
|
|
293
314
|
if debug_root is not None:
|
|
294
315
|
debug_root.mkdir(parents=True, exist_ok=True)
|
|
@@ -360,6 +381,43 @@ def translate(
|
|
|
360
381
|
progress: ProgressTracker,
|
|
361
382
|
) -> None:
|
|
362
383
|
content = read_text(path)
|
|
384
|
+
request_log: list[dict[str, Any]] = []
|
|
385
|
+
debug_tag = None
|
|
386
|
+
protected_path = None
|
|
387
|
+
placeholders_path = None
|
|
388
|
+
nodes_path = None
|
|
389
|
+
requests_path = None
|
|
390
|
+
if debug_root is not None:
|
|
391
|
+
debug_tag = f"{path.stem}.{short_hash(str(path))}"
|
|
392
|
+
protected_path = debug_root / f"{debug_tag}.protected.md"
|
|
393
|
+
placeholders_path = debug_root / f"{debug_tag}.placeholders.json"
|
|
394
|
+
nodes_path = debug_root / f"{debug_tag}.nodes.json"
|
|
395
|
+
requests_path = debug_root / f"{debug_tag}.requests.json"
|
|
396
|
+
|
|
397
|
+
def write_dump(snapshot: DumpSnapshot) -> None:
|
|
398
|
+
if debug_root is None or debug_tag is None:
|
|
399
|
+
return
|
|
400
|
+
if dump_protected and snapshot.protected_text is not None and protected_path:
|
|
401
|
+
protected_path.write_text(snapshot.protected_text, encoding="utf-8")
|
|
402
|
+
if dump_placeholders and snapshot.placeholder_store is not None and placeholders_path:
|
|
403
|
+
snapshot.placeholder_store.save(str(placeholders_path))
|
|
404
|
+
if dump_nodes and snapshot.nodes is not None and nodes_path:
|
|
405
|
+
node_payload = {
|
|
406
|
+
str(node_id): {
|
|
407
|
+
"origin_text": node.origin_text,
|
|
408
|
+
"translated_text": node.translated_text,
|
|
409
|
+
}
|
|
410
|
+
for node_id, node in snapshot.nodes.items()
|
|
411
|
+
}
|
|
412
|
+
nodes_path.write_text(
|
|
413
|
+
json.dumps(node_payload, ensure_ascii=False, indent=2),
|
|
414
|
+
encoding="utf-8",
|
|
415
|
+
)
|
|
416
|
+
if dump_requests_log and snapshot.request_log is not None and requests_path:
|
|
417
|
+
requests_path.write_text(
|
|
418
|
+
json.dumps(snapshot.request_log, ensure_ascii=False, indent=2),
|
|
419
|
+
encoding="utf-8",
|
|
420
|
+
)
|
|
363
421
|
result = await translator.translate(
|
|
364
422
|
content,
|
|
365
423
|
provider,
|
|
@@ -381,6 +439,9 @@ def translate(
|
|
|
381
439
|
fallback_retry_times=fallback_retry_times,
|
|
382
440
|
fallback_retry_times_2=fallback_retry_times_2,
|
|
383
441
|
format_enabled=not no_format,
|
|
442
|
+
request_log=request_log if dump_requests_log else None,
|
|
443
|
+
dump_callback=write_dump if debug_root is not None else None,
|
|
444
|
+
group_concurrency=group_concurrency,
|
|
384
445
|
)
|
|
385
446
|
output_path = output_map[path]
|
|
386
447
|
output_path.write_text(result.translated_text, encoding="utf-8")
|
|
@@ -401,25 +462,15 @@ def translate(
|
|
|
401
462
|
)
|
|
402
463
|
|
|
403
464
|
if debug_root is not None:
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
result.
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
if dump_nodes:
|
|
412
|
-
node_payload = {
|
|
413
|
-
str(node_id): {
|
|
414
|
-
"origin_text": node.origin_text,
|
|
415
|
-
"translated_text": node.translated_text,
|
|
416
|
-
}
|
|
417
|
-
for node_id, node in result.nodes.items()
|
|
418
|
-
}
|
|
419
|
-
(debug_root / f"{debug_tag}.nodes.json").write_text(
|
|
420
|
-
json.dumps(node_payload, ensure_ascii=False, indent=2),
|
|
421
|
-
encoding="utf-8",
|
|
465
|
+
write_dump(
|
|
466
|
+
DumpSnapshot(
|
|
467
|
+
stage="final",
|
|
468
|
+
nodes=result.nodes,
|
|
469
|
+
protected_text=result.protected_text,
|
|
470
|
+
placeholder_store=result.placeholder_store,
|
|
471
|
+
request_log=request_log if dump_requests_log else None,
|
|
422
472
|
)
|
|
473
|
+
)
|
|
423
474
|
await progress.advance_docs(1)
|
|
424
475
|
|
|
425
476
|
async def run() -> None:
|
|
@@ -9,7 +9,8 @@ import logging
|
|
|
9
9
|
import re
|
|
10
10
|
import shutil
|
|
11
11
|
import subprocess
|
|
12
|
-
|
|
12
|
+
import time
|
|
13
|
+
from typing import Any, Callable, Optional, Protocol
|
|
13
14
|
|
|
14
15
|
import httpx
|
|
15
16
|
|
|
@@ -47,6 +48,15 @@ class TranslationResult:
|
|
|
47
48
|
stats: "TranslationStats"
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
@dataclass
|
|
52
|
+
class DumpSnapshot:
|
|
53
|
+
stage: str
|
|
54
|
+
nodes: dict[int, Node] | None = None
|
|
55
|
+
protected_text: str | None = None
|
|
56
|
+
placeholder_store: PlaceHolderStore | None = None
|
|
57
|
+
request_log: list[dict[str, Any]] | None = None
|
|
58
|
+
|
|
59
|
+
|
|
50
60
|
@dataclass
|
|
51
61
|
class TranslationStats:
|
|
52
62
|
total_nodes: int
|
|
@@ -116,7 +126,8 @@ class MarkdownTranslator:
|
|
|
116
126
|
)
|
|
117
127
|
|
|
118
128
|
self._rx_node_unpack = re.compile(
|
|
119
|
-
r"
|
|
129
|
+
r"(?:<|@@)NODE_START_(\d{4})(?:>|@@)(.*?)(?:</NODE_END_\1>|@@NODE_END_\1@@)",
|
|
130
|
+
re.DOTALL,
|
|
120
131
|
)
|
|
121
132
|
|
|
122
133
|
def _strip_untranslatables(self, s: str) -> str:
|
|
@@ -167,6 +178,10 @@ class MarkdownTranslator:
|
|
|
167
178
|
text = s.strip()
|
|
168
179
|
if not text:
|
|
169
180
|
return False
|
|
181
|
+
if "__PH_" in text:
|
|
182
|
+
core = self._strip_untranslatables(text)
|
|
183
|
+
if len(core) <= 2:
|
|
184
|
+
return True
|
|
170
185
|
if re.search(r"\b(?:isbn|issn|doi|arxiv)\b", text, flags=re.IGNORECASE):
|
|
171
186
|
return True
|
|
172
187
|
if re.search(r"\b(?:acm|ieee)\b", text, flags=re.IGNORECASE):
|
|
@@ -341,7 +356,10 @@ class MarkdownTranslator:
|
|
|
341
356
|
return text
|
|
342
357
|
if not self._rumdl_path:
|
|
343
358
|
if not self._rumdl_warned:
|
|
344
|
-
logger.warning(
|
|
359
|
+
logger.warning(
|
|
360
|
+
"rumdl not available; skip markdown formatting (stage=%s)",
|
|
361
|
+
stage,
|
|
362
|
+
)
|
|
345
363
|
self._rumdl_warned = True
|
|
346
364
|
return text
|
|
347
365
|
|
|
@@ -356,8 +374,9 @@ class MarkdownTranslator:
|
|
|
356
374
|
result = await asyncio.to_thread(run)
|
|
357
375
|
if result.returncode != 0:
|
|
358
376
|
logger.warning(
|
|
359
|
-
"rumdl fmt failed (
|
|
377
|
+
"rumdl fmt failed (stage=%s, rc=%s): %s",
|
|
360
378
|
stage,
|
|
379
|
+
result.returncode,
|
|
361
380
|
(result.stderr or "").strip() or "unknown error",
|
|
362
381
|
)
|
|
363
382
|
return text
|
|
@@ -530,6 +549,10 @@ class MarkdownTranslator:
|
|
|
530
549
|
throttle: RequestThrottle | None,
|
|
531
550
|
max_tokens: int | None,
|
|
532
551
|
max_retries: int,
|
|
552
|
+
request_log: list[dict[str, Any]] | None,
|
|
553
|
+
stage: str,
|
|
554
|
+
group_index: int,
|
|
555
|
+
dump_callback: Callable[[DumpSnapshot], None] | None,
|
|
533
556
|
) -> str:
|
|
534
557
|
attempts = 0
|
|
535
558
|
while True:
|
|
@@ -539,9 +562,10 @@ class MarkdownTranslator:
|
|
|
539
562
|
messages = build_translation_messages(
|
|
540
563
|
self.cfg.source_lang, self.cfg.target_lang, group_text
|
|
541
564
|
)
|
|
565
|
+
start_time = time.time()
|
|
542
566
|
try:
|
|
543
567
|
async with semaphore:
|
|
544
|
-
|
|
568
|
+
response = await call_provider(
|
|
545
569
|
provider,
|
|
546
570
|
model,
|
|
547
571
|
messages,
|
|
@@ -552,7 +576,60 @@ class MarkdownTranslator:
|
|
|
552
576
|
client,
|
|
553
577
|
max_tokens=max_tokens,
|
|
554
578
|
)
|
|
579
|
+
elapsed_ms = int((time.time() - start_time) * 1000)
|
|
580
|
+
if request_log is not None:
|
|
581
|
+
request_log.append(
|
|
582
|
+
{
|
|
583
|
+
"stage": stage,
|
|
584
|
+
"group_index": group_index,
|
|
585
|
+
"attempt": attempts,
|
|
586
|
+
"provider": provider.name,
|
|
587
|
+
"model": model,
|
|
588
|
+
"messages": messages,
|
|
589
|
+
"response": response,
|
|
590
|
+
"elapsed_ms": elapsed_ms,
|
|
591
|
+
}
|
|
592
|
+
)
|
|
593
|
+
if dump_callback is not None:
|
|
594
|
+
dump_callback(DumpSnapshot(stage=stage, request_log=request_log))
|
|
595
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
596
|
+
logger.debug(
|
|
597
|
+
"Group translated: stage=%s group=%d attempt=%d chars=%d elapsed_ms=%d",
|
|
598
|
+
stage,
|
|
599
|
+
group_index,
|
|
600
|
+
attempts,
|
|
601
|
+
len(group_text),
|
|
602
|
+
elapsed_ms,
|
|
603
|
+
)
|
|
604
|
+
return response
|
|
555
605
|
except ProviderError as exc:
|
|
606
|
+
elapsed_ms = int((time.time() - start_time) * 1000)
|
|
607
|
+
if request_log is not None:
|
|
608
|
+
request_log.append(
|
|
609
|
+
{
|
|
610
|
+
"stage": stage,
|
|
611
|
+
"group_index": group_index,
|
|
612
|
+
"attempt": attempts,
|
|
613
|
+
"provider": provider.name,
|
|
614
|
+
"model": model,
|
|
615
|
+
"messages": messages,
|
|
616
|
+
"error": str(exc),
|
|
617
|
+
"retryable": exc.retryable,
|
|
618
|
+
"elapsed_ms": elapsed_ms,
|
|
619
|
+
}
|
|
620
|
+
)
|
|
621
|
+
if dump_callback is not None:
|
|
622
|
+
dump_callback(DumpSnapshot(stage=stage, request_log=request_log))
|
|
623
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
624
|
+
logger.debug(
|
|
625
|
+
"Group failed: stage=%s group=%d attempt=%d retryable=%s elapsed_ms=%d error=%s",
|
|
626
|
+
stage,
|
|
627
|
+
group_index,
|
|
628
|
+
attempts,
|
|
629
|
+
exc.retryable,
|
|
630
|
+
elapsed_ms,
|
|
631
|
+
exc,
|
|
632
|
+
)
|
|
556
633
|
if exc.retryable and attempts < max_retries:
|
|
557
634
|
await asyncio.sleep(backoff_delay(1.0, attempts, 20.0))
|
|
558
635
|
continue
|
|
@@ -580,6 +657,9 @@ class MarkdownTranslator:
|
|
|
580
657
|
fallback_retry_times: int | None = None,
|
|
581
658
|
fallback_retry_times_2: int | None = None,
|
|
582
659
|
format_enabled: bool = True,
|
|
660
|
+
request_log: list[dict[str, Any]] | None = None,
|
|
661
|
+
dump_callback: Callable[[DumpSnapshot], None] | None = None,
|
|
662
|
+
group_concurrency: int = 1,
|
|
583
663
|
) -> TranslationResult:
|
|
584
664
|
if fix_level != "off":
|
|
585
665
|
text = fix_markdown(text, fix_level)
|
|
@@ -588,6 +668,15 @@ class MarkdownTranslator:
|
|
|
588
668
|
|
|
589
669
|
store = PlaceHolderStore()
|
|
590
670
|
protected = self.protector.protect(text, self.cfg, store)
|
|
671
|
+
if dump_callback is not None:
|
|
672
|
+
dump_callback(
|
|
673
|
+
DumpSnapshot(
|
|
674
|
+
stage="protected",
|
|
675
|
+
protected_text=protected,
|
|
676
|
+
placeholder_store=store,
|
|
677
|
+
request_log=request_log,
|
|
678
|
+
)
|
|
679
|
+
)
|
|
591
680
|
segments, nodes = split_to_segments(protected, self.cfg.max_chunk_chars)
|
|
592
681
|
total_nodes = len(nodes)
|
|
593
682
|
if logger.isEnabledFor(logging.DEBUG):
|
|
@@ -607,31 +696,111 @@ class MarkdownTranslator:
|
|
|
607
696
|
rotator = KeyRotator(resolve_api_keys(api_keys))
|
|
608
697
|
max_retries = max(self.cfg.retry_times, 1)
|
|
609
698
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
699
|
+
nodes_progress: dict[int, Node] | None = None
|
|
700
|
+
if dump_callback is not None:
|
|
701
|
+
nodes_progress = {
|
|
702
|
+
nid: Node(
|
|
703
|
+
nid=nid,
|
|
704
|
+
origin_text=node.origin_text,
|
|
705
|
+
translated_text=node.translated_text,
|
|
706
|
+
)
|
|
707
|
+
for nid, node in nodes.items()
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
async def run_groups(
|
|
711
|
+
groups: list[str],
|
|
712
|
+
rotator: KeyRotator,
|
|
713
|
+
stage: str,
|
|
714
|
+
max_tokens_value: int | None,
|
|
715
|
+
retry_limit_value: int,
|
|
716
|
+
provider_value: ProviderConfig,
|
|
717
|
+
model_value: str,
|
|
718
|
+
) -> list[str]:
|
|
719
|
+
if not groups:
|
|
720
|
+
return []
|
|
721
|
+
outputs: list[str] = [""] * len(groups)
|
|
722
|
+
|
|
723
|
+
async def run_one(idx: int, group_text: str) -> tuple[int, str]:
|
|
724
|
+
api_key = await rotator.next_key()
|
|
725
|
+
response = await self._translate_group(
|
|
726
|
+
group_text,
|
|
727
|
+
provider_value,
|
|
728
|
+
model_value,
|
|
624
729
|
client,
|
|
625
730
|
api_key,
|
|
626
731
|
timeout,
|
|
627
732
|
semaphore,
|
|
628
733
|
throttle,
|
|
629
|
-
|
|
630
|
-
|
|
734
|
+
max_tokens_value,
|
|
735
|
+
retry_limit_value,
|
|
736
|
+
request_log,
|
|
737
|
+
stage,
|
|
738
|
+
idx,
|
|
739
|
+
dump_callback,
|
|
631
740
|
)
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
741
|
+
return idx, response
|
|
742
|
+
|
|
743
|
+
if group_concurrency <= 1:
|
|
744
|
+
for idx, group_text in enumerate(groups):
|
|
745
|
+
idx_out, response = await run_one(idx, group_text)
|
|
746
|
+
outputs[idx_out] = response
|
|
747
|
+
if nodes_progress is not None:
|
|
748
|
+
nodes_progress.update(self._ungroup_nodes(response, nodes))
|
|
749
|
+
dump_callback(
|
|
750
|
+
DumpSnapshot(
|
|
751
|
+
stage=stage,
|
|
752
|
+
nodes=nodes_progress,
|
|
753
|
+
request_log=request_log,
|
|
754
|
+
)
|
|
755
|
+
)
|
|
756
|
+
if progress:
|
|
757
|
+
await progress.advance_groups(1)
|
|
758
|
+
return outputs
|
|
759
|
+
|
|
760
|
+
guard = asyncio.Semaphore(group_concurrency)
|
|
761
|
+
|
|
762
|
+
async def guarded(idx: int, group_text: str) -> tuple[int, str]:
|
|
763
|
+
async with guard:
|
|
764
|
+
return await run_one(idx, group_text)
|
|
765
|
+
|
|
766
|
+
tasks = [asyncio.create_task(guarded(i, g)) for i, g in enumerate(groups)]
|
|
767
|
+
try:
|
|
768
|
+
for task in asyncio.as_completed(tasks):
|
|
769
|
+
idx_out, response = await task
|
|
770
|
+
outputs[idx_out] = response
|
|
771
|
+
if nodes_progress is not None:
|
|
772
|
+
nodes_progress.update(self._ungroup_nodes(response, nodes))
|
|
773
|
+
dump_callback(
|
|
774
|
+
DumpSnapshot(
|
|
775
|
+
stage=stage,
|
|
776
|
+
nodes=nodes_progress,
|
|
777
|
+
request_log=request_log,
|
|
778
|
+
)
|
|
779
|
+
)
|
|
780
|
+
if progress:
|
|
781
|
+
await progress.advance_groups(1)
|
|
782
|
+
except Exception:
|
|
783
|
+
for task in tasks:
|
|
784
|
+
task.cancel()
|
|
785
|
+
raise
|
|
786
|
+
|
|
787
|
+
return outputs
|
|
788
|
+
|
|
789
|
+
groups = self._group_nodes(nodes)
|
|
790
|
+
initial_groups = len(groups)
|
|
791
|
+
if logger.isEnabledFor(logging.DEBUG):
|
|
792
|
+
logger.debug("Groups: %d", len(groups))
|
|
793
|
+
if progress:
|
|
794
|
+
await progress.add_groups(len(groups))
|
|
795
|
+
outputs = await run_groups(
|
|
796
|
+
groups,
|
|
797
|
+
rotator,
|
|
798
|
+
"initial",
|
|
799
|
+
max_tokens,
|
|
800
|
+
max_retries,
|
|
801
|
+
provider,
|
|
802
|
+
model,
|
|
803
|
+
)
|
|
635
804
|
|
|
636
805
|
translated_nodes = self._ungroup_groups(outputs, nodes)
|
|
637
806
|
valid_placeholders = set(store.snapshot().values())
|
|
@@ -692,25 +861,15 @@ class MarkdownTranslator:
|
|
|
692
861
|
)
|
|
693
862
|
if progress:
|
|
694
863
|
await progress.add_groups(len(retry_groups))
|
|
695
|
-
retry_outputs
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
api_key,
|
|
705
|
-
timeout,
|
|
706
|
-
semaphore,
|
|
707
|
-
throttle,
|
|
708
|
-
max_tokens,
|
|
709
|
-
retry_limit,
|
|
710
|
-
)
|
|
711
|
-
)
|
|
712
|
-
if progress:
|
|
713
|
-
await progress.advance_groups(1)
|
|
864
|
+
retry_outputs = await run_groups(
|
|
865
|
+
retry_groups,
|
|
866
|
+
rotator,
|
|
867
|
+
f"retry-{attempt}",
|
|
868
|
+
max_tokens,
|
|
869
|
+
retry_limit,
|
|
870
|
+
provider,
|
|
871
|
+
model,
|
|
872
|
+
)
|
|
714
873
|
retry_nodes = self._ungroup_groups(
|
|
715
874
|
retry_outputs, failed_nodes, fill_missing=False
|
|
716
875
|
)
|
|
@@ -777,25 +936,15 @@ class MarkdownTranslator:
|
|
|
777
936
|
)
|
|
778
937
|
if progress:
|
|
779
938
|
await progress.add_groups(len(retry_groups))
|
|
780
|
-
retry_outputs
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
api_key,
|
|
790
|
-
timeout,
|
|
791
|
-
semaphore,
|
|
792
|
-
throttle,
|
|
793
|
-
fallback_max_tokens,
|
|
794
|
-
fallback_retry_limit,
|
|
795
|
-
)
|
|
796
|
-
)
|
|
797
|
-
if progress:
|
|
798
|
-
await progress.advance_groups(1)
|
|
939
|
+
retry_outputs = await run_groups(
|
|
940
|
+
retry_groups,
|
|
941
|
+
fallback_rotator,
|
|
942
|
+
f"fallback-{attempt}",
|
|
943
|
+
fallback_max_tokens,
|
|
944
|
+
fallback_retry_limit,
|
|
945
|
+
fallback_provider,
|
|
946
|
+
fallback_model,
|
|
947
|
+
)
|
|
799
948
|
retry_nodes = self._ungroup_groups(
|
|
800
949
|
retry_outputs, failed_nodes, fill_missing=False
|
|
801
950
|
)
|
|
@@ -862,25 +1011,15 @@ class MarkdownTranslator:
|
|
|
862
1011
|
)
|
|
863
1012
|
if progress:
|
|
864
1013
|
await progress.add_groups(len(retry_groups))
|
|
865
|
-
retry_outputs
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
api_key,
|
|
875
|
-
timeout,
|
|
876
|
-
semaphore,
|
|
877
|
-
throttle,
|
|
878
|
-
fallback_max_tokens_2,
|
|
879
|
-
fallback_retry_limit,
|
|
880
|
-
)
|
|
881
|
-
)
|
|
882
|
-
if progress:
|
|
883
|
-
await progress.advance_groups(1)
|
|
1014
|
+
retry_outputs = await run_groups(
|
|
1015
|
+
retry_groups,
|
|
1016
|
+
fallback_rotator,
|
|
1017
|
+
f"fallback2-{attempt}",
|
|
1018
|
+
fallback_max_tokens_2,
|
|
1019
|
+
fallback_retry_limit,
|
|
1020
|
+
fallback_provider_2,
|
|
1021
|
+
fallback_model_2,
|
|
1022
|
+
)
|
|
884
1023
|
retry_nodes = self._ungroup_groups(
|
|
885
1024
|
retry_outputs, failed_nodes, fill_missing=False
|
|
886
1025
|
)
|
|
@@ -448,4 +448,19 @@ def fix_markdown(text: str, level: str) -> str:
|
|
|
448
448
|
if level == "aggressive":
|
|
449
449
|
text = title_processor.fix_titles(text)
|
|
450
450
|
|
|
451
|
+
try:
|
|
452
|
+
from deepresearch_flow.paper.web.markdown import (
|
|
453
|
+
normalize_fenced_code_blocks,
|
|
454
|
+
normalize_footnote_definitions,
|
|
455
|
+
normalize_mermaid_blocks,
|
|
456
|
+
normalize_unbalanced_fences,
|
|
457
|
+
)
|
|
458
|
+
except Exception:
|
|
459
|
+
return text
|
|
460
|
+
|
|
461
|
+
text = normalize_fenced_code_blocks(text)
|
|
462
|
+
text = normalize_mermaid_blocks(text)
|
|
463
|
+
text = normalize_unbalanced_fences(text)
|
|
464
|
+
text = normalize_footnote_definitions(text)
|
|
465
|
+
|
|
451
466
|
return text
|
|
@@ -28,7 +28,7 @@ You are a professional translation engine. Follow these invariant rules:
|
|
|
28
28
|
|
|
29
29
|
TRANSLATE_XML_TEMPLATE = Template(
|
|
30
30
|
dedent(
|
|
31
|
-
"""\
|
|
31
|
+
r"""\
|
|
32
32
|
<TranslationTask version="1.0">
|
|
33
33
|
<meta>
|
|
34
34
|
<source_lang>$SOURCE_LANG</source_lang>
|
|
@@ -36,14 +36,23 @@ TRANSLATE_XML_TEMPLATE = Template(
|
|
|
36
36
|
<visibility_note>Sections with visibility="internal" are instructions and MUST NOT appear in the final output.</visibility_note>
|
|
37
37
|
</meta>
|
|
38
38
|
|
|
39
|
+
<task>
|
|
40
|
+
You are a professional $SOURCE_LANG_NAME ($SOURCE_LANG_CODE) to $TARGET_LANG_NAME ($TARGET_LANG_CODE) translator.
|
|
41
|
+
Your goal is to accurately convey the meaning and nuances of the original $SOURCE_LANG_NAME text while adhering to $TARGET_LANG_NAME grammar, vocabulary, and cultural sensitivities.
|
|
42
|
+
Produce only the $TARGET_LANG_NAME translation, without any additional explanations or commentary.
|
|
43
|
+
Please translate the following $SOURCE_LANG_NAME text into $TARGET_LANG_NAME:
|
|
44
|
+
Important: There are two blank lines before the text to translate.
|
|
45
|
+
</task>
|
|
46
|
+
|
|
39
47
|
<constraints visibility="internal">
|
|
40
48
|
<rule id="fmt-1">Preserve ALL original formatting exactly: Markdown, whitespace, line breaks, paragraph spacing.</rule>
|
|
41
|
-
<rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$,
|
|
49
|
+
<rule id="fmt-2">Do NOT translate any content inside LaTeX ($$...$$, $$$$...$$$$, \( ... \), \[ ... \]) or LaTeX commands/environments.</rule>
|
|
42
50
|
<rule id="fmt-3">Keep ALL HTML tags intact.</rule>
|
|
43
51
|
<rule id="fmt-4">Do NOT alter abbreviations, technical terms, or code identifiers; translate surrounding prose only.</rule>
|
|
44
52
|
<rule id="fmt-5">Document structure must be preserved, including blank lines (double newlines) between blocks.</rule>
|
|
45
53
|
</constraints>
|
|
46
54
|
|
|
55
|
+
|
|
47
56
|
<markers visibility="internal">
|
|
48
57
|
<preserve>
|
|
49
58
|
<open>@@PRESERVE_{n}@@</open>
|
|
@@ -91,6 +100,8 @@ TRANSLATE_XML_TEMPLATE = Template(
|
|
|
91
100
|
<io>
|
|
92
101
|
<input>
|
|
93
102
|
<![CDATA[
|
|
103
|
+
|
|
104
|
+
|
|
94
105
|
$TEXT_TO_TRANSLATE
|
|
95
106
|
]]>
|
|
96
107
|
</input>
|
|
@@ -105,9 +116,15 @@ $TEXT_TO_TRANSLATE
|
|
|
105
116
|
|
|
106
117
|
|
|
107
118
|
def build_translation_messages(source_lang: str | None, target_lang: str, text: str) -> list[dict[str, str]]:
|
|
119
|
+
source_name = source_lang or "auto"
|
|
120
|
+
target_name = target_lang
|
|
108
121
|
user_xml = TRANSLATE_XML_TEMPLATE.substitute(
|
|
109
122
|
SOURCE_LANG=source_lang or "auto",
|
|
110
123
|
TARGET_LANG=target_lang,
|
|
124
|
+
SOURCE_LANG_NAME=source_name,
|
|
125
|
+
SOURCE_LANG_CODE=source_name,
|
|
126
|
+
TARGET_LANG_NAME=target_name,
|
|
127
|
+
TARGET_LANG_CODE=target_name,
|
|
111
128
|
TEXT_TO_TRANSLATE=_cdata_wrap(text),
|
|
112
129
|
)
|
|
113
130
|
return [
|