deepresearch-flow 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,451 @@
1
+ """CLI commands for markdown translation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ import time
10
+
11
+ import click
12
+ import coloredlogs
13
+ import httpx
14
+ from tqdm import tqdm
15
+ from rich.console import Console
16
+ from rich.table import Table
17
+
18
+ from deepresearch_flow.paper.config import ProviderConfig, load_config, resolve_api_keys
19
+ from deepresearch_flow.paper.extract import parse_model_ref
20
+ from deepresearch_flow.paper.utils import (
21
+ discover_markdown,
22
+ estimate_tokens,
23
+ read_text,
24
+ short_hash,
25
+ )
26
+ from deepresearch_flow.translator.config import TranslateConfig
27
+ from deepresearch_flow.translator.engine import MarkdownTranslator, RequestThrottle
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def configure_logging(verbose: bool) -> None:
34
+ level = "DEBUG" if verbose else "INFO"
35
+ coloredlogs.install(level=level, fmt="%(asctime)s %(levelname)s %(message)s")
36
+
37
+
38
+ def _format_duration(seconds: float) -> str:
39
+ if seconds < 60:
40
+ return f"{seconds:.2f}s"
41
+ minutes, remainder = divmod(seconds, 60)
42
+ if minutes < 60:
43
+ return f"{int(minutes)}m {remainder:.1f}s"
44
+ hours, minutes = divmod(minutes, 60)
45
+ return f"{int(hours)}h {int(minutes)}m {remainder:.1f}s"
46
+
47
+
48
+ def _language_suffix(target_lang: str) -> str:
49
+ lang = (target_lang or "").lower()
50
+ if lang.startswith("zh"):
51
+ return "zh"
52
+ if lang.startswith(("ja", "jp")):
53
+ return "ja"
54
+ return lang or "out"
55
+
56
+
57
+ def _unique_output_name(path: Path, suffix: str, used: set[str]) -> str:
58
+ base = path.stem
59
+ filename = f"{base}.{suffix}.md"
60
+ if filename not in used:
61
+ used.add(filename)
62
+ return filename
63
+ suffix_hash = short_hash(str(path))
64
+ filename = f"{base}.{suffix}.{suffix_hash}.md"
65
+ used.add(filename)
66
+ return filename
67
+
68
+
69
+ class ProgressTracker:
70
+ def __init__(self, doc_total: int) -> None:
71
+ self.doc_bar = tqdm(total=doc_total, desc="documents", unit="doc", position=0)
72
+ self.group_bar = tqdm(total=0, desc="groups", unit="group", position=1, leave=False)
73
+ self.lock = asyncio.Lock()
74
+
75
+ async def add_groups(self, count: int) -> None:
76
+ if count <= 0:
77
+ return
78
+ async with self.lock:
79
+ self.group_bar.total = (self.group_bar.total or 0) + count
80
+ self.group_bar.refresh()
81
+
82
+ async def advance_groups(self, count: int) -> None:
83
+ if count <= 0:
84
+ return
85
+ async with self.lock:
86
+ self.group_bar.update(count)
87
+
88
+ async def advance_docs(self, count: int = 1) -> None:
89
+ if count <= 0:
90
+ return
91
+ async with self.lock:
92
+ self.doc_bar.update(count)
93
+
94
+ async def set_group_status(self, text: str) -> None:
95
+ async with self.lock:
96
+ self.group_bar.set_postfix_str(text)
97
+ self.group_bar.refresh()
98
+
99
+ async def close(self) -> None:
100
+ async with self.lock:
101
+ self.group_bar.close()
102
+ self.doc_bar.close()
103
+
104
+
105
+ @click.group()
106
+ def translator() -> None:
107
+ """Translation workflows for OCR markdown."""
108
+
109
+
110
+ @translator.command()
111
+ @click.option("-c", "--config", "config_path", default="config.toml", help="Path to config.toml")
112
+ @click.option(
113
+ "-i",
114
+ "--input",
115
+ "inputs",
116
+ multiple=True,
117
+ required=True,
118
+ help="Input markdown file or directory (repeatable)",
119
+ )
120
+ @click.option("--count", "count_limit", default=None, type=int, help="Translate up to N files")
121
+ @click.option("-g", "--glob", "glob_pattern", default=None, help="Glob filter when input is a directory")
122
+ @click.option("-m", "--model", "model_ref", required=True, help="provider/model")
123
+ @click.option("--source-lang", "source_lang", default=None, help="Source language hint")
124
+ @click.option("--target-lang", "target_lang", default="zh", show_default=True, help="Target language")
125
+ @click.option("--output-dir", "output_dir", default=None, help="Directory for translated markdown outputs")
126
+ @click.option("--fix-level", "fix_level", default="moderate", type=click.Choice(["off", "moderate", "aggressive"]))
127
+ @click.option("--max-chunk-chars", "max_chunk_chars", default=4000, show_default=True, type=int)
128
+ @click.option("--max-concurrency", "max_concurrency", default=4, show_default=True, type=int)
129
+ @click.option("--timeout", "timeout", default=120.0, show_default=True, type=float)
130
+ @click.option("--retry-times", "retry_times", default=3, show_default=True, type=int)
131
+ @click.option("--fallback-model", "fallback_model_ref", default=None, help="Fallback provider/model")
132
+ @click.option(
133
+ "--fallback-model-2",
134
+ "fallback_model_ref_2",
135
+ default=None,
136
+ help="Second fallback provider/model",
137
+ )
138
+ @click.option(
139
+ "--fallback-retry-times",
140
+ "fallback_retry_times",
141
+ default=None,
142
+ type=int,
143
+ help="Retry rounds for fallback model",
144
+ )
145
+ @click.option(
146
+ "--fallback-retry-times-2",
147
+ "fallback_retry_times_2",
148
+ default=None,
149
+ type=int,
150
+ help="Retry rounds for second fallback model",
151
+ )
152
+ @click.option("--sleep-every", "sleep_every", default=None, type=int, help="Sleep after every N requests")
153
+ @click.option("--sleep-time", "sleep_time", default=None, type=float, help="Sleep duration in seconds")
154
+ @click.option("--debug-dir", "debug_dir", default=None, help="Directory for debug outputs")
155
+ @click.option("--dump-protected", "dump_protected", is_flag=True, help="Write protected markdown")
156
+ @click.option("--dump-placeholders", "dump_placeholders", is_flag=True, help="Write placeholder mapping JSON")
157
+ @click.option("--dump-nodes", "dump_nodes", is_flag=True, help="Write per-node translation JSON")
158
+ @click.option("--no-format", "no_format", is_flag=True, help="Disable rumdl formatting")
159
+ @click.option("--dry-run", "dry_run", is_flag=True, help="Discover inputs without calling providers")
160
+ @click.option("--force", "force", is_flag=True, help="Overwrite existing outputs")
161
+ @click.option("-v", "--verbose", is_flag=True, help="Enable verbose logging")
162
+ def translate(
163
+ config_path: str,
164
+ inputs: tuple[str, ...],
165
+ count_limit: int | None,
166
+ glob_pattern: str | None,
167
+ model_ref: str,
168
+ source_lang: str | None,
169
+ target_lang: str,
170
+ output_dir: str | None,
171
+ fix_level: str,
172
+ max_chunk_chars: int,
173
+ max_concurrency: int,
174
+ timeout: float,
175
+ retry_times: int,
176
+ fallback_model_ref: str | None,
177
+ fallback_model_ref_2: str | None,
178
+ fallback_retry_times: int | None,
179
+ fallback_retry_times_2: int | None,
180
+ sleep_every: int | None,
181
+ sleep_time: float | None,
182
+ debug_dir: str | None,
183
+ dump_protected: bool,
184
+ dump_placeholders: bool,
185
+ dump_nodes: bool,
186
+ no_format: bool,
187
+ dry_run: bool,
188
+ force: bool,
189
+ verbose: bool,
190
+ ) -> None:
191
+ """Translate OCR markdown while preserving structure."""
192
+ configure_logging(verbose)
193
+ config = load_config(config_path)
194
+ provider, model_name = parse_model_ref(model_ref, config.providers)
195
+ if provider.type in {
196
+ "openai_compatible",
197
+ "dashscope",
198
+ "gemini_ai_studio",
199
+ "azure_openai",
200
+ "claude",
201
+ }:
202
+ if not resolve_api_keys(provider.api_keys):
203
+ raise click.ClickException(f"{provider.type} providers require api_keys")
204
+ fallback_provider: ProviderConfig | None = None
205
+ fallback_model_name: str | None = None
206
+ if fallback_model_ref:
207
+ fallback_provider, fallback_model_name = parse_model_ref(
208
+ fallback_model_ref, config.providers
209
+ )
210
+ if fallback_provider.type in {
211
+ "openai_compatible",
212
+ "dashscope",
213
+ "gemini_ai_studio",
214
+ "azure_openai",
215
+ "claude",
216
+ }:
217
+ if not resolve_api_keys(fallback_provider.api_keys):
218
+ raise click.ClickException(
219
+ f"{fallback_provider.type} fallback providers require api_keys"
220
+ )
221
+ fallback_provider_2: ProviderConfig | None = None
222
+ fallback_model_name_2: str | None = None
223
+ if fallback_model_ref_2:
224
+ fallback_provider_2, fallback_model_name_2 = parse_model_ref(
225
+ fallback_model_ref_2, config.providers
226
+ )
227
+ if fallback_provider_2.type in {
228
+ "openai_compatible",
229
+ "dashscope",
230
+ "gemini_ai_studio",
231
+ "azure_openai",
232
+ "claude",
233
+ }:
234
+ if not resolve_api_keys(fallback_provider_2.api_keys):
235
+ raise click.ClickException(
236
+ f"{fallback_provider_2.type} fallback providers require api_keys"
237
+ )
238
+
239
+ if max_chunk_chars <= 0:
240
+ raise click.ClickException("--max-chunk-chars must be positive")
241
+ if max_concurrency <= 0:
242
+ raise click.ClickException("--max-concurrency must be positive")
243
+ if timeout <= 0:
244
+ raise click.ClickException("--timeout must be positive")
245
+ if retry_times <= 0:
246
+ raise click.ClickException("--retry-times must be positive")
247
+ if count_limit is not None and count_limit <= 0:
248
+ raise click.ClickException("--count must be positive")
249
+ if fallback_retry_times is not None and fallback_retry_times <= 0:
250
+ raise click.ClickException("--fallback-retry-times must be positive")
251
+ if fallback_retry_times_2 is not None and fallback_retry_times_2 <= 0:
252
+ raise click.ClickException("--fallback-retry-times-2 must be positive")
253
+ if (sleep_every is None) != (sleep_time is None):
254
+ raise click.ClickException("Both --sleep-every and --sleep-time are required")
255
+
256
+ markdown_files = discover_markdown(inputs, glob_pattern)
257
+ if not markdown_files:
258
+ raise click.ClickException("No markdown files discovered")
259
+ if count_limit is not None and dry_run:
260
+ markdown_files = markdown_files[:count_limit]
261
+
262
+ start_time = time.monotonic()
263
+ input_chars = 0
264
+ for path in markdown_files:
265
+ input_chars += len(read_text(path))
266
+
267
+ if dry_run:
268
+ duration = time.monotonic() - start_time
269
+ table = Table(
270
+ title="translator translate summary (dry-run)",
271
+ header_style="bold cyan",
272
+ title_style="bold magenta",
273
+ )
274
+ table.add_column("Metric", style="cyan", no_wrap=True)
275
+ table.add_column("Value", style="white", overflow="fold")
276
+ table.add_row("Documents", str(len(markdown_files)))
277
+ if count_limit is not None:
278
+ table.add_row("Limit", str(count_limit))
279
+ table.add_row("Duration", _format_duration(duration))
280
+ table.add_row("Input chars", str(input_chars))
281
+ table.add_row("Est tokens", str(estimate_tokens(input_chars)))
282
+ Console().print(table)
283
+ return
284
+
285
+ suffix = _language_suffix(target_lang)
286
+ output_root = Path(output_dir) if output_dir else None
287
+ if output_root is not None:
288
+ output_root.mkdir(parents=True, exist_ok=True)
289
+
290
+ debug_root = Path(debug_dir) if debug_dir else None
291
+ if debug_root is None and (dump_protected or dump_placeholders or dump_nodes):
292
+ debug_root = output_root or Path.cwd()
293
+ if debug_root is not None:
294
+ debug_root.mkdir(parents=True, exist_ok=True)
295
+
296
+ used_names: set[str] = set()
297
+ output_map: dict[Path, Path] = {}
298
+ for path in markdown_files:
299
+ if output_root is None:
300
+ output_map[path] = path.with_name(f"{path.stem}.{suffix}.md")
301
+ else:
302
+ output_name = _unique_output_name(path, suffix, used_names)
303
+ output_map[path] = output_root / output_name
304
+
305
+ to_process: list[Path] = []
306
+ skipped = 0
307
+ for path in markdown_files:
308
+ output_path = output_map[path]
309
+ if output_path.exists() and not force:
310
+ skipped += 1
311
+ logger.info("Skip existing output: %s", output_path)
312
+ continue
313
+ to_process.append(path)
314
+ if count_limit is not None:
315
+ to_process = to_process[:count_limit]
316
+
317
+ if not to_process:
318
+ table = Table(
319
+ title="translator translate summary",
320
+ header_style="bold cyan",
321
+ title_style="bold magenta",
322
+ )
323
+ table.add_column("Metric", style="cyan", no_wrap=True)
324
+ table.add_column("Value", style="white", overflow="fold")
325
+ table.add_row("Documents", str(len(markdown_files)))
326
+ table.add_row("Skipped", str(skipped))
327
+ table.add_row("Processed", "0")
328
+ if count_limit is not None:
329
+ table.add_row("Limit", str(count_limit))
330
+ Console().print(table)
331
+ return
332
+ cfg = TranslateConfig(
333
+ source_lang=source_lang,
334
+ target_lang=target_lang,
335
+ max_chunk_chars=max_chunk_chars,
336
+ retry_times=retry_times,
337
+ )
338
+ translator = MarkdownTranslator(cfg)
339
+ semaphore = asyncio.Semaphore(max_concurrency)
340
+
341
+ throttle = None
342
+ if sleep_every is not None or sleep_time is not None:
343
+ if not sleep_every or not sleep_time:
344
+ raise click.ClickException("--sleep-every and --sleep-time must be set together")
345
+ throttle = RequestThrottle(int(sleep_every), float(sleep_time))
346
+
347
+ max_tokens = provider.max_tokens if provider.type == "claude" else None
348
+ fallback_max_tokens = (
349
+ fallback_provider.max_tokens if fallback_provider and fallback_provider.type == "claude" else None
350
+ )
351
+ fallback_max_tokens_2 = (
352
+ fallback_provider_2.max_tokens
353
+ if fallback_provider_2 and fallback_provider_2.type == "claude"
354
+ else None
355
+ )
356
+
357
+ async def process_one(
358
+ path: Path,
359
+ client: httpx.AsyncClient,
360
+ progress: ProgressTracker,
361
+ ) -> None:
362
+ content = read_text(path)
363
+ result = await translator.translate(
364
+ content,
365
+ provider,
366
+ model_name,
367
+ client,
368
+ provider.api_keys,
369
+ timeout,
370
+ semaphore,
371
+ throttle,
372
+ max_tokens,
373
+ fix_level,
374
+ progress=progress,
375
+ fallback_provider=fallback_provider,
376
+ fallback_model=fallback_model_name,
377
+ fallback_max_tokens=fallback_max_tokens,
378
+ fallback_provider_2=fallback_provider_2,
379
+ fallback_model_2=fallback_model_name_2,
380
+ fallback_max_tokens_2=fallback_max_tokens_2,
381
+ fallback_retry_times=fallback_retry_times,
382
+ fallback_retry_times_2=fallback_retry_times_2,
383
+ format_enabled=not no_format,
384
+ )
385
+ output_path = output_map[path]
386
+ output_path.write_text(result.translated_text, encoding="utf-8")
387
+ stats = result.stats
388
+ logger.info(
389
+ "Translated %s | nodes=%d ok=%d fail=%d skip=%d groups=%d retries=%d",
390
+ path.name,
391
+ stats.total_nodes,
392
+ stats.success_nodes,
393
+ stats.failed_nodes,
394
+ stats.skipped_nodes,
395
+ stats.initial_groups,
396
+ stats.retry_groups,
397
+ )
398
+ await progress.set_group_status(
399
+ f"nodes {stats.total_nodes} ok {stats.success_nodes} "
400
+ f"fail {stats.failed_nodes} skip {stats.skipped_nodes}"
401
+ )
402
+
403
+ if debug_root is not None:
404
+ debug_tag = f"{path.stem}.{short_hash(str(path))}"
405
+ if dump_protected:
406
+ (debug_root / f"{debug_tag}.protected.md").write_text(
407
+ result.protected_text, encoding="utf-8"
408
+ )
409
+ if dump_placeholders:
410
+ result.placeholder_store.save(str(debug_root / f"{debug_tag}.placeholders.json"))
411
+ if dump_nodes:
412
+ node_payload = {
413
+ str(node_id): {
414
+ "origin_text": node.origin_text,
415
+ "translated_text": node.translated_text,
416
+ }
417
+ for node_id, node in result.nodes.items()
418
+ }
419
+ (debug_root / f"{debug_tag}.nodes.json").write_text(
420
+ json.dumps(node_payload, ensure_ascii=False, indent=2),
421
+ encoding="utf-8",
422
+ )
423
+ await progress.advance_docs(1)
424
+
425
+ async def run() -> None:
426
+ progress = ProgressTracker(len(to_process))
427
+ try:
428
+ async with httpx.AsyncClient() as client:
429
+ for path in to_process:
430
+ await process_one(path, client, progress)
431
+ finally:
432
+ await progress.close()
433
+
434
+ asyncio.run(run())
435
+
436
+ duration = time.monotonic() - start_time
437
+ table = Table(
438
+ title="translator translate summary",
439
+ header_style="bold cyan",
440
+ title_style="bold magenta",
441
+ )
442
+ table.add_column("Metric", style="cyan", no_wrap=True)
443
+ table.add_column("Value", style="white", overflow="fold")
444
+ table.add_row("Documents", str(len(markdown_files)))
445
+ table.add_row("Skipped", str(skipped))
446
+ table.add_row("Processed", str(len(to_process)))
447
+ if count_limit is not None:
448
+ table.add_row("Limit", str(count_limit))
449
+ table.add_row("Duration", _format_duration(duration))
450
+ table.add_row("Output suffix", f".{suffix}.md")
451
+ Console().print(table)
@@ -0,0 +1,19 @@
1
+ """Translator configuration helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ @dataclass
9
+ class TranslateConfig:
10
+ source_lang: str | None = None
11
+ target_lang: str = "zh"
12
+ max_chunk_chars: int = 4000
13
+ translate_tables: bool = False
14
+ translate_links_text: bool = False
15
+ translate_image_alt: bool = False
16
+ strict_placeholder_check: bool = True
17
+ retry_failed_nodes: bool = True
18
+ retry_times: int = 3
19
+ retry_group_max_chars: int | None = None