qwen-mt-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
qmt/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Qwen-MT CLI - Command-line translation tool powered by Qwen-MT."""
2
+
3
+ __version__ = "0.2.0"
qmt/batch.py ADDED
@@ -0,0 +1,538 @@
1
+ """Batch translation for CSV and Excel files."""
2
+
3
+ import csv
4
+ import io
5
+ import json
6
+ import time
7
+ from pathlib import Path
8
+
9
+ import pandas as pd
10
+
11
+ from qmt.client import QwenMTClient
12
+ from qmt.constants import DEFAULT_TOP_K, SMART_MATCH_THRESHOLD
13
+ from qmt.exceptions import APIError
14
+ from qmt.formatters import (
15
+ create_batch_progress,
16
+ print_error,
17
+ print_info,
18
+ print_rate_limit_wait,
19
+ print_warning,
20
+ )
21
+ from qmt.models import BatchResult, TranslationRequest
22
+ from qmt.parsers import read_csv_raw, read_file_with_encoding_fallback
23
+
24
+ MAX_RETRIES = 5
25
+ INITIAL_BACKOFF = 2 # seconds
26
+
27
+
28
+ def _translate_with_retry(
29
+ client: QwenMTClient,
30
+ request: TranslationRequest,
31
+ max_retries: int = MAX_RETRIES,
32
+ ) -> str:
33
+ """Translate with exponential backoff retry for rate limiting."""
34
+ for attempt in range(max_retries + 1):
35
+ try:
36
+ return client.translate(request)
37
+ except APIError as e:
38
+ err_msg = str(e).lower()
39
+ is_rate_limit = "rate" in err_msg or "429" in err_msg or "throttl" in err_msg
40
+ if is_rate_limit and attempt < max_retries:
41
+ wait = INITIAL_BACKOFF * (2**attempt)
42
+ print_rate_limit_wait(wait, attempt + 1, max_retries)
43
+ time.sleep(wait)
44
+ else:
45
+ raise
46
+ raise APIError("重试次数已用尽")
47
+
48
+
49
+ def _filter_terms_memory(
50
+ source_text: str,
51
+ store_terms: list,
52
+ extra_terms: list,
53
+ store_memory: list,
54
+ extra_memory: list,
55
+ api_key: str,
56
+ top_k: int,
57
+ threshold: int,
58
+ query_embedding: list[float] | None = None,
59
+ verbose: bool = False,
60
+ ) -> tuple[list, list]:
61
+ """Filter store terms/memory via semantic matching, merge with extras."""
62
+ from qmt.matcher import select_relevant_memory, select_relevant_terms
63
+
64
+ filtered_terms = extra_terms[:]
65
+ try:
66
+ matched = select_relevant_terms(
67
+ query_text=source_text,
68
+ store_terms=store_terms,
69
+ api_key=api_key,
70
+ top_k=top_k,
71
+ threshold=threshold,
72
+ query_embedding=query_embedding,
73
+ verbose=verbose,
74
+ )
75
+ filtered_terms.extend(matched)
76
+ except Exception:
77
+ filtered_terms.extend(store_terms)
78
+
79
+ filtered_memory = extra_memory[:]
80
+ try:
81
+ matched = select_relevant_memory(
82
+ query_text=source_text,
83
+ store_memory=store_memory,
84
+ api_key=api_key,
85
+ top_k=top_k,
86
+ threshold=threshold,
87
+ query_embedding=query_embedding,
88
+ verbose=verbose,
89
+ )
90
+ filtered_memory.extend(matched)
91
+ except Exception:
92
+ filtered_memory.extend(store_memory)
93
+
94
+ return filtered_terms, filtered_memory
95
+
96
+
97
+ def _needs_smart_match(store_terms: list, store_memory: list, threshold: int) -> bool:
98
+ """Check if semantic matching should be attempted."""
99
+ return len(store_terms) > threshold or len(store_memory) > threshold
100
+
101
+
102
+ def _pre_embed_batch(
103
+ source_texts: list[str],
104
+ store_terms: list,
105
+ store_memory: list,
106
+ api_key: str,
107
+ threshold: int,
108
+ verbose: bool,
109
+ ) -> dict[str, list[float]]:
110
+ """Pre-embed all source texts for batch efficiency. Returns text->embedding dict."""
111
+ if not _needs_smart_match(store_terms, store_memory, threshold):
112
+ return {}
113
+ try:
114
+ from qmt.matcher import batch_embed_queries
115
+
116
+ if verbose:
117
+ print_info("正在预计算源文本向量嵌入...")
118
+ return batch_embed_queries(api_key, source_texts)
119
+ except Exception:
120
+ if verbose:
121
+ print_warning("批量嵌入失败,将逐行回退到全量匹配")
122
+ return {}
123
+
124
+
125
+ # ─── CSV Batch ──────────────────────────────────────────────────────────────
126
+
127
+
128
+ def translate_csv(
129
+ client: QwenMTClient,
130
+ input_path: Path,
131
+ output_path: Path,
132
+ source_lang: str,
133
+ target_lang: str,
134
+ model: str,
135
+ domain: str | None = None,
136
+ store_terms: list | None = None,
137
+ extra_terms: list | None = None,
138
+ store_memory: list | None = None,
139
+ extra_memory: list | None = None,
140
+ has_header: bool = True,
141
+ resume: bool = False,
142
+ api_key: str = "",
143
+ top_k: int = DEFAULT_TOP_K,
144
+ threshold: int = SMART_MATCH_THRESHOLD,
145
+ learn: bool = False,
146
+ verbose: bool = False,
147
+ ) -> BatchResult:
148
+ """Translate CSV file: first column as source, append translation column."""
149
+ store_terms = store_terms or []
150
+ extra_terms = extra_terms or []
151
+ store_memory = store_memory or []
152
+ extra_memory = extra_memory or []
153
+
154
+ rows, delimiter = read_csv_raw(input_path)
155
+ total_rows = len(rows)
156
+
157
+ data_start = 1 if has_header else 0
158
+ translatable = total_rows - data_start
159
+
160
+ if translatable <= 0:
161
+ print_warning("文件中没有可翻译的数据行")
162
+ return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
163
+
164
+ # Resume: count rows already written in output
165
+ completed = 0
166
+ if resume and output_path.exists():
167
+ try:
168
+ existing = read_file_with_encoding_fallback(output_path)
169
+ existing_rows = list(csv.reader(io.StringIO(existing), delimiter=delimiter))
170
+ completed = len(existing_rows) - (1 if has_header else 0)
171
+ completed = max(completed, 0)
172
+ if completed > 0 and verbose:
173
+ print_info(f"恢复模式: 跳过已翻译的 {completed} 行")
174
+ except Exception:
175
+ completed = 0
176
+
177
+ # Pre-embed source texts for smart matching
178
+ all_sources = [rows[i][0].strip() if rows[i] else "" for i in range(data_start, total_rows)]
179
+ embeddings_map = _pre_embed_batch(
180
+ [t for t in all_sources if t],
181
+ store_terms,
182
+ store_memory,
183
+ api_key,
184
+ threshold,
185
+ verbose,
186
+ )
187
+
188
+ succeeded = 0
189
+ failed = 0
190
+ skipped = completed
191
+ learn_pairs: list[tuple[str, str]] = []
192
+
193
+ # Open file: append if resuming, else overwrite
194
+ mode = "a" if completed > 0 else "w"
195
+ with open(output_path, mode, newline="", encoding="utf-8") as f:
196
+ writer = csv.writer(f, delimiter=delimiter)
197
+
198
+ # Write header if new file
199
+ if mode == "w" and has_header:
200
+ writer.writerow(rows[0] + ["translation"])
201
+ f.flush()
202
+
203
+ progress = create_batch_progress(translatable)
204
+ task_id = list(progress.task_ids)[0]
205
+
206
+ with progress:
207
+ for i in range(data_start, total_rows):
208
+ data_index = i - data_start
209
+
210
+ # Skip rows already translated during resume
211
+ if data_index < completed:
212
+ progress.advance(task_id)
213
+ continue
214
+
215
+ row = rows[i]
216
+ source_text = row[0].strip() if row else ""
217
+
218
+ if not source_text:
219
+ writer.writerow(row + [""])
220
+ f.flush()
221
+ progress.advance(task_id)
222
+ continue
223
+
224
+ try:
225
+ # Per-row semantic filtering
226
+ row_terms, row_memory = _filter_terms_memory(
227
+ source_text,
228
+ store_terms,
229
+ extra_terms,
230
+ store_memory,
231
+ extra_memory,
232
+ api_key,
233
+ top_k,
234
+ threshold,
235
+ query_embedding=embeddings_map.get(source_text),
236
+ verbose=False,
237
+ )
238
+
239
+ request = TranslationRequest(
240
+ text=source_text,
241
+ source_lang=source_lang,
242
+ target_lang=target_lang,
243
+ model=model,
244
+ domain=domain,
245
+ terms=row_terms,
246
+ tm_list=row_memory,
247
+ )
248
+ translation = _translate_with_retry(client, request)
249
+ writer.writerow(row + [translation])
250
+ f.flush()
251
+ succeeded += 1
252
+
253
+ if learn and translation:
254
+ learn_pairs.append((source_text, translation))
255
+ except KeyboardInterrupt:
256
+ print_warning(
257
+ f"\n中断! 已完成 {succeeded + skipped}/{translatable} 行,"
258
+ f"可用 --resume 恢复"
259
+ )
260
+ break
261
+ except Exception as e:
262
+ writer.writerow(row + [f"[ERROR: {e}]"])
263
+ f.flush()
264
+ failed += 1
265
+ if verbose:
266
+ print_error(f"第 {i + 1} 行翻译失败: {e}")
267
+
268
+ progress.advance(task_id)
269
+
270
+ # Batch learn: write successful translations to memory
271
+ if learn_pairs:
272
+ try:
273
+ from qmt.matcher import batch_learn_memory
274
+
275
+ batch_learn_memory(learn_pairs, api_key)
276
+ if verbose:
277
+ print_info(f"已将 {len(learn_pairs)} 条翻译结果写入翻译记忆")
278
+ except Exception:
279
+ if verbose:
280
+ print_warning("翻译记忆批量回写失败")
281
+
282
+ return BatchResult(
283
+ total=translatable,
284
+ succeeded=succeeded,
285
+ failed=failed,
286
+ skipped=skipped,
287
+ output_path=output_path,
288
+ )
289
+
290
+
291
+ # ─── Excel Batch ────────────────────────────────────────────────────────────
292
+
293
+ _PROGRESS_FILE = ".qmt/batch_progress.json"
294
+
295
+
296
+ def _load_excel_progress(input_path: Path) -> dict:
297
+ """Load progress checkpoint for an Excel batch job."""
298
+ pf = Path(_PROGRESS_FILE)
299
+ if not pf.exists():
300
+ return {}
301
+ try:
302
+ data = json.loads(pf.read_text(encoding="utf-8"))
303
+ if data.get("input") == str(input_path):
304
+ return data.get("sheets", {})
305
+ return {}
306
+ except Exception:
307
+ return {}
308
+
309
+
310
+ def _save_excel_progress(input_path: Path, sheets: dict) -> None:
311
+ """Save progress checkpoint for an Excel batch job."""
312
+ pf = Path(_PROGRESS_FILE)
313
+ pf.parent.mkdir(parents=True, exist_ok=True)
314
+ pf.write_text(
315
+ json.dumps({"input": str(input_path), "sheets": sheets}, ensure_ascii=False),
316
+ encoding="utf-8",
317
+ )
318
+
319
+
320
+ def _clear_excel_progress() -> None:
321
+ pf = Path(_PROGRESS_FILE)
322
+ if pf.exists():
323
+ pf.unlink()
324
+
325
+
326
+ def translate_excel(
327
+ client: QwenMTClient,
328
+ input_path: Path,
329
+ output_path: Path,
330
+ source_lang: str,
331
+ target_lang: str,
332
+ model: str,
333
+ domain: str | None = None,
334
+ store_terms: list | None = None,
335
+ extra_terms: list | None = None,
336
+ store_memory: list | None = None,
337
+ extra_memory: list | None = None,
338
+ has_header: bool = True,
339
+ resume: bool = False,
340
+ api_key: str = "",
341
+ top_k: int = DEFAULT_TOP_K,
342
+ threshold: int = SMART_MATCH_THRESHOLD,
343
+ learn: bool = False,
344
+ verbose: bool = False,
345
+ ) -> BatchResult:
346
+ """Translate Excel file: first column of all sheets, append translation column."""
347
+ store_terms = store_terms or []
348
+ extra_terms = extra_terms or []
349
+ store_memory = store_memory or []
350
+ extra_memory = extra_memory or []
351
+
352
+ # Read all sheets
353
+ all_sheets: dict[str, pd.DataFrame] = pd.read_excel(
354
+ input_path,
355
+ sheet_name=None,
356
+ header=0 if has_header else None,
357
+ dtype=str,
358
+ )
359
+
360
+ if not all_sheets:
361
+ print_warning("Excel 文件中没有工作表")
362
+ return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
363
+
364
+ # Count total translatable rows across all sheets
365
+ total = sum(len(df) for df in all_sheets.values())
366
+ if total == 0:
367
+ print_warning("Excel 文件中没有可翻译的数据行")
368
+ return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
369
+
370
+ # Pre-embed all source texts for smart matching
371
+ all_sources: list[str] = []
372
+ for df in all_sheets.values():
373
+ for row_idx in range(len(df)):
374
+ val = str(df.iloc[row_idx, 0]).strip()
375
+ if val and val != "nan":
376
+ all_sources.append(val)
377
+
378
+ embeddings_map = _pre_embed_batch(
379
+ all_sources,
380
+ store_terms,
381
+ store_memory,
382
+ api_key,
383
+ threshold,
384
+ verbose,
385
+ )
386
+
387
+ # Load resume progress
388
+ sheet_progress: dict = {}
389
+ if resume:
390
+ sheet_progress = _load_excel_progress(input_path)
391
+ if sheet_progress and verbose:
392
+ print_info("恢复模式: 加载已有进度")
393
+
394
+ succeeded = 0
395
+ failed = 0
396
+ skipped = 0
397
+ result_sheets: dict[str, pd.DataFrame] = {}
398
+ learn_pairs: list[tuple[str, str]] = []
399
+
400
+ progress = create_batch_progress(total)
401
+ task_id = list(progress.task_ids)[0]
402
+
403
+ interrupted = False
404
+ with progress:
405
+ for sheet_name, df in all_sheets.items():
406
+ if interrupted:
407
+ result_sheets[sheet_name] = df
408
+ progress.advance(task_id, len(df))
409
+ continue
410
+
411
+ translations = []
412
+ completed_in_sheet = int(sheet_progress.get(sheet_name, 0))
413
+
414
+ for row_idx in range(len(df)):
415
+ # Resume: skip already translated rows
416
+ if row_idx < completed_in_sheet:
417
+ translations.append(None) # placeholder, filled from output
418
+ skipped += 1
419
+ progress.advance(task_id)
420
+ continue
421
+
422
+ source_text = str(df.iloc[row_idx, 0]).strip()
423
+ if not source_text or source_text == "nan":
424
+ translations.append("")
425
+ progress.advance(task_id)
426
+ continue
427
+
428
+ try:
429
+ # Per-row semantic filtering
430
+ row_terms, row_memory = _filter_terms_memory(
431
+ source_text,
432
+ store_terms,
433
+ extra_terms,
434
+ store_memory,
435
+ extra_memory,
436
+ api_key,
437
+ top_k,
438
+ threshold,
439
+ query_embedding=embeddings_map.get(source_text),
440
+ verbose=False,
441
+ )
442
+
443
+ request = TranslationRequest(
444
+ text=source_text,
445
+ source_lang=source_lang,
446
+ target_lang=target_lang,
447
+ model=model,
448
+ domain=domain,
449
+ terms=row_terms,
450
+ tm_list=row_memory,
451
+ )
452
+ translation = _translate_with_retry(client, request)
453
+ translations.append(translation)
454
+ succeeded += 1
455
+
456
+ if learn and translation:
457
+ learn_pairs.append((source_text, translation))
458
+
459
+ # Save progress checkpoint every row
460
+ sheet_progress[sheet_name] = row_idx + 1
461
+ _save_excel_progress(input_path, sheet_progress)
462
+ except KeyboardInterrupt:
463
+ translations.append("")
464
+ print_warning(
465
+ f"\n中断! 已完成 {succeeded + skipped}/{total} 行,可用 --resume 恢复"
466
+ )
467
+ interrupted = True
468
+ translations.extend([""] * (len(df) - row_idx - 1))
469
+ break
470
+ except Exception as e:
471
+ translations.append(f"[ERROR: {e}]")
472
+ failed += 1
473
+ if verbose:
474
+ print_error(f"[{sheet_name}] 第 {row_idx + 1} 行翻译失败: {e}")
475
+ sheet_progress[sheet_name] = row_idx + 1
476
+ _save_excel_progress(input_path, sheet_progress)
477
+
478
+ progress.advance(task_id)
479
+
480
+ # If resuming, merge translations from existing output
481
+ if resume and completed_in_sheet > 0 and output_path.exists():
482
+ try:
483
+ existing = pd.read_excel(
484
+ output_path,
485
+ sheet_name=sheet_name,
486
+ header=0 if has_header else None,
487
+ dtype=str,
488
+ )
489
+ if "translation" in (existing.columns if has_header else []):
490
+ col = existing["translation"]
491
+ elif not has_header and len(existing.columns) > len(df.columns):
492
+ col = existing.iloc[:, -1]
493
+ else:
494
+ col = pd.Series([""] * len(existing))
495
+
496
+ for j in range(min(completed_in_sheet, len(col))):
497
+ val = col.iloc[j]
498
+ translations[j] = "" if pd.isna(val) else str(val)
499
+ except Exception:
500
+ for j in range(completed_in_sheet):
501
+ if translations[j] is None:
502
+ translations[j] = ""
503
+
504
+ # Replace remaining None placeholders
505
+ translations = ["" if t is None else t for t in translations]
506
+
507
+ df_result = df.copy()
508
+ df_result["translation"] = translations
509
+ result_sheets[sheet_name] = df_result
510
+
511
+ # Write output
512
+ with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
513
+ for sheet_name, df_out in result_sheets.items():
514
+ df_out.to_excel(writer, sheet_name=sheet_name, index=False)
515
+
516
+ # Batch learn: write successful translations to memory
517
+ if learn_pairs:
518
+ try:
519
+ from qmt.matcher import batch_learn_memory
520
+
521
+ batch_learn_memory(learn_pairs, api_key)
522
+ if verbose:
523
+ print_info(f"已将 {len(learn_pairs)} 条翻译结果写入翻译记忆")
524
+ except Exception:
525
+ if verbose:
526
+ print_warning("翻译记忆批量回写失败")
527
+
528
+ # Clean up progress file on successful completion (no interruption)
529
+ if not interrupted and failed == 0:
530
+ _clear_excel_progress()
531
+
532
+ return BatchResult(
533
+ total=total,
534
+ succeeded=succeeded,
535
+ failed=failed,
536
+ skipped=skipped,
537
+ output_path=output_path,
538
+ )