PyPI - qwen-mt-cli - Versions diffs - 0.2.0__py3-none-any.whl - Mend

qwen-mt-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

qmt/__init__.py +3 -0
qmt/batch.py +538 -0
qmt/cli.py +664 -0
qmt/client.py +62 -0
qmt/constants.py +39 -0
qmt/embedding.py +39 -0
qmt/exceptions.py +40 -0
qmt/formatters.py +107 -0
qmt/interactive.py +222 -0
qmt/matcher.py +229 -0
qmt/models.py +45 -0
qmt/parsers.py +126 -0
qmt/store.py +212 -0
qmt/vectorstore.py +133 -0
qwen_mt_cli-0.2.0.dist-info/METADATA +258 -0
qwen_mt_cli-0.2.0.dist-info/RECORD +19 -0
qwen_mt_cli-0.2.0.dist-info/WHEEL +4 -0
qwen_mt_cli-0.2.0.dist-info/entry_points.txt +2 -0
qwen_mt_cli-0.2.0.dist-info/licenses/LICENSE +190 -0

qmt/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""Qwen-MT CLI - Command-line translation tool powered by Qwen-MT."""
+__version__ = "0.2.0"

qmt/batch.py ADDED Viewed

@@ -0,0 +1,538 @@
+"""Batch translation for CSV and Excel files."""
+import csv
+import io
+import json
+import time
+from pathlib import Path
+import pandas as pd
+from qmt.client import QwenMTClient
+from qmt.constants import DEFAULT_TOP_K, SMART_MATCH_THRESHOLD
+from qmt.exceptions import APIError
+from qmt.formatters import (
+    create_batch_progress,
+    print_error,
+    print_info,
+    print_rate_limit_wait,
+    print_warning,
+)
+from qmt.models import BatchResult, TranslationRequest
+from qmt.parsers import read_csv_raw, read_file_with_encoding_fallback
+MAX_RETRIES = 5
+INITIAL_BACKOFF = 2  # seconds
+def _translate_with_retry(
+    client: QwenMTClient,
+    request: TranslationRequest,
+    max_retries: int = MAX_RETRIES,
+) -> str:
+    """Translate with exponential backoff retry for rate limiting."""
+    for attempt in range(max_retries + 1):
+        try:
+            return client.translate(request)
+        except APIError as e:
+            err_msg = str(e).lower()
+            is_rate_limit = "rate" in err_msg or "429" in err_msg or "throttl" in err_msg
+            if is_rate_limit and attempt < max_retries:
+                wait = INITIAL_BACKOFF * (2**attempt)
+                print_rate_limit_wait(wait, attempt + 1, max_retries)
+                time.sleep(wait)
+            else:
+                raise
+    raise APIError("重试次数已用尽")
+def _filter_terms_memory(
+    source_text: str,
+    store_terms: list,
+    extra_terms: list,
+    store_memory: list,
+    extra_memory: list,
+    api_key: str,
+    top_k: int,
+    threshold: int,
+    query_embedding: list[float] | None = None,
+    verbose: bool = False,
+) -> tuple[list, list]:
+    """Filter store terms/memory via semantic matching, merge with extras."""
+    from qmt.matcher import select_relevant_memory, select_relevant_terms
+    filtered_terms = extra_terms[:]
+    try:
+        matched = select_relevant_terms(
+            query_text=source_text,
+            store_terms=store_terms,
+            api_key=api_key,
+            top_k=top_k,
+            threshold=threshold,
+            query_embedding=query_embedding,
+            verbose=verbose,
+        )
+        filtered_terms.extend(matched)
+    except Exception:
+        filtered_terms.extend(store_terms)
+    filtered_memory = extra_memory[:]
+    try:
+        matched = select_relevant_memory(
+            query_text=source_text,
+            store_memory=store_memory,
+            api_key=api_key,
+            top_k=top_k,
+            threshold=threshold,
+            query_embedding=query_embedding,
+            verbose=verbose,
+        )
+        filtered_memory.extend(matched)
+    except Exception:
+        filtered_memory.extend(store_memory)
+    return filtered_terms, filtered_memory
+def _needs_smart_match(store_terms: list, store_memory: list, threshold: int) -> bool:
+    """Check if semantic matching should be attempted."""
+    return len(store_terms) > threshold or len(store_memory) > threshold
+def _pre_embed_batch(
+    source_texts: list[str],
+    store_terms: list,
+    store_memory: list,
+    api_key: str,
+    threshold: int,
+    verbose: bool,
+) -> dict[str, list[float]]:
+    """Pre-embed all source texts for batch efficiency. Returns text->embedding dict."""
+    if not _needs_smart_match(store_terms, store_memory, threshold):
+        return {}
+    try:
+        from qmt.matcher import batch_embed_queries
+        if verbose:
+            print_info("正在预计算源文本向量嵌入...")
+        return batch_embed_queries(api_key, source_texts)
+    except Exception:
+        if verbose:
+            print_warning("批量嵌入失败，将逐行回退到全量匹配")
+        return {}
+# ─── CSV Batch ──────────────────────────────────────────────────────────────
+def translate_csv(
+    client: QwenMTClient,
+    input_path: Path,
+    output_path: Path,
+    source_lang: str,
+    target_lang: str,
+    model: str,
+    domain: str | None = None,
+    store_terms: list | None = None,
+    extra_terms: list | None = None,
+    store_memory: list | None = None,
+    extra_memory: list | None = None,
+    has_header: bool = True,
+    resume: bool = False,
+    api_key: str = "",
+    top_k: int = DEFAULT_TOP_K,
+    threshold: int = SMART_MATCH_THRESHOLD,
+    learn: bool = False,
+    verbose: bool = False,
+) -> BatchResult:
+    """Translate CSV file: first column as source, append translation column."""
+    store_terms = store_terms or []
+    extra_terms = extra_terms or []
+    store_memory = store_memory or []
+    extra_memory = extra_memory or []
+    rows, delimiter = read_csv_raw(input_path)
+    total_rows = len(rows)
+    data_start = 1 if has_header else 0
+    translatable = total_rows - data_start
+    if translatable <= 0:
+        print_warning("文件中没有可翻译的数据行")
+        return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
+    # Resume: count rows already written in output
+    completed = 0
+    if resume and output_path.exists():
+        try:
+            existing = read_file_with_encoding_fallback(output_path)
+            existing_rows = list(csv.reader(io.StringIO(existing), delimiter=delimiter))
+            completed = len(existing_rows) - (1 if has_header else 0)
+            completed = max(completed, 0)
+            if completed > 0 and verbose:
+                print_info(f"恢复模式: 跳过已翻译的 {completed} 行")
+        except Exception:
+            completed = 0
+    # Pre-embed source texts for smart matching
+    all_sources = [rows[i][0].strip() if rows[i] else "" for i in range(data_start, total_rows)]
+    embeddings_map = _pre_embed_batch(
+        [t for t in all_sources if t],
+        store_terms,
+        store_memory,
+        api_key,
+        threshold,
+        verbose,
+    )
+    succeeded = 0
+    failed = 0
+    skipped = completed
+    learn_pairs: list[tuple[str, str]] = []
+    # Open file: append if resuming, else overwrite
+    mode = "a" if completed > 0 else "w"
+    with open(output_path, mode, newline="", encoding="utf-8") as f:
+        writer = csv.writer(f, delimiter=delimiter)
+        # Write header if new file
+        if mode == "w" and has_header:
+            writer.writerow(rows[0] + ["translation"])
+            f.flush()
+        progress = create_batch_progress(translatable)
+        task_id = list(progress.task_ids)[0]
+        with progress:
+            for i in range(data_start, total_rows):
+                data_index = i - data_start
+                # Skip rows already translated during resume
+                if data_index < completed:
+                    progress.advance(task_id)
+                    continue
+                row = rows[i]
+                source_text = row[0].strip() if row else ""
+                if not source_text:
+                    writer.writerow(row + [""])
+                    f.flush()
+                    progress.advance(task_id)
+                    continue
+                try:
+                    # Per-row semantic filtering
+                    row_terms, row_memory = _filter_terms_memory(
+                        source_text,
+                        store_terms,
+                        extra_terms,
+                        store_memory,
+                        extra_memory,
+                        api_key,
+                        top_k,
+                        threshold,
+                        query_embedding=embeddings_map.get(source_text),
+                        verbose=False,
+                    )
+                    request = TranslationRequest(
+                        text=source_text,
+                        source_lang=source_lang,
+                        target_lang=target_lang,
+                        model=model,
+                        domain=domain,
+                        terms=row_terms,
+                        tm_list=row_memory,
+                    )
+                    translation = _translate_with_retry(client, request)
+                    writer.writerow(row + [translation])
+                    f.flush()
+                    succeeded += 1
+                    if learn and translation:
+                        learn_pairs.append((source_text, translation))
+                except KeyboardInterrupt:
+                    print_warning(
+                        f"\n中断! 已完成 {succeeded + skipped}/{translatable} 行，"
+                        f"可用 --resume 恢复"
+                    )
+                    break
+                except Exception as e:
+                    writer.writerow(row + [f"[ERROR: {e}]"])
+                    f.flush()
+                    failed += 1
+                    if verbose:
+                        print_error(f"第 {i + 1} 行翻译失败: {e}")
+                progress.advance(task_id)
+    # Batch learn: write successful translations to memory
+    if learn_pairs:
+        try:
+            from qmt.matcher import batch_learn_memory
+            batch_learn_memory(learn_pairs, api_key)
+            if verbose:
+                print_info(f"已将 {len(learn_pairs)} 条翻译结果写入翻译记忆")
+        except Exception:
+            if verbose:
+                print_warning("翻译记忆批量回写失败")
+    return BatchResult(
+        total=translatable,
+        succeeded=succeeded,
+        failed=failed,
+        skipped=skipped,
+        output_path=output_path,
+    )
+# ─── Excel Batch ────────────────────────────────────────────────────────────
+_PROGRESS_FILE = ".qmt/batch_progress.json"
+def _load_excel_progress(input_path: Path) -> dict:
+    """Load progress checkpoint for an Excel batch job."""
+    pf = Path(_PROGRESS_FILE)
+    if not pf.exists():
+        return {}
+    try:
+        data = json.loads(pf.read_text(encoding="utf-8"))
+        if data.get("input") == str(input_path):
+            return data.get("sheets", {})
+        return {}
+    except Exception:
+        return {}
+def _save_excel_progress(input_path: Path, sheets: dict) -> None:
+    """Save progress checkpoint for an Excel batch job."""
+    pf = Path(_PROGRESS_FILE)
+    pf.parent.mkdir(parents=True, exist_ok=True)
+    pf.write_text(
+        json.dumps({"input": str(input_path), "sheets": sheets}, ensure_ascii=False),
+        encoding="utf-8",
+    )
+def _clear_excel_progress() -> None:
+    pf = Path(_PROGRESS_FILE)
+    if pf.exists():
+        pf.unlink()
+def translate_excel(
+    client: QwenMTClient,
+    input_path: Path,
+    output_path: Path,
+    source_lang: str,
+    target_lang: str,
+    model: str,
+    domain: str | None = None,
+    store_terms: list | None = None,
+    extra_terms: list | None = None,
+    store_memory: list | None = None,
+    extra_memory: list | None = None,
+    has_header: bool = True,
+    resume: bool = False,
+    api_key: str = "",
+    top_k: int = DEFAULT_TOP_K,
+    threshold: int = SMART_MATCH_THRESHOLD,
+    learn: bool = False,
+    verbose: bool = False,
+) -> BatchResult:
+    """Translate Excel file: first column of all sheets, append translation column."""
+    store_terms = store_terms or []
+    extra_terms = extra_terms or []
+    store_memory = store_memory or []
+    extra_memory = extra_memory or []
+    # Read all sheets
+    all_sheets: dict[str, pd.DataFrame] = pd.read_excel(
+        input_path,
+        sheet_name=None,
+        header=0 if has_header else None,
+        dtype=str,
+    )
+    if not all_sheets:
+        print_warning("Excel 文件中没有工作表")
+        return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
+    # Count total translatable rows across all sheets
+    total = sum(len(df) for df in all_sheets.values())
+    if total == 0:
+        print_warning("Excel 文件中没有可翻译的数据行")
+        return BatchResult(total=0, succeeded=0, failed=0, skipped=0, output_path=output_path)
+    # Pre-embed all source texts for smart matching
+    all_sources: list[str] = []
+    for df in all_sheets.values():
+        for row_idx in range(len(df)):
+            val = str(df.iloc[row_idx, 0]).strip()
+            if val and val != "nan":
+                all_sources.append(val)
+    embeddings_map = _pre_embed_batch(
+        all_sources,
+        store_terms,
+        store_memory,
+        api_key,
+        threshold,
+        verbose,
+    )
+    # Load resume progress
+    sheet_progress: dict = {}
+    if resume:
+        sheet_progress = _load_excel_progress(input_path)
+        if sheet_progress and verbose:
+            print_info("恢复模式: 加载已有进度")
+    succeeded = 0
+    failed = 0
+    skipped = 0
+    result_sheets: dict[str, pd.DataFrame] = {}
+    learn_pairs: list[tuple[str, str]] = []
+    progress = create_batch_progress(total)
+    task_id = list(progress.task_ids)[0]
+    interrupted = False
+    with progress:
+        for sheet_name, df in all_sheets.items():
+            if interrupted:
+                result_sheets[sheet_name] = df
+                progress.advance(task_id, len(df))
+                continue
+            translations = []
+            completed_in_sheet = int(sheet_progress.get(sheet_name, 0))
+            for row_idx in range(len(df)):
+                # Resume: skip already translated rows
+                if row_idx < completed_in_sheet:
+                    translations.append(None)  # placeholder, filled from output
+                    skipped += 1
+                    progress.advance(task_id)
+                    continue
+                source_text = str(df.iloc[row_idx, 0]).strip()
+                if not source_text or source_text == "nan":
+                    translations.append("")
+                    progress.advance(task_id)
+                    continue
+                try:
+                    # Per-row semantic filtering
+                    row_terms, row_memory = _filter_terms_memory(
+                        source_text,
+                        store_terms,
+                        extra_terms,
+                        store_memory,
+                        extra_memory,
+                        api_key,
+                        top_k,
+                        threshold,
+                        query_embedding=embeddings_map.get(source_text),
+                        verbose=False,
+                    )
+                    request = TranslationRequest(
+                        text=source_text,
+                        source_lang=source_lang,
+                        target_lang=target_lang,
+                        model=model,
+                        domain=domain,
+                        terms=row_terms,
+                        tm_list=row_memory,
+                    )
+                    translation = _translate_with_retry(client, request)
+                    translations.append(translation)
+                    succeeded += 1
+                    if learn and translation:
+                        learn_pairs.append((source_text, translation))
+                    # Save progress checkpoint every row
+                    sheet_progress[sheet_name] = row_idx + 1
+                    _save_excel_progress(input_path, sheet_progress)
+                except KeyboardInterrupt:
+                    translations.append("")
+                    print_warning(
+                        f"\n中断! 已完成 {succeeded + skipped}/{total} 行，可用 --resume 恢复"
+                    )
+                    interrupted = True
+                    translations.extend([""] * (len(df) - row_idx - 1))
+                    break
+                except Exception as e:
+                    translations.append(f"[ERROR: {e}]")
+                    failed += 1
+                    if verbose:
+                        print_error(f"[{sheet_name}] 第 {row_idx + 1} 行翻译失败: {e}")
+                    sheet_progress[sheet_name] = row_idx + 1
+                    _save_excel_progress(input_path, sheet_progress)
+                progress.advance(task_id)
+            # If resuming, merge translations from existing output
+            if resume and completed_in_sheet > 0 and output_path.exists():
+                try:
+                    existing = pd.read_excel(
+                        output_path,
+                        sheet_name=sheet_name,
+                        header=0 if has_header else None,
+                        dtype=str,
+                    )
+                    if "translation" in (existing.columns if has_header else []):
+                        col = existing["translation"]
+                    elif not has_header and len(existing.columns) > len(df.columns):
+                        col = existing.iloc[:, -1]
+                    else:
+                        col = pd.Series([""] * len(existing))
+                    for j in range(min(completed_in_sheet, len(col))):
+                        val = col.iloc[j]
+                        translations[j] = "" if pd.isna(val) else str(val)
+                except Exception:
+                    for j in range(completed_in_sheet):
+                        if translations[j] is None:
+                            translations[j] = ""
+            # Replace remaining None placeholders
+            translations = ["" if t is None else t for t in translations]
+            df_result = df.copy()
+            df_result["translation"] = translations
+            result_sheets[sheet_name] = df_result
+    # Write output
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
+        for sheet_name, df_out in result_sheets.items():
+            df_out.to_excel(writer, sheet_name=sheet_name, index=False)
+    # Batch learn: write successful translations to memory
+    if learn_pairs:
+        try:
+            from qmt.matcher import batch_learn_memory
+            batch_learn_memory(learn_pairs, api_key)
+            if verbose:
+                print_info(f"已将 {len(learn_pairs)} 条翻译结果写入翻译记忆")
+        except Exception:
+            if verbose:
+                print_warning("翻译记忆批量回写失败")
+    # Clean up progress file on successful completion (no interruption)
+    if not interrupted and failed == 0:
+        _clear_excel_progress()
+    return BatchResult(
+        total=total,
+        succeeded=succeeded,
+        failed=failed,
+        skipped=skipped,
+        output_path=output_path,
+    )