RubyGems - llm_translate - Versions diffs - 0.4.0 → 0.6.0 - Mend

llm_translate 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/CONCURRENT_CHUNKS_UPDATE.md +149 -0
data/DOCUMENT_SPLITTER_SUMMARY.md +123 -0
data/README.md +79 -0
data/large_document_config.yml +159 -0
data/lib/llm_translate/config.rb +30 -19
data/lib/llm_translate/document_splitter.rb +157 -0
data/lib/llm_translate/translator_engine.rb +96 -2
data/lib/llm_translate/version.rb +1 -1
data/llm_translate.yml +14 -2
metadata +6 -5
data/test_config.yml +0 -52
data/test_llm_translate.yml +0 -176
data/test_new_config.yml +0 -184

data/lib/llm_translate/document_splitter.rb ADDED Viewed

@@ -0,0 +1,157 @@
+# frozen_string_literal: true
+module LlmTranslate
+  class DocumentSplitter
+    attr_reader :config, :logger
+    def initialize(config, logger = nil)
+      @config = config
+      @logger = logger || Logger.new($stdout, level: :info)
+    end
+    # 拆分文档为多个片段
+    def split_document(content)
+      return [content] unless should_split?(content)
+      logger.info "Document size (#{content.length} chars) exceeds limit, splitting..."
+      sections = extract_markdown_sections(content)
+      chunks = build_chunks(sections)
+      logger.info "Document split into #{chunks.length} chunks"
+      chunks
+    end
+    # 合并翻译后的文档片段
+    def merge_translated_chunks(translated_chunks)
+      return translated_chunks.first if translated_chunks.length == 1
+      logger.info "Merging #{translated_chunks.length} translated chunks..."
+      # 简单合并，用双换行连接
+      merged_content = translated_chunks.join("\n\n")
+      # 清理多余的空行
+      clean_merged_content(merged_content)
+    end
+    private
+    def should_split?(content)
+      content.length > config.max_chars_for_splitting
+    end
+    def extract_markdown_sections(content)
+      sections = []
+      current_section = ''
+      lines = content.split("\n")
+      lines.each do |line|
+        # 检查是否是新的段落开始（标题、空行后的内容等）
+        if is_section_boundary?(line, current_section) && !current_section.strip.empty?
+          sections << current_section.strip
+          current_section = ''
+        end
+        current_section += "#{line}\n"
+      end
+      # 添加最后一个段落
+      sections << current_section.strip unless current_section.strip.empty?
+      sections
+    end
+    def is_section_boundary?(line, current_section)
+      return false if current_section.strip.empty?
+      # 标题行
+      return true if line.start_with?('#') && line.match?(/^#+\s+/)
+      # 代码块开始/结束
+      return true if line.match?(/^```/)
+      # 列表项
+      return true if line.match?(/^\s*[-*+]\s+/) || line.match?(/^\s*\d+\.\s+/)
+      # 引用块
+      return true if line.match?(/^>\s+/)
+      # 水平分割线
+      return true if line.match?(/^[-*_]{3,}$/)
+      # 表格行
+      return true if line.match?(/^\|.*\|$/)
+      # 空行后的非空行（新段落）
+      return true if current_section.end_with?("\n\n") && !line.strip.empty?
+      false
+    end
+    def build_chunks(sections)
+      chunks = []
+      current_chunk = ''
+      sections.each do |section|
+        # 如果单个段落就超过限制，需要强制拆分
+        if section.length > config.split_every_chars
+          # 保存当前块
+          chunks << current_chunk.strip unless current_chunk.strip.empty?
+          # 强制拆分长段落
+          forced_chunks = force_split_section(section)
+          chunks.concat(forced_chunks)
+          current_chunk = ''
+          next
+        end
+        # 检查添加这个段落是否会超过限制
+        potential_length = current_chunk.length + section.length + 2 # +2 for "\n\n"
+        if potential_length > config.split_every_chars && !current_chunk.strip.empty?
+          # 保存当前块并开始新块
+          chunks << current_chunk.strip
+          current_chunk = "#{section}\n\n"
+        else
+          # 添加到当前块
+          current_chunk += "#{section}\n\n"
+        end
+      end
+      # 添加最后一个块
+      chunks << current_chunk.strip unless current_chunk.strip.empty?
+      chunks
+    end
+    def force_split_section(section)
+      chunks = []
+      lines = section.split("\n")
+      current_chunk = ''
+      lines.each do |line|
+        potential_length = current_chunk.length + line.length + 1 # +1 for "\n"
+        if potential_length > config.split_every_chars && !current_chunk.strip.empty?
+          chunks << current_chunk.strip
+          current_chunk = "#{line}\n"
+        else
+          current_chunk += "#{line}\n"
+        end
+      end
+      chunks << current_chunk.strip unless current_chunk.strip.empty?
+      chunks
+    end
+    def clean_merged_content(content)
+      # 移除多余的空行（超过2个连续换行的情况）
+      cleaned = content.gsub(/\n{3,}/, "\n\n")
+      # 确保文档以单个换行结尾
+      "#{cleaned.strip}\n"
+    end
+  end
+end

data/lib/llm_translate/translator_engine.rb CHANGED Viewed

@@ -3,16 +3,18 @@
 require 'pathname'
 require 'fileutils'
 require 'async'
+require_relative 'document_splitter'
 module LlmTranslate
   class TranslatorEngine
-    attr_reader :config, :logger, :ai_client, :file_finder
+    attr_reader :config, :logger, :ai_client, :file_finder, :document_splitter
     def initialize(config, logger, ai_client)
       @config = config
       @logger = logger
       @ai_client = ai_client
       @file_finder = FileFinder.new(config, logger)
+      @document_splitter = DocumentSplitter.new(config, logger)
     end
     def translate_file(input_path)
@@ -115,7 +117,10 @@ module LlmTranslate
     end
     def translate_content(content, file_path = nil)
-      if config.preserve_formatting?
+      # 检查是否需要启用文档拆分
+      if config.enable_document_splitting? && content.length > config.max_chars_for_splitting
+        translate_with_document_splitting(content, file_path)
+      elsif config.preserve_formatting?
         translate_with_format_preservation(content)
       else
         ai_client.translate(content)
@@ -151,5 +156,94 @@ module LlmTranslate
       # Translate the content with placeholders
       ai_client.translate(content)
     end
+    def translate_with_document_splitting(content, file_path = nil)
+      logger.info "Document splitting enabled for large content#{file_path ? " from #{file_path}" : ''}"
+      # 拆分文档
+      chunks = document_splitter.split_document(content)
+      logger.info "Translating #{chunks.length} chunks with #{config.concurrent_chunks} concurrent workers..."
+      # 并发翻译chunks
+      translated_chunks = translate_chunks_concurrently(chunks)
+      # 合并翻译后的片段
+      logger.info 'Merging translated chunks...'
+      document_splitter.merge_translated_chunks(translated_chunks)
+    end
+    def translate_chunks_concurrently(chunks)
+      return translate_chunks_sequentially(chunks) if config.concurrent_chunks <= 1
+      translated_chunks = Array.new(chunks.length)
+      # 使用 Async 进行并发处理
+      Async do |task|
+        # 将chunks分批处理，每批最多concurrent_chunks个
+        chunks.each_slice(config.concurrent_chunks).each do |batch|
+          # 为当前批次创建并发任务
+          batch_tasks = batch.map.with_index do |chunk, _batch_index|
+            # 计算在原数组中的索引
+            chunk_index = chunks.index(chunk)
+            task.async do
+              logger.info "Translating chunk #{chunk_index + 1}/#{chunks.length} (#{chunk.length} chars)..."
+              begin
+                translated_chunk = if config.preserve_formatting?
+                                     translate_with_format_preservation(chunk)
+                                   else
+                                     ai_client.translate(chunk)
+                                   end
+                # 将翻译结果存储在正确的位置
+                translated_chunks[chunk_index] = translated_chunk
+                logger.info "✓ Completed chunk #{chunk_index + 1}/#{chunks.length}"
+                translated_chunk
+              rescue StandardError => e
+                logger.error "✗ Failed to translate chunk #{chunk_index + 1}: #{e.message}"
+                raise e
+              end
+            end
+          end
+          # 等待当前批次的所有任务完成
+          batch_tasks.each(&:wait)
+          # 在批次间添加延迟
+          sleep(config.request_interval) if config.request_interval.positive?
+        end
+      end
+      translated_chunks
+    end
+    def translate_chunks_sequentially(chunks)
+      translated_chunks = []
+      chunks.each_with_index do |chunk, index|
+        logger.info "Translating chunk #{index + 1}/#{chunks.length} (#{chunk.length} chars)..."
+        begin
+          translated_chunk = if config.preserve_formatting?
+                               translate_with_format_preservation(chunk)
+                             else
+                               ai_client.translate(chunk)
+                             end
+          translated_chunks << translated_chunk
+          # 添加请求间隔延迟
+          sleep(config.request_interval) if config.request_interval.positive? && index < chunks.length - 1
+        rescue StandardError => e
+          logger.error "Failed to translate chunk #{index + 1}: #{e.message}"
+          raise e
+        end
+      end
+      translated_chunks
+    end
   end
 end

data/lib/llm_translate/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module LlmTranslate
-  VERSION = '0.4.0'
+  VERSION = '0.6.0'
 end

data/llm_translate.yml CHANGED Viewed

@@ -50,6 +50,18 @@ translation:
   # 是否翻译代码注释
   translate_code_comments: false
+  # 文档拆分配置
+  # 当文档字符数超过 max_chars 时，自动启用拆分功能
+  enable_splitting: true
+  # 触发拆分的最大字符数
+  max_chars: 20000
+  # 每个片段的目标字符数
+  every_chars: 18000
+  # 并发翻译的 chunk 数量
+  concurrent_chunks: 3
 # 文件处理配置
@@ -125,13 +137,13 @@ error_handling:
 # 性能配置
 performance:
-  # 并发处理文件数
+  # 并发处理文件数（使用文档拆分时建议设为 1）
   concurrent_files: 3
   # 批处理大小（同时翻译的文件数）
   batch_size: 5
-  # 请求间隔（避免 API 限流）
+  # 请求间隔（避免 API 限流，拆分文档时特别重要）
   request_interval: 1  # 秒
   # 内存使用限制

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llm_translate
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.6.0
 platform: ruby
 authors:
 - LlmTranslate Team
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2025-09-01 00:00:00.000000000 Z
+date: 2025-09-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: async
@@ -103,6 +103,8 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".rspec_status"
+- CONCURRENT_CHUNKS_UPDATE.md
+- DOCUMENT_SPLITTER_SUMMARY.md
 - README.md
 - README.zh.md
 - Rakefile
@@ -112,21 +114,20 @@ files:
 - content/prompt.md
 - content/todo.md
 - exe/llm_translate
+- large_document_config.yml
 - lib/llm_translate.rb
 - lib/llm_translate/ai_client.rb
 - lib/llm_translate/cli.rb
 - lib/llm_translate/config.rb
+- lib/llm_translate/document_splitter.rb
 - lib/llm_translate/file_finder.rb
 - lib/llm_translate/logger.rb
 - lib/llm_translate/translator_engine.rb
 - lib/llm_translate/version.rb
 - llm_translate.gemspec
 - llm_translate.yml
-- test_config.yml
 - test_docs/sample.md
 - test_docs_translated/sample.zh.md
-- test_llm_translate.yml
-- test_new_config.yml
 homepage: https://github.com/tianlu1677/llm_translate
 licenses:
 - MIT

data/test_config.yml DELETED Viewed

@@ -1,52 +0,0 @@
-# Test llm_translate configuration
-ai:
-  api_key: ${LLM_TRANSLATE_API_KEY}
-  provider: "openai"
-  model: "gpt-4"
-  temperature: 0.3
-  max_tokens: 4000
-  retry_attempts: 3
-  retry_delay: 2
-  timeout: 60
-translation:
-  target_language: "zh-CN"
-  default_prompt: |
-    Please translate the following Markdown content to Chinese, keeping all formatting intact:
-    - Preserve code blocks, links, images, and other Markdown syntax
-    - Keep English technical terms and product names
-    - Ensure natural and fluent translation
-    Content:
-    {content}
-files:
-  input_directory: "./test_docs"
-  output_directory: "./test_docs_translated"
-  filename_suffix: ".zh"
-  include_patterns:
-    - "**/*.md"
-    - "**/*.markdown"
-  exclude_patterns: []
-  preserve_directory_structure: true
-  overwrite_policy: "overwrite"
-logging:
-  level: "info"
-  output: "console"
-  verbose_translation: true
-error_handling:
-  on_error: "log_and_continue"
-  max_consecutive_errors: 5
-  retry_on_failure: 2
-  generate_error_report: true
-performance:
-  concurrent_files: 1
-  request_interval: 1
-output:
-  show_progress: true
-  show_statistics: true
-  generate_report: true

data/test_llm_translate.yml DELETED Viewed

@@ -1,176 +0,0 @@
-# translator.yml - 翻译工具配置文件
-# AI 模型配置
-ai:
-  # API 密钥（建议使用环境变量 LLM_TRANSLATE_API_KEY）
-  api_key: ${LLM_TRANSLATE_API_KEY}
-  # 模型提供商（openai, anthropic, ollama 等）
-  provider: "openai"
-  # 模型名称
-  model: "gpt-4"
-  # 模型参数
-  temperature: 0.3
-  max_tokens: 4000
-  top_p: 1.0
-  # 请求重试配置
-  retry_attempts: 3
-  retry_delay: 2  # 秒
-  # 请求超时时间
-  timeout: 60  # 秒
-# 翻译配置
-translation:
-  # 默认翻译 prompt
-  default_prompt: |
-    请将以下 Markdown 内容翻译为中文，保持所有格式不变：
-    - 保留代码块、链接、图片等 Markdown 语法
-    - 保留英文的专业术语和产品名称
-    - 确保翻译自然流畅
-    内容：
-    {content}
-  # 目标语言
-  target_language: "zh-CN"
-  # 源语言（auto 为自动检测）
-  source_language: "auto"
-  # 是否保留原文格式
-  preserve_formatting: true
-  # 是否翻译代码注释
-  translate_code_comments: false
-  # 需要保留不翻译的内容模式
-# 文件处理配置
-files:
-  # 输入目录
-  input_directory: "./docs"
-  # 输出目录
-  output_directory: "./docs-translated"
-  # 文件名后缀策略
-  filename_strategy: "suffix"  # suffix, replace, directory
-  filename_suffix: ".zh"       # 仅当 strategy 为 suffix 时使用
-  # 包含的文件模式
-  include_patterns:
-    - "**/*.md"
-    - "**/*.markdown"
-  # 排除的文件模式
-  exclude_patterns:
-    - "**/node_modules/**"
-    - "**/.*"
-    - "**/*.tmp"
-    - "**/README.md"  # 示例：排除 README 文件
-  # 是否保持目录结构
-  preserve_directory_structure: true
-  # 文件覆盖策略
-  overwrite_policy: "ask"  # ask, overwrite, skip, backup
-  # 备份目录（当 overwrite_policy 为 backup 时）
-  backup_directory: "./backups"
-# 日志配置
-logging:
-  # 日志级别
-  level: "info"  # debug, info, warn, error
-  # 日志输出位置
-  output: "console"  # console, file, both
-  # 日志文件路径（当 output 包含 file 时）
-  file_path: "./logs/translator.log"
-  # 是否记录详细的翻译过程
-  verbose_translation: false
-  # 错误日志文件
-  error_log_path: "./logs/errors.log"
-# 错误处理配置
-error_handling:
-  # 遇到错误时的行为
-  on_error: "log_and_continue"  # stop, log_and_continue, skip_file
-  # 最大连续错误数（超过则停止）
-  max_consecutive_errors: 5
-  # 错误重试次数
-  retry_on_failure: 2
-  # 生成错误报告
-  generate_error_report: true
-  error_report_path: "./logs/error_report.md"
-# 性能配置
-performance:
-  # 并发处理文件数
-  concurrent_files: 3
-  # 批处理大小（同时翻译的文件数）
-  batch_size: 5
-  # 请求间隔（避免 API 限流）
-  request_interval: 1  # 秒
-  # 内存使用限制
-  max_memory_mb: 500
-# 输出配置
-output:
-  # 是否显示进度条
-  show_progress: true
-  # 是否显示翻译统计
-  show_statistics: true
-  # 是否生成翻译报告
-  generate_report: true
-  report_path: "./reports/translation_report.md"
-  # 输出格式
-  format: "markdown"  # markdown, json, yaml
-  # 是否保留元数据
-  include_metadata: true
-# 预设配置（可通过 --preset 参数使用）
-presets:
-  chinese:
-    translation:
-      target_language: "zh-CN"
-      default_prompt: "翻译为简体中文，保持技术术语的准确性"
-  japanese:
-    translation:
-      target_language: "ja"
-      default_prompt: "日本語に翻訳してください。技術用語は正確に保ってください"
-  english:
-    translation:
-      target_language: "en"
-      default_prompt: "Translate to English, maintaining technical accuracy"
-# 自定义 Hook（高级功能）
-hooks:
-  # 翻译前处理
-  pre_translation: null
-  # 翻译后处理
-  post_translation: null
-  # 文件处理完成后
-  post_file_processing: null