RubyGems - llm_translate - Versions diffs - 0.1.0 - Mend

llm_translate 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +7 -0
data/.rspec_status +14 -0
data/README.md +301 -0
data/README.zh.md +209 -0
data/Rakefile +12 -0
data/content/changelog-1.md +12 -0
data/content/changelog-2.md +12 -0
data/content/llm_translate.yml +189 -0
data/content/prompt.md +8 -0
data/content/todo.md +115 -0
data/exe/llm_translate +6 -0
data/lib/llm_translate/ai_client.rb +95 -0
data/lib/llm_translate/cli.rb +205 -0
data/lib/llm_translate/config.rb +279 -0
data/lib/llm_translate/file_finder.rb +153 -0
data/lib/llm_translate/logger.rb +170 -0
data/lib/llm_translate/translator_engine.rb +233 -0
data/lib/llm_translate/version.rb +5 -0
data/lib/llm_translate.rb +16 -0
data/llm_translate.gemspec +41 -0
data/llm_translate.yml +189 -0
data/test_config.yml +52 -0
data/test_docs/sample.md +22 -0
data/test_docs_translated/sample.zh.md +22 -0
data/test_llm_translate.yml +180 -0
data/test_new_config.yml +189 -0
metadata +143 -0

data/lib/llm_translate/translator_engine.rb ADDED Viewed

@@ -0,0 +1,233 @@
+# frozen_string_literal: true
+require 'pathname'
+require 'fileutils'
+module LlmTranslate
+  class TranslatorEngine
+    attr_reader :config, :logger, :ai_client, :file_finder
+    def initialize(config, logger, ai_client)
+      @config = config
+      @logger = logger
+      @ai_client = ai_client
+      @file_finder = FileFinder.new(config, logger)
+    end
+    def translate_file(input_path)
+      logger.log_translation_start(input_path)
+      # Read file content
+      content = read_file_content(input_path)
+      # Determine output path
+      if config.single_file_mode?
+        output_path = config.output_file
+      else
+        output_path = file_finder.output_path_for(input_path).to_s
+        # Check if we should skip this file
+        if file_finder.should_skip_file?(input_path, output_path)
+          logger.info "Skipping file: #{input_path}"
+          return
+        end
+      end
+      # Ensure output directory exists
+      if config.single_file_mode?
+        output_dir = File.dirname(output_path)
+        FileUtils.mkdir_p(output_dir) unless Dir.exist?(output_dir)
+      else
+        file_finder.ensure_output_directory(output_path)
+      end
+      # Translate content
+      translated_content = translate_content(content, input_path)
+      # Write translated content
+      write_translated_file(output_path, translated_content)
+      logger.log_translation_complete(input_path, output_path)
+      # Add rate limiting delay if configured
+      sleep(config.request_interval) if config.request_interval.positive?
+    end
+    def translate_content(content, file_path = nil)
+      if config.preserve_formatting?
+        translate_with_format_preservation(content)
+      else
+        ai_client.translate(content)
+      end
+    rescue StandardError => e
+      raise TranslationError, "Failed to translate content#{file_path ? " from #{file_path}" : ''}: #{e.message}"
+    end
+    private
+    def read_file_content(file_path)
+      raise FileError, "Input file does not exist: #{file_path}" unless File.exist?(file_path)
+      raise FileError, "Input file is not readable: #{file_path}" unless File.readable?(file_path)
+      content = File.read(file_path, encoding: 'UTF-8')
+      logger.warn "Input file is empty: #{file_path}" if content.empty?
+      content
+    rescue Encoding::InvalidByteSequenceError => e
+      raise FileError, "Invalid encoding in file #{file_path}: #{e.message}"
+    end
+    def write_translated_file(output_path, content)
+      File.write(output_path, content, encoding: 'UTF-8')
+      logger.debug "Written translated content to: #{output_path}"
+    rescue StandardError => e
+      raise FileError, "Failed to write output file #{output_path}: #{e.message}"
+    end
+    def translate_with_format_preservation(content)
+      # Extract and preserve special markdown elements
+      preserved_elements = extract_preserved_elements(content)
+      # Replace preserved elements with placeholders
+      content_with_placeholders = replace_with_placeholders(content, preserved_elements)
+      # Translate the content with placeholders
+      translated_content = ai_client.translate(content_with_placeholders)
+      # Restore preserved elements
+      restore_preserved_elements(translated_content, preserved_elements)
+    end
+    def extract_preserved_elements(content)
+      preserved = {}
+      pattern_index = 0
+      config.preserve_patterns.each do |pattern|
+        regex = Regexp.new(pattern, Regexp::MULTILINE)
+        content.scan(regex) do |match|
+          # Handle both single match and capture groups
+          match_text = match.is_a?(Array) ? match[0] : match
+          placeholder = "PRESERVED_ELEMENT_#{pattern_index}"
+          preserved[placeholder] = match_text
+          pattern_index += 1
+        end
+      end
+      preserved
+    end
+    def replace_with_placeholders(content, preserved_elements)
+      result = content.dup
+      preserved_elements.each do |placeholder, original_text|
+        # Escape special regex characters in the original text
+        escaped_text = Regexp.escape(original_text)
+        result = result.gsub(Regexp.new(escaped_text), placeholder)
+      end
+      result
+    end
+    def restore_preserved_elements(translated_content, preserved_elements)
+      result = translated_content.dup
+      preserved_elements.each do |placeholder, original_text|
+        result = result.gsub(placeholder, original_text)
+      end
+      result
+    end
+    # Additional helper methods for handling special cases
+    def split_large_content(content, max_size = 3000)
+      # Split content into chunks if it's too large for the AI model
+      return [content] if content.length <= max_size
+      chunks = []
+      lines = content.split("\n")
+      current_chunk = ''
+      lines.each do |line|
+        # If adding this line would exceed the limit, start a new chunk
+        if "#{current_chunk}#{line}\n".length > max_size && !current_chunk.empty?
+          chunks << current_chunk.strip
+          current_chunk = "#{line}\n"
+        else
+          current_chunk += "#{line}\n"
+        end
+      end
+      # Add the last chunk if it's not empty
+      chunks << current_chunk.strip unless current_chunk.strip.empty?
+      chunks
+    end
+    def translate_large_content(content)
+      chunks = split_large_content(content)
+      return ai_client.translate(content) if chunks.length == 1
+      logger.info "Splitting large content into #{chunks.length} chunks"
+      translated_chunks = chunks.map.with_index do |chunk, index|
+        logger.debug "Translating chunk #{index + 1}/#{chunks.length}"
+        translated = ai_client.translate(chunk)
+        # Add delay between chunks to avoid rate limiting
+        sleep(config.request_interval) if config.request_interval.positive? && index < chunks.length - 1
+        translated
+      end
+      translated_chunks.join("\n\n")
+    end
+    def detect_language(content)
+      # Simple language detection based on content
+      # This is a basic implementation - could be enhanced with a proper language detection library
+      # Check for common English words
+      english_indicators = %w[the and or but with from this that these those]
+      chinese_indicators = %w[的 在 是 和 或者 但是 这 那]
+      english_score = english_indicators.count { |word| content.downcase.include?(word) }
+      chinese_score = chinese_indicators.count { |word| content.include?(word) }
+      if chinese_score > english_score
+        'zh'
+      elsif english_score.positive?
+        'en'
+      else
+        config.source_language
+      end
+    end
+    def should_translate_content?(content)
+      # Skip translation if content is mostly code or already in target language
+      # Skip if content is mostly code blocks
+      code_block_pattern = /```[\s\S]*?```/m
+      code_blocks = content.scan(code_block_pattern)
+      code_length = code_blocks.join.length
+      if code_length > content.length * 0.8
+        logger.debug 'Skipping translation: content is mostly code blocks'
+        return false
+      end
+      # Skip if content is very short
+      if content.strip.length < 10
+        logger.debug 'Skipping translation: content too short'
+        return false
+      end
+      true
+    end
+  end
+end

data/lib/llm_translate/version.rb ADDED Viewed

@@ -0,0 +1,5 @@
+# frozen_string_literal: true
+module LlmTranslate
+  VERSION = '0.1.0'
+end

data/lib/llm_translate.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+require_relative 'llm_translate/version'
+require_relative 'llm_translate/cli'
+require_relative 'llm_translate/config'
+require_relative 'llm_translate/file_finder'
+require_relative 'llm_translate/translator_engine'
+require_relative 'llm_translate/ai_client'
+require_relative 'llm_translate/logger'
+module LlmTranslate
+  class Error < StandardError; end
+  class ConfigurationError < Error; end
+  class TranslationError < Error; end
+  class FileError < Error; end
+end

data/llm_translate.gemspec ADDED Viewed

@@ -0,0 +1,41 @@
+# frozen_string_literal: true
+require_relative 'lib/llm_translate/version'
+Gem::Specification.new do |spec|
+  spec.name = 'llm_translate'
+  spec.version = LlmTranslate::VERSION
+  spec.authors = ['LlmTranslate Team']
+  spec.email = ['tianlu1677@gmail.com']
+  spec.summary = 'AI-powered Markdown translator'
+  spec.description = 'A Ruby gem for translating Markdown files using AI while preserving formatting'
+  spec.homepage = 'https://github.com/tianlu1677/llm_translate'
+  spec.license = 'MIT'
+  spec.required_ruby_version = '>= 3.1.0'
+  spec.metadata['allowed_push_host'] = 'https://rubygems.org'
+  spec.metadata['homepage_uri'] = spec.homepage
+  spec.metadata['source_code_uri'] = spec.homepage
+  spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
+  # Specify which files should be added to the gem when it is released.
+  spec.files = Dir.chdir(__dir__) do
+    `git ls-files -z`.split("\x0").reject do |f|
+      (File.expand_path(f) == __FILE__) ||
+        f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor Gemfile])
+    end
+  end
+  spec.bindir = 'exe'
+  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
+  spec.require_paths = ['lib']
+  # Dependencies
+  spec.add_dependency 'ruby_llm', '~> 1.6'
+  spec.add_dependency 'thor', '~> 1.3'
+  # Development dependencies
+  spec.add_development_dependency 'rspec', '~> 3.12'
+  spec.add_development_dependency 'rubocop', '~> 1.50'
+  spec.add_development_dependency 'rubocop-rspec', '~> 2.20'
+end

data/llm_translate.yml ADDED Viewed

@@ -0,0 +1,189 @@
+# translator.yml - 翻译工具配置文件
+# AI 模型配置
+ai:
+  # API 密钥
+  api_key: xxxx
+  # API 主机地址
+  host: https://aihubmix.com
+  # 模型提供商
+  provider: "claude"
+  # 模型名称
+  model: "claude-3-7-sonnet-20250219"
+  # 模型参数
+  temperature: 0.3
+  max_tokens: 4000
+  top_p: 1.0
+  # 请求重试配置
+  retry_attempts: 3
+  retry_delay: 2  # 秒
+  # 请求超时时间
+  timeout: 60  # 秒
+# 翻译配置
+translation:
+  # 默认翻译 prompt
+  default_prompt: |
+    请将以下 Markdown 内容翻译为中文，保持所有格式不变：
+    - 保留代码块、链接、图片等 Markdown 语法
+    - 保留英文的专业术语和产品名称
+    - 确保翻译自然流畅
+    内容：
+    {content}
+  # 目标语言
+  target_language: "zh-CN"
+  # 源语言（auto 为自动检测）
+  source_language: "auto"
+  # 是否保留原文格式
+  preserve_formatting: true
+  # 是否翻译代码注释
+  translate_code_comments: false
+  # 需要保留不翻译的内容模式
+  preserve_patterns:
+    - "```[\\s\\S]*?```"  # 代码块
+    - "`[^`]+`"            # 行内代码
+    - "\\[.*?\\]\\(.*?\\)" # 链接
+    - "!\\[.*?\\]\\(.*?\\)" # 图片
+# 文件处理配置
+files:
+  # 输入目录
+  input_directory: "./docs"
+  # 输出目录
+  output_directory: "./docs-translated"
+  # 输入文件
+  input_file: "./README.md"
+  # 输出文件
+  output_file: "./README.zh.md"
+  # 文件名后缀策略
+  filename_strategy: "suffix"  # suffix, replace, directory
+  filename_suffix: ".zh"       # 仅当 strategy 为 suffix 时使用
+  # 包含的文件模式
+  include_patterns:
+    - "**/*.md"
+    - "**/*.markdown"
+  # 排除的文件模式
+  exclude_patterns:
+    - "**/node_modules/**"
+    - "**/.*"
+    - "**/*.tmp"
+    - "**/README.md"  # 示例：排除 README 文件
+  # 是否保持目录结构
+  preserve_directory_structure: true
+  # 文件覆盖策略
+  overwrite_policy: "ask"  # ask, overwrite, skip, backup
+  # 备份目录（当 overwrite_policy 为 backup 时）
+  backup_directory: "./backups"
+# 日志配置
+logging:
+  # 日志级别
+  level: "info"  # debug, info, warn, error
+  # 日志输出位置
+  output: "console"  # console, file, both
+  # 日志文件路径（当 output 包含 file 时）
+  file_path: "./logs/translator.log"
+  # 是否记录详细的翻译过程
+  verbose_translation: false
+  # 错误日志文件
+  error_log_path: "./logs/errors.log"
+# 错误处理配置
+error_handling:
+  # 遇到错误时的行为
+  on_error: "log_and_continue"  # stop, log_and_continue, skip_file
+  # 最大连续错误数（超过则停止）
+  max_consecutive_errors: 5
+  # 错误重试次数
+  retry_on_failure: 2
+  # 生成错误报告
+  generate_error_report: true
+  error_report_path: "./logs/error_report.md"
+# 性能配置
+performance:
+  # 并发处理文件数
+  concurrent_files: 3
+  # 批处理大小（同时翻译的文件数）
+  batch_size: 5
+  # 请求间隔（避免 API 限流）
+  request_interval: 1  # 秒
+  # 内存使用限制
+  max_memory_mb: 500
+# 输出配置
+output:
+  # 是否显示进度条
+  show_progress: true
+  # 是否显示翻译统计
+  show_statistics: true
+  # 是否生成翻译报告
+  generate_report: true
+  report_path: "./reports/translation_report.md"
+  # 输出格式
+  format: "markdown"  # markdown, json, yaml
+  # 是否保留元数据
+  include_metadata: true
+# 预设配置（可通过 --preset 参数使用）
+presets:
+  chinese:
+    translation:
+      target_language: "zh-CN"
+      default_prompt: "翻译为简体中文，保持技术术语的准确性"
+  japanese:
+    translation:
+      target_language: "ja"
+      default_prompt: "日本語に翻訳してください。技術用語は正確に保ってください"
+  english:
+    translation:
+      target_language: "en"
+      default_prompt: "Translate to English, maintaining technical accuracy"
+# 自定义 Hook（高级功能）
+hooks:
+  # 翻译前处理
+  pre_translation: null
+  # 翻译后处理
+  post_translation: null
+  # 文件处理完成后
+  post_file_processing: null

data/test_config.yml ADDED Viewed

@@ -0,0 +1,52 @@
+# Test llm_translate configuration
+ai:
+  api_key: ${LLM_TRANSLATE_API_KEY}
+  provider: "openai"
+  model: "gpt-4"
+  temperature: 0.3
+  max_tokens: 4000
+  retry_attempts: 3
+  retry_delay: 2
+  timeout: 60
+translation:
+  target_language: "zh-CN"
+  default_prompt: |
+    Please translate the following Markdown content to Chinese, keeping all formatting intact:
+    - Preserve code blocks, links, images, and other Markdown syntax
+    - Keep English technical terms and product names
+    - Ensure natural and fluent translation
+    Content:
+    {content}
+files:
+  input_directory: "./test_docs"
+  output_directory: "./test_docs_translated"
+  filename_suffix: ".zh"
+  include_patterns:
+    - "**/*.md"
+    - "**/*.markdown"
+  exclude_patterns: []
+  preserve_directory_structure: true
+  overwrite_policy: "overwrite"
+logging:
+  level: "info"
+  output: "console"
+  verbose_translation: true
+error_handling:
+  on_error: "log_and_continue"
+  max_consecutive_errors: 5
+  retry_on_failure: 2
+  generate_error_report: true
+performance:
+  concurrent_files: 1
+  request_interval: 1
+output:
+  show_progress: true
+  show_statistics: true
+  generate_report: true

data/test_docs/sample.md ADDED Viewed

@@ -0,0 +1,22 @@
+# Sample Document
+This is a sample markdown document for testing the llm_translate.
+## Features
+- **Bold text** and *italic text*
+- Code blocks and `inline code`
+- [Links](https://example.com)
+- Images: ![Sample](https://example.com/image.png)
+## Code Example
+```ruby
+def hello_world
+  puts "Hello, World!"
+end
+```
+## Conclusion
+This document demonstrates various Markdown features that should be preserved during translation.

data/test_docs_translated/sample.zh.md ADDED Viewed

@@ -0,0 +1,22 @@
+# 示例文档
+这是一个用于测试翻译器的示例Markdown文档。
+## 特性
+- **粗体文本** 和 *斜体文本*
+- 代码块和 `inline code`
+- [Links](https://example.com)
+- 图片: ![Sample](https://example.com/image.png)
+## 代码示例
+```ruby
+def hello_world
+  puts "Hello, World!"
+end
+```
+## 结论
+本文档演示了在翻译过程中应保留的各种Markdown特性。