llm_translate 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,233 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+ require 'fileutils'
5
+
6
+ module LlmTranslate
7
+ class TranslatorEngine
8
+ attr_reader :config, :logger, :ai_client, :file_finder
9
+
10
+ def initialize(config, logger, ai_client)
11
+ @config = config
12
+ @logger = logger
13
+ @ai_client = ai_client
14
+ @file_finder = FileFinder.new(config, logger)
15
+ end
16
+
17
+ def translate_file(input_path)
18
+ logger.log_translation_start(input_path)
19
+
20
+ # Read file content
21
+ content = read_file_content(input_path)
22
+
23
+ # Determine output path
24
+ if config.single_file_mode?
25
+ output_path = config.output_file
26
+ else
27
+ output_path = file_finder.output_path_for(input_path).to_s
28
+
29
+ # Check if we should skip this file
30
+ if file_finder.should_skip_file?(input_path, output_path)
31
+ logger.info "Skipping file: #{input_path}"
32
+ return
33
+ end
34
+ end
35
+
36
+ # Ensure output directory exists
37
+ if config.single_file_mode?
38
+ output_dir = File.dirname(output_path)
39
+ FileUtils.mkdir_p(output_dir) unless Dir.exist?(output_dir)
40
+ else
41
+ file_finder.ensure_output_directory(output_path)
42
+ end
43
+
44
+ # Translate content
45
+ translated_content = translate_content(content, input_path)
46
+
47
+ # Write translated content
48
+ write_translated_file(output_path, translated_content)
49
+
50
+ logger.log_translation_complete(input_path, output_path)
51
+
52
+ # Add rate limiting delay if configured
53
+ sleep(config.request_interval) if config.request_interval.positive?
54
+ end
55
+
56
+ def translate_content(content, file_path = nil)
57
+ if config.preserve_formatting?
58
+ translate_with_format_preservation(content)
59
+ else
60
+ ai_client.translate(content)
61
+ end
62
+ rescue StandardError => e
63
+ raise TranslationError, "Failed to translate content#{file_path ? " from #{file_path}" : ''}: #{e.message}"
64
+ end
65
+
66
+ private
67
+
68
+ def read_file_content(file_path)
69
+ raise FileError, "Input file does not exist: #{file_path}" unless File.exist?(file_path)
70
+
71
+ raise FileError, "Input file is not readable: #{file_path}" unless File.readable?(file_path)
72
+
73
+ content = File.read(file_path, encoding: 'UTF-8')
74
+
75
+ logger.warn "Input file is empty: #{file_path}" if content.empty?
76
+
77
+ content
78
+ rescue Encoding::InvalidByteSequenceError => e
79
+ raise FileError, "Invalid encoding in file #{file_path}: #{e.message}"
80
+ end
81
+
82
+ def write_translated_file(output_path, content)
83
+ File.write(output_path, content, encoding: 'UTF-8')
84
+ logger.debug "Written translated content to: #{output_path}"
85
+ rescue StandardError => e
86
+ raise FileError, "Failed to write output file #{output_path}: #{e.message}"
87
+ end
88
+
89
+ def translate_with_format_preservation(content)
90
+ # Extract and preserve special markdown elements
91
+ preserved_elements = extract_preserved_elements(content)
92
+
93
+ # Replace preserved elements with placeholders
94
+ content_with_placeholders = replace_with_placeholders(content, preserved_elements)
95
+
96
+ # Translate the content with placeholders
97
+ translated_content = ai_client.translate(content_with_placeholders)
98
+
99
+ # Restore preserved elements
100
+ restore_preserved_elements(translated_content, preserved_elements)
101
+ end
102
+
103
+ def extract_preserved_elements(content)
104
+ preserved = {}
105
+ pattern_index = 0
106
+
107
+ config.preserve_patterns.each do |pattern|
108
+ regex = Regexp.new(pattern, Regexp::MULTILINE)
109
+
110
+ content.scan(regex) do |match|
111
+ # Handle both single match and capture groups
112
+ match_text = match.is_a?(Array) ? match[0] : match
113
+ placeholder = "PRESERVED_ELEMENT_#{pattern_index}"
114
+ preserved[placeholder] = match_text
115
+ pattern_index += 1
116
+ end
117
+ end
118
+
119
+ preserved
120
+ end
121
+
122
+ def replace_with_placeholders(content, preserved_elements)
123
+ result = content.dup
124
+
125
+ preserved_elements.each do |placeholder, original_text|
126
+ # Escape special regex characters in the original text
127
+ escaped_text = Regexp.escape(original_text)
128
+ result = result.gsub(Regexp.new(escaped_text), placeholder)
129
+ end
130
+
131
+ result
132
+ end
133
+
134
+ def restore_preserved_elements(translated_content, preserved_elements)
135
+ result = translated_content.dup
136
+
137
+ preserved_elements.each do |placeholder, original_text|
138
+ result = result.gsub(placeholder, original_text)
139
+ end
140
+
141
+ result
142
+ end
143
+
144
+ # Additional helper methods for handling special cases
145
+
146
+ def split_large_content(content, max_size = 3000)
147
+ # Split content into chunks if it's too large for the AI model
148
+ return [content] if content.length <= max_size
149
+
150
+ chunks = []
151
+ lines = content.split("\n")
152
+ current_chunk = ''
153
+
154
+ lines.each do |line|
155
+ # If adding this line would exceed the limit, start a new chunk
156
+ if "#{current_chunk}#{line}\n".length > max_size && !current_chunk.empty?
157
+ chunks << current_chunk.strip
158
+ current_chunk = "#{line}\n"
159
+ else
160
+ current_chunk += "#{line}\n"
161
+ end
162
+ end
163
+
164
+ # Add the last chunk if it's not empty
165
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
166
+
167
+ chunks
168
+ end
169
+
170
+ def translate_large_content(content)
171
+ chunks = split_large_content(content)
172
+
173
+ return ai_client.translate(content) if chunks.length == 1
174
+
175
+ logger.info "Splitting large content into #{chunks.length} chunks"
176
+
177
+ translated_chunks = chunks.map.with_index do |chunk, index|
178
+ logger.debug "Translating chunk #{index + 1}/#{chunks.length}"
179
+
180
+ translated = ai_client.translate(chunk)
181
+
182
+ # Add delay between chunks to avoid rate limiting
183
+ sleep(config.request_interval) if config.request_interval.positive? && index < chunks.length - 1
184
+
185
+ translated
186
+ end
187
+
188
+ translated_chunks.join("\n\n")
189
+ end
190
+
191
+ def detect_language(content)
192
+ # Simple language detection based on content
193
+ # This is a basic implementation - could be enhanced with a proper language detection library
194
+
195
+ # Check for common English words
196
+ english_indicators = %w[the and or but with from this that these those]
197
+ chinese_indicators = %w[的 在 是 和 或者 但是 这 那]
198
+
199
+ english_score = english_indicators.count { |word| content.downcase.include?(word) }
200
+ chinese_score = chinese_indicators.count { |word| content.include?(word) }
201
+
202
+ if chinese_score > english_score
203
+ 'zh'
204
+ elsif english_score.positive?
205
+ 'en'
206
+ else
207
+ config.source_language
208
+ end
209
+ end
210
+
211
+ def should_translate_content?(content)
212
+ # Skip translation if content is mostly code or already in target language
213
+
214
+ # Skip if content is mostly code blocks
215
+ code_block_pattern = /```[\s\S]*?```/m
216
+ code_blocks = content.scan(code_block_pattern)
217
+ code_length = code_blocks.join.length
218
+
219
+ if code_length > content.length * 0.8
220
+ logger.debug 'Skipping translation: content is mostly code blocks'
221
+ return false
222
+ end
223
+
224
+ # Skip if content is very short
225
+ if content.strip.length < 10
226
+ logger.debug 'Skipping translation: content too short'
227
+ return false
228
+ end
229
+
230
+ true
231
+ end
232
+ end
233
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmTranslate
4
+ VERSION = '0.1.0'
5
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'llm_translate/version'
4
+ require_relative 'llm_translate/cli'
5
+ require_relative 'llm_translate/config'
6
+ require_relative 'llm_translate/file_finder'
7
+ require_relative 'llm_translate/translator_engine'
8
+ require_relative 'llm_translate/ai_client'
9
+ require_relative 'llm_translate/logger'
10
+
11
+ module LlmTranslate
12
+ class Error < StandardError; end
13
+ class ConfigurationError < Error; end
14
+ class TranslationError < Error; end
15
+ class FileError < Error; end
16
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'lib/llm_translate/version'
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'llm_translate'
7
+ spec.version = LlmTranslate::VERSION
8
+ spec.authors = ['LlmTranslate Team']
9
+ spec.email = ['tianlu1677@gmail.com']
10
+
11
+ spec.summary = 'AI-powered Markdown translator'
12
+ spec.description = 'A Ruby gem for translating Markdown files using AI while preserving formatting'
13
+ spec.homepage = 'https://github.com/tianlu1677/llm_translate'
14
+ spec.license = 'MIT'
15
+ spec.required_ruby_version = '>= 3.1.0'
16
+
17
+ spec.metadata['allowed_push_host'] = 'https://rubygems.org'
18
+ spec.metadata['homepage_uri'] = spec.homepage
19
+ spec.metadata['source_code_uri'] = spec.homepage
20
+ spec.metadata['changelog_uri'] = "#{spec.homepage}/blob/main/CHANGELOG.md"
21
+
22
+ # Specify which files should be added to the gem when it is released.
23
+ spec.files = Dir.chdir(__dir__) do
24
+ `git ls-files -z`.split("\x0").reject do |f|
25
+ (File.expand_path(f) == __FILE__) ||
26
+ f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor Gemfile])
27
+ end
28
+ end
29
+ spec.bindir = 'exe'
30
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
31
+ spec.require_paths = ['lib']
32
+
33
+ # Dependencies
34
+ spec.add_dependency 'ruby_llm', '~> 1.6'
35
+ spec.add_dependency 'thor', '~> 1.3'
36
+
37
+ # Development dependencies
38
+ spec.add_development_dependency 'rspec', '~> 3.12'
39
+ spec.add_development_dependency 'rubocop', '~> 1.50'
40
+ spec.add_development_dependency 'rubocop-rspec', '~> 2.20'
41
+ end
data/llm_translate.yml ADDED
@@ -0,0 +1,189 @@
1
+ # translator.yml - 翻译工具配置文件
2
+
3
+ # AI 模型配置
4
+ ai:
5
+ # API 密钥
6
+ api_key: xxxx
7
+
8
+ # API 主机地址
9
+ host: https://aihubmix.com
10
+
11
+ # 模型提供商
12
+ provider: "claude"
13
+
14
+ # 模型名称
15
+ model: "claude-3-7-sonnet-20250219"
16
+
17
+ # 模型参数
18
+ temperature: 0.3
19
+ max_tokens: 4000
20
+ top_p: 1.0
21
+
22
+ # 请求重试配置
23
+ retry_attempts: 3
24
+ retry_delay: 2 # 秒
25
+
26
+ # 请求超时时间
27
+ timeout: 60 # 秒
28
+
29
+ # 翻译配置
30
+ translation:
31
+ # 默认翻译 prompt
32
+ default_prompt: |
33
+ 请将以下 Markdown 内容翻译为中文,保持所有格式不变:
34
+ - 保留代码块、链接、图片等 Markdown 语法
35
+ - 保留英文的专业术语和产品名称
36
+ - 确保翻译自然流畅
37
+
38
+ 内容:
39
+ {content}
40
+
41
+ # 目标语言
42
+ target_language: "zh-CN"
43
+
44
+ # 源语言(auto 为自动检测)
45
+ source_language: "auto"
46
+
47
+ # 是否保留原文格式
48
+ preserve_formatting: true
49
+
50
+ # 是否翻译代码注释
51
+ translate_code_comments: false
52
+
53
+ # 需要保留不翻译的内容模式
54
+ preserve_patterns:
55
+ - "```[\\s\\S]*?```" # 代码块
56
+ - "`[^`]+`" # 行内代码
57
+ - "\\[.*?\\]\\(.*?\\)" # 链接
58
+ - "!\\[.*?\\]\\(.*?\\)" # 图片
59
+
60
+ # 文件处理配置
61
+ files:
62
+ # 输入目录
63
+ input_directory: "./docs"
64
+
65
+ # 输出目录
66
+ output_directory: "./docs-translated"
67
+
68
+ # 输入文件
69
+ input_file: "./README.md"
70
+
71
+ # 输出文件
72
+ output_file: "./README.zh.md"
73
+
74
+ # 文件名后缀策略
75
+ filename_strategy: "suffix" # suffix, replace, directory
76
+ filename_suffix: ".zh" # 仅当 strategy 为 suffix 时使用
77
+
78
+ # 包含的文件模式
79
+ include_patterns:
80
+ - "**/*.md"
81
+ - "**/*.markdown"
82
+
83
+ # 排除的文件模式
84
+ exclude_patterns:
85
+ - "**/node_modules/**"
86
+ - "**/.*"
87
+ - "**/*.tmp"
88
+ - "**/README.md" # 示例:排除 README 文件
89
+
90
+ # 是否保持目录结构
91
+ preserve_directory_structure: true
92
+
93
+ # 文件覆盖策略
94
+ overwrite_policy: "ask" # ask, overwrite, skip, backup
95
+
96
+ # 备份目录(当 overwrite_policy 为 backup 时)
97
+ backup_directory: "./backups"
98
+
99
+ # 日志配置
100
+ logging:
101
+ # 日志级别
102
+ level: "info" # debug, info, warn, error
103
+
104
+ # 日志输出位置
105
+ output: "console" # console, file, both
106
+
107
+ # 日志文件路径(当 output 包含 file 时)
108
+ file_path: "./logs/translator.log"
109
+
110
+ # 是否记录详细的翻译过程
111
+ verbose_translation: false
112
+
113
+ # 错误日志文件
114
+ error_log_path: "./logs/errors.log"
115
+
116
+ # 错误处理配置
117
+ error_handling:
118
+ # 遇到错误时的行为
119
+ on_error: "log_and_continue" # stop, log_and_continue, skip_file
120
+
121
+ # 最大连续错误数(超过则停止)
122
+ max_consecutive_errors: 5
123
+
124
+ # 错误重试次数
125
+ retry_on_failure: 2
126
+
127
+ # 生成错误报告
128
+ generate_error_report: true
129
+ error_report_path: "./logs/error_report.md"
130
+
131
+ # 性能配置
132
+ performance:
133
+ # 并发处理文件数
134
+ concurrent_files: 3
135
+
136
+ # 批处理大小(同时翻译的文件数)
137
+ batch_size: 5
138
+
139
+ # 请求间隔(避免 API 限流)
140
+ request_interval: 1 # 秒
141
+
142
+ # 内存使用限制
143
+ max_memory_mb: 500
144
+
145
+ # 输出配置
146
+ output:
147
+ # 是否显示进度条
148
+ show_progress: true
149
+
150
+ # 是否显示翻译统计
151
+ show_statistics: true
152
+
153
+ # 是否生成翻译报告
154
+ generate_report: true
155
+ report_path: "./reports/translation_report.md"
156
+
157
+ # 输出格式
158
+ format: "markdown" # markdown, json, yaml
159
+
160
+ # 是否保留元数据
161
+ include_metadata: true
162
+
163
+ # 预设配置(可通过 --preset 参数使用)
164
+ presets:
165
+ chinese:
166
+ translation:
167
+ target_language: "zh-CN"
168
+ default_prompt: "翻译为简体中文,保持技术术语的准确性"
169
+
170
+ japanese:
171
+ translation:
172
+ target_language: "ja"
173
+ default_prompt: "日本語に翻訳してください。技術用語は正確に保ってください"
174
+
175
+ english:
176
+ translation:
177
+ target_language: "en"
178
+ default_prompt: "Translate to English, maintaining technical accuracy"
179
+
180
+ # 自定义 Hook(高级功能)
181
+ hooks:
182
+ # 翻译前处理
183
+ pre_translation: null
184
+
185
+ # 翻译后处理
186
+ post_translation: null
187
+
188
+ # 文件处理完成后
189
+ post_file_processing: null
data/test_config.yml ADDED
@@ -0,0 +1,52 @@
1
+ # Test llm_translate configuration
2
+ ai:
3
+ api_key: ${LLM_TRANSLATE_API_KEY}
4
+ provider: "openai"
5
+ model: "gpt-4"
6
+ temperature: 0.3
7
+ max_tokens: 4000
8
+ retry_attempts: 3
9
+ retry_delay: 2
10
+ timeout: 60
11
+
12
+ translation:
13
+ target_language: "zh-CN"
14
+ default_prompt: |
15
+ Please translate the following Markdown content to Chinese, keeping all formatting intact:
16
+ - Preserve code blocks, links, images, and other Markdown syntax
17
+ - Keep English technical terms and product names
18
+ - Ensure natural and fluent translation
19
+
20
+ Content:
21
+ {content}
22
+
23
+ files:
24
+ input_directory: "./test_docs"
25
+ output_directory: "./test_docs_translated"
26
+ filename_suffix: ".zh"
27
+ include_patterns:
28
+ - "**/*.md"
29
+ - "**/*.markdown"
30
+ exclude_patterns: []
31
+ preserve_directory_structure: true
32
+ overwrite_policy: "overwrite"
33
+
34
+ logging:
35
+ level: "info"
36
+ output: "console"
37
+ verbose_translation: true
38
+
39
+ error_handling:
40
+ on_error: "log_and_continue"
41
+ max_consecutive_errors: 5
42
+ retry_on_failure: 2
43
+ generate_error_report: true
44
+
45
+ performance:
46
+ concurrent_files: 1
47
+ request_interval: 1
48
+
49
+ output:
50
+ show_progress: true
51
+ show_statistics: true
52
+ generate_report: true
@@ -0,0 +1,22 @@
1
+ # Sample Document
2
+
3
+ This is a sample markdown document for testing the llm_translate.
4
+
5
+ ## Features
6
+
7
+ - **Bold text** and *italic text*
8
+ - Code blocks and `inline code`
9
+ - [Links](https://example.com)
10
+ - Images: ![Sample](https://example.com/image.png)
11
+
12
+ ## Code Example
13
+
14
+ ```ruby
15
+ def hello_world
16
+ puts "Hello, World!"
17
+ end
18
+ ```
19
+
20
+ ## Conclusion
21
+
22
+ This document demonstrates various Markdown features that should be preserved during translation.
@@ -0,0 +1,22 @@
1
+ # 示例文档
2
+
3
+ 这是一个用于测试翻译器的示例Markdown文档。
4
+
5
+ ## 特性
6
+
7
+ - **粗体文本** 和 *斜体文本*
8
+ - 代码块和 `inline code`
9
+ - [Links](https://example.com)
10
+ - 图片: ![Sample](https://example.com/image.png)
11
+
12
+ ## 代码示例
13
+
14
+ ```ruby
15
+ def hello_world
16
+ puts "Hello, World!"
17
+ end
18
+ ```
19
+
20
+ ## 结论
21
+
22
+ 本文档演示了在翻译过程中应保留的各种Markdown特性。