llm_translate 0.4.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,157 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmTranslate
4
+ class DocumentSplitter
5
+ attr_reader :config, :logger
6
+
7
+ def initialize(config, logger = nil)
8
+ @config = config
9
+ @logger = logger || Logger.new($stdout, level: :info)
10
+ end
11
+
12
+ # 拆分文档为多个片段
13
+ def split_document(content)
14
+ return [content] unless should_split?(content)
15
+
16
+ logger.info "Document size (#{content.length} chars) exceeds limit, splitting..."
17
+
18
+ sections = extract_markdown_sections(content)
19
+ chunks = build_chunks(sections)
20
+
21
+ logger.info "Document split into #{chunks.length} chunks"
22
+ chunks
23
+ end
24
+
25
+ # 合并翻译后的文档片段
26
+ def merge_translated_chunks(translated_chunks)
27
+ return translated_chunks.first if translated_chunks.length == 1
28
+
29
+ logger.info "Merging #{translated_chunks.length} translated chunks..."
30
+
31
+ # 简单合并,用双换行连接
32
+ merged_content = translated_chunks.join("\n\n")
33
+
34
+ # 清理多余的空行
35
+ clean_merged_content(merged_content)
36
+ end
37
+
38
+ private
39
+
40
+ def should_split?(content)
41
+ content.length > config.max_chars_for_splitting
42
+ end
43
+
44
+ def extract_markdown_sections(content)
45
+ sections = []
46
+ current_section = ''
47
+ lines = content.split("\n")
48
+
49
+ lines.each do |line|
50
+ # 检查是否是新的段落开始(标题、空行后的内容等)
51
+ if is_section_boundary?(line, current_section) && !current_section.strip.empty?
52
+ sections << current_section.strip
53
+ current_section = ''
54
+ end
55
+
56
+ current_section += "#{line}\n"
57
+ end
58
+
59
+ # 添加最后一个段落
60
+ sections << current_section.strip unless current_section.strip.empty?
61
+
62
+ sections
63
+ end
64
+
65
+ def is_section_boundary?(line, current_section)
66
+ return false if current_section.strip.empty?
67
+
68
+ # 标题行
69
+ return true if line.start_with?('#') && line.match?(/^#+\s+/)
70
+
71
+ # 代码块开始/结束
72
+ return true if line.match?(/^```/)
73
+
74
+ # 列表项
75
+ return true if line.match?(/^\s*[-*+]\s+/) || line.match?(/^\s*\d+\.\s+/)
76
+
77
+ # 引用块
78
+ return true if line.match?(/^>\s+/)
79
+
80
+ # 水平分割线
81
+ return true if line.match?(/^[-*_]{3,}$/)
82
+
83
+ # 表格行
84
+ return true if line.match?(/^\|.*\|$/)
85
+
86
+ # 空行后的非空行(新段落)
87
+ return true if current_section.end_with?("\n\n") && !line.strip.empty?
88
+
89
+ false
90
+ end
91
+
92
+ def build_chunks(sections)
93
+ chunks = []
94
+ current_chunk = ''
95
+
96
+ sections.each do |section|
97
+ # 如果单个段落就超过限制,需要强制拆分
98
+ if section.length > config.split_every_chars
99
+ # 保存当前块
100
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
101
+
102
+ # 强制拆分长段落
103
+ forced_chunks = force_split_section(section)
104
+ chunks.concat(forced_chunks)
105
+
106
+ current_chunk = ''
107
+ next
108
+ end
109
+
110
+ # 检查添加这个段落是否会超过限制
111
+ potential_length = current_chunk.length + section.length + 2 # +2 for "\n\n"
112
+
113
+ if potential_length > config.split_every_chars && !current_chunk.strip.empty?
114
+ # 保存当前块并开始新块
115
+ chunks << current_chunk.strip
116
+ current_chunk = "#{section}\n\n"
117
+ else
118
+ # 添加到当前块
119
+ current_chunk += "#{section}\n\n"
120
+ end
121
+ end
122
+
123
+ # 添加最后一个块
124
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
125
+
126
+ chunks
127
+ end
128
+
129
+ def force_split_section(section)
130
+ chunks = []
131
+ lines = section.split("\n")
132
+ current_chunk = ''
133
+
134
+ lines.each do |line|
135
+ potential_length = current_chunk.length + line.length + 1 # +1 for "\n"
136
+
137
+ if potential_length > config.split_every_chars && !current_chunk.strip.empty?
138
+ chunks << current_chunk.strip
139
+ current_chunk = "#{line}\n"
140
+ else
141
+ current_chunk += "#{line}\n"
142
+ end
143
+ end
144
+
145
+ chunks << current_chunk.strip unless current_chunk.strip.empty?
146
+ chunks
147
+ end
148
+
149
+ def clean_merged_content(content)
150
+ # 移除多余的空行(超过2个连续换行的情况)
151
+ cleaned = content.gsub(/\n{3,}/, "\n\n")
152
+
153
+ # 确保文档以单个换行结尾
154
+ "#{cleaned.strip}\n"
155
+ end
156
+ end
157
+ end
@@ -3,16 +3,18 @@
3
3
  require 'pathname'
4
4
  require 'fileutils'
5
5
  require 'async'
6
+ require_relative 'document_splitter'
6
7
 
7
8
  module LlmTranslate
8
9
  class TranslatorEngine
9
- attr_reader :config, :logger, :ai_client, :file_finder
10
+ attr_reader :config, :logger, :ai_client, :file_finder, :document_splitter
10
11
 
11
12
  def initialize(config, logger, ai_client)
12
13
  @config = config
13
14
  @logger = logger
14
15
  @ai_client = ai_client
15
16
  @file_finder = FileFinder.new(config, logger)
17
+ @document_splitter = DocumentSplitter.new(config, logger)
16
18
  end
17
19
 
18
20
  def translate_file(input_path)
@@ -115,7 +117,10 @@ module LlmTranslate
115
117
  end
116
118
 
117
119
  def translate_content(content, file_path = nil)
118
- if config.preserve_formatting?
120
+ # 检查是否需要启用文档拆分
121
+ if config.enable_document_splitting? && content.length > config.max_chars_for_splitting
122
+ translate_with_document_splitting(content, file_path)
123
+ elsif config.preserve_formatting?
119
124
  translate_with_format_preservation(content)
120
125
  else
121
126
  ai_client.translate(content)
@@ -151,5 +156,94 @@ module LlmTranslate
151
156
  # Translate the content with placeholders
152
157
  ai_client.translate(content)
153
158
  end
159
+
160
+ def translate_with_document_splitting(content, file_path = nil)
161
+ logger.info "Document splitting enabled for large content#{file_path ? " from #{file_path}" : ''}"
162
+
163
+ # 拆分文档
164
+ chunks = document_splitter.split_document(content)
165
+
166
+ logger.info "Translating #{chunks.length} chunks with #{config.concurrent_chunks} concurrent workers..."
167
+
168
+ # 并发翻译chunks
169
+ translated_chunks = translate_chunks_concurrently(chunks)
170
+
171
+ # 合并翻译后的片段
172
+ logger.info 'Merging translated chunks...'
173
+ document_splitter.merge_translated_chunks(translated_chunks)
174
+ end
175
+
176
+ def translate_chunks_concurrently(chunks)
177
+ return translate_chunks_sequentially(chunks) if config.concurrent_chunks <= 1
178
+
179
+ translated_chunks = Array.new(chunks.length)
180
+
181
+ # 使用 Async 进行并发处理
182
+ Async do |task|
183
+ # 将chunks分批处理,每批最多concurrent_chunks个
184
+ chunks.each_slice(config.concurrent_chunks).each do |batch|
185
+ # 为当前批次创建并发任务
186
+ batch_tasks = batch.map.with_index do |chunk, _batch_index|
187
+ # 计算在原数组中的索引
188
+ chunk_index = chunks.index(chunk)
189
+
190
+ task.async do
191
+ logger.info "Translating chunk #{chunk_index + 1}/#{chunks.length} (#{chunk.length} chars)..."
192
+
193
+ begin
194
+ translated_chunk = if config.preserve_formatting?
195
+ translate_with_format_preservation(chunk)
196
+ else
197
+ ai_client.translate(chunk)
198
+ end
199
+
200
+ # 将翻译结果存储在正确的位置
201
+ translated_chunks[chunk_index] = translated_chunk
202
+
203
+ logger.info "✓ Completed chunk #{chunk_index + 1}/#{chunks.length}"
204
+ translated_chunk
205
+ rescue StandardError => e
206
+ logger.error "✗ Failed to translate chunk #{chunk_index + 1}: #{e.message}"
207
+ raise e
208
+ end
209
+ end
210
+ end
211
+
212
+ # 等待当前批次的所有任务完成
213
+ batch_tasks.each(&:wait)
214
+
215
+ # 在批次间添加延迟
216
+ sleep(config.request_interval) if config.request_interval.positive?
217
+ end
218
+ end
219
+
220
+ translated_chunks
221
+ end
222
+
223
+ def translate_chunks_sequentially(chunks)
224
+ translated_chunks = []
225
+
226
+ chunks.each_with_index do |chunk, index|
227
+ logger.info "Translating chunk #{index + 1}/#{chunks.length} (#{chunk.length} chars)..."
228
+
229
+ begin
230
+ translated_chunk = if config.preserve_formatting?
231
+ translate_with_format_preservation(chunk)
232
+ else
233
+ ai_client.translate(chunk)
234
+ end
235
+
236
+ translated_chunks << translated_chunk
237
+
238
+ # 添加请求间隔延迟
239
+ sleep(config.request_interval) if config.request_interval.positive? && index < chunks.length - 1
240
+ rescue StandardError => e
241
+ logger.error "Failed to translate chunk #{index + 1}: #{e.message}"
242
+ raise e
243
+ end
244
+ end
245
+
246
+ translated_chunks
247
+ end
154
248
  end
155
249
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LlmTranslate
4
- VERSION = '0.4.0'
4
+ VERSION = '0.6.0'
5
5
  end
data/llm_translate.yml CHANGED
@@ -50,6 +50,18 @@ translation:
50
50
  # 是否翻译代码注释
51
51
  translate_code_comments: false
52
52
 
53
+ # 文档拆分配置
54
+ # 当文档字符数超过 max_chars 时,自动启用拆分功能
55
+ enable_splitting: true
56
+
57
+ # 触发拆分的最大字符数
58
+ max_chars: 20000
59
+
60
+ # 每个片段的目标字符数
61
+ every_chars: 18000
62
+
63
+ # 并发翻译的 chunk 数量
64
+ concurrent_chunks: 3
53
65
 
54
66
 
55
67
  # 文件处理配置
@@ -125,13 +137,13 @@ error_handling:
125
137
 
126
138
  # 性能配置
127
139
  performance:
128
- # 并发处理文件数
140
+ # 并发处理文件数(使用文档拆分时建议设为 1)
129
141
  concurrent_files: 3
130
142
 
131
143
  # 批处理大小(同时翻译的文件数)
132
144
  batch_size: 5
133
145
 
134
- # 请求间隔(避免 API 限流)
146
+ # 请求间隔(避免 API 限流,拆分文档时特别重要)
135
147
  request_interval: 1 # 秒
136
148
 
137
149
  # 内存使用限制
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llm_translate
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - LlmTranslate Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-09-01 00:00:00.000000000 Z
11
+ date: 2025-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: async
@@ -103,6 +103,8 @@ extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
105
  - ".rspec_status"
106
+ - CONCURRENT_CHUNKS_UPDATE.md
107
+ - DOCUMENT_SPLITTER_SUMMARY.md
106
108
  - README.md
107
109
  - README.zh.md
108
110
  - Rakefile
@@ -112,21 +114,20 @@ files:
112
114
  - content/prompt.md
113
115
  - content/todo.md
114
116
  - exe/llm_translate
117
+ - large_document_config.yml
115
118
  - lib/llm_translate.rb
116
119
  - lib/llm_translate/ai_client.rb
117
120
  - lib/llm_translate/cli.rb
118
121
  - lib/llm_translate/config.rb
122
+ - lib/llm_translate/document_splitter.rb
119
123
  - lib/llm_translate/file_finder.rb
120
124
  - lib/llm_translate/logger.rb
121
125
  - lib/llm_translate/translator_engine.rb
122
126
  - lib/llm_translate/version.rb
123
127
  - llm_translate.gemspec
124
128
  - llm_translate.yml
125
- - test_config.yml
126
129
  - test_docs/sample.md
127
130
  - test_docs_translated/sample.zh.md
128
- - test_llm_translate.yml
129
- - test_new_config.yml
130
131
  homepage: https://github.com/tianlu1677/llm_translate
131
132
  licenses:
132
133
  - MIT
data/test_config.yml DELETED
@@ -1,52 +0,0 @@
1
- # Test llm_translate configuration
2
- ai:
3
- api_key: ${LLM_TRANSLATE_API_KEY}
4
- provider: "openai"
5
- model: "gpt-4"
6
- temperature: 0.3
7
- max_tokens: 4000
8
- retry_attempts: 3
9
- retry_delay: 2
10
- timeout: 60
11
-
12
- translation:
13
- target_language: "zh-CN"
14
- default_prompt: |
15
- Please translate the following Markdown content to Chinese, keeping all formatting intact:
16
- - Preserve code blocks, links, images, and other Markdown syntax
17
- - Keep English technical terms and product names
18
- - Ensure natural and fluent translation
19
-
20
- Content:
21
- {content}
22
-
23
- files:
24
- input_directory: "./test_docs"
25
- output_directory: "./test_docs_translated"
26
- filename_suffix: ".zh"
27
- include_patterns:
28
- - "**/*.md"
29
- - "**/*.markdown"
30
- exclude_patterns: []
31
- preserve_directory_structure: true
32
- overwrite_policy: "overwrite"
33
-
34
- logging:
35
- level: "info"
36
- output: "console"
37
- verbose_translation: true
38
-
39
- error_handling:
40
- on_error: "log_and_continue"
41
- max_consecutive_errors: 5
42
- retry_on_failure: 2
43
- generate_error_report: true
44
-
45
- performance:
46
- concurrent_files: 1
47
- request_interval: 1
48
-
49
- output:
50
- show_progress: true
51
- show_statistics: true
52
- generate_report: true
@@ -1,176 +0,0 @@
1
- # translator.yml - 翻译工具配置文件
2
-
3
- # AI 模型配置
4
- ai:
5
- # API 密钥(建议使用环境变量 LLM_TRANSLATE_API_KEY)
6
- api_key: ${LLM_TRANSLATE_API_KEY}
7
-
8
- # 模型提供商(openai, anthropic, ollama 等)
9
- provider: "openai"
10
-
11
- # 模型名称
12
- model: "gpt-4"
13
-
14
- # 模型参数
15
- temperature: 0.3
16
- max_tokens: 4000
17
- top_p: 1.0
18
-
19
- # 请求重试配置
20
- retry_attempts: 3
21
- retry_delay: 2 # 秒
22
-
23
- # 请求超时时间
24
- timeout: 60 # 秒
25
-
26
- # 翻译配置
27
- translation:
28
- # 默认翻译 prompt
29
- default_prompt: |
30
- 请将以下 Markdown 内容翻译为中文,保持所有格式不变:
31
- - 保留代码块、链接、图片等 Markdown 语法
32
- - 保留英文的专业术语和产品名称
33
- - 确保翻译自然流畅
34
-
35
- 内容:
36
- {content}
37
-
38
- # 目标语言
39
- target_language: "zh-CN"
40
-
41
- # 源语言(auto 为自动检测)
42
- source_language: "auto"
43
-
44
- # 是否保留原文格式
45
- preserve_formatting: true
46
-
47
- # 是否翻译代码注释
48
- translate_code_comments: false
49
-
50
- # 需要保留不翻译的内容模式
51
-
52
-
53
- # 文件处理配置
54
- files:
55
- # 输入目录
56
- input_directory: "./docs"
57
-
58
- # 输出目录
59
- output_directory: "./docs-translated"
60
-
61
- # 文件名后缀策略
62
- filename_strategy: "suffix" # suffix, replace, directory
63
- filename_suffix: ".zh" # 仅当 strategy 为 suffix 时使用
64
-
65
- # 包含的文件模式
66
- include_patterns:
67
- - "**/*.md"
68
- - "**/*.markdown"
69
-
70
- # 排除的文件模式
71
- exclude_patterns:
72
- - "**/node_modules/**"
73
- - "**/.*"
74
- - "**/*.tmp"
75
- - "**/README.md" # 示例:排除 README 文件
76
-
77
- # 是否保持目录结构
78
- preserve_directory_structure: true
79
-
80
- # 文件覆盖策略
81
- overwrite_policy: "ask" # ask, overwrite, skip, backup
82
-
83
- # 备份目录(当 overwrite_policy 为 backup 时)
84
- backup_directory: "./backups"
85
-
86
- # 日志配置
87
- logging:
88
- # 日志级别
89
- level: "info" # debug, info, warn, error
90
-
91
- # 日志输出位置
92
- output: "console" # console, file, both
93
-
94
- # 日志文件路径(当 output 包含 file 时)
95
- file_path: "./logs/translator.log"
96
-
97
- # 是否记录详细的翻译过程
98
- verbose_translation: false
99
-
100
- # 错误日志文件
101
- error_log_path: "./logs/errors.log"
102
-
103
- # 错误处理配置
104
- error_handling:
105
- # 遇到错误时的行为
106
- on_error: "log_and_continue" # stop, log_and_continue, skip_file
107
-
108
- # 最大连续错误数(超过则停止)
109
- max_consecutive_errors: 5
110
-
111
- # 错误重试次数
112
- retry_on_failure: 2
113
-
114
- # 生成错误报告
115
- generate_error_report: true
116
- error_report_path: "./logs/error_report.md"
117
-
118
- # 性能配置
119
- performance:
120
- # 并发处理文件数
121
- concurrent_files: 3
122
-
123
- # 批处理大小(同时翻译的文件数)
124
- batch_size: 5
125
-
126
- # 请求间隔(避免 API 限流)
127
- request_interval: 1 # 秒
128
-
129
- # 内存使用限制
130
- max_memory_mb: 500
131
-
132
- # 输出配置
133
- output:
134
- # 是否显示进度条
135
- show_progress: true
136
-
137
- # 是否显示翻译统计
138
- show_statistics: true
139
-
140
- # 是否生成翻译报告
141
- generate_report: true
142
- report_path: "./reports/translation_report.md"
143
-
144
- # 输出格式
145
- format: "markdown" # markdown, json, yaml
146
-
147
- # 是否保留元数据
148
- include_metadata: true
149
-
150
- # 预设配置(可通过 --preset 参数使用)
151
- presets:
152
- chinese:
153
- translation:
154
- target_language: "zh-CN"
155
- default_prompt: "翻译为简体中文,保持技术术语的准确性"
156
-
157
- japanese:
158
- translation:
159
- target_language: "ja"
160
- default_prompt: "日本語に翻訳してください。技術用語は正確に保ってください"
161
-
162
- english:
163
- translation:
164
- target_language: "en"
165
- default_prompt: "Translate to English, maintaining technical accuracy"
166
-
167
- # 自定义 Hook(高级功能)
168
- hooks:
169
- # 翻译前处理
170
- pre_translation: null
171
-
172
- # 翻译后处理
173
- post_translation: null
174
-
175
- # 文件处理完成后
176
- post_file_processing: null