i18n-context-generator 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,381 @@
1
+ # frozen_string_literal: true
2
+
3
+ module I18nContextGenerator
4
+ # Main orchestrator that parses translation files, searches source code for usages,
5
+ # sends context to the LLM, and writes results via the configured writer.
6
+ class ContextExtractor
7
+ include Writers::Helpers
8
+
9
+ # Result for a single translation key
10
+ ExtractionResult = Data.define(:key, :text, :description, :source_file, :ui_element, :tone,
11
+ :max_length, :locations, :error) do
12
+ def initialize(key:, text:, description:, source_file: nil, ui_element: nil, tone: nil,
13
+ max_length: nil, locations: [], error: nil)
14
+ super
15
+ end
16
+
17
+ def to_h
18
+ {
19
+ key: key,
20
+ text: text,
21
+ description: description,
22
+ source_file: source_file,
23
+ ui_element: ui_element,
24
+ tone: tone,
25
+ max_length: max_length,
26
+ locations: locations,
27
+ error: error
28
+ }
29
+ end
30
+ end
31
+
32
+ attr_reader :results, :errors
33
+
34
+ def initialize(config)
35
+ @config = config
36
+ @results = Concurrent::Array.new
37
+ @errors = Concurrent::Array.new
38
+
39
+ # Defer initialization of expensive resources
40
+ @searcher = nil
41
+ @llm = nil
42
+ @cache = nil
43
+ end
44
+
45
+ def run
46
+ PlatformValidator.new(@config).validate!
47
+
48
+ entries = load_translations
49
+ entries = filter_entries(entries) if @config.key_filter
50
+ entries = filter_by_diff(entries) if @config.diff_base
51
+ entries = filter_by_range(entries) if @config.start_key || @config.end_key
52
+
53
+ if entries.empty?
54
+ if @config.diff_base
55
+ puts "No changed translation keys found since #{@config.diff_base}."
56
+ else
57
+ puts 'No translation entries found.'
58
+ end
59
+ return
60
+ end
61
+
62
+ puts "Loaded #{entries.size} translation keys"
63
+ puts "(filtered to changes since #{@config.diff_base})" if @config.diff_base
64
+
65
+ if @config.dry_run
66
+ puts "\nDry run - would process these keys:"
67
+ entries.first(20).each { |e| puts " - #{e.key}: #{truncate(e.text, 50)}" }
68
+ puts " ... and #{entries.size - 20} more" if entries.size > 20
69
+ return
70
+ end
71
+
72
+ process_entries(entries)
73
+
74
+ if @config.output_path
75
+ write_output
76
+ puts "\nWrote #{@results.size} results to #{@config.output_path}"
77
+ end
78
+
79
+ write_back_to_source if @config.write_back
80
+
81
+ write_back_to_code if @config.write_back_to_code
82
+
83
+ puts "Errors: #{@errors.size}" if @errors.any?
84
+ end
85
+
86
+ private
87
+
88
+ def searcher
89
+ @searcher ||= Searcher.new(
90
+ source_paths: @config.source_paths,
91
+ ignore_patterns: @config.ignore_patterns,
92
+ context_lines: @config.context_lines
93
+ )
94
+ end
95
+
96
+ def llm
97
+ @llm ||= LLM::Client.for(@config.provider)
98
+ end
99
+
100
+ def cache
101
+ @cache ||= Cache.new(enabled: !@config.no_cache)
102
+ end
103
+
104
+ def load_translations
105
+ @config.translations.uniq.flat_map do |path|
106
+ unless File.exist?(path)
107
+ warn "Translation file not found: #{path}"
108
+ next []
109
+ end
110
+
111
+ parser = Parsers::Base.for(path)
112
+ parser.parse(path)
113
+ end
114
+ end
115
+
116
+ def filter_entries(entries)
117
+ patterns = @config.key_filter.split(',').map do |pattern|
118
+ escaped = Regexp.escape(pattern.strip).gsub('\*', '.*')
119
+ Regexp.new("^#{escaped}$")
120
+ end
121
+
122
+ entries.select do |entry|
123
+ patterns.any? { |p| entry.key.match?(p) }
124
+ end
125
+ end
126
+
127
+ def filter_by_diff(entries)
128
+ git_diff = GitDiff.new(base_ref: @config.diff_base)
129
+ changed_keys = git_diff.changed_keys(@config.translations)
130
+
131
+ if changed_keys.empty?
132
+ puts "No changes detected in translation files since #{@config.diff_base}"
133
+ return []
134
+ end
135
+
136
+ puts "Found #{changed_keys.size} changed keys in git diff"
137
+
138
+ entries.select do |entry|
139
+ changed_keys.include?(entry.key) || changed_keys.include?(android_base_key(entry.key))
140
+ end
141
+ end
142
+
143
+ # Extract the base resource name from composite Android keys
144
+ # e.g., "post_likes_count:one" -> "post_likes_count"
145
+ # "days_of_week[0]" -> "days_of_week"
146
+ def android_base_key(key)
147
+ key.sub(/:[a-z]+$/, '').sub(/\[\d+\]$/, '')
148
+ end
149
+
150
+ def filter_by_range(entries)
151
+ start_idx = 0
152
+ end_idx = entries.size - 1
153
+
154
+ if @config.start_key
155
+ found_idx = entries.find_index { |e| e.key == @config.start_key }
156
+ if found_idx
157
+ start_idx = found_idx
158
+ else
159
+ puts "Warning: start_key '#{@config.start_key}' not found, starting from beginning"
160
+ end
161
+ end
162
+
163
+ if @config.end_key
164
+ found_idx = entries.find_index { |e| e.key == @config.end_key }
165
+ if found_idx
166
+ end_idx = found_idx
167
+ else
168
+ puts "Warning: end_key '#{@config.end_key}' not found, processing to end"
169
+ end
170
+ end
171
+
172
+ range_info = []
173
+ range_info << "from '#{@config.start_key}'" if @config.start_key
174
+ range_info << "to '#{@config.end_key}'" if @config.end_key
175
+ puts "Filtering #{range_info.join(' ')}: keys #{start_idx + 1} to #{end_idx + 1}"
176
+
177
+ entries[start_idx..end_idx]
178
+ end
179
+
180
+ def process_entries(entries)
181
+ # Ensure output is not buffered
182
+ $stdout.sync = true
183
+
184
+ progress = TTY::ProgressBar.new(
185
+ '[:bar] :current/:total :percent :eta :key',
186
+ total: entries.size,
187
+ width: 30,
188
+ output: $stdout
189
+ )
190
+
191
+ # Use a thread pool for concurrent processing
192
+ pool = Concurrent::FixedThreadPool.new(@config.concurrency)
193
+ semaphore = Concurrent::Semaphore.new(@config.concurrency)
194
+ current_key = Concurrent::AtomicReference.new('')
195
+
196
+ entries.each do |entry|
197
+ pool.post do
198
+ semaphore.acquire
199
+ begin
200
+ current_key.set(truncate(entry.key, 40))
201
+ result = process_entry(entry)
202
+ @results << result
203
+ @errors << result if result.error
204
+ rescue StandardError => e
205
+ # Capture errors as results so they're visible in output
206
+ result = ExtractionResult.new(
207
+ key: entry.key,
208
+ text: entry.text,
209
+ description: 'Processing failed',
210
+ source_file: entry.source_file,
211
+ error: e.message
212
+ )
213
+ @results << result
214
+ @errors << result
215
+ ensure
216
+ semaphore.release
217
+ progress.advance(key: current_key.get)
218
+ end
219
+ end
220
+ end
221
+
222
+ pool.shutdown
223
+ pool.wait_for_termination
224
+ puts # New line after progress bar
225
+ end
226
+
227
+ def process_entry(entry)
228
+ # Search for key usage in code first — needed for both cache key and LLM prompt
229
+ matches = searcher.search(entry.key)
230
+ comment = @config.include_translation_comments ? entry.metadata&.dig(:comment) : nil
231
+
232
+ if matches.empty?
233
+ return ExtractionResult.new(
234
+ key: entry.key,
235
+ text: entry.text,
236
+ description: 'No usage found in source code',
237
+ source_file: entry.source_file,
238
+ locations: []
239
+ )
240
+ end
241
+
242
+ # Limit matches to avoid huge prompts
243
+ matches = matches.first(@config.max_matches_per_key)
244
+
245
+ # Build a cache context digest from all prompt-shaping inputs so the cache
246
+ # invalidates when source code, comments, or model change
247
+ cache_ctx = [
248
+ matches.map { |m| "#{m.file}:#{m.line}:#{m.match_line}:#{m.enclosing_scope}:#{m.context}" }.sort.join("\0"),
249
+ "comment:#{comment}",
250
+ "provider:#{@config.provider}",
251
+ "model:#{@config.model}",
252
+ "include_file_paths:#{@config.include_file_paths}",
253
+ "redact_prompts:#{@config.redact_prompts}"
254
+ ].join("\n")
255
+
256
+ # Check cache with match context included
257
+ if (cached = cache.get(entry.key, entry.text, context: cache_ctx))
258
+ return ExtractionResult.new(source_file: entry.source_file, **cached.transform_keys(&:to_sym))
259
+ end
260
+
261
+ # Get context from LLM
262
+ llm_result = llm.generate_context(
263
+ key: entry.key,
264
+ text: entry.text,
265
+ matches: matches,
266
+ model: @config.model,
267
+ comment: comment,
268
+ include_file_paths: @config.include_file_paths,
269
+ redact_prompts: @config.redact_prompts
270
+ )
271
+
272
+ result = ExtractionResult.new(
273
+ key: entry.key,
274
+ text: entry.text,
275
+ description: llm_result.description,
276
+ source_file: entry.source_file,
277
+ ui_element: llm_result.ui_element,
278
+ tone: llm_result.tone,
279
+ max_length: llm_result.max_length,
280
+ locations: matches.map { |m| "#{m.file}:#{m.line}" },
281
+ error: llm_result.error
282
+ )
283
+
284
+ cache.set(entry.key, entry.text, result.to_h.except(:source_file), context: cache_ctx)
285
+ result
286
+ end
287
+
288
+ def write_output
289
+ writer = case @config.output_format.to_s.downcase
290
+ when 'json'
291
+ Writers::JsonWriter.new
292
+ else
293
+ Writers::CsvWriter.new
294
+ end
295
+
296
+ writer.write(@results, @config.output_path)
297
+ end
298
+
299
+ def write_back_to_source
300
+ @config.translations.each do |path|
301
+ next unless File.exist?(path)
302
+
303
+ writer = source_writer_for(path)
304
+ next unless writer
305
+
306
+ relevant_results = @results.select { |result| result_matches_source_path?(result, path) }
307
+ next if relevant_results.empty?
308
+
309
+ writer.write(relevant_results, path)
310
+ puts "Updated #{path} with context comments"
311
+ end
312
+ end
313
+
314
+ def write_back_to_code
315
+ swift_writer = Writers::SwiftWriter.new(
316
+ functions: @config.swift_functions,
317
+ context_prefix: @config.context_prefix,
318
+ context_mode: @config.context_mode
319
+ )
320
+
321
+ updated_count = 0
322
+ results_by_key = build_results_by_key_for_code_write_back
323
+
324
+ @config.source_paths.each do |source_path|
325
+ swift_files = find_swift_files(source_path, ignore_patterns: @config.ignore_patterns)
326
+
327
+ swift_files.each do |swift_file|
328
+ if swift_writer.update_file(swift_file, results_by_key)
329
+ updated_count += 1
330
+ puts "Updated #{swift_file} with context comments"
331
+ end
332
+ end
333
+ end
334
+
335
+ puts "Updated #{updated_count} Swift files with context comments" if updated_count.positive?
336
+ end
337
+
338
+ def build_results_by_key_for_code_write_back
339
+ @results
340
+ .sort_by { |result| [translation_source_priority(result.source_file), result.key] }
341
+ .each_with_object({}) do |result, lookup|
342
+ next unless writable_result?(result)
343
+
344
+ lookup[result.key] ||= result
345
+ end
346
+ end
347
+
348
+ def translation_source_priority(source_file)
349
+ return @config.translations.size unless source_file
350
+
351
+ index = @config.translations.index(source_file)
352
+ index || @config.translations.size
353
+ end
354
+
355
+ def source_writer_for(path)
356
+ basename = File.basename(path).downcase
357
+ ext = File.extname(path).downcase
358
+
359
+ case ext
360
+ when '.strings'
361
+ Writers::StringsWriter.new(
362
+ context_prefix: @config.context_prefix,
363
+ context_mode: @config.context_mode
364
+ )
365
+ when '.xml'
366
+ if basename == 'strings.xml' || path.include?('/res/values')
367
+ Writers::AndroidXmlWriter.new(
368
+ context_prefix: @config.context_prefix,
369
+ context_mode: @config.context_mode
370
+ )
371
+ end
372
+ end
373
+ end
374
+
375
+ def truncate(str, length, omission: '...')
376
+ return str if str.length <= length
377
+
378
+ "#{str[0, length - omission.length]}#{omission}"
379
+ end
380
+ end
381
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'open3'
4
+
5
+ module I18nContextGenerator
6
+ # Parses git diff to extract changed translation keys
7
+ class GitDiff
8
+ def initialize(base_ref: 'main')
9
+ @base_ref = base_ref
10
+ end
11
+
12
+ # Get keys that were added or modified since the base ref
13
+ # @param translation_paths [Array<String>] paths to translation files
14
+ # @return [Set<String>] set of changed keys
15
+ def changed_keys(translation_paths)
16
+ keys = Set.new
17
+
18
+ translation_paths.each do |path|
19
+ next unless File.exist?(path)
20
+
21
+ diff_output = git_diff_for_file(path)
22
+ next if diff_output.empty?
23
+
24
+ keys.merge(extract_keys_from_diff(diff_output, path))
25
+ end
26
+
27
+ keys
28
+ end
29
+
30
+ # Check if we're in a git repository
31
+ def self.available?
32
+ system('git rev-parse --git-dir > /dev/null 2>&1')
33
+ end
34
+
35
+ # Check if the base ref exists
36
+ def base_ref_exists?
37
+ system('git', 'rev-parse', '--verify', @base_ref, out: File::NULL, err: File::NULL)
38
+ end
39
+
40
+ private
41
+
42
+ def git_diff_for_file(path)
43
+ # Run git from the directory containing the file so the correct repo is used
44
+ dir = File.directory?(path) ? path : File.dirname(path)
45
+ pathspec = File.directory?(path) ? '.' : File.basename(path)
46
+ # Use triple-dot to get changes on current branch since it diverged from base
47
+ stdout, _stderr, status = Open3.capture3('git', 'diff', "#{@base_ref}...HEAD", '--', pathspec, chdir: dir)
48
+ status.success? ? stdout : ''
49
+ end
50
+
51
+ def extract_keys_from_diff(diff_output, path)
52
+ ext = File.extname(path).downcase
53
+
54
+ case ext
55
+ when '.strings'
56
+ extract_strings_keys(diff_output)
57
+ when '.xml'
58
+ extract_xml_keys(diff_output, path)
59
+ else
60
+ Set.new
61
+ end
62
+ end
63
+
64
+ # Extract keys from iOS .strings diff
65
+ # Looks for added lines like: +"key" = "value";
66
+ def extract_strings_keys(diff_output)
67
+ keys = Set.new
68
+
69
+ diff_output.each_line do |line|
70
+ # Match added or modified lines (start with +, not ++)
71
+ next unless line.start_with?('+') && !line.start_with?('++')
72
+
73
+ # Extract key from: "key" = "value";
74
+ keys << Regexp.last_match(1) if line =~ /^\+\s*"([^"]+)"\s*=/
75
+ end
76
+
77
+ keys
78
+ end
79
+
80
+ # Extract keys from Android strings.xml diff.
81
+ # Tracks parent element context from diff lines and uses hunk headers to
82
+ # map added lines to file positions. When an added <item> can't be attributed
83
+ # to a parent from diff context alone (e.g. large plural/array blocks where the
84
+ # opener isn't in the hunk), falls back to reading the actual file.
85
+ def extract_xml_keys(diff_output, file_path)
86
+ keys = Set.new
87
+ current_parent = nil
88
+ file_line = nil
89
+ orphaned_item_file_lines = []
90
+
91
+ diff_output.each_line do |line|
92
+ # Parse hunk header to track position in new file
93
+ if (hunk = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/))
94
+ file_line = hunk[1].to_i
95
+ next
96
+ end
97
+
98
+ # Skip diff metadata lines
99
+ next if line.start_with?('diff ', 'index ', '--- ', '+++ ')
100
+
101
+ is_removed = line.start_with?('-')
102
+ is_added = line.start_with?('+')
103
+ content = line.sub(/^[ +-]/, '')
104
+
105
+ # Track parent element from any visible line (context, added, or removed)
106
+ if content =~ /<(?:plurals|string-array)\s+name=["']([^"']+)["']/
107
+ current_parent = Regexp.last_match(1)
108
+ elsif content =~ %r{</(?:plurals|string-array)>}
109
+ current_parent = nil
110
+ end
111
+
112
+ # Process added lines for key extraction
113
+ if is_added
114
+ keys << Regexp.last_match(1) if content =~ /<string\s+name=["']([^"']+)["']/
115
+ keys << Regexp.last_match(1) if content =~ /<(?:plurals|string-array)\s+name=["']([^"']+)["']/
116
+
117
+ if content =~ /^\s*<item[\s>]/
118
+ if current_parent
119
+ keys << current_parent
120
+ elsif file_line
121
+ orphaned_item_file_lines << file_line
122
+ end
123
+ end
124
+ end
125
+
126
+ # Context and added lines exist in new file; removed lines do not
127
+ file_line += 1 if file_line && !is_removed
128
+ end
129
+
130
+ # Resolve orphaned items by reading the actual file
131
+ resolve_orphaned_items(keys, orphaned_item_file_lines, file_path)
132
+
133
+ keys
134
+ end
135
+
136
+ # Build a map of file line numbers to enclosing plural/array resource names,
137
+ # then use it to attribute orphaned <item> additions to their parent.
138
+ def resolve_orphaned_items(keys, orphaned_lines, file_path)
139
+ return if orphaned_lines.empty? || !File.exist?(file_path)
140
+
141
+ current_parent = nil
142
+ parent_at_line = {}
143
+
144
+ File.readlines(file_path).each_with_index do |line, index|
145
+ if line =~ /<(?:plurals|string-array)\s+name=["']([^"']+)["']/
146
+ current_parent = Regexp.last_match(1)
147
+ elsif line =~ %r{</(?:plurals|string-array)>}
148
+ current_parent = nil
149
+ end
150
+ parent_at_line[index + 1] = current_parent
151
+ end
152
+
153
+ orphaned_lines.each do |line_num|
154
+ parent = parent_at_line[line_num]
155
+ keys << parent if parent
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ module I18nContextGenerator
4
+ module LLM
5
+ # Claude API implementation of the LLM client with retry logic for rate limits.
6
+ class Anthropic < Client
7
+ API_URL = 'https://api.anthropic.com/v1/messages'
8
+ ANTHROPIC_VERSION = '2023-06-01'
9
+ DEFAULT_MODEL = 'claude-sonnet-4-6'
10
+
11
+ def initialize
12
+ super
13
+ @api_key = ENV.fetch('ANTHROPIC_API_KEY', nil)
14
+ raise Error, 'ANTHROPIC_API_KEY environment variable is required' unless @api_key
15
+
16
+ @uri = URI(API_URL)
17
+ end
18
+
19
+ MAX_RETRIES = 2
20
+
21
+ def generate_context(key:, text:, matches:, model: nil, comment: nil,
22
+ include_file_paths: false, redact_prompts: true)
23
+ model ||= DEFAULT_MODEL
24
+ prompt = build_prompt(
25
+ key: key,
26
+ text: text,
27
+ matches: matches,
28
+ comment: comment,
29
+ include_file_paths: include_file_paths,
30
+ redact_prompts: redact_prompts
31
+ )
32
+ retries = 0
33
+
34
+ loop do
35
+ response = post_request(model: model, prompt: prompt)
36
+
37
+ # Retry on rate limit with backoff
38
+ if response.code.to_i == 429 && retries < MAX_RETRIES
39
+ retries += 1
40
+ delay = (response['retry-after']&.to_i || 2) * retries
41
+ sleep(delay)
42
+ next
43
+ end
44
+
45
+ return handle_response(response)
46
+ end
47
+ rescue StandardError => e
48
+ ContextResult.new(description: 'API request failed', error: e.message)
49
+ end
50
+
51
+ private
52
+
53
+ def post_request(model:, prompt:)
54
+ post_json(
55
+ uri: @uri,
56
+ headers: {
57
+ 'anthropic-version' => ANTHROPIC_VERSION,
58
+ 'x-api-key' => @api_key
59
+ },
60
+ body: {
61
+ model: model,
62
+ max_tokens: 500,
63
+ system: SYSTEM_PROMPT,
64
+ messages: [{ role: 'user', content: prompt }]
65
+ }
66
+ )
67
+ end
68
+
69
+ def handle_response(response)
70
+ case response.code.to_i
71
+ when 200
72
+ body = JSON.parse(response.body)
73
+ content = body.dig('content', 0, 'text')
74
+ parse_response(content)
75
+ when 429
76
+ ContextResult.new(description: 'Rate limited', error: 'Rate limit exceeded - try reducing concurrency')
77
+ when 401
78
+ ContextResult.new(description: 'Authentication failed', error: 'Invalid API key')
79
+ else
80
+ error_body = begin
81
+ JSON.parse(response.body)
82
+ rescue StandardError
83
+ {}
84
+ end
85
+ error_msg = error_body.dig('error', 'message') || "HTTP #{response.code}"
86
+ ContextResult.new(description: 'API error', error: error_msg)
87
+ end
88
+ end
89
+ end
90
+ end
91
+ end