RubyGems - i18n-context-generator - Versions diffs - 0.3.0 - Mend

i18n-context-generator 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

checksums.yaml +7 -0
data/LICENSE +373 -0
data/README.md +282 -0
data/exe/i18n-context-generator +7 -0
data/lib/i18n_context_generator/cache.rb +49 -0
data/lib/i18n_context_generator/cli.rb +223 -0
data/lib/i18n_context_generator/config.rb +211 -0
data/lib/i18n_context_generator/context_extractor.rb +381 -0
data/lib/i18n_context_generator/git_diff.rb +159 -0
data/lib/i18n_context_generator/llm/anthropic.rb +91 -0
data/lib/i18n_context_generator/llm/client.rb +260 -0
data/lib/i18n_context_generator/llm/openai.rb +112 -0
data/lib/i18n_context_generator/parsers/android_xml_parser.rb +110 -0
data/lib/i18n_context_generator/parsers/base.rb +60 -0
data/lib/i18n_context_generator/parsers/json_parser.rb +21 -0
data/lib/i18n_context_generator/parsers/strings_parser.rb +28 -0
data/lib/i18n_context_generator/parsers/yaml_parser.rb +30 -0
data/lib/i18n_context_generator/platform_validator.rb +92 -0
data/lib/i18n_context_generator/searcher.rb +447 -0
data/lib/i18n_context_generator/version.rb +5 -0
data/lib/i18n_context_generator/writers/android_xml_writer.rb +117 -0
data/lib/i18n_context_generator/writers/csv_writer.rb +39 -0
data/lib/i18n_context_generator/writers/helpers.rb +58 -0
data/lib/i18n_context_generator/writers/json_writer.rb +34 -0
data/lib/i18n_context_generator/writers/strings_writer.rb +67 -0
data/lib/i18n_context_generator/writers/swift_writer.rb +160 -0
data/lib/i18n_context_generator.rb +36 -0
metadata +196 -0

data/lib/i18n_context_generator/llm/client.rb ADDED Viewed

@@ -0,0 +1,260 @@
+# frozen_string_literal: true
+require 'json'
+require 'net/http'
+module I18nContextGenerator
+  module LLM
+    # Result from LLM context generation
+    ContextResult = Data.define(:description, :ui_element, :tone, :max_length, :error) do
+      def initialize(description:, ui_element: nil, tone: nil, max_length: nil, error: nil)
+        super
+      end
+    end
+    # Base class for LLM clients
+    class Client
+      SYSTEM_PROMPT = 'You are a mobile app localization expert. Analyze only the provided evidence and provide concise, specific context for translators. Respond with only valid JSON.'
+      def self.for(provider)
+        case provider.to_s.downcase
+        when 'anthropic'
+          Anthropic.new
+        when 'openai'
+          OpenAI.new
+        when 'ollama'
+          raise Error, 'Ollama provider not yet implemented'
+        else
+          raise Error, "Unknown LLM provider: #{provider}"
+        end
+      end
+      def generate_context(key:, text:, matches:, model: nil, comment: nil,
+                           include_file_paths: false, redact_prompts: true)
+        raise NotImplementedError, 'Subclasses must implement #generate_context'
+      end
+      protected
+      def build_prompt(key:, text:, matches:, comment: nil,
+                       include_file_paths: false, redact_prompts: true)
+        platform = detect_platform(matches)
+        safe_text = sanitize_prompt_text(text, redact: redact_prompts)
+        safe_comment = sanitize_prompt_text(comment, redact: redact_prompts)
+        placeholder_info = detect_placeholders(text)
+        <<~PROMPT
+          You are analyzing a localized string from a #{platform} mobile app to help translators understand its context.
+          ## Translation Key
+          `#{key}`
+          ## Original Text
+          "#{safe_text}"
+          #{"\n## Developer Comment\n\"#{safe_comment}\"\n" if safe_comment && !safe_comment.strip.empty?}#{"\n## Format Placeholders\n#{placeholder_info}\n" if placeholder_info}
+          ## Code Usage
+          #{format_matches(matches, include_file_paths: include_file_paths, redact_prompts: redact_prompts)}
+          ## Task
+          Analyze how this string is used in the mobile app code and provide context for translators.
+          **IMPORTANT - Avoid False Positives:**
+          - Look for ACTUAL UI USAGE, not coincidental code patterns
+          - Ignore method calls that happen to match the key (e.g., `.apply()`, `.close()`, `.clear()` are methods, not UI strings)
+          - Ignore boolean/string comparisons (e.g., `if value == "yes"` is not UI usage)
+          - Ignore analytics event names or tracking parameters
+          - Focus on localization patterns: getString(), NSLocalizedString(), Text(), @string/, R.string., etc.
+          - If no clear UI usage is found in the code, base your description only on the provided text, developer comment, and key name
+          - If evidence is limited, keep the description generic rather than inventing a specific screen, flow, or user action
+          Focus on:
+          1. **Where it appears**: What screen or view displays this text?
+          2. **UI element type**: Is it a button label, navigation title, alert message, placeholder, etc.?
+          3. **User action**: What action triggers this text or what happens when the user interacts with it?
+          4. **Constraints**: Are there any length constraints (e.g., button width, navigation bar)?
+          Write a concise context description (1-2 sentences) that helps a translator understand:
+          - The purpose of this text in the app
+          - The UI context where it appears
+          - Any important considerations for translation
+          **Quality Guidelines:**
+          - Be SPECIFIC about WHERE and HOW the text is used, not just what it means
+          - Avoid vague descriptions like "used throughout the app" - identify specific screens/features
+          - If the text is a common UI term (Save, Cancel, OK), describe its specific usage context in THIS app
+          - Do not speculate or hedge. Never use words like "likely", "probably", "appears", "seems", "may", or "might"
+          - Only mention screens, features, or actions when they are supported by the provided code, comment, text, or key name
+          - Only set `max_length` when there is explicit evidence for a concrete numeric limit; otherwise return null
+          - Do not infer `max_length` from general UI conventions like buttons, badges, placeholders, or navigation bars
+          - Don't mention code implementation details - focus on the user-facing experience
+          Respond with ONLY a JSON object (no markdown, no explanation):
+          {
+            "description": "Concise context for translators (1-2 sentences)",
+            "ui_element": "button|label|title|alert|toast|placeholder|navigation|menu|tab|error|confirmation|other",
+            "tone": "formal|casual|urgent|friendly|technical|neutral",
+            "max_length": null or a number only when explicit evidence gives a concrete numeric limit
+          }
+        PROMPT
+      end
+      def detect_platform(matches)
+        return 'mobile' if matches.empty?
+        extensions = matches.map { |m| File.extname(m.file).downcase }
+        if extensions.any? { |e| ['.swift', '.m', '.mm'].include?(e) }
+          'iOS'
+        elsif extensions.any? { |e| ['.kt', '.java'].include?(e) }
+          'Android'
+        else
+          'mobile'
+        end
+      end
+      def format_matches(matches, include_file_paths:, redact_prompts:)
+        matches.map.with_index do |match, i|
+          scope_info = match.enclosing_scope ? " (in #{match.enclosing_scope})" : ''
+          location = include_file_paths ? match.file : File.basename(match.file)
+          context = sanitize_prompt_text(match.context, redact: redact_prompts)
+          <<~MATCH
+            ### Match #{i + 1}: #{location}:#{match.line}#{scope_info}
+            ```
+            #{context}
+            ```
+          MATCH
+        end.join("\n")
+      end
+      def sanitize_prompt_text(text, redact:)
+        return text if text.nil? || !redact
+        text
+          .gsub(%r{https?://\S+}i, '[REDACTED_URL]')
+          .gsub(/\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i, '[REDACTED_EMAIL]')
+          .gsub(%r{Bearer\s+[A-Za-z0-9\-._~+/]+=*}i, 'Bearer [REDACTED_TOKEN]')
+          .gsub(/((?:api[_-]?key|access[_-]?token|refresh[_-]?token|secret|password)\s*[:=]\s*)"[^"]*"/i,
+                '\1"[REDACTED_SECRET]"')
+          .gsub(/((?:api[_-]?key|access[_-]?token|refresh[_-]?token|secret|password)\s*[:=]\s*)'[^']*'/i,
+                "\\1'[REDACTED_SECRET]'")
+          .gsub(/\beyJ[A-Za-z0-9\-_]+(?:\.[A-Za-z0-9\-_]+){2}\b/, '[REDACTED_TOKEN]')
+          .gsub(/\b(?!\h{8}-\h{4}-\h{4}-\h{4}-\h{12}\b)[A-Fa-f0-9]{32,}\b/, '[REDACTED_TOKEN]')
+      end
+      def detect_placeholders(text)
+        # iOS: %@, %d, %f, %ld, %lld, %1$@, %2$d, etc.
+        # Android: %s, %d, %f, %1$s, %2$d, etc.
+        placeholders = text.scan(/%(?:(\d+)\$)?([#0 +'.-]*\d*(?:\.\d+)?(?:l{0,2}|h{0,2})?[diouxXeEfFgGaAcsSpn@])/)
+        return nil if placeholders.empty?
+        descriptions = []
+        # Also gather the raw matches for display
+        raw = text.scan(/%(?:\d+\$)?[#0 +'.-]*\d*(?:\.\d+)?(?:l{0,2}|h{0,2})?[diouxXeEfFgGaAcsSpn@]/)
+        raw.each_with_index do |placeholder, _i|
+          type_hint = case placeholder
+                      when /%.*[di]/ then 'a number'
+                      when /%.*[fFeEgGaA]/ then 'a decimal number'
+                      when /%.*[@sS]/ then 'a string value'
+                      else 'a value'
+                      end
+          descriptions << "#{placeholder} — #{type_hint}"
+        end
+        "This string contains #{raw.size} placeholder(s) that must be preserved in translation:\n" +
+          descriptions.map { |d| "- #{d}" }.join("\n")
+      end
+      def parse_response(text)
+        if text.nil? || text.empty?
+          return ContextResult.new(description: 'Failed to parse response',
+                                   error: 'Empty response')
+        end
+        # Try to extract JSON from the response
+        json_text = extract_json(text)
+        return ContextResult.new(description: text.strip, error: nil) unless json_text
+        data = JSON.parse(json_text, symbolize_names: true)
+        ContextResult.new(
+          description: data[:description] || 'No description provided',
+          ui_element: data[:ui_element],
+          tone: data[:tone],
+          max_length: data[:max_length]
+        )
+      rescue JSON::ParserError => e
+        ContextResult.new(description: text.strip, error: "JSON parse error: #{e.message}")
+      end
+      def extract_json(text)
+        # Try to find JSON object in the response
+        # Handle both raw JSON and markdown-wrapped JSON
+        if text.include?('```')
+          match = text.match(/```(?:json)?\s*(\{[^`]+\})\s*```/m)
+          return match[1] if match
+        end
+        # Find first { and try to parse valid JSON from it
+        start = text.index('{')
+        return nil unless start
+        # Walk backwards from end looking for matching }
+        text.length.downto(start + 1) do |i|
+          next unless text[i - 1] == '}'
+          candidate = text[start...i]
+          begin
+            JSON.parse(candidate) # validate it parses
+            return candidate
+          rescue JSON::ParserError
+            next
+          end
+        end
+        nil
+      end
+      def post_json(uri:, headers:, body:, open_timeout: 10, read_timeout: 60)
+        http = http_for(uri, open_timeout: open_timeout, read_timeout: read_timeout)
+        request = Net::HTTP::Post.new(
+          uri.request_uri,
+          { 'Content-Type' => 'application/json' }.merge(headers)
+        )
+        request.body = JSON.generate(body)
+        http.request(request)
+      end
+      # Returns a persistent Net::HTTP session scoped to the current thread.
+      # This preserves connection reuse without sharing a mutable Net::HTTP
+      # instance across the worker pool.
+      def http_for(uri, open_timeout:, read_timeout:)
+        key = [uri.scheme, uri.host, uri.port]
+        sessions = Thread.current.thread_variable_get(http_sessions_key) || {}
+        http = sessions[key]
+        if http&.started?
+          http.open_timeout = open_timeout
+          http.read_timeout = read_timeout
+          return http
+        end
+        http = Net::HTTP.new(uri.host, uri.port)
+        http.use_ssl = uri.scheme == 'https'
+        http.open_timeout = open_timeout
+        http.read_timeout = read_timeout
+        http.keep_alive_timeout = 30
+        http.start
+        sessions[key] = http
+        Thread.current.thread_variable_set(http_sessions_key, sessions)
+        http
+      end
+      def http_sessions_key
+        @http_sessions_key ||= :"i18n_context_generator_http_sessions_#{object_id}"
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/llm/openai.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# frozen_string_literal: true
+module I18nContextGenerator
+  module LLM
+    # OpenAI Responses API implementation of the LLM client.
+    class OpenAI < Client
+      API_URL = 'https://api.openai.com/v1/responses'
+      DEFAULT_MODEL = 'gpt-5-mini'
+      MAX_RETRIES = 2
+      RESPONSE_SCHEMA = {
+        type: 'object',
+        additionalProperties: false,
+        required: %w[description ui_element tone max_length],
+        properties: {
+          description: { type: 'string' },
+          ui_element: { type: %w[string null], enum: %w[button label title alert toast placeholder navigation menu tab error confirmation other] + [nil] },
+          tone: { type: %w[string null], enum: %w[formal casual urgent friendly technical neutral] + [nil] },
+          max_length: { type: %w[integer null] }
+        }
+      }.freeze
+      def initialize
+        super
+        @api_key = ENV.fetch('OPENAI_API_KEY', nil)
+        raise Error, 'OPENAI_API_KEY environment variable is required' unless @api_key
+        @uri = URI(API_URL)
+      end
+      def generate_context(key:, text:, matches:, model: nil, comment: nil,
+                           include_file_paths: false, redact_prompts: true)
+        model ||= DEFAULT_MODEL
+        prompt = build_prompt(
+          key: key,
+          text: text,
+          matches: matches,
+          comment: comment,
+          include_file_paths: include_file_paths,
+          redact_prompts: redact_prompts
+        )
+        retries = 0
+        loop do
+          response = post_request(model: model, prompt: prompt)
+          if response.code.to_i == 429 && retries < MAX_RETRIES
+            retries += 1
+            delay = (response['retry-after']&.to_i || 2) * retries
+            sleep(delay)
+            next
+          end
+          return handle_response(response)
+        end
+      rescue StandardError => e
+        ContextResult.new(description: 'API request failed', error: e.message)
+      end
+      private
+      def post_request(model:, prompt:)
+        post_json(
+          uri: @uri,
+          headers: {
+            'Authorization' => "Bearer #{@api_key}"
+          },
+          body: {
+            model: model,
+            store: false,
+            instructions: SYSTEM_PROMPT,
+            input: prompt,
+            max_output_tokens: 500,
+            text: {
+              format: {
+                type: 'json_schema',
+                name: 'translation_context',
+                strict: true,
+                schema: RESPONSE_SCHEMA
+              }
+            }
+          }
+        )
+      end
+      def handle_response(response)
+        case response.code.to_i
+        when 200
+          body = JSON.parse(response.body)
+          parse_response(extract_output_text(body))
+        when 429
+          ContextResult.new(description: 'Rate limited', error: 'Rate limit exceeded - try reducing concurrency')
+        when 401
+          ContextResult.new(description: 'Authentication failed', error: 'Invalid API key')
+        else
+          error_body = begin
+            JSON.parse(response.body)
+          rescue StandardError
+            {}
+          end
+          error_msg = error_body.dig('error', 'message') || error_body['message'] || "HTTP #{response.code}"
+          ContextResult.new(description: 'API error', error: error_msg)
+        end
+      end
+      def extract_output_text(body)
+        output_item = Array(body['output']).find { |item| item['type'] == 'message' }
+        content_item = Array(output_item&.[]('content')).find { |item| item['type'] == 'output_text' }
+        content_item&.dig('text')
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/parsers/android_xml_parser.rb ADDED Viewed

@@ -0,0 +1,110 @@
+# frozen_string_literal: true
+require 'rexml/document'
+module I18nContextGenerator
+  module Parsers
+    # Parser for Android strings.xml files
+    # Format: <string name="key">value</string>
+    # With optional comment: <!-- comment --> <string name="key">value</string>
+    class AndroidXmlParser < Base
+      def parse(path)
+        content = File.read(path, encoding: 'UTF-8')
+        doc = REXML::Document.new(content)
+        entries = []
+        doc.elements.each('resources/string') do |element|
+          next unless translatable?(element)
+          key = element.attributes['name']
+          text = inner_text(element)
+          # Look for preceding comment
+          comment = find_preceding_comment(element)
+          entries << TranslationEntry.new(
+            key: key,
+            text: unescape_android_string(text),
+            source_file: path,
+            metadata: { comment: comment }
+          )
+        end
+        # Also parse string arrays
+        doc.elements.each('resources/string-array') do |array_element|
+          next unless translatable?(array_element)
+          array_name = array_element.attributes['name']
+          array_element.elements.to_a('item').each_with_index do |item, index|
+            entries << TranslationEntry.new(
+              key: "#{array_name}[#{index}]",
+              text: unescape_android_string(inner_text(item)),
+              source_file: path,
+              metadata: { array: array_name, index: index }
+            )
+          end
+        end
+        # Also parse plurals
+        doc.elements.each('resources/plurals') do |plural_element|
+          next unless translatable?(plural_element)
+          plural_name = plural_element.attributes['name']
+          plural_element.elements.each('item') do |item|
+            quantity = item.attributes['quantity']
+            entries << TranslationEntry.new(
+              key: "#{plural_name}:#{quantity}",
+              text: unescape_android_string(inner_text(item)),
+              source_file: path,
+              metadata: { plural: plural_name, quantity: quantity }
+            )
+          end
+        end
+        entries
+      end
+      private
+      # Get the full inner content of an element, including inline markup like
+      # <b>, <i>, <u>, <xliff:g>. REXML::Element#text only returns the first
+      # text node, losing everything after a nested element.
+      def inner_text(element)
+        element.children.map do |child|
+          child.is_a?(REXML::Text) ? child.value : child.to_s
+        end.join
+      end
+      def find_preceding_comment(element)
+        # Look at the previous sibling
+        prev = element.previous_sibling
+        while prev
+          if prev.is_a?(REXML::Comment)
+            return prev.to_s.strip
+          elsif prev.is_a?(REXML::Element)
+            # Hit another element, stop looking
+            return nil
+          end
+          prev = prev.previous_sibling
+        end
+        nil
+      end
+      def translatable?(element)
+        element.attributes['translatable']&.downcase != 'false'
+      end
+      # Unescape Android string escapes
+      def unescape_android_string(str)
+        str
+          .gsub("\\'", "'")
+          .gsub('\\"', '"')
+          .gsub('\\n', "\n")
+          .gsub('\\t', "\t")
+          .gsub('\\@', '@')
+          .gsub('\\?', '?')
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/parsers/base.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# frozen_string_literal: true
+module I18nContextGenerator
+  module Parsers
+    # Represents a single translation entry
+    TranslationEntry = Data.define(:key, :text, :source_file, :metadata) do
+      def initialize(key:, text:, source_file:, metadata: {})
+        super
+      end
+    end
+    # Base class for translation file parsers
+    class Base
+      def self.for(path)
+        basename = File.basename(path).downcase
+        ext = File.extname(path).downcase
+        case ext
+        when '.json'
+          JsonParser.new
+        when '.yml', '.yaml'
+          YamlParser.new
+        when '.strings'
+          StringsParser.new
+        when '.xml'
+          # Check if it's an Android strings.xml
+          raise Error, "Unsupported XML format: #{path} (only Android strings.xml is supported)" unless basename == 'strings.xml' || path.include?('/res/values')
+          AndroidXmlParser.new
+        else
+          raise Error, "Unsupported translation file format: #{path}"
+        end
+      end
+      def parse(path)
+        raise NotImplementedError, 'Subclasses must implement #parse'
+      end
+      protected
+      # Flatten nested hashes: {"a" => {"b" => "c"}} -> {"a.b" => "c"}
+      def flatten_keys(hash, prefix = nil)
+        hash.each_with_object({}) do |(key, value), result|
+          full_key = [prefix, key].compact.join('.')
+          case value
+          when Hash
+            result.merge!(flatten_keys(value, full_key))
+          when Array
+            # Handle arrays (e.g., pluralization)
+            result[full_key] = value.join(' | ')
+          else
+            result[full_key] = value
+          end
+        end
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/parsers/json_parser.rb ADDED Viewed

@@ -0,0 +1,21 @@
+# frozen_string_literal: true
+module I18nContextGenerator
+  module Parsers
+    # Parses flat or nested JSON translation files into TranslationEntry objects.
+    class JsonParser < Base
+      def parse(path)
+        data = Oj.load_file(path)
+        flatten_keys(data).filter_map do |key, text|
+          next if text.nil? || text.to_s.strip.empty?
+          TranslationEntry.new(
+            key: key,
+            text: text.to_s,
+            source_file: path
+          )
+        end
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/parsers/strings_parser.rb ADDED Viewed

@@ -0,0 +1,28 @@
+# frozen_string_literal: true
+module I18nContextGenerator
+  module Parsers
+    # Parser for Apple .strings files (iOS/macOS)
+    # Uses the dotstrings gem for proper parsing with support for:
+    # - Multi-line comments
+    # - Unicode and escaped characters
+    # - Proper error handling
+    class StringsParser < Base
+      def parse(path)
+        # Use non-strict mode to be lenient with edge cases
+        strings_file = DotStrings.parse_file(path, strict: false)
+        strings_file.items.map do |item|
+          TranslationEntry.new(
+            key: item.key,
+            text: item.value,
+            source_file: path,
+            metadata: { comment: item.comment }
+          )
+        end
+      rescue DotStrings::ParsingError => e
+        raise Error, "Failed to parse .strings file #{path}: #{e.message}"
+      end
+    end
+  end
+end

data/lib/i18n_context_generator/parsers/yaml_parser.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module I18nContextGenerator
+  module Parsers
+    # Parses YAML translation files (including Rails i18n style) into TranslationEntry objects.
+    class YamlParser < Base
+      def parse(path)
+        data = YAML.safe_load_file(path, permitted_classes: [])
+        # Skip top-level locale key if present (Rails i18n style)
+        # e.g., { "en" => { "hello" => "Hello" } } -> { "hello" => "Hello" }
+        if data.is_a?(Hash) && data.keys.size == 1 && data.values.first.is_a?(Hash)
+          locale_key = data.keys.first
+          # Only skip if it looks like a locale code (2-5 chars)
+          data = data.values.first if locale_key.match?(/\A[a-z]{2}(-[A-Z]{2})?\z/i)
+        end
+        flatten_keys(data).filter_map do |key, text|
+          next if text.nil? || text.to_s.strip.empty?
+          TranslationEntry.new(
+            key: key,
+            text: text.to_s,
+            source_file: path
+          )
+        end
+      end
+    end
+  end
+end