RubyGems - llm-docs-builder - Versions diffs - 0.3.0 - Mend

llm-docs-builder 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.dockerignore +44 -0
data/.github/workflows/ci.yml +71 -0
data/.github/workflows/docker.yml +102 -0
data/.github/workflows/push.yml +35 -0
data/.gitignore +66 -0
data/.rubocop.yml +74 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +66 -0
data/CLAUDE.md +178 -0
data/Dockerfile +64 -0
data/Gemfile +10 -0
data/Gemfile.lock +88 -0
data/LICENSE +21 -0
data/README.md +684 -0
data/Rakefile +10 -0
data/bin/llm-docs-builder +7 -0
data/bin/rspecs +7 -0
data/lib/llm_docs_builder/bulk_transformer.rb +135 -0
data/lib/llm_docs_builder/cli.rb +434 -0
data/lib/llm_docs_builder/comparator.rb +238 -0
data/lib/llm_docs_builder/config.rb +116 -0
data/lib/llm_docs_builder/errors.rb +31 -0
data/lib/llm_docs_builder/generator.rb +234 -0
data/lib/llm_docs_builder/markdown_transformer.rb +90 -0
data/lib/llm_docs_builder/parser.rb +223 -0
data/lib/llm_docs_builder/validator.rb +216 -0
data/lib/llm_docs_builder/version.rb +6 -0
data/lib/llm_docs_builder.rb +130 -0
data/llm-docs-builder.gemspec +45 -0
data/llm-docs-builder.yml +7 -0
data/llm-docs-builder.yml.example +26 -0
data/renovate.json +33 -0
metadata +171 -0

data/lib/llm_docs_builder/comparator.rb ADDED Viewed

@@ -0,0 +1,238 @@
+# frozen_string_literal: true
+require 'net/http'
+require 'uri'
+module LlmDocsBuilder
+  # Compares content sizes between human and AI versions
+  #
+  # Helps quantify context window savings by comparing:
+  # - Remote URL with different User-Agents (human vs AI bot)
+  # - Remote URL with local markdown file
+  #
+  # @example Compare remote versions
+  #   comparator = LlmDocsBuilder::Comparator.new('https://example.com/docs/page.html')
+  #   result = comparator.compare
+  #   puts "Reduction: #{result[:reduction_percent]}%"
+  #
+  # @example Compare remote with local file
+  #   comparator = LlmDocsBuilder::Comparator.new('https://example.com/docs/page.html',
+  #     local_file: 'docs/page.md'
+  #   )
+  #   result = comparator.compare
+  #
+  # @api public
+  class Comparator
+    # Default User-Agent for simulating human browser
+    HUMAN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
+    # Default User-Agent for simulating AI bot
+    AI_USER_AGENT = 'Claude-Web/1.0 (Anthropic AI Assistant)'
+    # Maximum number of redirects to follow before raising an error
+    MAX_REDIRECTS = 10
+    # @return [String] URL to compare
+    attr_reader :url
+    # @return [Hash] comparison options
+    attr_reader :options
+    # Initialize a new comparator
+    #
+    # @param url [String] URL to fetch and compare
+    # @param options [Hash] comparison options
+    # @option options [String] :local_file path to local markdown file for comparison
+    # @option options [String] :human_user_agent custom User-Agent for human version
+    # @option options [String] :ai_user_agent custom User-Agent for AI version
+    # @option options [Boolean] :verbose enable verbose output
+    def initialize(url, options = {})
+      @url = url
+      @options = {
+        human_user_agent: HUMAN_USER_AGENT,
+        ai_user_agent: AI_USER_AGENT
+      }.merge(options)
+    end
+    # Compare content sizes and calculate reduction
+    #
+    # @return [Hash] comparison results with keys:
+    #   - :human_size [Integer] size of human version in bytes
+    #   - :ai_size [Integer] size of AI version in bytes
+    #   - :reduction_bytes [Integer] bytes saved
+    #   - :reduction_percent [Integer] percentage reduction
+    #   - :factor [Float] compression factor
+    #   - :human_source [String] source description (URL or file)
+    #   - :ai_source [String] source description (URL or file)
+    def compare
+      if options[:local_file]
+        compare_with_local_file
+      else
+        compare_remote_versions
+      end
+    end
+    private
+    # Compare remote URL (human User-Agent) with remote URL (AI User-Agent)
+    #
+    # @return [Hash] comparison results
+    def compare_remote_versions
+      puts "Fetching human version from #{url}..." if options[:verbose]
+      human_content = fetch_url(url, options[:human_user_agent])
+      puts "Fetching AI version from #{url}..." if options[:verbose]
+      ai_content = fetch_url(url, options[:ai_user_agent])
+      calculate_results(
+        human_content.bytesize,
+        ai_content.bytesize,
+        "#{url} (User-Agent: human)",
+        "#{url} (User-Agent: AI)"
+      )
+    end
+    # Compare remote URL (human User-Agent) with local markdown file
+    #
+    # @return [Hash] comparison results
+    def compare_with_local_file
+      local_file = options[:local_file]
+      unless File.exist?(local_file)
+        raise(
+          Errors::GenerationError,
+          "Local file not found: #{local_file}"
+        )
+      end
+      puts "Fetching human version from #{url}..." if options[:verbose]
+      human_content = fetch_url(url, options[:human_user_agent])
+      puts "Reading local file #{local_file}..." if options[:verbose]
+      ai_content = File.read(local_file)
+      calculate_results(
+        human_content.bytesize,
+        ai_content.bytesize,
+        url,
+        local_file
+      )
+    end
+    # Fetch URL content with specified User-Agent
+    #
+    # Follows redirects (up to MAX_REDIRECTS) and handles HTTPS
+    #
+    # @param url_string [String] URL to fetch
+    # @param user_agent [String] User-Agent header value
+    # @param redirect_count [Integer] current redirect depth (internal use)
+    # @return [String] response body
+    # @raise [Errors::GenerationError] if fetch fails or too many redirects
+    def fetch_url(url_string, user_agent, redirect_count = 0)
+      if redirect_count >= MAX_REDIRECTS
+        raise(
+          Errors::GenerationError,
+          "Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
+        )
+      end
+      uri = validate_and_parse_url(url_string)
+      http = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl = uri.scheme == 'https'
+      http.open_timeout = 10
+      http.read_timeout = 30
+      request = Net::HTTP::Get.new(uri.request_uri)
+      request['User-Agent'] = user_agent
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        response.body
+      when Net::HTTPRedirection
+        # Follow redirect with incremented counter
+        redirect_url = response['location']
+        puts "  Redirecting to #{redirect_url}..." if options[:verbose] && redirect_count.positive?
+        fetch_url(redirect_url, user_agent, redirect_count + 1)
+      else
+        raise(
+          Errors::GenerationError,
+          "Failed to fetch #{url_string}: #{response.code} #{response.message}"
+        )
+      end
+    rescue Errors::GenerationError
+      raise
+    rescue StandardError => e
+      raise(
+        Errors::GenerationError,
+        "Error fetching #{url_string}: #{e.message}"
+      )
+    end
+    # Validates and parses URL to prevent malformed URLs
+    #
+    # @param url_string [String] URL to validate and parse
+    # @return [URI::HTTP, URI::HTTPS] parsed URI
+    # @raise [Errors::GenerationError] if URL is invalid or uses unsupported scheme
+    def validate_and_parse_url(url_string)
+      uri = URI.parse(url_string)
+      # Only allow HTTP and HTTPS schemes
+      unless %w[http https].include?(uri.scheme&.downcase)
+        raise(
+          Errors::GenerationError,
+          "Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
+        )
+      end
+      # Ensure host is present
+      if uri.host.nil? || uri.host.empty?
+        raise(
+          Errors::GenerationError,
+          "Invalid URL: missing host in #{url_string}"
+        )
+      end
+      uri
+    rescue URI::InvalidURIError => e
+      raise(
+        Errors::GenerationError,
+        "Invalid URL format: #{e.message}"
+      )
+    end
+    # Calculate comparison statistics
+    #
+    # @param human_size [Integer] size of human version in bytes
+    # @param ai_size [Integer] size of AI version in bytes
+    # @param human_source [String] description of human source
+    # @param ai_source [String] description of AI source
+    # @return [Hash] comparison results
+    def calculate_results(human_size, ai_size, human_source, ai_source)
+      reduction_bytes = human_size - ai_size
+      reduction_percent = if human_size.positive?
+                            ((reduction_bytes.to_f / human_size) * 100).round
+                          else
+                            0
+                          end
+      factor = if ai_size.positive?
+                 (human_size.to_f / ai_size).round(1)
+               else
+                 Float::INFINITY
+               end
+      {
+        human_size: human_size,
+        ai_size: ai_size,
+        reduction_bytes: reduction_bytes,
+        reduction_percent: reduction_percent,
+        factor: factor,
+        human_source: human_source,
+        ai_source: ai_source
+      }
+    end
+  end
+end

data/lib/llm_docs_builder/config.rb ADDED Viewed

@@ -0,0 +1,116 @@
+# frozen_string_literal: true
+require 'yaml'
+module LlmDocsBuilder
+  # Simple configuration loader for llm-docs-builder.yml files
+  #
+  # Loads YAML configuration files and provides a simple interface for accessing configuration
+  # values. Automatically looks for config files in the current directory if none specified.
+  #
+  # @example Load default config file
+  #   config = LlmDocsBuilder::Config.new
+  #
+  # @example Load specific config file
+  #   config = LlmDocsBuilder::Config.new('my-config.yml')
+  #
+  # @example Access config values
+  #   config['base_url']        # => "https://myproject.io"
+  #   config.dig('output')      # => "llms.txt"
+  #
+  # @api public
+  class Config
+    # @return [Hash] the loaded configuration data
+    attr_reader :data
+    # Initialize a new configuration loader
+    #
+    # @param config_file [String, nil] path to YAML config file, or nil to auto-find
+    def initialize(config_file = nil)
+      @config_file = config_file || find_config_file
+      @data = load_config
+    end
+    # Access configuration value by key
+    #
+    # @param key [String, Symbol] configuration key
+    # @return [Object, nil] configuration value or nil if not found
+    def [](key)
+      data[key.to_s]
+    end
+    # Access nested configuration values
+    #
+    # @param keys [Array<String, Symbol>] nested keys to access
+    # @return [Object, nil] configuration value or nil if not found
+    def dig(*keys)
+      data.dig(*keys.map(&:to_s))
+    end
+    # Merge config file values with CLI options
+    #
+    # CLI options take precedence over config file values. Config file provides
+    # defaults for any options not specified via CLI.
+    #
+    # @param options [Hash] CLI options hash
+    # @return [Hash] merged configuration with CLI overrides applied
+    def merge_with_options(options)
+      # CLI options override config file, config file provides defaults
+      {
+        docs: options[:docs] || self['docs'] || '.',
+        base_url: options[:base_url] || self['base_url'],
+        title: options[:title] || self['title'],
+        description: options[:description] || self['description'],
+        output: options[:output] || self['output'] || 'llms.txt',
+        convert_urls: if options.key?(:convert_urls)
+                        options[:convert_urls]
+                      else
+                        self['convert_urls'] || false
+                      end,
+        verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
+        # Bulk transformation options
+        suffix: options[:suffix] || self['suffix'] || '.llm',
+        excludes: options[:excludes] || self['excludes'] || [],
+        bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
+      }
+    end
+    # Check if a config file was found and exists
+    #
+    # @return [Boolean] true if config file exists, false otherwise
+    def exists?
+      @config_file && File.exist?(@config_file)
+    end
+    private
+    # Find config file in current directory
+    #
+    # Looks for config files in order of preference:
+    # 1. llm-docs-builder.yml
+    # 2. llm-docs-builder.yaml
+    # 3. .llm-docs-builder.yml
+    #
+    # @return [String, nil] path to config file or nil if none found
+    def find_config_file
+      candidates = ['llm-docs-builder.yml', 'llm-docs-builder.yaml', '.llm-docs-builder.yml']
+      candidates.find { |file| File.exist?(file) }
+    end
+    # Load and parse YAML config file
+    #
+    # @return [Hash] parsed config data, empty hash if no file
+    # @raise [Errors::GenerationError] if YAML is invalid or file cannot be read
+    def load_config
+      return {} unless @config_file && File.exist?(@config_file)
+      begin
+        YAML.load_file(@config_file) || {}
+      rescue Psych::SyntaxError => e
+        raise Errors::GenerationError, "Invalid YAML in config file #{@config_file}: #{e.message}"
+      rescue StandardError => e
+        raise Errors::GenerationError, "Failed to load config file #{@config_file}: #{e.message}"
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/errors.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Namespace used to encapsulate all the internal errors of LlmDocsBuilder
+  module Errors
+    # Base class for all the LlmDocsBuilder internal errors
+    BaseError = Class.new(StandardError)
+    # Raised when llms.txt generation fails due to configuration issues,
+    # missing directories, invalid YAML, or file access problems
+    #
+    # @example When directory doesn't exist
+    #   LlmDocsBuilder.bulk_transform('/nonexistent/path')
+    #   # => raises GenerationError: "Directory not found: /nonexistent/path"
+    #
+    # @example When config YAML is invalid
+    #   LlmDocsBuilder.generate_from_docs(config_file: 'invalid.yml')
+    #   # => raises GenerationError: "Invalid YAML in config file..."
+    GenerationError = Class.new(BaseError)
+    # Raised when llms.txt content validation fails
+    #
+    # This error is intended for validation failures but currently not used.
+    # The Validator class returns boolean results instead of raising errors.
+    #
+    # @example Future usage (when validation raises)
+    #   LlmDocsBuilder.validate!(invalid_content)
+    #   # => raises ValidationError: "Missing required H1 title"
+    ValidationError = Class.new(BaseError)
+  end
+end

data/lib/llm_docs_builder/generator.rb ADDED Viewed

@@ -0,0 +1,234 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Simple generator that creates llms.txt from existing markdown documentation
+  #
+  # Takes a documentation directory or file and generates a properly formatted llms.txt file by
+  # analyzing markdown files, extracting titles and descriptions, and organizing them by priority.
+  #
+  # @example Generate from docs directory
+  #   generator = LlmDocsBuilder::Generator.new('./docs', base_url: 'https://myproject.io')
+  #   content = generator.generate
+  #
+  # @api public
+  class Generator
+    # @return [String] path to documentation directory or file
+    attr_reader :docs_path
+    # @return [Hash] generation options
+    attr_reader :options
+    # Initialize a new generator
+    #
+    # @param docs_path [String] path to documentation directory or file
+    # @param options [Hash] generation options
+    # @option options [String] :base_url base URL for expanding relative links
+    # @option options [String] :title project title (overrides auto-detection)
+    # @option options [String] :description project description (overrides auto-detection)
+    # @option options [String] :output output file path for saving
+    # @option options [Boolean] :verbose enable verbose output
+    def initialize(docs_path, options = {})
+      @docs_path = docs_path
+      @options = options
+    end
+    # Generate llms.txt content from documentation
+    #
+    # Scans documentation files, extracts metadata, prioritizes them, and builds a formatted
+    # llms.txt file.
+    #
+    # @return [String] generated llms.txt content
+    def generate
+      docs = find_documentation_files
+      content = build_llms_txt(docs)
+      if (output_path = options[:output])
+        File.write(output_path, content)
+      end
+      content
+    end
+    private
+    # Locates and analyzes documentation files from docs_path
+    #
+    # Handles both single file and directory paths
+    #
+    # @return [Array<Hash>] array of analyzed file metadata
+    def find_documentation_files
+      return [] unless File.exist?(docs_path)
+      if File.file?(docs_path)
+        [analyze_file(docs_path)]
+      else
+        find_markdown_files_in_directory
+      end
+    end
+    # Recursively finds and analyzes markdown files in directory
+    #
+    # Sorts by priority (README, guides, etc.) and skips hidden files
+    #
+    # @return [Array<Hash>] sorted array of analyzed file metadata
+    def find_markdown_files_in_directory
+      files = []
+      Find.find(docs_path) do |path|
+        next unless File.file?(path)
+        next unless path.match?(/\.md$/i)
+        next if File.basename(path).start_with?('.')
+        files << analyze_file(path)
+      end
+      files.sort_by { |f| f[:priority] }
+    end
+    # Extracts metadata from a documentation file
+    #
+    # Analyzes file content to extract title, description, and priority
+    #
+    # @param file_path [String] path to file to analyze
+    # @return [Hash] file metadata with :path, :title, :description, :priority
+    def analyze_file(file_path)
+      # Handle single file case differently
+      relative_path = if File.file?(docs_path)
+                        File.basename(file_path)
+                      else
+                        Pathname.new(file_path).relative_path_from(Pathname.new(docs_path)).to_s
+                      end
+      content = File.read(file_path)
+      {
+        path: relative_path,
+        title: extract_title(content, file_path),
+        description: extract_description(content),
+        priority: calculate_priority(file_path)
+      }
+    end
+    # Extracts title from file content or generates from filename
+    #
+    # Prefers first H1 header, falls back to formatted filename
+    #
+    # @param content [String] file content
+    # @param file_path [String] path to file
+    # @return [String] extracted or generated title
+    def extract_title(content, file_path)
+      # Try to extract title from first # header
+      if content.match(/^#\s+(.+)/)
+        ::Regexp.last_match(1).strip
+      else
+        # Use filename as fallback
+        File.basename(file_path, '.md').gsub(/[_-]/, ' ').split.map(&:capitalize).join(' ')
+      end
+    end
+    # Extracts description from file content
+    #
+    # Takes first paragraph after title, truncated to 200 characters
+    #
+    # @param content [String] file content
+    # @return [String] extracted description
+    def extract_description(content)
+      lines = content.lines
+      # Skip title line and empty lines
+      description_lines = lines.drop_while { |line| line.start_with?('#') || line.strip.empty? }
+      # Get first paragraph
+      first_paragraph = description_lines.take_while { |line| !line.strip.empty? }
+      first_paragraph.join(' ').strip.slice(0, 200)
+    end
+    # Assigns priority to file based on filename patterns
+    #
+    # README gets highest priority, followed by guides, tutorials, API docs
+    #
+    # @param file_path [String] path to file
+    # @return [Integer] priority value (1-7, lower is higher priority)
+    def calculate_priority(file_path)
+      basename = File.basename(file_path).downcase
+      return 1 if basename.start_with?('readme')
+      return 2 if basename.include?('getting')
+      return 3 if basename.include?('guide')
+      return 4 if basename.include?('tutorial')
+      return 5 if basename.include?('api')
+      return 6 if basename.include?('reference')
+      7 # default priority
+    end
+    # Constructs llms.txt content from analyzed documentation files
+    #
+    # Combines title, description, and documentation links into formatted output
+    #
+    # @param docs [Array<Hash>] analyzed file metadata
+    # @return [String] formatted llms.txt content
+    def build_llms_txt(docs)
+      title = options[:title] || detect_project_title(docs)
+      description = options[:description] || detect_project_description(docs)
+      content = []
+      content << "# #{title}"
+      content << ''
+      content << "> #{description}" if description
+      content << ''
+      if docs.any?
+        content << '## Documentation'
+        content << ''
+        docs.each do |doc|
+          url = build_url(doc[:path])
+          content << if doc[:description] && !doc[:description].empty?
+                       "- [#{doc[:title]}](#{url}): #{doc[:description]}"
+                     else
+                       "- [#{doc[:title]}](#{url})"
+                     end
+        end
+      end
+      "#{content.join("\n")}\n"
+    end
+    # Attempts to detect project title from README or directory name
+    #
+    # @param docs [Array<Hash>] analyzed file metadata
+    # @return [String] detected project title
+    def detect_project_title(docs)
+      readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
+      return readme[:title] if readme
+      File.basename(File.expand_path('.'))
+    end
+    # Attempts to extract project description from README
+    #
+    # @param docs [Array<Hash>] analyzed file metadata
+    # @return [String, nil] detected project description or nil
+    def detect_project_description(docs)
+      readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
+      return readme[:description] if readme&.fetch(:description, nil)
+      nil
+    end
+    # Constructs full URL from path using base_url option if provided
+    #
+    # @param path [String] relative path to file
+    # @return [String] full URL or relative path
+    def build_url(path)
+      if (base_url = options[:base_url])
+        File.join(base_url, path)
+      else
+        path
+      end
+    end
+  end
+end