RubyGems - llm-docs-builder - Versions diffs - 0.3.0 - Mend

llm-docs-builder 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +7 -0
data/.dockerignore +44 -0
data/.github/workflows/ci.yml +71 -0
data/.github/workflows/docker.yml +102 -0
data/.github/workflows/push.yml +35 -0
data/.gitignore +66 -0
data/.rubocop.yml +74 -0
data/.ruby-version +1 -0
data/CHANGELOG.md +66 -0
data/CLAUDE.md +178 -0
data/Dockerfile +64 -0
data/Gemfile +10 -0
data/Gemfile.lock +88 -0
data/LICENSE +21 -0
data/README.md +684 -0
data/Rakefile +10 -0
data/bin/llm-docs-builder +7 -0
data/bin/rspecs +7 -0
data/lib/llm_docs_builder/bulk_transformer.rb +135 -0
data/lib/llm_docs_builder/cli.rb +434 -0
data/lib/llm_docs_builder/comparator.rb +238 -0
data/lib/llm_docs_builder/config.rb +116 -0
data/lib/llm_docs_builder/errors.rb +31 -0
data/lib/llm_docs_builder/generator.rb +234 -0
data/lib/llm_docs_builder/markdown_transformer.rb +90 -0
data/lib/llm_docs_builder/parser.rb +223 -0
data/lib/llm_docs_builder/validator.rb +216 -0
data/lib/llm_docs_builder/version.rb +6 -0
data/lib/llm_docs_builder.rb +130 -0
data/llm-docs-builder.gemspec +45 -0
data/llm-docs-builder.yml +7 -0
data/llm-docs-builder.yml.example +26 -0
data/renovate.json +33 -0
metadata +171 -0

data/lib/llm_docs_builder/markdown_transformer.rb ADDED Viewed

@@ -0,0 +1,90 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Transforms markdown files to be AI-friendly
+  #
+  # Processes individual markdown files to make them more suitable for LLM consumption by
+  # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
+  # formats.
+  #
+  # @example Transform with base URL
+  #   transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
+  #     base_url: 'https://myproject.io'
+  #   )
+  #   content = transformer.transform
+  #
+  # @api public
+  class MarkdownTransformer
+    # @return [String] path to markdown file
+    attr_reader :file_path
+    # @return [Hash] transformation options
+    attr_reader :options
+    # Initialize a new markdown transformer
+    #
+    # @param file_path [String] path to markdown file to transform
+    # @param options [Hash] transformation options
+    # @option options [String] :base_url base URL for expanding relative links
+    # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
+    def initialize(file_path, options = {})
+      @file_path = file_path
+      @options = options
+    end
+    # Transform markdown content to be AI-friendly
+    #
+    # Applies transformations to make the markdown more suitable for LLM processing:
+    # - Expands relative links to absolute URLs (if base_url provided)
+    # - Converts HTML URLs to markdown format (if convert_urls enabled)
+    #
+    # @return [String] transformed markdown content
+    def transform
+      content = File.read(file_path)
+      content = expand_relative_links(content) if options[:base_url]
+      content = convert_html_urls(content) if options[:convert_urls]
+      content
+    end
+    private
+    # Expand relative links to absolute URLs
+    #
+    # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
+    # Leaves absolute URLs and anchors unchanged.
+    #
+    # @param content [String] markdown content to process
+    # @return [String] content with expanded links
+    def expand_relative_links(content)
+      base_url = options[:base_url]
+      content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
+        text = ::Regexp.last_match(1)
+        url = ::Regexp.last_match(2)
+        if url.start_with?('http://', 'https://', '//', '#')
+          match # Already absolute or anchor
+        else
+          # Clean up relative path
+          clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
+          expanded_url = File.join(base_url, clean_url)
+          "[#{text}](#{expanded_url})"
+        end
+      end
+    end
+    # Convert HTML URLs to markdown-friendly format
+    #
+    # Changes URLs ending in .html or .htm to .md for better LLM understanding
+    #
+    # @param content [String] markdown content to process
+    # @return [String] content with converted URLs
+    def convert_html_urls(content)
+      content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
+        url.sub(/\.html?$/, '.md')
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/parser.rb ADDED Viewed

@@ -0,0 +1,223 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Parses llms.txt files into structured data
+  #
+  # Reads and parses llms.txt files according to the llms.txt specification,
+  # extracting the title, description, and structured sections (Documentation,
+  # Examples, Optional) with their links.
+  #
+  # @example Parse an llms.txt file
+  #   parser = LlmDocsBuilder::Parser.new('llms.txt')
+  #   parsed = parser.parse
+  #   parsed.title              # => "My Project"
+  #   parsed.description        # => "Project description"
+  #   parsed.documentation_links # => [{title: "README", url: "...", description: "..."}]
+  #
+  # @api public
+  class Parser
+    # @return [String] path to the llms.txt file
+    attr_reader :file_path
+    # @return [String] raw content of the llms.txt file
+    attr_reader :content
+    # Initialize a new parser
+    #
+    # @param file_path [String] path to the llms.txt file to parse
+    def initialize(file_path)
+      @file_path = file_path
+      @content = File.read(file_path)
+    end
+    # Parse the llms.txt file
+    #
+    # Parses the file content and returns a {ParsedContent} object containing
+    # the extracted title, description, and structured sections with links.
+    #
+    # @return [ParsedContent] parsed content with title, description, and sections
+    def parse
+      sections = {}
+      current_section = nil
+      current_content = []
+      lines = content.lines
+      lines.each_with_index do |line, index|
+        if line.start_with?('# ')
+          save_section(sections, current_section, current_content) if current_section
+          sections[:title] = line[2..].strip if sections.empty?
+          current_section = :description if index == 1 && line.start_with?('> ')
+          current_content = []
+        elsif line.start_with?('> ') && sections[:title] && !sections[:description]
+          sections[:description] = line[2..].strip
+        elsif line.start_with?('## ')
+          save_section(sections, current_section, current_content) if current_section
+          current_section = line[3..].strip.downcase.gsub(/\s+/, '_').to_sym
+          current_content = []
+        elsif !line.strip.empty?
+          current_content << line
+        end
+      end
+      save_section(sections, current_section, current_content) if current_section
+      ParsedContent.new(sections)
+    end
+    private
+    # Parses and stores section content in the sections hash
+    #
+    # Skips empty sections and delegates to parse_section_content for processing
+    #
+    # @param sections [Hash] accumulator hash for sections
+    # @param section_name [Symbol] name of the section
+    # @param content [Array<String>] raw content lines
+    def save_section(sections, section_name, content)
+      return if content.empty?
+      sections[section_name] ||= []
+      sections[section_name] = parse_section_content(content.join)
+    end
+    # Extracts markdown links from section content into structured format
+    #
+    # Scans for markdown list items with links and descriptions. Returns raw content
+    # if no links are found in the expected format.
+    #
+    # @param content [String] raw section content
+    # @return [Array<Hash>, String] array of link hashes or raw content if no links found
+    def parse_section_content(content)
+      links = []
+      content.scan(/^[-*]\s*\[([^\]]+)\]\(([^)]+)\):\s*(.*)$/m) do |title, url, description|
+        links << {
+          title: title,
+          url: url,
+          description: description.strip
+        }
+      end
+      links.empty? ? content.strip : links
+    end
+  end
+  # Represents parsed llms.txt content with structured access to sections
+  #
+  # Provides convenient access to parsed llms.txt sections including title,
+  # description, and link collections. Can be converted to Hash or XML formats.
+  #
+  # @example Access parsed content
+  #   parsed.title              # => "My Project"
+  #   parsed.description        # => "A description"
+  #   parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
+  #   parsed.to_h               # => Hash representation
+  #   parsed.to_xml             # => XML string
+  #
+  # @api public
+  class ParsedContent
+    # @return [Hash] the parsed sections hash
+    attr_reader :sections
+    # Initialize parsed content
+    #
+    # @param sections [Hash] hash containing parsed sections (:title, :description, :documentation, etc.)
+    def initialize(sections)
+      @sections = sections
+    end
+    # Get the project title
+    #
+    # @return [String, nil] the H1 title or nil if not present
+    def title
+      sections[:title]
+    end
+    # Get the project description
+    #
+    # @return [String, nil] the description blockquote or nil if not present
+    def description
+      sections[:description]
+    end
+    # Get documentation links
+    #
+    # @return [Array<Hash>] array of documentation links with :title, :url, and :description
+    def documentation_links
+      sections[:documentation] || []
+    end
+    # Get example links
+    #
+    # @return [Array<Hash>] array of example links with :title, :url, and :description
+    def example_links
+      sections[:examples] || []
+    end
+    # Get optional links
+    #
+    # @return [Array<Hash>] array of optional links with :title, :url, and :description
+    def optional_links
+      sections[:optional] || []
+    end
+    # Convert to hash representation
+    #
+    # @return [Hash] hash containing all parsed sections
+    def to_h
+      sections
+    end
+    # Convert to XML representation
+    #
+    # Generates an XML document with all parsed sections and links.
+    #
+    # @return [String] XML string representation
+    def to_xml
+      builder = []
+      builder << '<?xml version="1.0" encoding="UTF-8"?>'
+      builder << '<llms_context>'
+      builder << "  <title>#{title}</title>" if title
+      builder << "  <description>#{description}</description>" if description
+      add_xml_section(builder, 'documentation', documentation_links)
+      add_xml_section(builder, 'examples', example_links)
+      add_xml_section(builder, 'optional', optional_links) if sections[:optional]
+      builder << '</llms_context>'
+      builder.join("\n")
+    end
+    private
+    # Appends section XML elements to builder array
+    #
+    # Handles both array of link hashes and raw string content
+    #
+    # @param builder [Array<String>] XML lines accumulator
+    # @param name [String] section name
+    # @param links [Array<Hash>, String] section links or content
+    def add_xml_section(builder, name, links)
+      return if links.empty?
+      builder << "  <#{name}>"
+      if links.is_a?(Array)
+        links.each do |link|
+          builder << '    <link>'
+          builder << "      <title>#{link[:title]}</title>"
+          builder << "      <url>#{link[:url]}</url>"
+          builder << "      <description>#{link[:description]}</description>"
+          builder << '    </link>'
+        end
+      else
+        builder << "    #{links}"
+      end
+      builder << "  </#{name}>"
+    end
+  end
+end

data/lib/llm_docs_builder/validator.rb ADDED Viewed

@@ -0,0 +1,216 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Validates llms.txt content against the llms.txt specification
+  #
+  # Ensures that llms.txt content follows proper formatting rules including:
+  # - Required H1 title header
+  # - Optional description blockquote
+  # - Proper section ordering (Documentation, Examples, Optional)
+  # - Valid markdown syntax and link formats
+  # - File size and line length limits
+  #
+  # @example Validate llms.txt content
+  #   validator = LlmDocsBuilder::Validator.new(content)
+  #   validator.valid? # => true or false
+  #   validator.errors # => Array of error messages
+  #
+  # @api public
+  class Validator
+    # @return [String] the llms.txt content being validated
+    attr_reader :content
+    # @return [Array<String>] array of validation error messages
+    attr_reader :errors
+    # Required sections that must appear in llms.txt
+    REQUIRED_SECTIONS = ['# '].freeze
+    # Optional sections that may appear in llms.txt
+    OPTIONAL_SECTIONS = ['> ', '## Documentation', '## Examples', '## Optional'].freeze
+    # Maximum length for a single line in characters
+    MAX_LINE_LENGTH = 120
+    # Maximum file size in bytes
+    MAX_FILE_SIZE = 50_000
+    # Initialize a new validator
+    #
+    # @param content [String] the llms.txt content to validate
+    def initialize(content)
+      @content = content
+      @errors = []
+    end
+    # Check if content is valid
+    #
+    # Runs all validation checks and returns whether the content is valid.
+    # Use {#errors} to access validation error messages.
+    #
+    # @return [Boolean] true if content is valid, false otherwise
+    def valid?
+      validate!
+      errors.empty?
+    end
+    # Validate content and return result
+    #
+    # Runs all validation checks, populates {#errors} array, and returns whether
+    # the content is valid.
+    #
+    # @return [Boolean] true if content is valid, false otherwise
+    def validate!
+      @errors = []
+      validate_required_sections
+      validate_structure
+      validate_markdown_syntax
+      validate_links
+      validate_file_size
+      errors.empty?
+    end
+    private
+    # Checks for required H1 title header and validates title length
+    #
+    # Adds errors if title is missing or exceeds 80 characters
+    def validate_required_sections
+      lines = content.lines
+      errors << 'Missing required H1 title (must start with "# ")' unless lines.first&.start_with?('# ')
+      return unless lines.first&.strip&.length.to_i > 80
+      errors << 'Title is too long (max 80 characters)'
+    end
+    # Validates H1 uniqueness, description length, and section ordering
+    #
+    # Ensures only one H1, description under 200 chars, and proper section order
+    def validate_structure
+      lines = content.lines
+      h1_count = lines.count { |line| line.start_with?('# ') }
+      errors << 'Multiple H1 headers found (only one allowed)' if h1_count > 1
+      if lines[1]&.start_with?('> ') && lines[1].strip.length > 200
+        errors << 'Description blockquote is too long (max 200 characters)'
+      end
+      validate_section_order
+    end
+    # Verifies sections appear in correct order: Documentation, Examples, Optional
+    #
+    # Detects out-of-order sections and adds validation errors
+    def validate_section_order
+      sections = content.scan(/^## (.+)$/).flatten
+      expected_order = %w[Documentation Examples Optional]
+      current_index = -1
+      sections.each do |section|
+        index = expected_order.index(section)
+        next unless index
+        errors << "Section '#{section}' is out of order" if index < current_index
+        current_index = index
+      end
+    end
+    # Validates markdown syntax including links, lists, and headers
+    #
+    # Delegates to specialized validators for different markdown elements
+    def validate_markdown_syntax
+      validate_link_format
+      validate_list_format
+      validate_headers
+    end
+    # Checks markdown links for empty text/URLs and valid URL formats
+    #
+    # Validates URLs follow expected patterns for relative/absolute paths
+    def validate_link_format
+      content.scan(/\[([^\]]*)\]\(([^)]*)\)/) do |text, url|
+        errors << 'Empty link text found' if text.empty?
+        errors << 'Empty link URL found' if url.empty?
+        # Allow relative paths, absolute paths, HTTP(S) URLs, and common file extensions
+        url_pattern = %r{
+          ^(?:
+            https?://|
+            /|
+            \.\.?/|
+            [a-zA-Z0-9_.-]+(?:/|\.md|\.txt|\.rb|\.html)?|
+            [A-Z]+[a-zA-Z]*|
+            docs/|
+            examples/|
+            lib/
+          ).*$
+        }x
+        errors << "Invalid URL format: #{url}" unless url =~ url_pattern
+      end
+    end
+    # Validates list items match expected markdown link format
+    #
+    # Ensures list items with links have proper syntax with optional descriptions
+    def validate_list_format
+      content.lines.each_with_index do |line, index|
+        next unless line =~ /^[-*]\s+\[/
+        # Allow both with and without descriptions
+        next if line =~ /^[-*]\s+\[.+\]\(.+\)(?::\s*.+)?$/
+        errors << "Invalid list item format at line #{index + 1}"
+      end
+    end
+    # Validates header levels and content
+    #
+    # Checks for empty H1 headers and warns about headers deeper than H2
+    def validate_headers
+      content.scan(/^(#+)\s+(.+)$/) do |hashes, text|
+        level = hashes.length
+        if level == 1 && text.strip.empty?
+          errors << 'Empty H1 header text'
+        elsif level > 2
+          errors << "Headers deeper than H2 not recommended (found H#{level})"
+        end
+      end
+    end
+    # Validates link security and format requirements
+    #
+    # Warns about non-HTTPS URLs and URLs containing spaces
+    def validate_links
+      urls = content.scan(/\[([^\]]+)\]\(([^)]+)\)/).map(&:last)
+      urls.each do |url|
+        if url.start_with?('http') && !url.start_with?('https')
+          errors << "Non-HTTPS URL found: #{url} (consider using HTTPS)"
+        end
+        errors << "URL contains spaces: #{url}" if url.include?(' ')
+      end
+    end
+    # Checks file size and individual line lengths against limits
+    #
+    # Enforces 50KB file size limit and 120 character line length limit
+    def validate_file_size
+      errors << "File size exceeds maximum (#{MAX_FILE_SIZE} bytes)" if content.bytesize > MAX_FILE_SIZE
+      lines = content.lines
+      lines.each_with_index do |line, index|
+        if line.chomp.length > MAX_LINE_LENGTH
+          errors << "Line #{index + 1} exceeds maximum length (#{MAX_LINE_LENGTH} characters)"
+        end
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/version.rb ADDED Viewed

@@ -0,0 +1,6 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Current version of the LlmDocsBuilder gem
+  VERSION = '0.3.0'
+end

data/lib/llm_docs_builder.rb ADDED Viewed

@@ -0,0 +1,130 @@
+# frozen_string_literal: true
+require 'zeitwerk'
+require 'pathname'
+require 'find'
+loader = Zeitwerk::Loader.for_gem
+loader.inflector.inflect('cli' => 'CLI')
+loader.setup
+module LlmDocsBuilder
+  class << self
+    # Generates llms.txt from existing markdown documentation
+    #
+    # @param docs_path [String, nil] path to documentation directory or file (optional if
+    #   config_file provided)
+    # @param options [Hash] generation options
+    # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
+    # @option options [String] :base_url base URL for converting relative links (overrides config)
+    # @option options [String] :title project title (auto-detected if not provided, overrides
+    #   config)
+    # @option options [String] :description project description (auto-detected if not provided,
+    #   overrides config)
+    # @option options [String] :output output file path (default: 'llms.txt', overrides config)
+    # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
+    #   config)
+    # @option options [Boolean] :verbose enable verbose output (overrides config)
+    # @return [String] generated llms.txt content
+    #
+    # @example Generate from docs directory
+    #   LlmDocsBuilder.generate_from_docs('./docs')
+    #
+    # @example Generate using config file
+    #   LlmDocsBuilder.generate_from_docs(config_file: 'llm-docs-builder.yml')
+    #
+    # @example Generate with config file and overrides
+    #   LlmDocsBuilder.generate_from_docs('./docs',
+    #     config_file: 'my-config.yml',
+    #     title: 'Override Title'
+    #   )
+    def generate_from_docs(docs_path = nil, options = {})
+      # Support config-first usage: generate_from_docs(config_file: 'path.yml')
+      if docs_path.is_a?(Hash) && docs_path.key?(:config_file)
+        options = docs_path
+        docs_path = nil
+      end
+      config = Config.new(options[:config_file])
+      merged_options = config.merge_with_options(options)
+      # Use docs_path param or config file docs setting
+      final_docs_path = docs_path || merged_options[:docs]
+      Generator.new(final_docs_path, merged_options).generate
+    end
+    # Transforms a markdown file to be AI-friendly
+    #
+    # @param file_path [String] path to markdown file
+    # @param options [Hash] transformation options
+    # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
+    # @option options [String] :base_url base URL for expanding relative links (overrides config)
+    # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
+    #   config)
+    # @option options [Boolean] :verbose enable verbose output (overrides config)
+    # @return [String] transformed markdown content
+    #
+    # @example Transform with direct options
+    #   LlmDocsBuilder.transform_markdown('README.md',
+    #     base_url: 'https://myproject.io',
+    #     convert_urls: true
+    #   )
+    #
+    # @example Transform using config file
+    #   LlmDocsBuilder.transform_markdown('README.md', config_file: 'llm-docs-builder.yml')
+    def transform_markdown(file_path, options = {})
+      config = Config.new(options[:config_file])
+      merged_options = config.merge_with_options(options)
+      MarkdownTransformer.new(file_path, merged_options).transform
+    end
+    # Bulk transforms multiple markdown files to be AI-friendly
+    #
+    # @param docs_path [String] path to documentation directory
+    # @param options [Hash] transformation options
+    # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
+    # @option options [String] :base_url base URL for expanding relative links (overrides config)
+    # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
+    #   config)
+    # @option options [String] :suffix suffix for transformed files (default: '.llm', overrides
+    #   config)
+    # @option options [Array<String>] :excludes glob patterns for files to exclude (overrides
+    #   config)
+    # @option options [Boolean] :verbose enable verbose output (overrides config)
+    # @return [Array<String>] paths of transformed files
+    #
+    # @example Bulk transform with direct options
+    #   LlmDocsBuilder.bulk_transform('./docs',
+    #     base_url: 'https://myproject.io',
+    #     suffix: '.ai',
+    #     excludes: ['**/private/**', 'draft-*.md']
+    #   )
+    #
+    # @example Bulk transform using config file
+    #   LlmDocsBuilder.bulk_transform('./docs', config_file: 'llm-docs-builder.yml')
+    def bulk_transform(docs_path, options = {})
+      config = Config.new(options[:config_file])
+      merged_options = config.merge_with_options(options)
+      BulkTransformer.new(docs_path, merged_options).transform_all
+    end
+    # Parses an existing llms.txt file
+    #
+    # @param file_path [String] path to the llms.txt file to parse
+    # @return [Parser] parsed llms.txt object
+    def parse(file_path)
+      Parser.new(file_path).parse
+    end
+    # Validates llms.txt content
+    #
+    # @param content [String] the llms.txt content to validate
+    # @return [Boolean] true if content is valid, false otherwise
+    def validate(content)
+      Validator.new(content).valid?
+    end
+  end
+end