RubyGems - llm-docs-builder - Versions diffs - 0.6.0 → 0.8.0 - Mend

llm-docs-builder 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/.rspec +3 -0
data/CHANGELOG.md +59 -0
data/Gemfile.lock +1 -1
data/README.md +241 -541
data/bin/rspecs +2 -1
data/lib/llm_docs_builder/cli.rb +1 -62
data/lib/llm_docs_builder/comparator.rb +4 -16
data/lib/llm_docs_builder/config.rb +74 -5
data/lib/llm_docs_builder/generator.rb +67 -8
data/lib/llm_docs_builder/markdown_transformer.rb +61 -126
data/lib/llm_docs_builder/output_formatter.rb +93 -0
data/lib/llm_docs_builder/parser.rb +1 -59
data/lib/llm_docs_builder/text_compressor.rb +164 -0
data/lib/llm_docs_builder/token_estimator.rb +52 -0
data/lib/llm_docs_builder/transformers/base_transformer.rb +30 -0
data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb +106 -0
data/lib/llm_docs_builder/transformers/enhancement_transformer.rb +95 -0
data/lib/llm_docs_builder/transformers/heading_transformer.rb +72 -0
data/lib/llm_docs_builder/transformers/link_transformer.rb +84 -0
data/lib/llm_docs_builder/transformers/whitespace_transformer.rb +44 -0
data/lib/llm_docs_builder/version.rb +1 -1
metadata +11 -3
data/CLAUDE.md +0 -178
data/llm-docs-builder.yml +0 -7

data/lib/llm_docs_builder/parser.rb CHANGED Viewed

@@ -108,14 +108,12 @@ module LlmDocsBuilder
   # Represents parsed llms.txt content with structured access to sections
   #
   # Provides convenient access to parsed llms.txt sections including title,
-  # description, and link collections. Can be converted to Hash or XML formats.
+  # description, and link collections.
   #
   # @example Access parsed content
   #   parsed.title              # => "My Project"
   #   parsed.description        # => "A description"
   #   parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
-  #   parsed.to_h               # => Hash representation
-  #   parsed.to_xml             # => XML string
   #
   # @api public
   class ParsedContent
@@ -163,61 +161,5 @@ module LlmDocsBuilder
     def optional_links
       sections[:optional] || []
     end
-    # Convert to hash representation
-    #
-    # @return [Hash] hash containing all parsed sections
-    def to_h
-      sections
-    end
-    # Convert to XML representation
-    #
-    # Generates an XML document with all parsed sections and links.
-    #
-    # @return [String] XML string representation
-    def to_xml
-      builder = []
-      builder << '<?xml version="1.0" encoding="UTF-8"?>'
-      builder << '<llms_context>'
-      builder << "  <title>#{title}</title>" if title
-      builder << "  <description>#{description}</description>" if description
-      add_xml_section(builder, 'documentation', documentation_links)
-      add_xml_section(builder, 'examples', example_links)
-      add_xml_section(builder, 'optional', optional_links) if sections[:optional]
-      builder << '</llms_context>'
-      builder.join("\n")
-    end
-    private
-    # Appends section XML elements to builder array
-    #
-    # Handles both array of link hashes and raw string content
-    #
-    # @param builder [Array<String>] XML lines accumulator
-    # @param name [String] section name
-    # @param links [Array<Hash>, String] section links or content
-    def add_xml_section(builder, name, links)
-      return if links.empty?
-      builder << "  <#{name}>"
-      if links.is_a?(Array)
-        links.each do |link|
-          builder << '    <link>'
-          builder << "      <title>#{link[:title]}</title>"
-          builder << "      <url>#{link[:url]}</url>"
-          builder << "      <description>#{link[:description]}</description>"
-          builder << '    </link>'
-        end
-      else
-        builder << "    #{links}"
-      end
-      builder << "  </#{name}>"
-    end
   end
 end

data/lib/llm_docs_builder/text_compressor.rb ADDED Viewed

@@ -0,0 +1,164 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Advanced text compression techniques for reducing token count
+  #
+  # Provides more aggressive text compression methods including stopword removal,
+  # duplicate content detection, and sentence deduplication. These methods are more
+  # aggressive than basic markdown cleanup and should be used carefully.
+  #
+  # @example Basic usage
+  #   compressor = LlmDocsBuilder::TextCompressor.new
+  #   compressed = compressor.compress("Your text here", remove_stopwords: true)
+  #
+  # @api public
+  class TextCompressor
+    # Common English stopwords that can be safely removed from documentation
+    # Excludes words that might be important in technical contexts (like "not", "no")
+    STOPWORDS = %w[
+      a an the this that these those
+      is am are was were be being been
+      have has had do does did
+      will would shall should may might must can could
+      i me my mine we us our ours
+      you your yours
+      he him his she her hers it its
+      they them their theirs
+      what which who whom whose where when why how
+      all both each few more most other some such
+      and or but if then else
+      at by for from in into of on to with
+      as so than
+      very really quite
+      there here
+      about above across after against along among around because before behind below
+      beneath beside besides between beyond during except inside near off since through
+      throughout under until up upon within without
+    ].freeze
+    # @return [Hash] compression options
+    attr_reader :options
+    # Initialize a new text compressor
+    #
+    # @param options [Hash] compression options
+    # @option options [Array<String>] :custom_stopwords additional stopwords to remove
+    # @option options [Boolean] :preserve_technical preserve technical terms and code
+    def initialize(options = {})
+      @options = {
+        preserve_technical: true,
+        custom_stopwords: []
+      }.merge(options)
+    end
+    # Compress text using configured methods
+    #
+    # @param content [String] text to compress
+    # @param methods [Hash] compression methods to apply
+    # @option methods [Boolean] :remove_stopwords remove common filler words
+    # @option methods [Boolean] :remove_duplicates remove duplicate sentences/paragraphs
+    # @return [String] compressed text
+    def compress(content, methods = {})
+      result = content.dup
+      result = remove_stopwords(result) if methods[:remove_stopwords]
+      result = remove_duplicate_paragraphs(result) if methods[:remove_duplicates]
+      result
+    end
+    # Remove stopwords from text while preserving technical content
+    #
+    # Removes common English stopwords that don't carry significant meaning.
+    # Preserves code blocks, inline code, and technical terms.
+    #
+    # WARNING: This is an aggressive optimization that may affect readability.
+    # Use with caution and test results carefully.
+    #
+    # @param content [String] text to process
+    # @return [String] text with stopwords removed
+    def remove_stopwords(content)
+      # Preserve code blocks by temporarily replacing them
+      code_blocks = {}
+      code_counter = 0
+      # Extract and preserve fenced code blocks
+      content = content.gsub(/^```.*?^```/m) do |match|
+        placeholder = "___CODE_BLOCK_#{code_counter}___"
+        code_blocks[placeholder] = match
+        code_counter += 1
+        placeholder
+      end
+      # Extract and preserve inline code
+      content = content.gsub(/`[^`]+`/) do |match|
+        placeholder = "___INLINE_CODE_#{code_counter}___"
+        code_blocks[placeholder] = match
+        code_counter += 1
+        placeholder
+      end
+      # Get combined stopwords list
+      stopwords_list = STOPWORDS + options[:custom_stopwords]
+      # Process each line
+      content = content.split("\n").map do |line|
+        # Skip markdown headers, lists, and links
+        if line.match?(/^#+\s/) || line.match?(/^[\*\-]\s/) || line.match?(/\[[^\]]+\]\([^)]+\)/)
+          line
+        else
+          # Remove stopwords from regular text
+          words = line.split(/\b/)
+          words.map do |word|
+            # Preserve the word if it's not a stopword or if we should preserve technical terms
+            if stopwords_list.include?(word.downcase) && !word.match?(/^[A-Z]/) # Don't remove capitalized words
+              ''
+            else
+              word
+            end
+          end.join
+        end
+      end.join("\n")
+      # Restore code blocks
+      code_blocks.each do |placeholder, original|
+        content = content.gsub(placeholder, original)
+      end
+      content
+    end
+    # Remove duplicate paragraphs from text
+    #
+    # Detects and removes paragraphs that are duplicates or near-duplicates.
+    # Documentation often repeats concepts across different sections.
+    #
+    # @param content [String] text to process
+    # @return [String] text with duplicate paragraphs removed
+    def remove_duplicate_paragraphs(content)
+      # Split into paragraphs (separated by blank lines)
+      paragraphs = content.split(/\n\s*\n/)
+      # Track seen paragraphs with normalized comparison
+      seen = {}
+      unique_paragraphs = []
+      paragraphs.each do |para|
+        # Normalize for comparison (remove extra whitespace, lowercase)
+        normalized = para.gsub(/\s+/, ' ').strip.downcase
+        # Skip empty paragraphs
+        next if normalized.empty?
+        # Check if we've seen this or similar paragraph
+        unless seen[normalized]
+          seen[normalized] = true
+          unique_paragraphs << para
+        end
+      end
+      unique_paragraphs.join("\n\n")
+    end
+  end
+end

data/lib/llm_docs_builder/token_estimator.rb ADDED Viewed

@@ -0,0 +1,52 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Estimates token count for text content using character-based approximation
+  #
+  # Provides token estimation without requiring external tokenizer dependencies.
+  # Uses the common heuristic that ~4 characters equals 1 token for English text,
+  # which works reasonably well for documentation and markdown content.
+  #
+  # @example Basic usage
+  #   estimator = LlmDocsBuilder::TokenEstimator.new
+  #   token_count = estimator.estimate("This is a sample text.")
+  #
+  # @example With custom characters per token
+  #   estimator = LlmDocsBuilder::TokenEstimator.new(chars_per_token: 3.5)
+  #   token_count = estimator.estimate(content)
+  #
+  # @api public
+  class TokenEstimator
+    # Default number of characters per token
+    DEFAULT_CHARS_PER_TOKEN = 4.0
+    # @return [Float] characters per token ratio
+    attr_reader :chars_per_token
+    # Initialize a new token estimator
+    #
+    # @param chars_per_token [Float] number of characters per token (default: 4.0)
+    def initialize(chars_per_token: DEFAULT_CHARS_PER_TOKEN)
+      @chars_per_token = chars_per_token.to_f
+    end
+    # Estimate token count for given content
+    #
+    # @param content [String] text content to estimate tokens for
+    # @return [Integer] estimated number of tokens
+    def estimate(content)
+      return 0 if content.nil? || content.empty?
+      (content.length / chars_per_token).round
+    end
+    # Estimate token count (class method for convenience)
+    #
+    # @param content [String] text content to estimate tokens for
+    # @param chars_per_token [Float] number of characters per token (default: 4.0)
+    # @return [Integer] estimated number of tokens
+    def self.estimate(content, chars_per_token: DEFAULT_CHARS_PER_TOKEN)
+      new(chars_per_token: chars_per_token).estimate(content)
+    end
+  end
+end

data/lib/llm_docs_builder/transformers/base_transformer.rb ADDED Viewed

@@ -0,0 +1,30 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Transformers
+    # Base module for all transformers
+    #
+    # Provides a common interface for content transformation operations.
+    # Each transformer should implement the `transform` method.
+    #
+    # @api public
+    module BaseTransformer
+      # Transform content
+      #
+      # @param content [String] content to transform
+      # @param options [Hash] transformation options
+      # @return [String] transformed content
+      def transform(content, options = {})
+        raise NotImplementedError, "#{self.class} must implement #transform"
+      end
+      # Check if transformation should be applied
+      #
+      # @param options [Hash] transformation options
+      # @return [Boolean] true if transformation should be applied
+      def should_transform?(options)
+        true
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb ADDED Viewed

@@ -0,0 +1,106 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Transformers
+    # Transformer for content cleanup operations
+    #
+    # Handles removal of various markdown elements that don't provide
+    # value for LLM consumption (frontmatter, comments, badges, etc.).
+    #
+    # @api public
+    class ContentCleanupTransformer
+      include BaseTransformer
+      # Transform content by removing unwanted elements
+      #
+      # @param content [String] markdown content
+      # @param options [Hash] transformation options
+      # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
+      # @option options [Boolean] :remove_comments remove HTML comments
+      # @option options [Boolean] :remove_badges remove badge images
+      # @option options [Boolean] :remove_code_examples remove code blocks
+      # @option options [Boolean] :remove_images remove image syntax
+      # @option options [Boolean] :remove_blockquotes remove blockquote formatting
+      # @return [String] transformed content
+      def transform(content, options = {})
+        result = content.dup
+        result = remove_frontmatter(result) if options[:remove_frontmatter]
+        result = remove_comments(result) if options[:remove_comments]
+        result = remove_badges(result) if options[:remove_badges]
+        result = remove_code_examples(result) if options[:remove_code_examples]
+        result = remove_images(result) if options[:remove_images]
+        result = remove_blockquotes(result) if options[:remove_blockquotes]
+        result
+      end
+      private
+      # Remove YAML or TOML frontmatter
+      #
+      # @param content [String] markdown content
+      # @return [String] content without frontmatter
+      def remove_frontmatter(content)
+        content = content.sub(/\A---\s*$.*?^---\s*$/m, '')
+        content = content.sub(/\A\+\+\+\s*$.*?^\+\+\+\s*$/m, '')
+        content
+      end
+      # Remove HTML comments
+      #
+      # @param content [String] markdown content
+      # @return [String] content without comments
+      def remove_comments(content)
+        content.gsub(/<!--.*?-->/m, '')
+      end
+      # Remove badge images
+      #
+      # @param content [String] markdown content
+      # @return [String] content without badges
+      def remove_badges(content)
+        # Remove linked badges
+        content = content.gsub(/\[\!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)\]\([^\)]*\)/i, '')
+        # Remove standalone badges
+        content = content.gsub(/!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)/i, '')
+        content
+      end
+      # Remove code blocks and inline code
+      #
+      # @param content [String] markdown content
+      # @return [String] content without code
+      def remove_code_examples(content)
+        # Remove fenced code blocks
+        content = content.gsub(/^```.*?^```/m, '')
+        content = content.gsub(/^~~~.*?^~~~/m, '')
+        # Remove indented code blocks
+        content = content.gsub(/^([ ]{4,}|\t).+$/m, '')
+        # Remove inline code
+        content = content.gsub(/`[^`]+`/, '')
+        content
+      end
+      # Remove image syntax
+      #
+      # @param content [String] markdown content
+      # @return [String] content without images
+      def remove_images(content)
+        # Remove inline images
+        content = content.gsub(/!\[([^\]]*)\]\([^\)]+\)/, '')
+        # Remove reference-style images
+        content = content.gsub(/!\[([^\]]*)\]\[[^\]]+\]/, '')
+        content
+      end
+      # Remove blockquote formatting
+      #
+      # @param content [String] markdown content
+      # @return [String] content without blockquote markers
+      def remove_blockquotes(content)
+        content.gsub(/^>\s?/, '')
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/transformers/enhancement_transformer.rb ADDED Viewed

@@ -0,0 +1,95 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Transformers
+    # Transformer for document enhancements
+    #
+    # Adds helpful features like table of contents and custom instructions
+    # to improve LLM navigation and context understanding.
+    #
+    # @api public
+    class EnhancementTransformer
+      include BaseTransformer
+      # Transform content by adding enhancements
+      #
+      # @param content [String] markdown content
+      # @param options [Hash] transformation options
+      # @option options [Boolean] :generate_toc generate table of contents
+      # @option options [String] :custom_instruction custom instruction text
+      # @option options [Boolean] :remove_blockquotes whether blockquotes are being removed
+      # @return [String] transformed content
+      def transform(content, options = {})
+        result = content.dup
+        if options[:custom_instruction]
+          result = inject_custom_instruction(result, options[:custom_instruction], options[:remove_blockquotes])
+        end
+        result = generate_table_of_contents(result) if options[:generate_toc]
+        result
+      end
+      private
+      # Generate table of contents from headings
+      #
+      # @param content [String] markdown content
+      # @return [String] content with TOC prepended
+      def generate_table_of_contents(content)
+        headings = []
+        content.scan(/^(#{Regexp.escape('#')}{1,6})\s+(.+)$/) do
+          level = ::Regexp.last_match(1).length
+          title = ::Regexp.last_match(2).strip
+          anchor = title.downcase
+                        .gsub(/[^\w\s-]/, '')
+                        .gsub(/\s+/, '-')
+          headings << { level: level, title: title, anchor: anchor }
+        end
+        return content if headings.empty?
+        toc = ["## Table of Contents\n"]
+        headings.each do |heading|
+          next if heading[:level] == 1 && headings.first == heading
+          indent = '  ' * (heading[:level] - 1)
+          toc << "#{indent}- [#{heading[:title]}](##{heading[:anchor]})"
+        end
+        toc << "\n---\n"
+        if content.match(/^#\s+(.+)$/)
+          content.sub(/^(#\s+.+\n)/, "\\1\n#{toc.join("\n")}\n")
+        else
+          "#{toc.join("\n")}\n\n#{content}"
+        end
+      end
+      # Inject custom instruction at document top
+      #
+      # @param content [String] markdown content
+      # @param instruction [String] instruction text
+      # @param remove_blockquotes [Boolean] whether to avoid blockquote formatting
+      # @return [String] content with instruction prepended
+      def inject_custom_instruction(content, instruction, remove_blockquotes = false)
+        return content if instruction.nil? || instruction.empty?
+        formatted_instruction = if remove_blockquotes
+                                  "**AI Context**: #{instruction}\n\n---\n\n"
+                                else
+                                  "> **AI Context**: #{instruction}\n\n---\n\n"
+                                end
+        if content.match(/^#\s+(.+?)\n/)
+          content.sub(/^(#\s+.+?\n)/, "\\1\n#{formatted_instruction}")
+        else
+          "#{formatted_instruction}#{content}"
+        end
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/transformers/heading_transformer.rb ADDED Viewed

@@ -0,0 +1,72 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Transformers
+    # Normalizes headings to include hierarchical context
+    #
+    # Transforms markdown headings to include parent context, making each section
+    # self-contained for RAG systems. This is particularly useful when documents
+    # are chunked and retrieved independently.
+    #
+    # @example Basic heading normalization
+    #   # Configuration
+    #   ## Consumer Settings
+    #   ### auto_offset_reset
+    #
+    #   Becomes:
+    #   # Configuration
+    #   ## Configuration / Consumer Settings
+    #   ### Configuration / Consumer Settings / auto_offset_reset
+    #
+    # @api public
+    class HeadingTransformer
+      include BaseTransformer
+      # Transform content by normalizing heading hierarchy
+      #
+      # Parses markdown headings and adds parent context to each heading,
+      # making sections self-documenting when retrieved independently.
+      #
+      # @param content [String] markdown content to transform
+      # @param options [Hash] transformation options
+      # @option options [Boolean] :normalize_headings enable heading normalization
+      # @option options [String] :heading_separator separator between heading levels (default: ' / ')
+      # @return [String] transformed content with normalized headings
+      def transform(content, options = {})
+        return content unless options[:normalize_headings]
+        separator = options[:heading_separator] || ' / '
+        heading_stack = []
+        lines = content.lines
+        transformed_lines = lines.map do |line|
+          # Match markdown headings (1-6 hash symbols followed by space and text)
+          heading_match = line.match(/^(#+)\s+(.+)$/)
+          if heading_match && heading_match[1].count('#').between?(1, 6)
+            level = heading_match[1].count('#')
+            title = heading_match[2].strip
+            # Update heading stack to current level
+            heading_stack = heading_stack[0...level - 1]
+            heading_stack << title
+            # Build hierarchical heading
+            if level == 1
+              # H1 stays as-is (top level)
+              line
+            else
+              # H2+ gets parent context
+              hierarchical_title = heading_stack.join(separator)
+              "#{'#' * level} #{hierarchical_title}\n"
+            end
+          else
+            line
+          end
+        end
+        transformed_lines.join
+      end
+    end
+  end
+end

data/lib/llm_docs_builder/transformers/link_transformer.rb ADDED Viewed

@@ -0,0 +1,84 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Transformers
+    # Transformer for link-related operations
+    #
+    # Handles expansion of relative links to absolute URLs and
+    # conversion of HTML URLs to markdown format.
+    #
+    # @api public
+    class LinkTransformer
+      include BaseTransformer
+      # Transform links in content
+      #
+      # @param content [String] markdown content
+      # @param options [Hash] transformation options
+      # @option options [String] :base_url base URL for expanding relative links
+      # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
+      # @return [String] transformed content
+      def transform(content, options = {})
+        result = content.dup
+        result = expand_relative_links(result, options[:base_url]) if options[:base_url]
+        result = convert_html_urls(result) if options[:convert_urls]
+        result = simplify_links(result) if options[:simplify_links]
+        result
+      end
+      private
+      # Expand relative links to absolute URLs
+      #
+      # @param content [String] markdown content
+      # @param base_url [String] base URL for expansion
+      # @return [String] content with expanded links
+      def expand_relative_links(content, base_url)
+        content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
+          text = ::Regexp.last_match(1)
+          url = ::Regexp.last_match(2)
+          if url.start_with?('http://', 'https://', '//', '#')
+            match
+          else
+            clean_url = url.gsub(%r{^\./}, '')
+            expanded_url = File.join(base_url, clean_url)
+            "[#{text}](#{expanded_url})"
+          end
+        end
+      end
+      # Convert HTML URLs to markdown format
+      #
+      # @param content [String] markdown content
+      # @return [String] content with converted URLs
+      def convert_html_urls(content)
+        content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
+          url.sub(/\.html?$/, '.md')
+        end
+      end
+      # Simplify verbose link text
+      #
+      # @param content [String] markdown content
+      # @return [String] content with simplified links
+      def simplify_links(content)
+        content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do
+          text = ::Regexp.last_match(1)
+          url = ::Regexp.last_match(2)
+          simplified_text = text
+                            .gsub(/^(click here to|see|read more about|check out|visit)\s+(the\s+)?/i, '')
+                            .gsub(/\s+(here|documentation|docs)$/i, '')
+                            .strip
+          simplified_text = text if simplified_text.empty?
+          "[#{simplified_text}](#{url})"
+        end
+      end
+    end
+  end
+end