RubyGems - llm-docs-builder - Versions diffs - 0.3.0 → 0.7.0 - Mend

llm-docs-builder 0.3.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.github/workflows/docker.yml +6 -6
data/.rspec +3 -0
data/CHANGELOG.md +38 -1
data/Gemfile.lock +1 -1
data/README.md +190 -519
data/bin/rspecs +2 -1
data/lib/llm_docs_builder/cli.rb +1 -50
data/lib/llm_docs_builder/comparator.rb +30 -7
data/lib/llm_docs_builder/config.rb +58 -1
data/lib/llm_docs_builder/markdown_transformer.rb +65 -36
data/lib/llm_docs_builder/output_formatter.rb +93 -0
data/lib/llm_docs_builder/parser.rb +1 -59
data/lib/llm_docs_builder/text_compressor.rb +164 -0
data/lib/llm_docs_builder/token_estimator.rb +52 -0
data/lib/llm_docs_builder/transformers/base_transformer.rb +30 -0
data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb +106 -0
data/lib/llm_docs_builder/transformers/enhancement_transformer.rb +95 -0
data/lib/llm_docs_builder/transformers/link_transformer.rb +84 -0
data/lib/llm_docs_builder/transformers/whitespace_transformer.rb +44 -0
data/lib/llm_docs_builder/version.rb +1 -1
metadata +10 -3
data/CLAUDE.md +0 -178
data/llm-docs-builder.yml +0 -7

data/bin/rspecs CHANGED Viewed

@@ -4,4 +4,5 @@
 set -e
 echo "Running all tests..."
-bundle exec rspec --format documentation
+# Explicitly specify the spec directory to ensure all tests are discovered
+bundle exec rspec spec/ --format documentation

data/lib/llm_docs_builder/cli.rb CHANGED Viewed

@@ -295,8 +295,6 @@ module LlmDocsBuilder
         puts "Documentation Links: #{parsed.documentation_links.size}"
         puts "Example Links: #{parsed.example_links.size}" if parsed.respond_to?(:example_links)
         puts "Optional Links: #{parsed.optional_links.size}" if parsed.respond_to?(:optional_links)
-      elsif parsed.respond_to?(:to_xml)
-        puts parsed.to_xml
       end
     end
@@ -335,60 +333,13 @@ module LlmDocsBuilder
       begin
         result = comparator.compare
-        display_comparison_results(result)
+        OutputFormatter.display_comparison_results(result)
       rescue LlmDocsBuilder::Errors::BaseError => e
         puts "Error during comparison: #{e.message}"
         exit 1
       end
     end
-    # Display formatted comparison results
-    #
-    # @param result [Hash] comparison results from Comparator
-    def display_comparison_results(result)
-      puts ''
-      puts '=' * 60
-      puts 'Context Window Comparison'
-      puts '=' * 60
-      puts ''
-      puts "Human version:  #{format_bytes(result[:human_size])}"
-      puts "  Source: #{result[:human_source]}"
-      puts ''
-      puts "AI version:     #{format_bytes(result[:ai_size])}"
-      puts "  Source: #{result[:ai_source]}"
-      puts ''
-      puts '-' * 60
-      if result[:reduction_bytes].positive?
-        puts "Reduction:      #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
-        puts "Factor:         #{result[:factor]}x smaller"
-      elsif result[:reduction_bytes].negative?
-        increase_bytes = result[:reduction_bytes].abs
-        increase_percent = result[:reduction_percent].abs
-        puts "Increase:       #{format_bytes(increase_bytes)} (#{increase_percent}%)"
-        puts "Factor:         #{result[:factor]}x larger"
-      else
-        puts 'Same size'
-      end
-      puts '=' * 60
-      puts ''
-    end
-    # Format bytes into human-readable string
-    #
-    # @param bytes [Integer] number of bytes
-    # @return [String] formatted string with units
-    def format_bytes(bytes)
-      if bytes < 1024
-        "#{bytes} bytes"
-      elsif bytes < 1024 * 1024
-        "#{(bytes / 1024.0).round(1)} KB"
-      else
-        "#{(bytes / (1024.0 * 1024)).round(2)} MB"
-      end
-    end
     # Validate llms.txt file format
     #
     # Checks if llms.txt file follows proper format with title, description, and documentation links.

data/lib/llm_docs_builder/comparator.rb CHANGED Viewed

@@ -62,6 +62,10 @@ module LlmDocsBuilder
     #   - :reduction_bytes [Integer] bytes saved
     #   - :reduction_percent [Integer] percentage reduction
     #   - :factor [Float] compression factor
+    #   - :human_tokens [Integer] estimated tokens in human version
+    #   - :ai_tokens [Integer] estimated tokens in AI version
+    #   - :token_reduction [Integer] estimated tokens saved
+    #   - :token_reduction_percent [Integer] percentage of tokens saved
     #   - :human_source [String] source description (URL or file)
     #   - :ai_source [String] source description (URL or file)
     def compare
@@ -85,8 +89,8 @@ module LlmDocsBuilder
       ai_content = fetch_url(url, options[:ai_user_agent])
       calculate_results(
-        human_content.bytesize,
-        ai_content.bytesize,
+        human_content,
+        ai_content,
         "#{url} (User-Agent: human)",
         "#{url} (User-Agent: AI)"
       )
@@ -112,8 +116,8 @@ module LlmDocsBuilder
       ai_content = File.read(local_file)
       calculate_results(
-        human_content.bytesize,
-        ai_content.bytesize,
+        human_content,
+        ai_content,
         url,
         local_file
       )
@@ -205,12 +209,15 @@ module LlmDocsBuilder
     # Calculate comparison statistics
     #
-    # @param human_size [Integer] size of human version in bytes
-    # @param ai_size [Integer] size of AI version in bytes
+    # @param human_content [String] content of human version
+    # @param ai_content [String] content of AI version
     # @param human_source [String] description of human source
     # @param ai_source [String] description of AI source
     # @return [Hash] comparison results
-    def calculate_results(human_size, ai_size, human_source, ai_source)
+    def calculate_results(human_content, ai_content, human_source, ai_source)
+      human_size = human_content.bytesize
+      ai_size = ai_content.bytesize
       reduction_bytes = human_size - ai_size
       reduction_percent = if human_size.positive?
                             ((reduction_bytes.to_f / human_size) * 100).round
@@ -224,15 +231,31 @@ module LlmDocsBuilder
                  Float::INFINITY
                end
+      # Estimate tokens using TokenEstimator
+      estimator = TokenEstimator.new
+      human_tokens = estimator.estimate(human_content)
+      ai_tokens = estimator.estimate(ai_content)
+      token_reduction = human_tokens - ai_tokens
+      token_reduction_percent = if human_tokens.positive?
+                                  ((token_reduction.to_f / human_tokens) * 100).round
+                                else
+                                  0
+                                end
       {
         human_size: human_size,
         ai_size: ai_size,
         reduction_bytes: reduction_bytes,
         reduction_percent: reduction_percent,
         factor: factor,
+        human_tokens: human_tokens,
+        ai_tokens: ai_tokens,
+        token_reduction: token_reduction,
+        token_reduction_percent: token_reduction_percent,
         human_source: human_source,
         ai_source: ai_source
       }
     end
   end
 end

data/lib/llm_docs_builder/config.rb CHANGED Viewed

@@ -67,11 +67,68 @@ module LlmDocsBuilder
                       else
                         self['convert_urls'] || false
                       end,
+        remove_comments: if options.key?(:remove_comments)
+                           options[:remove_comments]
+                         else
+                           self['remove_comments'] || true
+                         end,
+        normalize_whitespace: if options.key?(:normalize_whitespace)
+                                options[:normalize_whitespace]
+                              else
+                                self['normalize_whitespace'] || true
+                              end,
+        remove_badges: if options.key?(:remove_badges)
+                         options[:remove_badges]
+                       else
+                         self['remove_badges'] || true
+                       end,
+        remove_frontmatter: if options.key?(:remove_frontmatter)
+                              options[:remove_frontmatter]
+                            else
+                              self['remove_frontmatter'] || true
+                            end,
         verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
         # Bulk transformation options
         suffix: options[:suffix] || self['suffix'] || '.llm',
         excludes: options[:excludes] || self['excludes'] || [],
-        bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
+        bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false),
+        # New compression options
+        remove_code_examples: if options.key?(:remove_code_examples)
+                                options[:remove_code_examples]
+                              else
+                                self['remove_code_examples'] || false
+                              end,
+        remove_images: if options.key?(:remove_images)
+                         options[:remove_images]
+                       else
+                         self['remove_images'] || false
+                       end,
+        simplify_links: if options.key?(:simplify_links)
+                          options[:simplify_links]
+                        else
+                          self['simplify_links'] || false
+                        end,
+        remove_blockquotes: if options.key?(:remove_blockquotes)
+                              options[:remove_blockquotes]
+                            else
+                              self['remove_blockquotes'] || false
+                            end,
+        generate_toc: if options.key?(:generate_toc)
+                        options[:generate_toc]
+                      else
+                        self['generate_toc'] || false
+                      end,
+        custom_instruction: options[:custom_instruction] || self['custom_instruction'],
+        remove_stopwords: if options.key?(:remove_stopwords)
+                            options[:remove_stopwords]
+                          else
+                            self['remove_stopwords'] || false
+                          end,
+        remove_duplicates: if options.key?(:remove_duplicates)
+                             options[:remove_duplicates]
+                           else
+                             self['remove_duplicates'] || false
+                           end
       }
     end

data/lib/llm_docs_builder/markdown_transformer.rb CHANGED Viewed

@@ -3,9 +3,8 @@
 module LlmDocsBuilder
   # Transforms markdown files to be AI-friendly
   #
-  # Processes individual markdown files to make them more suitable for LLM consumption by
-  # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
-  # formats.
+  # Orchestrates a pipeline of specialized transformers to process markdown content.
+  # Each transformer is responsible for a specific aspect of the transformation.
   #
   # @example Transform with base URL
   #   transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
@@ -27,64 +26,94 @@ module LlmDocsBuilder
     # @param options [Hash] transformation options
     # @option options [String] :base_url base URL for expanding relative links
     # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
+    # @option options [Boolean] :remove_comments remove HTML comments from markdown
+    # @option options [Boolean] :normalize_whitespace normalize excessive whitespace
+    # @option options [Boolean] :remove_badges remove badge/shield images
+    # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
+    # @option options [Boolean] :remove_code_examples remove code blocks and inline code
+    # @option options [Boolean] :remove_images remove image syntax
+    # @option options [Boolean] :simplify_links shorten verbose link text
+    # @option options [Boolean] :remove_blockquotes remove blockquote formatting
+    # @option options [Boolean] :generate_toc generate table of contents at the top
+    # @option options [String] :custom_instruction custom instruction text to inject at top
+    # @option options [Boolean] :remove_stopwords remove common stopwords (aggressive)
+    # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
     def initialize(file_path, options = {})
       @file_path = file_path
       @options = options
     end
-    # Transform markdown content to be AI-friendly
+    # Transform markdown content using a pipeline of transformers
     #
-    # Applies transformations to make the markdown more suitable for LLM processing:
-    # - Expands relative links to absolute URLs (if base_url provided)
-    # - Converts HTML URLs to markdown format (if convert_urls enabled)
+    # Processes content through specialized transformers in order:
+    # 1. ContentCleanupTransformer - Removes unwanted elements
+    # 2. LinkTransformer - Processes links
+    # 3. TextCompressor - Advanced compression (if enabled)
+    # 4. EnhancementTransformer - Adds TOC and instructions
+    # 5. WhitespaceTransformer - Normalizes whitespace
     #
     # @return [String] transformed markdown content
     def transform
       content = File.read(file_path)
-      content = expand_relative_links(content) if options[:base_url]
-      content = convert_html_urls(content) if options[:convert_urls]
+      # Build and execute transformation pipeline
+      content = cleanup_transformer.transform(content, options)
+      content = link_transformer.transform(content, options)
+      content = compress_content(content) if should_compress?
+      content = enhancement_transformer.transform(content, options)
+      content = whitespace_transformer.transform(content, options)
       content
     end
     private
-    # Expand relative links to absolute URLs
+    # Get content cleanup transformer instance
     #
-    # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
-    # Leaves absolute URLs and anchors unchanged.
+    # @return [Transformers::ContentCleanupTransformer]
+    def cleanup_transformer
+      @cleanup_transformer ||= Transformers::ContentCleanupTransformer.new
+    end
+    # Get link transformer instance
     #
-    # @param content [String] markdown content to process
-    # @return [String] content with expanded links
-    def expand_relative_links(content)
-      base_url = options[:base_url]
+    # @return [Transformers::LinkTransformer]
+    def link_transformer
+      @link_transformer ||= Transformers::LinkTransformer.new
+    end
-      content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
-        text = ::Regexp.last_match(1)
-        url = ::Regexp.last_match(2)
+    # Get enhancement transformer instance
+    #
+    # @return [Transformers::EnhancementTransformer]
+    def enhancement_transformer
+      @enhancement_transformer ||= Transformers::EnhancementTransformer.new
+    end
-        if url.start_with?('http://', 'https://', '//', '#')
-          match # Already absolute or anchor
-        else
-          # Clean up relative path
-          clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
-          expanded_url = File.join(base_url, clean_url)
-          "[#{text}](#{expanded_url})"
-        end
-      end
+    # Get whitespace transformer instance
+    #
+    # @return [Transformers::WhitespaceTransformer]
+    def whitespace_transformer
+      @whitespace_transformer ||= Transformers::WhitespaceTransformer.new
     end
-    # Convert HTML URLs to markdown-friendly format
+    # Check if content compression should be applied
     #
-    # Changes URLs ending in .html or .htm to .md for better LLM understanding
+    # @return [Boolean]
+    def should_compress?
+      options[:remove_stopwords] || options[:remove_duplicates]
+    end
+    # Compress content using TextCompressor
     #
-    # @param content [String] markdown content to process
-    # @return [String] content with converted URLs
-    def convert_html_urls(content)
-      content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
-        url.sub(/\.html?$/, '.md')
-      end
+    # @param content [String] content to compress
+    # @return [String] compressed content
+    def compress_content(content)
+      compressor = TextCompressor.new
+      compression_methods = {
+        remove_stopwords: options[:remove_stopwords],
+        remove_duplicates: options[:remove_duplicates]
+      }
+      compressor.compress(content, compression_methods)
     end
   end
 end

data/lib/llm_docs_builder/output_formatter.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Formats output for CLI display
+  #
+  # Provides formatting utilities for displaying comparison results,
+  # byte sizes, and numbers in a user-friendly way.
+  #
+  # @api private
+  class OutputFormatter
+    # Format bytes into human-readable string
+    #
+    # @param bytes [Integer] number of bytes
+    # @return [String] formatted string with units (bytes/KB/MB)
+    #
+    # @example
+    #   OutputFormatter.format_bytes(1024)      #=> "1.0 KB"
+    #   OutputFormatter.format_bytes(1048576)   #=> "1.0 MB"
+    def self.format_bytes(bytes)
+      if bytes < 1024
+        "#{bytes} bytes"
+      elsif bytes < 1024 * 1024
+        "#{(bytes / 1024.0).round(1)} KB"
+      else
+        "#{(bytes / (1024.0 * 1024)).round(2)} MB"
+      end
+    end
+    # Format number with comma separators for readability
+    #
+    # @param number [Integer] number to format
+    # @return [String] formatted number with commas
+    #
+    # @example
+    #   OutputFormatter.format_number(1234567)  #=> "1,234,567"
+    def self.format_number(number)
+      number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
+    end
+    # Display formatted comparison results
+    #
+    # @param result [Hash] comparison results from Comparator
+    def self.display_comparison_results(result)
+      puts ''
+      puts '=' * 60
+      puts 'Context Window Comparison'
+      puts '=' * 60
+      puts ''
+      puts "Human version:  #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
+      puts "  Source: #{result[:human_source]}"
+      puts ''
+      puts "AI version:     #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
+      puts "  Source: #{result[:ai_source]}"
+      puts ''
+      puts '-' * 60
+      if result[:reduction_bytes].positive?
+        display_reduction(result)
+      elsif result[:reduction_bytes].negative?
+        display_increase(result)
+      else
+        puts 'Same size'
+      end
+      puts '=' * 60
+      puts ''
+    end
+    # Display reduction statistics
+    #
+    # @param result [Hash] comparison results
+    # @api private
+    def self.display_reduction(result)
+      puts "Reduction:      #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
+      puts "Token savings:  #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
+      puts "Factor:         #{result[:factor]}x smaller"
+    end
+    # Display increase statistics
+    #
+    # @param result [Hash] comparison results
+    # @api private
+    def self.display_increase(result)
+      increase_bytes = result[:reduction_bytes].abs
+      increase_percent = result[:reduction_percent].abs
+      token_increase = result[:token_reduction].abs
+      token_increase_percent = result[:token_reduction_percent].abs
+      puts "Increase:       #{format_bytes(increase_bytes)} (#{increase_percent}%)"
+      puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
+      puts "Factor:         #{result[:factor]}x larger"
+    end
+  end
+end

data/lib/llm_docs_builder/parser.rb CHANGED Viewed

@@ -108,14 +108,12 @@ module LlmDocsBuilder
   # Represents parsed llms.txt content with structured access to sections
   #
   # Provides convenient access to parsed llms.txt sections including title,
-  # description, and link collections. Can be converted to Hash or XML formats.
+  # description, and link collections.
   #
   # @example Access parsed content
   #   parsed.title              # => "My Project"
   #   parsed.description        # => "A description"
   #   parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
-  #   parsed.to_h               # => Hash representation
-  #   parsed.to_xml             # => XML string
   #
   # @api public
   class ParsedContent
@@ -163,61 +161,5 @@ module LlmDocsBuilder
     def optional_links
       sections[:optional] || []
     end
-    # Convert to hash representation
-    #
-    # @return [Hash] hash containing all parsed sections
-    def to_h
-      sections
-    end
-    # Convert to XML representation
-    #
-    # Generates an XML document with all parsed sections and links.
-    #
-    # @return [String] XML string representation
-    def to_xml
-      builder = []
-      builder << '<?xml version="1.0" encoding="UTF-8"?>'
-      builder << '<llms_context>'
-      builder << "  <title>#{title}</title>" if title
-      builder << "  <description>#{description}</description>" if description
-      add_xml_section(builder, 'documentation', documentation_links)
-      add_xml_section(builder, 'examples', example_links)
-      add_xml_section(builder, 'optional', optional_links) if sections[:optional]
-      builder << '</llms_context>'
-      builder.join("\n")
-    end
-    private
-    # Appends section XML elements to builder array
-    #
-    # Handles both array of link hashes and raw string content
-    #
-    # @param builder [Array<String>] XML lines accumulator
-    # @param name [String] section name
-    # @param links [Array<Hash>, String] section links or content
-    def add_xml_section(builder, name, links)
-      return if links.empty?
-      builder << "  <#{name}>"
-      if links.is_a?(Array)
-        links.each do |link|
-          builder << '    <link>'
-          builder << "      <title>#{link[:title]}</title>"
-          builder << "      <url>#{link[:url]}</url>"
-          builder << "      <description>#{link[:description]}</description>"
-          builder << '    </link>'
-        end
-      else
-        builder << "    #{links}"
-      end
-      builder << "  </#{name}>"
-    end
   end
 end