RubyGems - llm-docs-builder - Versions diffs - 0.6.0 → 0.7.0 - Mend

llm-docs-builder 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

checksums.yaml +4 -4
data/.rspec +3 -0
data/CHANGELOG.md +37 -0
data/Gemfile.lock +1 -1
data/README.md +182 -555
data/bin/rspecs +2 -1
data/lib/llm_docs_builder/cli.rb +1 -62
data/lib/llm_docs_builder/comparator.rb +4 -16
data/lib/llm_docs_builder/config.rb +42 -5
data/lib/llm_docs_builder/markdown_transformer.rb +54 -128
data/lib/llm_docs_builder/output_formatter.rb +93 -0
data/lib/llm_docs_builder/parser.rb +1 -59
data/lib/llm_docs_builder/text_compressor.rb +164 -0
data/lib/llm_docs_builder/token_estimator.rb +52 -0
data/lib/llm_docs_builder/transformers/base_transformer.rb +30 -0
data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb +106 -0
data/lib/llm_docs_builder/transformers/enhancement_transformer.rb +95 -0
data/lib/llm_docs_builder/transformers/link_transformer.rb +84 -0
data/lib/llm_docs_builder/transformers/whitespace_transformer.rb +44 -0
data/lib/llm_docs_builder/version.rb +1 -1
metadata +10 -3
data/CLAUDE.md +0 -178
data/llm-docs-builder.yml +0 -7

data/bin/rspecs CHANGED Viewed

@@ -4,4 +4,5 @@
 set -e
 echo "Running all tests..."
-bundle exec rspec --format documentation
+# Explicitly specify the spec directory to ensure all tests are discovered
+bundle exec rspec spec/ --format documentation

data/lib/llm_docs_builder/cli.rb CHANGED Viewed

@@ -295,8 +295,6 @@ module LlmDocsBuilder
         puts "Documentation Links: #{parsed.documentation_links.size}"
         puts "Example Links: #{parsed.example_links.size}" if parsed.respond_to?(:example_links)
         puts "Optional Links: #{parsed.optional_links.size}" if parsed.respond_to?(:optional_links)
-      elsif parsed.respond_to?(:to_xml)
-        puts parsed.to_xml
       end
     end
@@ -335,72 +333,13 @@ module LlmDocsBuilder
       begin
         result = comparator.compare
-        display_comparison_results(result)
+        OutputFormatter.display_comparison_results(result)
       rescue LlmDocsBuilder::Errors::BaseError => e
         puts "Error during comparison: #{e.message}"
         exit 1
       end
     end
-    # Display formatted comparison results
-    #
-    # @param result [Hash] comparison results from Comparator
-    def display_comparison_results(result)
-      puts ''
-      puts '=' * 60
-      puts 'Context Window Comparison'
-      puts '=' * 60
-      puts ''
-      puts "Human version:  #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
-      puts "  Source: #{result[:human_source]}"
-      puts ''
-      puts "AI version:     #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
-      puts "  Source: #{result[:ai_source]}"
-      puts ''
-      puts '-' * 60
-      if result[:reduction_bytes].positive?
-        puts "Reduction:      #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
-        puts "Token savings:  #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
-        puts "Factor:         #{result[:factor]}x smaller"
-      elsif result[:reduction_bytes].negative?
-        increase_bytes = result[:reduction_bytes].abs
-        increase_percent = result[:reduction_percent].abs
-        token_increase = result[:token_reduction].abs
-        token_increase_percent = result[:token_reduction_percent].abs
-        puts "Increase:       #{format_bytes(increase_bytes)} (#{increase_percent}%)"
-        puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
-        puts "Factor:         #{result[:factor]}x larger"
-      else
-        puts 'Same size'
-      end
-      puts '=' * 60
-      puts ''
-    end
-    # Format bytes into human-readable string
-    #
-    # @param bytes [Integer] number of bytes
-    # @return [String] formatted string with units
-    def format_bytes(bytes)
-      if bytes < 1024
-        "#{bytes} bytes"
-      elsif bytes < 1024 * 1024
-        "#{(bytes / 1024.0).round(1)} KB"
-      else
-        "#{(bytes / (1024.0 * 1024)).round(2)} MB"
-      end
-    end
-    # Format number with comma separators for readability
-    #
-    # @param number [Integer] number to format
-    # @return [String] formatted number with commas
-    def format_number(number)
-      number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
-    end
     # Validate llms.txt file format
     #
     # Checks if llms.txt file follows proper format with title, description, and documentation links.

data/lib/llm_docs_builder/comparator.rb CHANGED Viewed

@@ -231,9 +231,10 @@ module LlmDocsBuilder
                  Float::INFINITY
                end
-      # Estimate tokens
-      human_tokens = estimate_tokens(human_content)
-      ai_tokens = estimate_tokens(ai_content)
+      # Estimate tokens using TokenEstimator
+      estimator = TokenEstimator.new
+      human_tokens = estimator.estimate(human_content)
+      ai_tokens = estimator.estimate(ai_content)
       token_reduction = human_tokens - ai_tokens
       token_reduction_percent = if human_tokens.positive?
                                   ((token_reduction.to_f / human_tokens) * 100).round
@@ -256,18 +257,5 @@ module LlmDocsBuilder
       }
     end
-    # Estimate token count using character-based approximation
-    #
-    # Uses the common heuristic that ~4 characters equals 1 token for English text.
-    # This provides reasonable estimates for documentation content without requiring
-    # external tokenizer dependencies.
-    #
-    # @param content [String] text content to estimate tokens for
-    # @return [Integer] estimated number of tokens
-    def estimate_tokens(content)
-      # Use 4 characters per token as a reasonable approximation
-      # This is a common heuristic for English text and works well for documentation
-      (content.length / 4.0).round
-    end
   end
 end

data/lib/llm_docs_builder/config.rb CHANGED Viewed

@@ -70,28 +70,65 @@ module LlmDocsBuilder
         remove_comments: if options.key?(:remove_comments)
                            options[:remove_comments]
                          else
-                           self['remove_comments'] || false
+                           self['remove_comments'] || true
                          end,
         normalize_whitespace: if options.key?(:normalize_whitespace)
                                 options[:normalize_whitespace]
                               else
-                                self['normalize_whitespace'] || false
+                                self['normalize_whitespace'] || true
                               end,
         remove_badges: if options.key?(:remove_badges)
                          options[:remove_badges]
                        else
-                         self['remove_badges'] || false
+                         self['remove_badges'] || true
                        end,
         remove_frontmatter: if options.key?(:remove_frontmatter)
                               options[:remove_frontmatter]
                             else
-                              self['remove_frontmatter'] || false
+                              self['remove_frontmatter'] || true
                             end,
         verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
         # Bulk transformation options
         suffix: options[:suffix] || self['suffix'] || '.llm',
         excludes: options[:excludes] || self['excludes'] || [],
-        bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
+        bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false),
+        # New compression options
+        remove_code_examples: if options.key?(:remove_code_examples)
+                                options[:remove_code_examples]
+                              else
+                                self['remove_code_examples'] || false
+                              end,
+        remove_images: if options.key?(:remove_images)
+                         options[:remove_images]
+                       else
+                         self['remove_images'] || false
+                       end,
+        simplify_links: if options.key?(:simplify_links)
+                          options[:simplify_links]
+                        else
+                          self['simplify_links'] || false
+                        end,
+        remove_blockquotes: if options.key?(:remove_blockquotes)
+                              options[:remove_blockquotes]
+                            else
+                              self['remove_blockquotes'] || false
+                            end,
+        generate_toc: if options.key?(:generate_toc)
+                        options[:generate_toc]
+                      else
+                        self['generate_toc'] || false
+                      end,
+        custom_instruction: options[:custom_instruction] || self['custom_instruction'],
+        remove_stopwords: if options.key?(:remove_stopwords)
+                            options[:remove_stopwords]
+                          else
+                            self['remove_stopwords'] || false
+                          end,
+        remove_duplicates: if options.key?(:remove_duplicates)
+                             options[:remove_duplicates]
+                           else
+                             self['remove_duplicates'] || false
+                           end
       }
     end

data/lib/llm_docs_builder/markdown_transformer.rb CHANGED Viewed

@@ -3,9 +3,8 @@
 module LlmDocsBuilder
   # Transforms markdown files to be AI-friendly
   #
-  # Processes individual markdown files to make them more suitable for LLM consumption by
-  # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
-  # formats.
+  # Orchestrates a pipeline of specialized transformers to process markdown content.
+  # Each transformer is responsible for a specific aspect of the transformation.
   #
   # @example Transform with base URL
   #   transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
@@ -31,163 +30,90 @@ module LlmDocsBuilder
     # @option options [Boolean] :normalize_whitespace normalize excessive whitespace
     # @option options [Boolean] :remove_badges remove badge/shield images
     # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
+    # @option options [Boolean] :remove_code_examples remove code blocks and inline code
+    # @option options [Boolean] :remove_images remove image syntax
+    # @option options [Boolean] :simplify_links shorten verbose link text
+    # @option options [Boolean] :remove_blockquotes remove blockquote formatting
+    # @option options [Boolean] :generate_toc generate table of contents at the top
+    # @option options [String] :custom_instruction custom instruction text to inject at top
+    # @option options [Boolean] :remove_stopwords remove common stopwords (aggressive)
+    # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
     def initialize(file_path, options = {})
       @file_path = file_path
       @options = options
     end
-    # Transform markdown content to be AI-friendly
+    # Transform markdown content using a pipeline of transformers
     #
-    # Applies transformations to make the markdown more suitable for LLM processing:
-    # - Removes YAML/TOML frontmatter (if remove_frontmatter enabled)
-    # - Expands relative links to absolute URLs (if base_url provided)
-    # - Converts HTML URLs to markdown format (if convert_urls enabled)
-    # - Removes HTML comments (if remove_comments enabled)
-    # - Removes badge/shield images (if remove_badges enabled)
-    # - Normalizes excessive whitespace (if normalize_whitespace enabled)
+    # Processes content through specialized transformers in order:
+    # 1. ContentCleanupTransformer - Removes unwanted elements
+    # 2. LinkTransformer - Processes links
+    # 3. TextCompressor - Advanced compression (if enabled)
+    # 4. EnhancementTransformer - Adds TOC and instructions
+    # 5. WhitespaceTransformer - Normalizes whitespace
     #
     # @return [String] transformed markdown content
     def transform
       content = File.read(file_path)
-      # Remove frontmatter first (before any other processing)
-      content = remove_frontmatter(content) if options[:remove_frontmatter]
-      # Link transformations
-      content = expand_relative_links(content) if options[:base_url]
-      content = convert_html_urls(content) if options[:convert_urls]
-      # Content cleanup
-      content = remove_comments(content) if options[:remove_comments]
-      content = remove_badges(content) if options[:remove_badges]
-      # Whitespace normalization last (after all other transformations)
-      content = normalize_whitespace(content) if options[:normalize_whitespace]
+      # Build and execute transformation pipeline
+      content = cleanup_transformer.transform(content, options)
+      content = link_transformer.transform(content, options)
+      content = compress_content(content) if should_compress?
+      content = enhancement_transformer.transform(content, options)
+      content = whitespace_transformer.transform(content, options)
       content
     end
     private
-    # Expand relative links to absolute URLs
+    # Get content cleanup transformer instance
     #
-    # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
-    # Leaves absolute URLs and anchors unchanged.
-    #
-    # @param content [String] markdown content to process
-    # @return [String] content with expanded links
-    def expand_relative_links(content)
-      base_url = options[:base_url]
-      content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
-        text = ::Regexp.last_match(1)
-        url = ::Regexp.last_match(2)
-        if url.start_with?('http://', 'https://', '//', '#')
-          match # Already absolute or anchor
-        else
-          # Clean up relative path
-          clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
-          expanded_url = File.join(base_url, clean_url)
-          "[#{text}](#{expanded_url})"
-        end
-      end
+    # @return [Transformers::ContentCleanupTransformer]
+    def cleanup_transformer
+      @cleanup_transformer ||= Transformers::ContentCleanupTransformer.new
     end
-    # Convert HTML URLs to markdown-friendly format
+    # Get link transformer instance
     #
-    # Changes URLs ending in .html or .htm to .md for better LLM understanding
-    #
-    # @param content [String] markdown content to process
-    # @return [String] content with converted URLs
-    def convert_html_urls(content)
-      content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
-        url.sub(/\.html?$/, '.md')
-      end
+    # @return [Transformers::LinkTransformer]
+    def link_transformer
+      @link_transformer ||= Transformers::LinkTransformer.new
     end
-    # Remove HTML comments from markdown content
-    #
-    # Strips out HTML comments (<!-- ... -->) which are typically metadata for developers
-    # and not relevant for LLM consumption. This reduces token usage and improves clarity.
-    #
-    # Handles:
-    # - Single-line comments: <!-- comment -->
-    # - Multi-line comments spanning multiple lines
-    # - Multiple comments in the same content
+    # Get enhancement transformer instance
     #
-    # @param content [String] markdown content to process
-    # @return [String] content with comments removed
-    def remove_comments(content)
-      # Remove HTML comments (single and multi-line)
-      # The .*? makes it non-greedy so it stops at the first -->
-      content.gsub(/<!--.*?-->/m, '')
+    # @return [Transformers::EnhancementTransformer]
+    def enhancement_transformer
+      @enhancement_transformer ||= Transformers::EnhancementTransformer.new
     end
-    # Remove badge and shield images from markdown
+    # Get whitespace transformer instance
     #
-    # Strips out badge/shield images (typically from shields.io, badge.fury.io, etc.)
-    # which are visual indicators for humans but provide no value to LLMs.
-    #
-    # Recognizes common patterns:
-    # - [![Badge](badge.svg)](link) (linked badges)
-    # - ![Badge](badge.svg) (unlinked badges)
-    # - Common badge domains: shields.io, badge.fury.io, travis-ci.org, etc.
-    #
-    # @param content [String] markdown content to process
-    # @return [String] content with badges removed
-    def remove_badges(content)
-      # Remove linked badges: [![...](badge-url)](link-url)
-      content = content.gsub(/\[\!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)\]\([^\)]*\)/i, '')
-      # Remove standalone badges: ![...](badge-url)
-      content = content.gsub(/!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)/i, '')
-      content
+    # @return [Transformers::WhitespaceTransformer]
+    def whitespace_transformer
+      @whitespace_transformer ||= Transformers::WhitespaceTransformer.new
     end
-    # Remove YAML or TOML frontmatter from markdown
-    #
-    # Strips out frontmatter blocks which are metadata used by static site generators
-    # (Jekyll, Hugo, etc.) but not relevant for LLM consumption.
+    # Check if content compression should be applied
     #
-    # Recognizes:
-    # - YAML frontmatter: --- ... ---
-    # - TOML frontmatter: +++ ... +++
-    #
-    # @param content [String] markdown content to process
-    # @return [String] content with frontmatter removed
-    def remove_frontmatter(content)
-      # Remove YAML frontmatter (--- ... ---)
-      content = content.sub(/\A---\s*$.*?^---\s*$/m, '')
-      # Remove TOML frontmatter (+++ ... +++)
-      content = content.sub(/\A\+\+\+\s*$.*?^\+\+\+\s*$/m, '')
-      content
+    # @return [Boolean]
+    def should_compress?
+      options[:remove_stopwords] || options[:remove_duplicates]
     end
-    # Normalize excessive whitespace in markdown
-    #
-    # Reduces excessive blank lines and trailing whitespace to make content more compact
-    # for LLM consumption without affecting readability.
-    #
-    # Transformations:
-    # - Multiple consecutive blank lines (3+) → 2 blank lines max
-    # - Trailing whitespace on lines → removed
-    # - Leading/trailing whitespace in file → trimmed
-    #
-    # @param content [String] markdown content to process
-    # @return [String] content with normalized whitespace
-    def normalize_whitespace(content)
-      # Remove trailing whitespace from each line
-      content = content.gsub(/ +$/, '')
-      # Reduce multiple consecutive blank lines to maximum of 2
-      content = content.gsub(/\n{4,}/, "\n\n\n")
-      # Trim leading and trailing whitespace from the entire content
-      content.strip
+    # Compress content using TextCompressor
+    #
+    # @param content [String] content to compress
+    # @return [String] compressed content
+    def compress_content(content)
+      compressor = TextCompressor.new
+      compression_methods = {
+        remove_stopwords: options[:remove_stopwords],
+        remove_duplicates: options[:remove_duplicates]
+      }
+      compressor.compress(content, compression_methods)
     end
   end
 end

data/lib/llm_docs_builder/output_formatter.rb ADDED Viewed

@@ -0,0 +1,93 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Formats output for CLI display
+  #
+  # Provides formatting utilities for displaying comparison results,
+  # byte sizes, and numbers in a user-friendly way.
+  #
+  # @api private
+  class OutputFormatter
+    # Format bytes into human-readable string
+    #
+    # @param bytes [Integer] number of bytes
+    # @return [String] formatted string with units (bytes/KB/MB)
+    #
+    # @example
+    #   OutputFormatter.format_bytes(1024)      #=> "1.0 KB"
+    #   OutputFormatter.format_bytes(1048576)   #=> "1.0 MB"
+    def self.format_bytes(bytes)
+      if bytes < 1024
+        "#{bytes} bytes"
+      elsif bytes < 1024 * 1024
+        "#{(bytes / 1024.0).round(1)} KB"
+      else
+        "#{(bytes / (1024.0 * 1024)).round(2)} MB"
+      end
+    end
+    # Format number with comma separators for readability
+    #
+    # @param number [Integer] number to format
+    # @return [String] formatted number with commas
+    #
+    # @example
+    #   OutputFormatter.format_number(1234567)  #=> "1,234,567"
+    def self.format_number(number)
+      number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
+    end
+    # Display formatted comparison results
+    #
+    # @param result [Hash] comparison results from Comparator
+    def self.display_comparison_results(result)
+      puts ''
+      puts '=' * 60
+      puts 'Context Window Comparison'
+      puts '=' * 60
+      puts ''
+      puts "Human version:  #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
+      puts "  Source: #{result[:human_source]}"
+      puts ''
+      puts "AI version:     #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
+      puts "  Source: #{result[:ai_source]}"
+      puts ''
+      puts '-' * 60
+      if result[:reduction_bytes].positive?
+        display_reduction(result)
+      elsif result[:reduction_bytes].negative?
+        display_increase(result)
+      else
+        puts 'Same size'
+      end
+      puts '=' * 60
+      puts ''
+    end
+    # Display reduction statistics
+    #
+    # @param result [Hash] comparison results
+    # @api private
+    def self.display_reduction(result)
+      puts "Reduction:      #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
+      puts "Token savings:  #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
+      puts "Factor:         #{result[:factor]}x smaller"
+    end
+    # Display increase statistics
+    #
+    # @param result [Hash] comparison results
+    # @api private
+    def self.display_increase(result)
+      increase_bytes = result[:reduction_bytes].abs
+      increase_percent = result[:reduction_percent].abs
+      token_increase = result[:token_reduction].abs
+      token_increase_percent = result[:token_reduction_percent].abs
+      puts "Increase:       #{format_bytes(increase_bytes)} (#{increase_percent}%)"
+      puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
+      puts "Factor:         #{result[:factor]}x larger"
+    end
+  end
+end

data/lib/llm_docs_builder/parser.rb CHANGED Viewed

@@ -108,14 +108,12 @@ module LlmDocsBuilder
   # Represents parsed llms.txt content with structured access to sections
   #
   # Provides convenient access to parsed llms.txt sections including title,
-  # description, and link collections. Can be converted to Hash or XML formats.
+  # description, and link collections.
   #
   # @example Access parsed content
   #   parsed.title              # => "My Project"
   #   parsed.description        # => "A description"
   #   parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
-  #   parsed.to_h               # => Hash representation
-  #   parsed.to_xml             # => XML string
   #
   # @api public
   class ParsedContent
@@ -163,61 +161,5 @@ module LlmDocsBuilder
     def optional_links
       sections[:optional] || []
     end
-    # Convert to hash representation
-    #
-    # @return [Hash] hash containing all parsed sections
-    def to_h
-      sections
-    end
-    # Convert to XML representation
-    #
-    # Generates an XML document with all parsed sections and links.
-    #
-    # @return [String] XML string representation
-    def to_xml
-      builder = []
-      builder << '<?xml version="1.0" encoding="UTF-8"?>'
-      builder << '<llms_context>'
-      builder << "  <title>#{title}</title>" if title
-      builder << "  <description>#{description}</description>" if description
-      add_xml_section(builder, 'documentation', documentation_links)
-      add_xml_section(builder, 'examples', example_links)
-      add_xml_section(builder, 'optional', optional_links) if sections[:optional]
-      builder << '</llms_context>'
-      builder.join("\n")
-    end
-    private
-    # Appends section XML elements to builder array
-    #
-    # Handles both array of link hashes and raw string content
-    #
-    # @param builder [Array<String>] XML lines accumulator
-    # @param name [String] section name
-    # @param links [Array<Hash>, String] section links or content
-    def add_xml_section(builder, name, links)
-      return if links.empty?
-      builder << "  <#{name}>"
-      if links.is_a?(Array)
-        links.each do |link|
-          builder << '    <link>'
-          builder << "      <title>#{link[:title]}</title>"
-          builder << "      <url>#{link[:url]}</url>"
-          builder << "      <description>#{link[:description]}</description>"
-          builder << '    </link>'
-        end
-      else
-        builder << "    #{links}"
-      end
-      builder << "  </#{name}>"
-    end
   end
 end