RubyGems - llm-docs-builder - Versions diffs - 0.10.0 → 0.12.0 - Mend

llm-docs-builder 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +13 -0
data/.github/workflows/docker.yml +2 -2
data/.github/workflows/push.yml +2 -2
data/.gitignore +8 -0
data/CHANGELOG.md +13 -0
data/Gemfile +4 -0
data/Gemfile.lock +47 -18
data/README.md +19 -0
data/lib/llm_docs_builder/cli.rb +32 -10
data/lib/llm_docs_builder/comparator.rb +5 -75
data/lib/llm_docs_builder/config.rb +42 -2
data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
data/lib/llm_docs_builder/helpers.rb +9 -0
data/lib/llm_docs_builder/html_detector.rb +159 -0
data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
data/lib/llm_docs_builder/markdown_transformer.rb +30 -5
data/lib/llm_docs_builder/output_formatter.rb +1 -1
data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
data/lib/llm_docs_builder/url_fetcher.rb +138 -0
data/lib/llm_docs_builder/version.rb +1 -1
data/lib/llm_docs_builder.rb +11 -0
data/llm-docs-builder.gemspec +1 -0
metadata +23 -1

data/lib/llm_docs_builder/html_detector.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Detects whether input should be treated as HTML and related snippet checks
+  class HtmlDetector
+    # Detect if loaded content is HTML instead of markdown
+    #
+    # @param content [String] raw content
+    # @param snippet [String, nil] optional precomputed snippet
+    # @return [Boolean]
+    def html_content?(content, snippet = detection_snippet(content))
+      return false unless html_content_snippet?(snippet)
+      full_html_document?(content)
+    end
+    # Prepare a snippet of content for HTML detection by removing leading whitespace
+    # and build metadata comments.
+    #
+    # @param content [String]
+    # @return [String, nil]
+    def detection_snippet(content)
+      return unless content
+      snippet = content.lstrip
+      return unless snippet
+      comment_prefix = /\A<!--.*?-->\s*/m
+      # Remote docs often include build metadata comments; skip them before tag detection.
+      return '' if snippet.empty? while snippet.sub!(comment_prefix, '')
+      snippet.lstrip[0, 500]
+    end
+    # Determine whether a snippet should be treated as HTML.
+    #
+    # @param snippet [String, nil]
+    # @return [Boolean]
+    def html_content_snippet?(snippet)
+      return false unless snippet && !snippet.empty?
+      return false if markdown_heading_snippet?(snippet)
+      html_candidate_snippet?(snippet)
+    end
+    # Determine whether a snippet appears to start with HTML markup.
+    #
+    # @param snippet [String]
+    # @return [Boolean]
+    def html_candidate_snippet?(snippet)
+      snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i)
+    end
+    # Check if the full document should be treated as HTML by parsing it and
+    # ensuring we do not observe unwrapped markdown constructs like plain text or lists.
+    #
+    # @param content [String]
+    # @return [Boolean]
+    def full_html_document?(content)
+      document = Nokogiri::HTML::Document.parse(content)
+      body = document.at('body')
+      return false unless body
+      return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) }
+      body.xpath('./text()').each do |node|
+        text = node.text
+        next unless meaningful_text?(text)
+        return false unless allow_inline_body_text?(content, text)
+      end
+      true
+    rescue Nokogiri::XML::SyntaxError
+      false
+    end
+    # Checks if text contains meaningful non-whitespace content
+    #
+    # @param text [String, nil]
+    # @return [Boolean] true if text contains non-whitespace characters
+    def meaningful_text?(text)
+      return false if text.nil?
+      stripped = text.strip
+      stripped.match?(/\S/)
+    end
+    # Checks if text looks like markdown syntax
+    #
+    # @param text [String, nil]
+    # @return [Boolean] true if text contains markdown-like patterns
+    def markdown_like_text?(text)
+      return false if text.nil?
+      return true if markdown_heading_snippet?(text)
+      text.each_line do |line|
+        trimmed = line.lstrip
+        next if trimmed.empty?
+        next if trimmed.start_with?('<')
+        return true if trimmed.match?(/\A[*+-]\s+\S/)
+        return true if trimmed.match?(/\A\d+\.\s+\S/)
+        return true if trimmed.match?(/\A>\s+\S/)
+        return true if trimmed.start_with?('```', '~~~')
+        return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/)
+      end
+      false
+    end
+    # Determines if inline body text should be allowed in HTML context
+    #
+    # @param content [String] full content being processed
+    # @param text [String] specific text to check
+    # @return [Boolean] true if inline body text is acceptable
+    def allow_inline_body_text?(content, text)
+      return false if markdown_like_text?(text)
+      html_with_body_wrapper?(content)
+    end
+    # Checks if content has HTML document structure wrapper tags
+    #
+    # @param content [String] content to check for HTML wrapper tags
+    # @return [Boolean] true if content contains DOCTYPE, html, or body tags
+    def html_with_body_wrapper?(content)
+      content.match?(/<\s*!DOCTYPE\s+html/i) ||
+        content.match?(/<\s*html\b/i) ||
+        content.match?(/<\s*body\b/i)
+    end
+    # Detect whether the snippet represents a table fragment we should preserve.
+    #
+    # @param snippet [String, nil]
+    # @return [Boolean]
+    def table_fragment?(snippet)
+      return false unless snippet && !snippet.empty?
+      snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i)
+    end
+    # Detect common markdown heading syntax within the snippet.
+    #
+    # @param snippet [String]
+    # @return [Boolean]
+    def markdown_heading_snippet?(snippet)
+      snippet.each_line do |line|
+        trimmed = line.lstrip
+        next if trimmed.empty?
+        next if trimmed.start_with?('<')
+        return true if trimmed.match?(/\A#+\s+/)
+      end
+      false
+    end
+  end
+end

data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb ADDED Viewed

@@ -0,0 +1,181 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Provides HTML to Markdown conversion functionality
+  #
+  # This module contains specialized renderers for converting HTML elements
+  # to Markdown format, with support for complex structures like tables,
+  # figures, and syntax-highlighted code blocks.
+  #
+  # @api private
+  module HtmlToMarkdown
+    # Converts <figure> elements that actually contain syntax-highlighted code back into fenced Markdown.
+    class FigureCodeBlockRenderer
+      # Generic CSS class names commonly used for code formatting that should be ignored
+      GENERIC_CODE_CLASSES = %w[highlight code main gutter numbers line-numbers line-number line wrap table].freeze
+      # @return [Nokogiri::XML::Node, nil] the identified code block node
+      attr_reader :code_block_node
+      # Initialize a new figure code block renderer
+      #
+      # @param element [Nokogiri::XML::Node] the figure element to render
+      # @param inline_collapser [Proc] callable for collapsing inline content
+      # @param fence_calculator [Proc] callable for calculating fence length
+      def initialize(element, inline_collapser:, fence_calculator:)
+        @element = element
+        @inline_collapser = inline_collapser
+        @fence_calculator = fence_calculator
+      end
+      # Render the figure as a fenced code block
+      #
+      # @return [String, nil] markdown fenced code block or nil if not a code figure
+      def render
+        @code_block_node = nil
+        return unless code_figure?
+        lines = extract_figure_code_lines
+        return if lines.empty?
+        language = detect_code_language
+        caption = caption_text
+        info_string = [language, caption].compact.reject(&:empty?).join(' ')
+        code_body = lines.join("\n")
+        fence = fence_calculator.call(code_body)
+        opening_fence = info_string.empty? ? fence : "#{fence}#{info_string}"
+        "#{opening_fence}\n#{code_body}\n#{fence}"
+      end
+      private
+      # @!attribute [r] element
+      #   @return [Nokogiri::XML::Node] the figure element being processed
+      # @!attribute [r] inline_collapser
+      #   @return [Proc] callable for collapsing inline content
+      # @!attribute [r] fence_calculator
+      #   @return [Proc] callable for calculating fence length
+      attr_reader :element, :inline_collapser, :fence_calculator
+      # Extract caption text from figcaption element
+      #
+      # @return [String, nil] caption text or nil if no caption
+      def caption_text
+        caption_node = element.at_css('figcaption')
+        return if caption_node.nil?
+        inline_collapser.call(caption_node)
+      end
+      # Check if figure element represents a code block
+      #
+      # @return [Boolean] true if figure contains code
+      def code_figure?
+        class_tokens(element).any? { |token| token.casecmp('code').zero? }
+      end
+      # Extract code lines from figure element
+      #
+      # @return [Array<String>] array of code lines
+      def extract_figure_code_lines
+        pre = element.at_css('td.main pre') ||
+              element.at_css('td:not(.line-numbers) pre') ||
+              element.at_css('div.highlight pre') ||
+              element.at_css('pre')
+        @code_block_node = pre
+        return [] unless pre
+        lines =
+          if pre.css('.line').any?
+            pre.css('.line').map { |line| extract_code_line_text(line) }
+          else
+            raw = pre.at_css('code') ? pre.at_css('code').text : pre.text
+            raw.to_s.gsub(/\r\n?/, "\n").split("\n", -1)
+          end
+        clean_code_lines(lines)
+      end
+      # Extract text from a single code line node
+      #
+      # @param line_node [Nokogiri::XML::Element] line element
+      # @return [String] extracted text
+      def extract_code_line_text(line_node)
+        text = line_node.xpath('.//text()').map(&:text).join
+        text = text.tr("\u00a0", ' ')
+        text.gsub(/\r\n?/, '').rstrip
+      end
+      # Clean and normalize code lines
+      #
+      # @param lines [Array<String>] raw code lines
+      # @return [Array<String>] cleaned lines
+      def clean_code_lines(lines)
+        sanitized = lines.map { |line| line.to_s.gsub(/\r\n?/, "\n") }
+        sanitized.shift while sanitized.first&.strip&.empty?
+        sanitized.pop while sanitized.last&.strip&.empty?
+        sanitized
+      end
+      # Detect programming language from element attributes
+      #
+      # @return [String, nil] detected language or nil
+      def detect_code_language
+        candidates = [
+          element.at_css('code'),
+          element.at_css('pre'),
+          element.at_css('td.main'),
+          element.at_css('div.highlight'),
+          element
+        ].compact
+        candidates.concat(element.css('[data-language], [data-lang], [lang], [class]'))
+        candidates.each do |node|
+          language = extract_language_from_node(node)
+          return language unless language.nil? || language.empty?
+        end
+        nil
+      end
+      # Extract language identifier from node attributes
+      #
+      # @param node [Nokogiri::XML::Element] element to examine
+      # @return [String, nil] language identifier or nil
+      def extract_language_from_node(node)
+        %w[data-language data-lang lang].each do |attr|
+          value = node[attr]
+          return value.to_s.strip unless value.nil? || value.to_s.strip.empty?
+        end
+        class_attr = node['class']
+        return nil if class_attr.nil? || class_attr.strip.empty?
+        tokens = class_tokens(node)
+        tokens.each do |token|
+          next if token.empty?
+          if (match = token.match(/\A(?:language|lang)-(.*)\z/i))
+            candidate = match[1].to_s.strip
+            return candidate unless candidate.empty?
+          end
+          lowered = token.downcase
+          next if GENERIC_CODE_CLASSES.include?(lowered)
+          return token
+        end
+        nil
+      end
+      # Extract class tokens from node's class attribute
+      #
+      # @param node [Nokogiri::XML::Element] element to examine
+      # @return [Array<String>] array of class names
+      def class_tokens(node)
+        (node['class'] || '').split(/\s+/).reject(&:empty?)
+      end
+    end
+  end
+end