RubyGems - llm-docs-builder - Versions diffs - 0.11.0 → 0.12.0 - Mend

llm-docs-builder 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/.github/workflows/ci.yml +13 -0
data/.github/workflows/docker.yml +2 -2
data/.github/workflows/push.yml +2 -2
data/.gitignore +8 -0
data/CHANGELOG.md +7 -0
data/Gemfile +4 -0
data/Gemfile.lock +34 -5
data/README.md +16 -0
data/lib/llm_docs_builder/config.rb +33 -0
data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb +31 -0
data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb +71 -0
data/lib/llm_docs_builder/helpers.rb +9 -0
data/lib/llm_docs_builder/html_detector.rb +159 -0
data/lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb +181 -0
data/lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb +597 -0
data/lib/llm_docs_builder/html_to_markdown_converter.rb +792 -0
data/lib/llm_docs_builder/markdown_transformer.rb +23 -9
data/lib/llm_docs_builder/output_formatter.rb +1 -1
data/lib/llm_docs_builder/transformers/base_transformer.rb +13 -1
data/lib/llm_docs_builder/url_fetcher.rb +18 -0
data/lib/llm_docs_builder/version.rb +1 -1
data/lib/llm_docs_builder.rb +10 -0
data/llm-docs-builder.gemspec +1 -0
metadata +22 -2
data/AGENTS.md +0 -20

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: deeae74a329018b4a43d7845a3be8b7c31347699c3ff7abd93d7b697a48982a3
-  data.tar.gz: f9c842caa93a45b4d75c45a15e116e6f98d8463e5268e2de32e498c725877e4f
+  metadata.gz: ac257dad79f49ed6993f784f8a28ee1e996e735fef4581449ad521ea9414a5d4
+  data.tar.gz: 29e1d2d578d57ea6f17aafca070c61b6161b6313d6614f0e4f798933ceae082d
 SHA512:
-  metadata.gz: 94575eced147bd6740b5395acd41d3f46ffcadf40908831df081d5e03f56b35a2e1e9acfdfc7642af775b2aa86fe48ea322dd11baf48512d2f2ef43a1a491079
-  data.tar.gz: 52b7d40d4a95acd20a408f4d453f32c7154637ce42ec210cf67010ce10dbe14ad711c4cbfd4060d8ce5018b91668315d552732ea7118374e69228a869792ff0f
+  metadata.gz: f82216cca621e942c0e6ad3d92aba5d099159cc9c0d10c1d010a85e2a740511103cebd0198c0056195775064853e749472dcb7f0939b8d3fda7753d291a5b0da
+  data.tar.gz: 31aa5737e215439b11a2e79d793dabb9ff342206b660a2ecd846920bc2f6501c3d5910da4cdc52ecfcfa9f7b9acef14213b17936edd23d86808c0bcb2f391952

data/.github/workflows/ci.yml CHANGED Viewed

@@ -54,6 +54,18 @@ jobs:
           GITHUB_COVERAGE: ${{ matrix.coverage }}
         run: bin/rspecs
+  yard-lint:
+    timeout-minutes: 5
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
+      - name: Set up Ruby
+        uses: ruby/setup-ruby@v1
+        with:
+          ruby-version: '3.4.7'
+          bundler-cache: true
+      - name: Run yard-lint
+        run: bundle exec yard-lint lib/
   ci-success:
     name: CI Success
@@ -61,6 +73,7 @@ jobs:
     if: always()
     needs:
       - specs
+      - yard-lint
     steps:
       - name: Check all jobs passed
         if: |

data/.github/workflows/docker.yml CHANGED Viewed

@@ -31,7 +31,7 @@ jobs:
       - name: Docker meta
         id: meta
-        uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5
+        uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5
         with:
           images: |
             mensfeld/llm-docs-builder
@@ -45,7 +45,7 @@ jobs:
             type=raw,value=latest,enable={{is_default_branch}}
       - name: Set up QEMU
-        uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3
+        uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3

data/.github/workflows/push.yml CHANGED Viewed

@@ -24,7 +24,7 @@ jobs:
           fetch-depth: 0
       - name: Set up Ruby
-        uses: ruby/setup-ruby@4ff6f3611a42bc75eee1e5138240eb1613f48c8f # v1.266.0
+        uses: ruby/setup-ruby@d5126b9b3579e429dd52e51e68624dda2e05be25 # v1.267.0
         with:
           bundler-cache: false
@@ -32,4 +32,4 @@ jobs:
         run: |
           bundle install --jobs 4 --retry 3
-      - uses: rubygems/release-gem@a25424ba2ba8b387abc8ef40807c2c85b96cbe32 # v1.1.1
+      - uses: rubygems/release-gem@1c162a739e8b4cb21a676e97b087e8268d8fc40b # v1.1.2

data/.gitignore CHANGED Viewed

@@ -10,6 +10,8 @@
 /test/version_tmp/
 /tmp/
 mise.toml
+.DS_Store
+.vscode/launch.json
 # Used by dotenv library to load environment variables.
 .env
@@ -64,3 +66,9 @@ llms.txt
 # Config files that might contain sensitive data
 llms-txt.yml
 .llms-txt.yml
+# AI coding agent
+AGENTS.md
+CLAUDE.md
+GEMINI.md

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,12 @@
 # Changelog
+## 0.12.0 (2025-11-12)
+- [Feature] **HTML to Markdown Reverse Converter** — Added support for converting HTML content to markdown format.
+  - Enables processing of HTML documentation sources
+  - Integrates seamlessly with the transformer pipeline
+  - Useful for converting web-based docs to markdown for further processing
+  - By @Eric-Guo in PR #32.
 ## 0.11.0 (2025-11-03)
 - [Feature] **Transform from URL** — The `transform` command now accepts a remote URL via `--url` and processes fetched content through the standard transformer pipeline.
   - Example: `llm-docs-builder transform --url https://example.com/docs/page.html`

data/Gemfile CHANGED Viewed

@@ -7,4 +7,8 @@ gemspec
 group :development do
   gem 'pry'
   gem 'pry-byebug'
+  gem 'yard-lint'
+end
+group :test do
 end

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,8 @@
 PATH
   remote: .
   specs:
-    llm-docs-builder (0.11.0)
+    llm-docs-builder (0.12.0)
+      nokogiri (~> 1.17)
       zeitwerk (~> 2.6)
 GEM
@@ -10,16 +11,28 @@ GEM
     ast (2.4.3)
     byebug (12.0.0)
     coderay (1.1.3)
+    date (3.5.0)
     diff-lcs (1.6.2)
     docile (1.4.1)
-    json (2.15.2)
+    erb (5.1.3)
+    io-console (0.8.1)
+    irb (1.15.3)
+      pp (>= 0.6.0)
+      rdoc (>= 4.0.0)
+      reline (>= 0.4.2)
+    json (2.16.0)
     language_server-protocol (3.17.0.5)
     lint_roller (1.1.0)
     method_source (1.1.0)
+    nokogiri (1.18.10-x86_64-linux-gnu)
+      racc (~> 1.4)
     parallel (1.27.0)
     parser (3.3.10.0)
       ast (~> 2.4.1)
       racc
+    pp (0.6.3)
+      prettyprint
+    prettyprint (0.2.0)
     prism (1.6.0)
     pry (0.15.2)
       coderay (~> 1.1)
@@ -27,10 +40,19 @@ GEM
     pry-byebug (3.11.0)
       byebug (~> 12.0)
       pry (>= 0.13, < 0.16)
+    psych (5.2.6)
+      date
+      stringio
     racc (1.8.1)
     rainbow (3.1.1)
     rake (13.3.1)
+    rdoc (6.15.1)
+      erb
+      psych (>= 4.0.0)
+      tsort
     regexp_parser (2.11.3)
+    reline (0.6.3)
+      io-console (~> 0.5)
     rspec (3.13.2)
       rspec-core (~> 3.13.0)
       rspec-expectations (~> 3.13.0)
@@ -40,11 +62,11 @@ GEM
     rspec-expectations (3.13.5)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
-    rspec-mocks (3.13.6)
+    rspec-mocks (3.13.7)
       diff-lcs (>= 1.2.0, < 2.0)
       rspec-support (~> 3.13.0)
     rspec-support (3.13.6)
-    rubocop (1.81.6)
+    rubocop (1.81.7)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)
       lint_roller (~> 1.1.0)
@@ -65,13 +87,19 @@ GEM
       simplecov_json_formatter (~> 0.1)
     simplecov-html (0.13.2)
     simplecov_json_formatter (0.1.4)
+    stringio (3.1.8)
+    tsort (0.2.0)
     unicode-display_width (3.2.0)
       unicode-emoji (~> 4.1)
     unicode-emoji (4.1.0)
+    yard (0.9.37)
+    yard-lint (1.1.0)
+      irb
+      yard (~> 0.9)
+      zeitwerk (~> 2.6)
     zeitwerk (2.7.3)
 PLATFORMS
-  ruby
   x86_64-linux
 DEPENDENCIES
@@ -83,6 +111,7 @@ DEPENDENCIES
   rspec (~> 3.0)
   rubocop (~> 1.0)
   simplecov (~> 0.21)
+  yard-lint
 BUNDLED WITH
    2.7.2

data/README.md CHANGED Viewed

@@ -68,6 +68,8 @@ llm-docs-builder transform --url https://yoursite.com/docs/page.html
 llm-docs-builder bulk-transform --config llm-docs-builder.yml
 ```
+**HTML to Markdown Conversion:** The transformer automatically detects and converts HTML content to clean markdown format. This works seamlessly with both local files and remote URLs, converting HTML tables, code blocks, and other elements into their markdown equivalents.
 ## Installation
 ### Docker (Recommended)
@@ -85,6 +87,20 @@ gem install llm-docs-builder
 ## Features
+### Automatic HTML to Markdown Conversion
+The tool automatically detects and converts HTML content to clean markdown:
+- **HTML Tables** → Markdown tables
+- **HTML Code Blocks** → Fenced code blocks
+- **Figures & Captions** → Clean markdown equivalents
+- **Seamless Integration** - Works with local files and remote URLs without special configuration
+```bash
+# Transform HTML content automatically
+llm-docs-builder transform --docs page-with-html.md
+llm-docs-builder transform --url https://site.com/docs/api.html
+```
 ### Measure and Compare
 ```bash

data/lib/llm_docs_builder/config.rb CHANGED Viewed

@@ -53,6 +53,39 @@ module LlmDocsBuilder
     # defaults for any options not specified via CLI.
     #
     # @param options [Hash] CLI options hash
+    # @option options [String] :docs path to documentation directory or file
+    # @option options [String] :base_url base URL for expanding relative links
+    # @option options [String] :title project title
+    # @option options [String] :description project description
+    # @option options [String] :body additional body content
+    # @option options [String] :output output file path
+    # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
+    # @option options [Boolean] :remove_comments remove HTML comments
+    # @option options [Boolean] :normalize_whitespace normalize whitespace
+    # @option options [Boolean] :remove_badges remove badge images
+    # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
+    # @option options [Boolean] :verbose enable verbose output
+    # @option options [String] :suffix suffix for transformed files
+    # @option options [Array<String>] :excludes glob patterns for files to exclude
+    # @option options [Boolean] :bulk enable bulk transformation mode
+    # @option options [Boolean] :include_hidden include hidden files
+    # @option options [Boolean] :remove_code_examples remove code blocks
+    # @option options [Boolean] :remove_images remove image syntax
+    # @option options [Boolean] :simplify_links simplify link text
+    # @option options [Boolean] :remove_blockquotes remove blockquote formatting
+    # @option options [Boolean] :generate_toc generate table of contents
+    # @option options [String] :custom_instruction custom instruction text
+    # @option options [Boolean] :remove_stopwords remove common stopwords
+    # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
+    # @option options [Boolean] :normalize_headings normalize heading hierarchy
+    # @option options [String] :heading_separator separator for heading paths
+    # @option options [Boolean] :include_metadata include metadata in output
+    # @option options [Boolean] :include_tokens include token counts
+    # @option options [Boolean] :include_timestamps include timestamps
+    # @option options [Boolean] :include_priority include priority metadata
+    # @option options [Boolean] :calculate_compression calculate compression ratios
+    # @option options [String] :content raw markdown content
+    # @option options [String] :source_url source URL for content
     # @return [Hash] merged configuration with CLI overrides applied
     def merge_with_options(options)
       # CLI options override config file, config file provides defaults

data/lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb ADDED Viewed

@@ -0,0 +1,31 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Helper methods for content transformation
+  #
+  # @api private
+  module Helpers
+    # Removes trailing pipe characters and whitespace from array of string parts
+    #
+    # @param parts [Array<String>] array of string parts to process
+    # @return [void]
+    def prune_trailing_unsafe_link_separator!(parts)
+      while parts.any?
+        last = parts.last
+        new_last = last.sub(/[ \t]*\|\s*\z/, '')
+        if new_last != last
+          trimmed = new_last.rstrip
+          parts[-1] = trimmed
+          parts.pop if trimmed.empty?
+        elsif last.strip.empty?
+          parts.pop
+        else
+          break
+        end
+      end
+    end
+    module_function :prune_trailing_unsafe_link_separator!
+  end
+end

data/lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb ADDED Viewed

@@ -0,0 +1,71 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Helpers
+    # Reduces consecutive blank lines outside of code fences
+    #
+    # @param text [String] input text to process
+    # @param max_blank [Integer] maximum number of consecutive blank lines to allow
+    # @param fence_chars [Array<String>] characters that can be used for code fences
+    # @param min_fence [Integer] minimum length of fence character sequence
+    # @return [String] processed text with squeezed blank lines
+    def squeeze_blank_lines_outside_fences(text, max_blank: 2, fence_chars: %w[` ~], min_fence: 3)
+      return '' if text.to_s.empty?
+      lines = text.split("\n", -1)
+      inside_fence = false
+      fence_indent = ''.dup
+      fence_char   = nil
+      fence_len    = 0
+      # Build a fast “does this look like an opening fence?” regex
+      # e.g., leading spaces + ``` or ~~~ (length >= min_fence) + optional info string
+      fence_set = Regexp.escape(fence_chars.join)
+      open_re   = /\A(\s*)([#{fence_set}])\2{#{min_fence - 1},}.*\z/
+      out = []
+      blank_streak = 0
+      lines.each_with_index do |line, _idx|
+        if inside_fence
+          out << line
+          # Closing fence must match indent, char, and fence length
+          if line.match?(/\A#{Regexp.escape(fence_indent)}#{Regexp.escape(fence_char * fence_len)}\s*\z/)
+            inside_fence = false
+            fence_indent = ''.dup
+            fence_char   = nil
+            fence_len    = 0
+          end
+          next
+        end
+        if (m = line.match(open_re))
+          # Enter fenced block; compute the *actual* fence length from the line
+          fence_indent = m[1]
+          fence_char   = m[2]
+          after_indent = line[fence_indent.length..]
+          fence_len    = after_indent[/\A#{Regexp.escape(fence_char)}+/].length
+          inside_fence = true
+          blank_streak = 0
+          out << line
+          next
+        end
+        # Outside fences: squeeze blank lines
+        if line.strip.empty?
+          blank_streak += 1
+          # Keep at most max_blank blank lines; skip extras
+          out << line if blank_streak <= max_blank
+        else
+          blank_streak = 0
+          out << line
+        end
+      end
+      out.join("\n")
+    end
+    module_function :squeeze_blank_lines_outside_fences
+  end
+end

data/lib/llm_docs_builder/helpers.rb ADDED Viewed

@@ -0,0 +1,9 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  module Helpers
+  end
+end
+require_relative 'helpers/squeeze_blank_lines_outside_fences'
+require_relative 'helpers/prune_trailing_unsafe_link_separator'

data/lib/llm_docs_builder/html_detector.rb ADDED Viewed

@@ -0,0 +1,159 @@
+# frozen_string_literal: true
+module LlmDocsBuilder
+  # Detects whether input should be treated as HTML and related snippet checks
+  class HtmlDetector
+    # Detect if loaded content is HTML instead of markdown
+    #
+    # @param content [String] raw content
+    # @param snippet [String, nil] optional precomputed snippet
+    # @return [Boolean]
+    def html_content?(content, snippet = detection_snippet(content))
+      return false unless html_content_snippet?(snippet)
+      full_html_document?(content)
+    end
+    # Prepare a snippet of content for HTML detection by removing leading whitespace
+    # and build metadata comments.
+    #
+    # @param content [String]
+    # @return [String, nil]
+    def detection_snippet(content)
+      return unless content
+      snippet = content.lstrip
+      return unless snippet
+      comment_prefix = /\A<!--.*?-->\s*/m
+      # Remote docs often include build metadata comments; skip them before tag detection.
+      return '' if snippet.empty? while snippet.sub!(comment_prefix, '')
+      snippet.lstrip[0, 500]
+    end
+    # Determine whether a snippet should be treated as HTML.
+    #
+    # @param snippet [String, nil]
+    # @return [Boolean]
+    def html_content_snippet?(snippet)
+      return false unless snippet && !snippet.empty?
+      return false if markdown_heading_snippet?(snippet)
+      html_candidate_snippet?(snippet)
+    end
+    # Determine whether a snippet appears to start with HTML markup.
+    #
+    # @param snippet [String]
+    # @return [Boolean]
+    def html_candidate_snippet?(snippet)
+      snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i)
+    end
+    # Check if the full document should be treated as HTML by parsing it and
+    # ensuring we do not observe unwrapped markdown constructs like plain text or lists.
+    #
+    # @param content [String]
+    # @return [Boolean]
+    def full_html_document?(content)
+      document = Nokogiri::HTML::Document.parse(content)
+      body = document.at('body')
+      return false unless body
+      return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) }
+      body.xpath('./text()').each do |node|
+        text = node.text
+        next unless meaningful_text?(text)
+        return false unless allow_inline_body_text?(content, text)
+      end
+      true
+    rescue Nokogiri::XML::SyntaxError
+      false
+    end
+    # Checks if text contains meaningful non-whitespace content
+    #
+    # @param text [String, nil]
+    # @return [Boolean] true if text contains non-whitespace characters
+    def meaningful_text?(text)
+      return false if text.nil?
+      stripped = text.strip
+      stripped.match?(/\S/)
+    end
+    # Checks if text looks like markdown syntax
+    #
+    # @param text [String, nil]
+    # @return [Boolean] true if text contains markdown-like patterns
+    def markdown_like_text?(text)
+      return false if text.nil?
+      return true if markdown_heading_snippet?(text)
+      text.each_line do |line|
+        trimmed = line.lstrip
+        next if trimmed.empty?
+        next if trimmed.start_with?('<')
+        return true if trimmed.match?(/\A[*+-]\s+\S/)
+        return true if trimmed.match?(/\A\d+\.\s+\S/)
+        return true if trimmed.match?(/\A>\s+\S/)
+        return true if trimmed.start_with?('```', '~~~')
+        return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/)
+      end
+      false
+    end
+    # Determines if inline body text should be allowed in HTML context
+    #
+    # @param content [String] full content being processed
+    # @param text [String] specific text to check
+    # @return [Boolean] true if inline body text is acceptable
+    def allow_inline_body_text?(content, text)
+      return false if markdown_like_text?(text)
+      html_with_body_wrapper?(content)
+    end
+    # Checks if content has HTML document structure wrapper tags
+    #
+    # @param content [String] content to check for HTML wrapper tags
+    # @return [Boolean] true if content contains DOCTYPE, html, or body tags
+    def html_with_body_wrapper?(content)
+      content.match?(/<\s*!DOCTYPE\s+html/i) ||
+        content.match?(/<\s*html\b/i) ||
+        content.match?(/<\s*body\b/i)
+    end
+    # Detect whether the snippet represents a table fragment we should preserve.
+    #
+    # @param snippet [String, nil]
+    # @return [Boolean]
+    def table_fragment?(snippet)
+      return false unless snippet && !snippet.empty?
+      snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i)
+    end
+    # Detect common markdown heading syntax within the snippet.
+    #
+    # @param snippet [String]
+    # @return [Boolean]
+    def markdown_heading_snippet?(snippet)
+      snippet.each_line do |line|
+        trimmed = line.lstrip
+        next if trimmed.empty?
+        next if trimmed.start_with?('<')
+        return true if trimmed.match?(/\A#+\s+/)
+      end
+      false
+    end
+  end
+end