llm-docs-builder 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: deeae74a329018b4a43d7845a3be8b7c31347699c3ff7abd93d7b697a48982a3
4
- data.tar.gz: f9c842caa93a45b4d75c45a15e116e6f98d8463e5268e2de32e498c725877e4f
3
+ metadata.gz: ac257dad79f49ed6993f784f8a28ee1e996e735fef4581449ad521ea9414a5d4
4
+ data.tar.gz: 29e1d2d578d57ea6f17aafca070c61b6161b6313d6614f0e4f798933ceae082d
5
5
  SHA512:
6
- metadata.gz: 94575eced147bd6740b5395acd41d3f46ffcadf40908831df081d5e03f56b35a2e1e9acfdfc7642af775b2aa86fe48ea322dd11baf48512d2f2ef43a1a491079
7
- data.tar.gz: 52b7d40d4a95acd20a408f4d453f32c7154637ce42ec210cf67010ce10dbe14ad711c4cbfd4060d8ce5018b91668315d552732ea7118374e69228a869792ff0f
6
+ metadata.gz: f82216cca621e942c0e6ad3d92aba5d099159cc9c0d10c1d010a85e2a740511103cebd0198c0056195775064853e749472dcb7f0939b8d3fda7753d291a5b0da
7
+ data.tar.gz: 31aa5737e215439b11a2e79d793dabb9ff342206b660a2ecd846920bc2f6501c3d5910da4cdc52ecfcfa9f7b9acef14213b17936edd23d86808c0bcb2f391952
@@ -54,6 +54,18 @@ jobs:
54
54
  GITHUB_COVERAGE: ${{ matrix.coverage }}
55
55
  run: bin/rspecs
56
56
 
57
+ yard-lint:
58
+ timeout-minutes: 5
59
+ runs-on: ubuntu-latest
60
+ steps:
61
+ - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
62
+ - name: Set up Ruby
63
+ uses: ruby/setup-ruby@v1
64
+ with:
65
+ ruby-version: '3.4.7'
66
+ bundler-cache: true
67
+ - name: Run yard-lint
68
+ run: bundle exec yard-lint lib/
57
69
 
58
70
  ci-success:
59
71
  name: CI Success
@@ -61,6 +73,7 @@ jobs:
61
73
  if: always()
62
74
  needs:
63
75
  - specs
76
+ - yard-lint
64
77
  steps:
65
78
  - name: Check all jobs passed
66
79
  if: |
@@ -31,7 +31,7 @@ jobs:
31
31
 
32
32
  - name: Docker meta
33
33
  id: meta
34
- uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5
34
+ uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5
35
35
  with:
36
36
  images: |
37
37
  mensfeld/llm-docs-builder
@@ -45,7 +45,7 @@ jobs:
45
45
  type=raw,value=latest,enable={{is_default_branch}}
46
46
 
47
47
  - name: Set up QEMU
48
- uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3
48
+ uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
49
49
 
50
50
  - name: Set up Docker Buildx
51
51
  uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3
@@ -24,7 +24,7 @@ jobs:
24
24
  fetch-depth: 0
25
25
 
26
26
  - name: Set up Ruby
27
- uses: ruby/setup-ruby@4ff6f3611a42bc75eee1e5138240eb1613f48c8f # v1.266.0
27
+ uses: ruby/setup-ruby@d5126b9b3579e429dd52e51e68624dda2e05be25 # v1.267.0
28
28
  with:
29
29
  bundler-cache: false
30
30
 
@@ -32,4 +32,4 @@ jobs:
32
32
  run: |
33
33
  bundle install --jobs 4 --retry 3
34
34
 
35
- - uses: rubygems/release-gem@a25424ba2ba8b387abc8ef40807c2c85b96cbe32 # v1.1.1
35
+ - uses: rubygems/release-gem@1c162a739e8b4cb21a676e97b087e8268d8fc40b # v1.1.2
data/.gitignore CHANGED
@@ -10,6 +10,8 @@
10
10
  /test/version_tmp/
11
11
  /tmp/
12
12
  mise.toml
13
+ .DS_Store
14
+ .vscode/launch.json
13
15
 
14
16
  # Used by dotenv library to load environment variables.
15
17
  .env
@@ -64,3 +66,9 @@ llms.txt
64
66
  # Config files that might contain sensitive data
65
67
  llms-txt.yml
66
68
  .llms-txt.yml
69
+
70
+ # AI coding agent
71
+ AGENTS.md
72
+ CLAUDE.md
73
+ GEMINI.md
74
+
data/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.12.0 (2025-11-12)
4
+ - [Feature] **HTML to Markdown Reverse Converter** — Added support for converting HTML content to markdown format.
5
+ - Enables processing of HTML documentation sources
6
+ - Integrates seamlessly with the transformer pipeline
7
+ - Useful for converting web-based docs to markdown for further processing
8
+ - By @Eric-Guo in PR #32.
9
+
3
10
  ## 0.11.0 (2025-11-03)
4
11
  - [Feature] **Transform from URL** — The `transform` command now accepts a remote URL via `--url` and processes fetched content through the standard transformer pipeline.
5
12
  - Example: `llm-docs-builder transform --url https://example.com/docs/page.html`
data/Gemfile CHANGED
@@ -7,4 +7,8 @@ gemspec
7
7
  group :development do
8
8
  gem 'pry'
9
9
  gem 'pry-byebug'
10
+ gem 'yard-lint'
11
+ end
12
+
13
+ group :test do
10
14
  end
data/Gemfile.lock CHANGED
@@ -1,7 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- llm-docs-builder (0.11.0)
4
+ llm-docs-builder (0.12.0)
5
+ nokogiri (~> 1.17)
5
6
  zeitwerk (~> 2.6)
6
7
 
7
8
  GEM
@@ -10,16 +11,28 @@ GEM
10
11
  ast (2.4.3)
11
12
  byebug (12.0.0)
12
13
  coderay (1.1.3)
14
+ date (3.5.0)
13
15
  diff-lcs (1.6.2)
14
16
  docile (1.4.1)
15
- json (2.15.2)
17
+ erb (5.1.3)
18
+ io-console (0.8.1)
19
+ irb (1.15.3)
20
+ pp (>= 0.6.0)
21
+ rdoc (>= 4.0.0)
22
+ reline (>= 0.4.2)
23
+ json (2.16.0)
16
24
  language_server-protocol (3.17.0.5)
17
25
  lint_roller (1.1.0)
18
26
  method_source (1.1.0)
27
+ nokogiri (1.18.10-x86_64-linux-gnu)
28
+ racc (~> 1.4)
19
29
  parallel (1.27.0)
20
30
  parser (3.3.10.0)
21
31
  ast (~> 2.4.1)
22
32
  racc
33
+ pp (0.6.3)
34
+ prettyprint
35
+ prettyprint (0.2.0)
23
36
  prism (1.6.0)
24
37
  pry (0.15.2)
25
38
  coderay (~> 1.1)
@@ -27,10 +40,19 @@ GEM
27
40
  pry-byebug (3.11.0)
28
41
  byebug (~> 12.0)
29
42
  pry (>= 0.13, < 0.16)
43
+ psych (5.2.6)
44
+ date
45
+ stringio
30
46
  racc (1.8.1)
31
47
  rainbow (3.1.1)
32
48
  rake (13.3.1)
49
+ rdoc (6.15.1)
50
+ erb
51
+ psych (>= 4.0.0)
52
+ tsort
33
53
  regexp_parser (2.11.3)
54
+ reline (0.6.3)
55
+ io-console (~> 0.5)
34
56
  rspec (3.13.2)
35
57
  rspec-core (~> 3.13.0)
36
58
  rspec-expectations (~> 3.13.0)
@@ -40,11 +62,11 @@ GEM
40
62
  rspec-expectations (3.13.5)
41
63
  diff-lcs (>= 1.2.0, < 2.0)
42
64
  rspec-support (~> 3.13.0)
43
- rspec-mocks (3.13.6)
65
+ rspec-mocks (3.13.7)
44
66
  diff-lcs (>= 1.2.0, < 2.0)
45
67
  rspec-support (~> 3.13.0)
46
68
  rspec-support (3.13.6)
47
- rubocop (1.81.6)
69
+ rubocop (1.81.7)
48
70
  json (~> 2.3)
49
71
  language_server-protocol (~> 3.17.0.2)
50
72
  lint_roller (~> 1.1.0)
@@ -65,13 +87,19 @@ GEM
65
87
  simplecov_json_formatter (~> 0.1)
66
88
  simplecov-html (0.13.2)
67
89
  simplecov_json_formatter (0.1.4)
90
+ stringio (3.1.8)
91
+ tsort (0.2.0)
68
92
  unicode-display_width (3.2.0)
69
93
  unicode-emoji (~> 4.1)
70
94
  unicode-emoji (4.1.0)
95
+ yard (0.9.37)
96
+ yard-lint (1.1.0)
97
+ irb
98
+ yard (~> 0.9)
99
+ zeitwerk (~> 2.6)
71
100
  zeitwerk (2.7.3)
72
101
 
73
102
  PLATFORMS
74
- ruby
75
103
  x86_64-linux
76
104
 
77
105
  DEPENDENCIES
@@ -83,6 +111,7 @@ DEPENDENCIES
83
111
  rspec (~> 3.0)
84
112
  rubocop (~> 1.0)
85
113
  simplecov (~> 0.21)
114
+ yard-lint
86
115
 
87
116
  BUNDLED WITH
88
117
  2.7.2
data/README.md CHANGED
@@ -68,6 +68,8 @@ llm-docs-builder transform --url https://yoursite.com/docs/page.html
68
68
  llm-docs-builder bulk-transform --config llm-docs-builder.yml
69
69
  ```
70
70
 
71
+ **HTML to Markdown Conversion:** The transformer automatically detects and converts HTML content to clean markdown format. This works seamlessly with both local files and remote URLs, converting HTML tables, code blocks, and other elements into their markdown equivalents.
72
+
71
73
  ## Installation
72
74
 
73
75
  ### Docker (Recommended)
@@ -85,6 +87,20 @@ gem install llm-docs-builder
85
87
 
86
88
  ## Features
87
89
 
90
+ ### Automatic HTML to Markdown Conversion
91
+
92
+ The tool automatically detects and converts HTML content to clean markdown:
93
+ - **HTML Tables** → Markdown tables
94
+ - **HTML Code Blocks** → Fenced code blocks
95
+ - **Figures & Captions** → Clean markdown equivalents
96
+ - **Seamless Integration** - Works with local files and remote URLs without special configuration
97
+
98
+ ```bash
99
+ # Transform HTML content automatically
100
+ llm-docs-builder transform --docs page-with-html.md
101
+ llm-docs-builder transform --url https://site.com/docs/api.html
102
+ ```
103
+
88
104
  ### Measure and Compare
89
105
 
90
106
  ```bash
@@ -53,6 +53,39 @@ module LlmDocsBuilder
53
53
  # defaults for any options not specified via CLI.
54
54
  #
55
55
  # @param options [Hash] CLI options hash
56
+ # @option options [String] :docs path to documentation directory or file
57
+ # @option options [String] :base_url base URL for expanding relative links
58
+ # @option options [String] :title project title
59
+ # @option options [String] :description project description
60
+ # @option options [String] :body additional body content
61
+ # @option options [String] :output output file path
62
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
63
+ # @option options [Boolean] :remove_comments remove HTML comments
64
+ # @option options [Boolean] :normalize_whitespace normalize whitespace
65
+ # @option options [Boolean] :remove_badges remove badge images
66
+ # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
67
+ # @option options [Boolean] :verbose enable verbose output
68
+ # @option options [String] :suffix suffix for transformed files
69
+ # @option options [Array<String>] :excludes glob patterns for files to exclude
70
+ # @option options [Boolean] :bulk enable bulk transformation mode
71
+ # @option options [Boolean] :include_hidden include hidden files
72
+ # @option options [Boolean] :remove_code_examples remove code blocks
73
+ # @option options [Boolean] :remove_images remove image syntax
74
+ # @option options [Boolean] :simplify_links simplify link text
75
+ # @option options [Boolean] :remove_blockquotes remove blockquote formatting
76
+ # @option options [Boolean] :generate_toc generate table of contents
77
+ # @option options [String] :custom_instruction custom instruction text
78
+ # @option options [Boolean] :remove_stopwords remove common stopwords
79
+ # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
80
+ # @option options [Boolean] :normalize_headings normalize heading hierarchy
81
+ # @option options [String] :heading_separator separator for heading paths
82
+ # @option options [Boolean] :include_metadata include metadata in output
83
+ # @option options [Boolean] :include_tokens include token counts
84
+ # @option options [Boolean] :include_timestamps include timestamps
85
+ # @option options [Boolean] :include_priority include priority metadata
86
+ # @option options [Boolean] :calculate_compression calculate compression ratios
87
+ # @option options [String] :content raw markdown content
88
+ # @option options [String] :source_url source URL for content
56
89
  # @return [Hash] merged configuration with CLI overrides applied
57
90
  def merge_with_options(options)
58
91
  # CLI options override config file, config file provides defaults
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Helper methods for content transformation
5
+ #
6
+ # @api private
7
+ module Helpers
8
+ # Removes trailing pipe characters and whitespace from array of string parts
9
+ #
10
+ # @param parts [Array<String>] array of string parts to process
11
+ # @return [void]
12
+ def prune_trailing_unsafe_link_separator!(parts)
13
+ while parts.any?
14
+ last = parts.last
15
+ new_last = last.sub(/[ \t]*\|\s*\z/, '')
16
+
17
+ if new_last != last
18
+ trimmed = new_last.rstrip
19
+ parts[-1] = trimmed
20
+ parts.pop if trimmed.empty?
21
+ elsif last.strip.empty?
22
+ parts.pop
23
+ else
24
+ break
25
+ end
26
+ end
27
+ end
28
+
29
+ module_function :prune_trailing_unsafe_link_separator!
30
+ end
31
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ # Reduces consecutive blank lines outside of code fences
6
+ #
7
+ # @param text [String] input text to process
8
+ # @param max_blank [Integer] maximum number of consecutive blank lines to allow
9
+ # @param fence_chars [Array<String>] characters that can be used for code fences
10
+ # @param min_fence [Integer] minimum length of fence character sequence
11
+ # @return [String] processed text with squeezed blank lines
12
+ def squeeze_blank_lines_outside_fences(text, max_blank: 2, fence_chars: %w[` ~], min_fence: 3)
13
+ return '' if text.to_s.empty?
14
+
15
+ lines = text.split("\n", -1)
16
+
17
+ inside_fence = false
18
+ fence_indent = ''.dup
19
+ fence_char = nil
20
+ fence_len = 0
21
+
22
+ # Build a fast “does this look like an opening fence?” regex
23
+ # e.g., leading spaces + ``` or ~~~ (length >= min_fence) + optional info string
24
+ fence_set = Regexp.escape(fence_chars.join)
25
+ open_re = /\A(\s*)([#{fence_set}])\2{#{min_fence - 1},}.*\z/
26
+
27
+ out = []
28
+ blank_streak = 0
29
+
30
+ lines.each_with_index do |line, _idx|
31
+ if inside_fence
32
+ out << line
33
+ # Closing fence must match indent, char, and fence length
34
+ if line.match?(/\A#{Regexp.escape(fence_indent)}#{Regexp.escape(fence_char * fence_len)}\s*\z/)
35
+ inside_fence = false
36
+ fence_indent = ''.dup
37
+ fence_char = nil
38
+ fence_len = 0
39
+ end
40
+ next
41
+ end
42
+
43
+ if (m = line.match(open_re))
44
+ # Enter fenced block; compute the *actual* fence length from the line
45
+ fence_indent = m[1]
46
+ fence_char = m[2]
47
+ after_indent = line[fence_indent.length..]
48
+ fence_len = after_indent[/\A#{Regexp.escape(fence_char)}+/].length
49
+ inside_fence = true
50
+ blank_streak = 0
51
+ out << line
52
+ next
53
+ end
54
+
55
+ # Outside fences: squeeze blank lines
56
+ if line.strip.empty?
57
+ blank_streak += 1
58
+ # Keep at most max_blank blank lines; skip extras
59
+ out << line if blank_streak <= max_blank
60
+ else
61
+ blank_streak = 0
62
+ out << line
63
+ end
64
+ end
65
+
66
+ out.join("\n")
67
+ end
68
+
69
+ module_function :squeeze_blank_lines_outside_fences
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ end
6
+ end
7
+
8
+ require_relative 'helpers/squeeze_blank_lines_outside_fences'
9
+ require_relative 'helpers/prune_trailing_unsafe_link_separator'
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Detects whether input should be treated as HTML and related snippet checks
5
+ class HtmlDetector
6
+ # Detect if loaded content is HTML instead of markdown
7
+ #
8
+ # @param content [String] raw content
9
+ # @param snippet [String, nil] optional precomputed snippet
10
+ # @return [Boolean]
11
+ def html_content?(content, snippet = detection_snippet(content))
12
+ return false unless html_content_snippet?(snippet)
13
+
14
+ full_html_document?(content)
15
+ end
16
+
17
+ # Prepare a snippet of content for HTML detection by removing leading whitespace
18
+ # and build metadata comments.
19
+ #
20
+ # @param content [String]
21
+ # @return [String, nil]
22
+ def detection_snippet(content)
23
+ return unless content
24
+
25
+ snippet = content.lstrip
26
+ return unless snippet
27
+
28
+ comment_prefix = /\A<!--.*?-->\s*/m
29
+ # Remote docs often include build metadata comments; skip them before tag detection.
30
+ return '' if snippet.empty? while snippet.sub!(comment_prefix, '')
31
+
32
+ snippet.lstrip[0, 500]
33
+ end
34
+
35
+ # Determine whether a snippet should be treated as HTML.
36
+ #
37
+ # @param snippet [String, nil]
38
+ # @return [Boolean]
39
+ def html_content_snippet?(snippet)
40
+ return false unless snippet && !snippet.empty?
41
+ return false if markdown_heading_snippet?(snippet)
42
+
43
+ html_candidate_snippet?(snippet)
44
+ end
45
+
46
+ # Determine whether a snippet appears to start with HTML markup.
47
+ #
48
+ # @param snippet [String]
49
+ # @return [Boolean]
50
+ def html_candidate_snippet?(snippet)
51
+ snippet.match?(/\A<\s*(?:!DOCTYPE\s+html|html\b|body\b|head\b|article\b|section\b|main\b|p\b|div\b|table\b|thead\b|tbody\b|tr\b|td\b|th\b|meta\b|link\b|h[1-6]\b|ul\b|ol\b|li\b|blockquote\b)/i)
52
+ end
53
+
54
+ # Check if the full document should be treated as HTML by parsing it and
55
+ # ensuring we do not observe unwrapped markdown constructs like plain text or lists.
56
+ #
57
+ # @param content [String]
58
+ # @return [Boolean]
59
+ def full_html_document?(content)
60
+ document = Nokogiri::HTML::Document.parse(content)
61
+ body = document.at('body')
62
+
63
+ return false unless body
64
+ return false if document.xpath('/text()').any? { |node| meaningful_text?(node.text) }
65
+
66
+ body.xpath('./text()').each do |node|
67
+ text = node.text
68
+ next unless meaningful_text?(text)
69
+
70
+ return false unless allow_inline_body_text?(content, text)
71
+ end
72
+
73
+ true
74
+ rescue Nokogiri::XML::SyntaxError
75
+ false
76
+ end
77
+
78
+ # Checks if text contains meaningful non-whitespace content
79
+ #
80
+ # @param text [String, nil]
81
+ # @return [Boolean] true if text contains non-whitespace characters
82
+ def meaningful_text?(text)
83
+ return false if text.nil?
84
+
85
+ stripped = text.strip
86
+ stripped.match?(/\S/)
87
+ end
88
+
89
+ # Checks if text looks like markdown syntax
90
+ #
91
+ # @param text [String, nil]
92
+ # @return [Boolean] true if text contains markdown-like patterns
93
+ def markdown_like_text?(text)
94
+ return false if text.nil?
95
+ return true if markdown_heading_snippet?(text)
96
+
97
+ text.each_line do |line|
98
+ trimmed = line.lstrip
99
+ next if trimmed.empty?
100
+ next if trimmed.start_with?('<')
101
+
102
+ return true if trimmed.match?(/\A[*+-]\s+\S/)
103
+ return true if trimmed.match?(/\A\d+\.\s+\S/)
104
+ return true if trimmed.match?(/\A>\s+\S/)
105
+ return true if trimmed.start_with?('```', '~~~')
106
+ return true if trimmed.strip.match?(/\A(?:-{3,}|_{3,}|={3,})\z/)
107
+ end
108
+
109
+ false
110
+ end
111
+
112
+ # Determines if inline body text should be allowed in HTML context
113
+ #
114
+ # @param content [String] full content being processed
115
+ # @param text [String] specific text to check
116
+ # @return [Boolean] true if inline body text is acceptable
117
+ def allow_inline_body_text?(content, text)
118
+ return false if markdown_like_text?(text)
119
+
120
+ html_with_body_wrapper?(content)
121
+ end
122
+
123
+ # Checks if content has HTML document structure wrapper tags
124
+ #
125
+ # @param content [String] content to check for HTML wrapper tags
126
+ # @return [Boolean] true if content contains DOCTYPE, html, or body tags
127
+ def html_with_body_wrapper?(content)
128
+ content.match?(/<\s*!DOCTYPE\s+html/i) ||
129
+ content.match?(/<\s*html\b/i) ||
130
+ content.match?(/<\s*body\b/i)
131
+ end
132
+
133
+ # Detect whether the snippet represents a table fragment we should preserve.
134
+ #
135
+ # @param snippet [String, nil]
136
+ # @return [Boolean]
137
+ def table_fragment?(snippet)
138
+ return false unless snippet && !snippet.empty?
139
+
140
+ snippet.match?(/\A<\s*(?:table|thead|tbody|tr|td|th)\b/i)
141
+ end
142
+
143
+ # Detect common markdown heading syntax within the snippet.
144
+ #
145
+ # @param snippet [String]
146
+ # @return [Boolean]
147
+ def markdown_heading_snippet?(snippet)
148
+ snippet.each_line do |line|
149
+ trimmed = line.lstrip
150
+ next if trimmed.empty?
151
+ next if trimmed.start_with?('<')
152
+
153
+ return true if trimmed.match?(/\A#+\s+/)
154
+ end
155
+
156
+ false
157
+ end
158
+ end
159
+ end