llm-docs-builder 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e6407836216f436b728247009f614ea4ea5c2b4de0edf855717129460df4b309
4
- data.tar.gz: 8a556fc0b6307529f5c615c05b082521bd021e9e9f34ca30c8bb21d22b21deb2
3
+ metadata.gz: ac257dad79f49ed6993f784f8a28ee1e996e735fef4581449ad521ea9414a5d4
4
+ data.tar.gz: 29e1d2d578d57ea6f17aafca070c61b6161b6313d6614f0e4f798933ceae082d
5
5
  SHA512:
6
- metadata.gz: 3a0b657545415c35187fa1595f3fcc5bd5c27a0c1a292bf00635d3c6f14221ac0a254f008f83acc1f7351ef76c6649e6b0540ac585cd64a670b05ef725b0308e
7
- data.tar.gz: 68e95142e374ebae3c292db724a9163c396fd423eaa04983bbdaefd6f6cbb0bc72822dfbc5af9fd8f357f1545320db70bf5a6c8a2413fc5f796e73195adcdd13
6
+ metadata.gz: f82216cca621e942c0e6ad3d92aba5d099159cc9c0d10c1d010a85e2a740511103cebd0198c0056195775064853e749472dcb7f0939b8d3fda7753d291a5b0da
7
+ data.tar.gz: 31aa5737e215439b11a2e79d793dabb9ff342206b660a2ecd846920bc2f6501c3d5910da4cdc52ecfcfa9f7b9acef14213b17936edd23d86808c0bcb2f391952
@@ -54,6 +54,18 @@ jobs:
54
54
  GITHUB_COVERAGE: ${{ matrix.coverage }}
55
55
  run: bin/rspecs
56
56
 
57
+ yard-lint:
58
+ timeout-minutes: 5
59
+ runs-on: ubuntu-latest
60
+ steps:
61
+ - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5
62
+ - name: Set up Ruby
63
+ uses: ruby/setup-ruby@v1
64
+ with:
65
+ ruby-version: '3.4.7'
66
+ bundler-cache: true
67
+ - name: Run yard-lint
68
+ run: bundle exec yard-lint lib/
57
69
 
58
70
  ci-success:
59
71
  name: CI Success
@@ -61,6 +73,7 @@ jobs:
61
73
  if: always()
62
74
  needs:
63
75
  - specs
76
+ - yard-lint
64
77
  steps:
65
78
  - name: Check all jobs passed
66
79
  if: |
@@ -31,7 +31,7 @@ jobs:
31
31
 
32
32
  - name: Docker meta
33
33
  id: meta
34
- uses: docker/metadata-action@c1e51972afc2121e065aed6d45c65596fe445f3f # v5
34
+ uses: docker/metadata-action@318604b99e75e41977312d83839a89be02ca4893 # v5
35
35
  with:
36
36
  images: |
37
37
  mensfeld/llm-docs-builder
@@ -45,7 +45,7 @@ jobs:
45
45
  type=raw,value=latest,enable={{is_default_branch}}
46
46
 
47
47
  - name: Set up QEMU
48
- uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3
48
+ uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
49
49
 
50
50
  - name: Set up Docker Buildx
51
51
  uses: docker/setup-buildx-action@e468171a9de216ec08956ac3ada2f0791b6bd435 # v3
@@ -24,7 +24,7 @@ jobs:
24
24
  fetch-depth: 0
25
25
 
26
26
  - name: Set up Ruby
27
- uses: ruby/setup-ruby@ab177d40ee5483edb974554986f56b33477e21d0 # v1.265.0
27
+ uses: ruby/setup-ruby@d5126b9b3579e429dd52e51e68624dda2e05be25 # v1.267.0
28
28
  with:
29
29
  bundler-cache: false
30
30
 
@@ -32,4 +32,4 @@ jobs:
32
32
  run: |
33
33
  bundle install --jobs 4 --retry 3
34
34
 
35
- - uses: rubygems/release-gem@a25424ba2ba8b387abc8ef40807c2c85b96cbe32 # v1.1.1
35
+ - uses: rubygems/release-gem@1c162a739e8b4cb21a676e97b087e8268d8fc40b # v1.1.2
data/.gitignore CHANGED
@@ -10,6 +10,8 @@
10
10
  /test/version_tmp/
11
11
  /tmp/
12
12
  mise.toml
13
+ .DS_Store
14
+ .vscode/launch.json
13
15
 
14
16
  # Used by dotenv library to load environment variables.
15
17
  .env
@@ -64,3 +66,9 @@ llms.txt
64
66
  # Config files that might contain sensitive data
65
67
  llms-txt.yml
66
68
  .llms-txt.yml
69
+
70
+ # AI coding agent
71
+ AGENTS.md
72
+ CLAUDE.md
73
+ GEMINI.md
74
+
data/CHANGELOG.md CHANGED
@@ -1,5 +1,18 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.12.0 (2025-11-12)
4
+ - [Feature] **HTML to Markdown Reverse Converter** — Added support for converting HTML content to markdown format.
5
+ - Enables processing of HTML documentation sources
6
+ - Integrates seamlessly with the transformer pipeline
7
+ - Useful for converting web-based docs to markdown for further processing
8
+ - By @Eric-Guo in PR #32.
9
+
10
+ ## 0.11.0 (2025-11-03)
11
+ - [Feature] **Transform from URL** — The `transform` command now accepts a remote URL via `--url` and processes fetched content through the standard transformer pipeline.
12
+ - Example: `llm-docs-builder transform --url https://example.com/docs/page.html`
13
+ - Applies all configured transformations and output options identically to local files
14
+ - By @Eric-Guo and @codex in PR #28.
15
+
3
16
  ## 0.10.0 (2025-10-27)
4
17
  - [Feature] **llms.txt Specification Compliance** - Updated output format to fully comply with the llms.txt specification from llmstxt.org.
5
18
  - **Metadata Format**: Metadata now appears within the description field using parentheses and comma separators: `- [title](url): description (tokens:450, updated:2025-10-13, priority:high)`
data/Gemfile CHANGED
@@ -7,4 +7,8 @@ gemspec
7
7
  group :development do
8
8
  gem 'pry'
9
9
  gem 'pry-byebug'
10
+ gem 'yard-lint'
11
+ end
12
+
13
+ group :test do
10
14
  end
data/Gemfile.lock CHANGED
@@ -1,7 +1,8 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- llm-docs-builder (0.10.0)
4
+ llm-docs-builder (0.12.0)
5
+ nokogiri (~> 1.17)
5
6
  zeitwerk (~> 2.6)
6
7
 
7
8
  GEM
@@ -10,41 +11,62 @@ GEM
10
11
  ast (2.4.3)
11
12
  byebug (12.0.0)
12
13
  coderay (1.1.3)
14
+ date (3.5.0)
13
15
  diff-lcs (1.6.2)
14
16
  docile (1.4.1)
15
- json (2.13.2)
17
+ erb (5.1.3)
18
+ io-console (0.8.1)
19
+ irb (1.15.3)
20
+ pp (>= 0.6.0)
21
+ rdoc (>= 4.0.0)
22
+ reline (>= 0.4.2)
23
+ json (2.16.0)
16
24
  language_server-protocol (3.17.0.5)
17
25
  lint_roller (1.1.0)
18
26
  method_source (1.1.0)
27
+ nokogiri (1.18.10-x86_64-linux-gnu)
28
+ racc (~> 1.4)
19
29
  parallel (1.27.0)
20
- parser (3.3.9.0)
30
+ parser (3.3.10.0)
21
31
  ast (~> 2.4.1)
22
32
  racc
23
- prism (1.4.0)
33
+ pp (0.6.3)
34
+ prettyprint
35
+ prettyprint (0.2.0)
36
+ prism (1.6.0)
24
37
  pry (0.15.2)
25
38
  coderay (~> 1.1)
26
39
  method_source (~> 1.0)
27
40
  pry-byebug (3.11.0)
28
41
  byebug (~> 12.0)
29
42
  pry (>= 0.13, < 0.16)
43
+ psych (5.2.6)
44
+ date
45
+ stringio
30
46
  racc (1.8.1)
31
47
  rainbow (3.1.1)
32
- rake (13.3.0)
33
- regexp_parser (2.11.2)
34
- rspec (3.13.1)
48
+ rake (13.3.1)
49
+ rdoc (6.15.1)
50
+ erb
51
+ psych (>= 4.0.0)
52
+ tsort
53
+ regexp_parser (2.11.3)
54
+ reline (0.6.3)
55
+ io-console (~> 0.5)
56
+ rspec (3.13.2)
35
57
  rspec-core (~> 3.13.0)
36
58
  rspec-expectations (~> 3.13.0)
37
59
  rspec-mocks (~> 3.13.0)
38
- rspec-core (3.13.5)
60
+ rspec-core (3.13.6)
39
61
  rspec-support (~> 3.13.0)
40
62
  rspec-expectations (3.13.5)
41
63
  diff-lcs (>= 1.2.0, < 2.0)
42
64
  rspec-support (~> 3.13.0)
43
- rspec-mocks (3.13.5)
65
+ rspec-mocks (3.13.7)
44
66
  diff-lcs (>= 1.2.0, < 2.0)
45
67
  rspec-support (~> 3.13.0)
46
- rspec-support (3.13.5)
47
- rubocop (1.80.0)
68
+ rspec-support (3.13.6)
69
+ rubocop (1.81.7)
48
70
  json (~> 2.3)
49
71
  language_server-protocol (~> 3.17.0.2)
50
72
  lint_roller (~> 1.1.0)
@@ -52,10 +74,10 @@ GEM
52
74
  parser (>= 3.3.0.2)
53
75
  rainbow (>= 2.2.2, < 4.0)
54
76
  regexp_parser (>= 2.9.3, < 3.0)
55
- rubocop-ast (>= 1.46.0, < 2.0)
77
+ rubocop-ast (>= 1.47.1, < 2.0)
56
78
  ruby-progressbar (~> 1.7)
57
79
  unicode-display_width (>= 2.4.0, < 4.0)
58
- rubocop-ast (1.46.0)
80
+ rubocop-ast (1.47.1)
59
81
  parser (>= 3.3.7.2)
60
82
  prism (~> 1.4)
61
83
  ruby-progressbar (1.13.0)
@@ -65,13 +87,19 @@ GEM
65
87
  simplecov_json_formatter (~> 0.1)
66
88
  simplecov-html (0.13.2)
67
89
  simplecov_json_formatter (0.1.4)
68
- unicode-display_width (3.1.5)
69
- unicode-emoji (~> 4.0, >= 4.0.4)
70
- unicode-emoji (4.0.4)
90
+ stringio (3.1.8)
91
+ tsort (0.2.0)
92
+ unicode-display_width (3.2.0)
93
+ unicode-emoji (~> 4.1)
94
+ unicode-emoji (4.1.0)
95
+ yard (0.9.37)
96
+ yard-lint (1.1.0)
97
+ irb
98
+ yard (~> 0.9)
99
+ zeitwerk (~> 2.6)
71
100
  zeitwerk (2.7.3)
72
101
 
73
102
  PLATFORMS
74
- ruby
75
103
  x86_64-linux
76
104
 
77
105
  DEPENDENCIES
@@ -83,6 +111,7 @@ DEPENDENCIES
83
111
  rspec (~> 3.0)
84
112
  rubocop (~> 1.0)
85
113
  simplecov (~> 0.21)
114
+ yard-lint
86
115
 
87
116
  BUNDLED WITH
88
- 2.7.1
117
+ 2.7.2
data/README.md CHANGED
@@ -61,10 +61,15 @@ Factor: 2.8x smaller
61
61
  # Single file
62
62
  llm-docs-builder transform --docs README.md
63
63
 
64
+ # Fetch and transform a remote page
65
+ llm-docs-builder transform --url https://yoursite.com/docs/page.html
66
+
64
67
  # Bulk transform with config
65
68
  llm-docs-builder bulk-transform --config llm-docs-builder.yml
66
69
  ```
67
70
 
71
+ **HTML to Markdown Conversion:** The transformer automatically detects and converts HTML content to clean markdown format. This works seamlessly with both local files and remote URLs, converting HTML tables, code blocks, and other elements into their markdown equivalents.
72
+
68
73
  ## Installation
69
74
 
70
75
  ### Docker (Recommended)
@@ -82,6 +87,20 @@ gem install llm-docs-builder
82
87
 
83
88
  ## Features
84
89
 
90
+ ### Automatic HTML to Markdown Conversion
91
+
92
+ The tool automatically detects and converts HTML content to clean markdown:
93
+ - **HTML Tables** → Markdown tables
94
+ - **HTML Code Blocks** → Fenced code blocks
95
+ - **Figures & Captions** → Clean markdown equivalents
96
+ - **Seamless Integration** - Works with local files and remote URLs without special configuration
97
+
98
+ ```bash
99
+ # Transform HTML content automatically
100
+ llm-docs-builder transform --docs page-with-html.md
101
+ llm-docs-builder transform --url https://site.com/docs/api.html
102
+ ```
103
+
85
104
  ### Measure and Compare
86
105
 
87
106
  ```bash
@@ -68,8 +68,9 @@ module LlmDocsBuilder
68
68
  # @param argv [Array<String>] command-line arguments
69
69
  # @return [Hash] parsed options including :command, :config, :docs, :output, :verbose
70
70
  def parse_options(argv)
71
+ command_token = argv.first
71
72
  options = {
72
- command: argv.first&.match?(/^[a-z-]+$/) ? argv.shift : nil
73
+ command: command_token&.match?(/\A[a-z](?:[a-z-]*[a-z])?\z/) ? argv.shift : nil
73
74
  }
74
75
 
75
76
  OptionParser.new do |opts|
@@ -100,7 +101,7 @@ module LlmDocsBuilder
100
101
  options[:output] = path
101
102
  end
102
103
 
103
- opts.on('-u', '--url URL', 'URL to fetch for comparison') do |url|
104
+ opts.on('-u', '--url URL', 'URL to fetch for transform or comparison') do |url|
104
105
  options[:url] = url
105
106
  end
106
107
 
@@ -185,21 +186,42 @@ module LlmDocsBuilder
185
186
  config = LlmDocsBuilder::Config.new(options[:config])
186
187
  merged_options = config.merge_with_options(options)
187
188
 
188
- file_path = merged_options[:docs]
189
+ url = options[:url]
190
+ cli_file_path = options[:docs]
191
+ config_file_path = config['docs']
192
+ file_path = url ? cli_file_path : (cli_file_path || config_file_path)
189
193
 
190
- unless file_path
191
- puts 'File path required for transform command (use -d/--docs)'
194
+ if url && cli_file_path
195
+ puts 'Cannot use both --docs and --url for transform command'
192
196
  exit 1
193
197
  end
194
198
 
195
- unless File.exist?(file_path)
196
- puts "File not found: #{file_path}"
197
- exit 1
199
+ unless file_path
200
+ unless url
201
+ puts 'File path required for transform command (use -d/--docs)'
202
+ exit 1
203
+ end
198
204
  end
199
205
 
200
- puts "Transforming #{file_path}..." if merged_options[:verbose]
206
+ content =
207
+ if url
208
+ puts "Fetching #{url}..." if merged_options[:verbose]
209
+ fetcher = LlmDocsBuilder::UrlFetcher.new(verbose: merged_options[:verbose])
210
+ remote_content = fetcher.fetch(url)
211
+ puts "Transforming content from #{url}..." if merged_options[:verbose]
212
+ transform_options = merged_options.merge(content: remote_content, docs: nil, source_url: url)
213
+ LlmDocsBuilder.transform_markdown(nil, transform_options)
214
+ else
215
+ unless File.exist?(file_path)
216
+ puts "File not found: #{file_path}"
217
+ exit 1
218
+ end
201
219
 
202
- content = LlmDocsBuilder.transform_markdown(file_path, merged_options)
220
+ puts "Transforming #{file_path}..." if merged_options[:verbose]
221
+
222
+ merged_options[:docs] = file_path
223
+ LlmDocsBuilder.transform_markdown(file_path, merged_options)
224
+ end
203
225
 
204
226
  if merged_options[:output] && merged_options[:output] != 'llms.txt'
205
227
  File.write(merged_options[:output], content)
@@ -1,8 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'net/http'
4
- require 'uri'
5
-
6
3
  module LlmDocsBuilder
7
4
  # Compares content sizes between human and AI versions
8
5
  #
@@ -30,7 +27,7 @@ module LlmDocsBuilder
30
27
  AI_USER_AGENT = 'Claude-Web/1.0 (Anthropic AI Assistant)'
31
28
 
32
29
  # Maximum number of redirects to follow before raising an error
33
- MAX_REDIRECTS = 10
30
+ MAX_REDIRECTS = UrlFetcher::MAX_REDIRECTS
34
31
 
35
32
  # @return [String] URL to compare
36
33
  attr_reader :url
@@ -133,78 +130,11 @@ module LlmDocsBuilder
133
130
  # @return [String] response body
134
131
  # @raise [Errors::GenerationError] if fetch fails or too many redirects
135
132
  def fetch_url(url_string, user_agent, redirect_count = 0)
136
- if redirect_count >= MAX_REDIRECTS
137
- raise(
138
- Errors::GenerationError,
139
- "Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
140
- )
141
- end
142
-
143
- uri = validate_and_parse_url(url_string)
144
-
145
- http = Net::HTTP.new(uri.host, uri.port)
146
- http.use_ssl = uri.scheme == 'https'
147
- http.open_timeout = 10
148
- http.read_timeout = 30
149
-
150
- request = Net::HTTP::Get.new(uri.request_uri)
151
- request['User-Agent'] = user_agent
152
-
153
- response = http.request(request)
154
-
155
- case response
156
- when Net::HTTPSuccess
157
- response.body
158
- when Net::HTTPRedirection
159
- # Follow redirect with incremented counter
160
- redirect_url = response['location']
161
- puts " Redirecting to #{redirect_url}..." if options[:verbose] && redirect_count.positive?
162
- fetch_url(redirect_url, user_agent, redirect_count + 1)
163
- else
164
- raise(
165
- Errors::GenerationError,
166
- "Failed to fetch #{url_string}: #{response.code} #{response.message}"
167
- )
168
- end
169
- rescue Errors::GenerationError
170
- raise
171
- rescue StandardError => e
172
- raise(
173
- Errors::GenerationError,
174
- "Error fetching #{url_string}: #{e.message}"
175
- )
176
- end
177
-
178
- # Validates and parses URL to prevent malformed URLs
179
- #
180
- # @param url_string [String] URL to validate and parse
181
- # @return [URI::HTTP, URI::HTTPS] parsed URI
182
- # @raise [Errors::GenerationError] if URL is invalid or uses unsupported scheme
183
- def validate_and_parse_url(url_string)
184
- uri = URI.parse(url_string)
185
-
186
- # Only allow HTTP and HTTPS schemes
187
- unless %w[http https].include?(uri.scheme&.downcase)
188
- raise(
189
- Errors::GenerationError,
190
- "Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
191
- )
192
- end
193
-
194
- # Ensure host is present
195
- if uri.host.nil? || uri.host.empty?
196
- raise(
197
- Errors::GenerationError,
198
- "Invalid URL: missing host in #{url_string}"
199
- )
200
- end
201
-
202
- uri
203
- rescue URI::InvalidURIError => e
204
- raise(
205
- Errors::GenerationError,
206
- "Invalid URL format: #{e.message}"
133
+ fetcher = UrlFetcher.new(
134
+ user_agent: user_agent,
135
+ verbose: options[:verbose]
207
136
  )
137
+ fetcher.fetch(url_string, redirect_count)
208
138
  end
209
139
 
210
140
  # Calculate comparison statistics
@@ -53,11 +53,48 @@ module LlmDocsBuilder
53
53
  # defaults for any options not specified via CLI.
54
54
  #
55
55
  # @param options [Hash] CLI options hash
56
+ # @option options [String] :docs path to documentation directory or file
57
+ # @option options [String] :base_url base URL for expanding relative links
58
+ # @option options [String] :title project title
59
+ # @option options [String] :description project description
60
+ # @option options [String] :body additional body content
61
+ # @option options [String] :output output file path
62
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
63
+ # @option options [Boolean] :remove_comments remove HTML comments
64
+ # @option options [Boolean] :normalize_whitespace normalize whitespace
65
+ # @option options [Boolean] :remove_badges remove badge images
66
+ # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
67
+ # @option options [Boolean] :verbose enable verbose output
68
+ # @option options [String] :suffix suffix for transformed files
69
+ # @option options [Array<String>] :excludes glob patterns for files to exclude
70
+ # @option options [Boolean] :bulk enable bulk transformation mode
71
+ # @option options [Boolean] :include_hidden include hidden files
72
+ # @option options [Boolean] :remove_code_examples remove code blocks
73
+ # @option options [Boolean] :remove_images remove image syntax
74
+ # @option options [Boolean] :simplify_links simplify link text
75
+ # @option options [Boolean] :remove_blockquotes remove blockquote formatting
76
+ # @option options [Boolean] :generate_toc generate table of contents
77
+ # @option options [String] :custom_instruction custom instruction text
78
+ # @option options [Boolean] :remove_stopwords remove common stopwords
79
+ # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
80
+ # @option options [Boolean] :normalize_headings normalize heading hierarchy
81
+ # @option options [String] :heading_separator separator for heading paths
82
+ # @option options [Boolean] :include_metadata include metadata in output
83
+ # @option options [Boolean] :include_tokens include token counts
84
+ # @option options [Boolean] :include_timestamps include timestamps
85
+ # @option options [Boolean] :include_priority include priority metadata
86
+ # @option options [Boolean] :calculate_compression calculate compression ratios
87
+ # @option options [String] :content raw markdown content
88
+ # @option options [String] :source_url source URL for content
56
89
  # @return [Hash] merged configuration with CLI overrides applied
57
90
  def merge_with_options(options)
58
91
  # CLI options override config file, config file provides defaults
59
92
  {
60
- docs: options[:docs] || self['docs'] || '.',
93
+ docs: if options.key?(:docs)
94
+ options[:docs]
95
+ else
96
+ self['docs'] || '.'
97
+ end,
61
98
  base_url: options[:base_url] || self['base_url'],
62
99
  title: options[:title] || self['title'],
63
100
  description: options[:description] || self['description'],
@@ -171,7 +208,10 @@ module LlmDocsBuilder
171
208
  else
172
209
  self['calculate_compression'] || false
173
210
  end
174
- }
211
+ }.tap do |merged|
212
+ merged[:content] = options[:content] if options.key?(:content)
213
+ merged[:source_url] = options[:source_url] if options.key?(:source_url)
214
+ end
175
215
  end
176
216
 
177
217
  # Check if a config file was found and exists
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Helper methods for content transformation
5
+ #
6
+ # @api private
7
+ module Helpers
8
+ # Removes trailing pipe characters and whitespace from array of string parts
9
+ #
10
+ # @param parts [Array<String>] array of string parts to process
11
+ # @return [void]
12
+ def prune_trailing_unsafe_link_separator!(parts)
13
+ while parts.any?
14
+ last = parts.last
15
+ new_last = last.sub(/[ \t]*\|\s*\z/, '')
16
+
17
+ if new_last != last
18
+ trimmed = new_last.rstrip
19
+ parts[-1] = trimmed
20
+ parts.pop if trimmed.empty?
21
+ elsif last.strip.empty?
22
+ parts.pop
23
+ else
24
+ break
25
+ end
26
+ end
27
+ end
28
+
29
+ module_function :prune_trailing_unsafe_link_separator!
30
+ end
31
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ # Reduces consecutive blank lines outside of code fences
6
+ #
7
+ # @param text [String] input text to process
8
+ # @param max_blank [Integer] maximum number of consecutive blank lines to allow
9
+ # @param fence_chars [Array<String>] characters that can be used for code fences
10
+ # @param min_fence [Integer] minimum length of fence character sequence
11
+ # @return [String] processed text with squeezed blank lines
12
+ def squeeze_blank_lines_outside_fences(text, max_blank: 2, fence_chars: %w[` ~], min_fence: 3)
13
+ return '' if text.to_s.empty?
14
+
15
+ lines = text.split("\n", -1)
16
+
17
+ inside_fence = false
18
+ fence_indent = ''.dup
19
+ fence_char = nil
20
+ fence_len = 0
21
+
22
+ # Build a fast “does this look like an opening fence?” regex
23
+ # e.g., leading spaces + ``` or ~~~ (length >= min_fence) + optional info string
24
+ fence_set = Regexp.escape(fence_chars.join)
25
+ open_re = /\A(\s*)([#{fence_set}])\2{#{min_fence - 1},}.*\z/
26
+
27
+ out = []
28
+ blank_streak = 0
29
+
30
+ lines.each_with_index do |line, _idx|
31
+ if inside_fence
32
+ out << line
33
+ # Closing fence must match indent, char, and fence length
34
+ if line.match?(/\A#{Regexp.escape(fence_indent)}#{Regexp.escape(fence_char * fence_len)}\s*\z/)
35
+ inside_fence = false
36
+ fence_indent = ''.dup
37
+ fence_char = nil
38
+ fence_len = 0
39
+ end
40
+ next
41
+ end
42
+
43
+ if (m = line.match(open_re))
44
+ # Enter fenced block; compute the *actual* fence length from the line
45
+ fence_indent = m[1]
46
+ fence_char = m[2]
47
+ after_indent = line[fence_indent.length..]
48
+ fence_len = after_indent[/\A#{Regexp.escape(fence_char)}+/].length
49
+ inside_fence = true
50
+ blank_streak = 0
51
+ out << line
52
+ next
53
+ end
54
+
55
+ # Outside fences: squeeze blank lines
56
+ if line.strip.empty?
57
+ blank_streak += 1
58
+ # Keep at most max_blank blank lines; skip extras
59
+ out << line if blank_streak <= max_blank
60
+ else
61
+ blank_streak = 0
62
+ out << line
63
+ end
64
+ end
65
+
66
+ out.join("\n")
67
+ end
68
+
69
+ module_function :squeeze_blank_lines_outside_fences
70
+ end
71
+ end
@@ -0,0 +1,9 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Helpers
5
+ end
6
+ end
7
+
8
+ require_relative 'helpers/squeeze_blank_lines_outside_fences'
9
+ require_relative 'helpers/prune_trailing_unsafe_link_separator'