llm-docs-builder 0.10.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -55,7 +55,7 @@ module LlmDocsBuilder
55
55
  #
56
56
  # @return [String] transformed markdown content
57
57
  def transform
58
- content = File.read(file_path)
58
+ content = load_content
59
59
 
60
60
  # Build and execute transformation pipeline
61
61
  content = cleanup_transformer.transform(content, options)
@@ -63,9 +63,7 @@ module LlmDocsBuilder
63
63
  content = heading_transformer.transform(content, options)
64
64
  content = compress_content(content) if should_compress?
65
65
  content = enhancement_transformer.transform(content, options)
66
- content = whitespace_transformer.transform(content, options)
67
-
68
- content
66
+ whitespace_transformer.transform(content, options)
69
67
  end
70
68
 
71
69
  private
@@ -114,7 +112,7 @@ module LlmDocsBuilder
114
112
 
115
113
  # Compress content using TextCompressor
116
114
  #
117
- # @param content [String] content to compress
115
+ # @param content [String] markdown content
118
116
  # @return [String] compressed content
119
117
  def compress_content(content)
120
118
  compressor = TextCompressor.new
@@ -124,5 +122,32 @@ module LlmDocsBuilder
124
122
  }
125
123
  compressor.compress(content, compression_methods)
126
124
  end
125
+
126
+ # Load source content either from provided string or file path
127
+ #
128
+ # @return [String] markdown content to transform
129
+ def load_content
130
+ content = options[:content] ? options[:content].dup : File.read(file_path)
131
+ snippet = html_detector.detection_snippet(content)
132
+
133
+ return content if html_detector.table_fragment?(snippet)
134
+ return html_to_markdown_converter.convert(content) if html_detector.html_content?(content, snippet)
135
+
136
+ content
137
+ end
138
+
139
+ # Memoized HTML to markdown converter
140
+ #
141
+ # @return [HtmlToMarkdownConverter]
142
+ def html_to_markdown_converter
143
+ @html_to_markdown_converter ||= HtmlToMarkdownConverter.new
144
+ end
145
+
146
+ # Memoized HTML detector
147
+ #
148
+ # @return [HtmlDetector]
149
+ def html_detector
150
+ @html_detector ||= HtmlDetector.new
151
+ end
127
152
  end
128
153
  end
@@ -30,7 +30,7 @@ module LlmDocsBuilder
30
30
 
31
31
  # Format number with comma separators for readability
32
32
  #
33
- # @param number [Integer] number to format
33
+ # @param number [Integer] integer value
34
34
  # @return [String] formatted number with commas
35
35
  #
36
36
  # @example
@@ -1,6 +1,13 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module LlmDocsBuilder
4
+ # Provides content transformation functionality
5
+ #
6
+ # This module contains specialized transformers for modifying markdown content,
7
+ # including cleanup operations, link processing, heading normalization, and
8
+ # content enhancement for AI consumption.
9
+ #
10
+ # @api private
4
11
  module Transformers
5
12
  # Base module for all transformers
6
13
  #
@@ -11,9 +18,12 @@ module LlmDocsBuilder
11
18
  module BaseTransformer
12
19
  # Transform content
13
20
  #
14
- # @param content [String] content to transform
21
+ # @abstract Subclasses must implement this method and document specific options
22
+ # @param content [String] markdown content
15
23
  # @param options [Hash] transformation options
24
+ # @option options [Object] :* options vary by implementation - see specific transformer classes
16
25
  # @return [String] transformed content
26
+ # @note Options vary by implementation - see specific transformer classes for supported keys
17
27
  def transform(content, options = {})
18
28
  raise NotImplementedError, "#{self.class} must implement #transform"
19
29
  end
@@ -21,7 +31,9 @@ module LlmDocsBuilder
21
31
  # Check if transformation should be applied
22
32
  #
23
33
  # @param options [Hash] transformation options
34
+ # @option options [Object] :* options vary by implementation - see specific transformer classes
24
35
  # @return [Boolean] true if transformation should be applied
36
+ # @note Options vary by implementation - see specific transformer classes for supported keys
25
37
  def should_transform?(options)
26
38
  true
27
39
  end
@@ -0,0 +1,138 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+
6
+ module LlmDocsBuilder
7
+ # Lightweight HTTP client for fetching remote documentation pages.
8
+ #
9
+ # Provides common functionality needed by multiple commands (transform, compare)
10
+ # including strict scheme validation, redirect handling and sensible timeouts.
11
+ class UrlFetcher
12
+ # Default user agent string for HTTP requests
13
+ DEFAULT_USER_AGENT = 'llm-docs-builder/1.0 (+https://github.com/mensfeld/llm-docs-builder)'
14
+
15
+ # Maximum number of redirects to follow
16
+ MAX_REDIRECTS = 10
17
+
18
+ # @param user_agent [String] HTTP user agent header value
19
+ # @param verbose [Boolean] enable redirect logging
20
+ # @param output [IO] IO stream used for redirect logging
21
+ def initialize(user_agent: DEFAULT_USER_AGENT, verbose: false, output: $stdout)
22
+ @user_agent = user_agent
23
+ @verbose = verbose
24
+ @output = output
25
+ end
26
+
27
+ # Fetch remote URL content while following redirects.
28
+ #
29
+ # @param url_string [String] URL to fetch
30
+ # @param redirect_count [Integer] current redirect depth (internal use)
31
+ # @return [String] response body
32
+ # @raise [Errors::GenerationError] on invalid URLs, network failures, or redirect loops
33
+ def fetch(url_string, redirect_count = 0)
34
+ if redirect_count >= MAX_REDIRECTS
35
+ raise(
36
+ Errors::GenerationError,
37
+ "Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
38
+ )
39
+ end
40
+
41
+ uri = validate_and_parse_url(url_string)
42
+
43
+ http = Net::HTTP.new(uri.host, uri.port)
44
+ http.use_ssl = uri.scheme == 'https'
45
+ http.open_timeout = 10
46
+ http.read_timeout = 30
47
+
48
+ request = Net::HTTP::Get.new(uri.request_uri)
49
+ request['User-Agent'] = @user_agent
50
+
51
+ response = http.request(request)
52
+
53
+ case response
54
+ when Net::HTTPSuccess
55
+ response.body
56
+ when Net::HTTPRedirection
57
+ redirect_url = absolute_redirect_url(uri, response['location'])
58
+ log_redirect(redirect_url)
59
+ fetch(redirect_url, redirect_count + 1)
60
+ else
61
+ raise(
62
+ Errors::GenerationError,
63
+ "Failed to fetch #{url_string}: #{response.code} #{response.message}"
64
+ )
65
+ end
66
+ rescue Errors::GenerationError
67
+ raise
68
+ rescue StandardError => e
69
+ raise(
70
+ Errors::GenerationError,
71
+ "Error fetching #{url_string}: #{e.message}"
72
+ )
73
+ end
74
+
75
+ private
76
+
77
+ # Validate and parse URL string
78
+ #
79
+ # @param url_string [String] URL to validate
80
+ # @return [URI::HTTP, URI::HTTPS] parsed URI
81
+ # @raise [Errors::GenerationError] if URL is invalid or unsupported
82
+ def validate_and_parse_url(url_string)
83
+ uri = URI.parse(url_string)
84
+
85
+ unless %w[http https].include?(uri.scheme&.downcase)
86
+ raise(
87
+ Errors::GenerationError,
88
+ "Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
89
+ )
90
+ end
91
+
92
+ if uri.host.nil? || uri.host.empty?
93
+ raise(
94
+ Errors::GenerationError,
95
+ "Invalid URL: missing host in #{url_string}"
96
+ )
97
+ end
98
+
99
+ uri
100
+ rescue URI::InvalidURIError => e
101
+ raise(
102
+ Errors::GenerationError,
103
+ "Invalid URL format: #{e.message}"
104
+ )
105
+ end
106
+
107
+ # Convert redirect location to absolute URL
108
+ #
109
+ # @param base_uri [URI] base URI
110
+ # @param location [String] redirect location
111
+ # @return [String] absolute redirect URL
112
+ # @raise [Errors::GenerationError] if location is invalid
113
+ def absolute_redirect_url(base_uri, location)
114
+ raise(
115
+ Errors::GenerationError,
116
+ "Redirect missing location header for #{base_uri}"
117
+ ) if location.nil? || location.empty?
118
+
119
+ URI.join(base_uri, location).to_s
120
+ rescue URI::InvalidURIError => e
121
+ raise(
122
+ Errors::GenerationError,
123
+ "Invalid redirect URL from #{base_uri}: #{e.message}"
124
+ )
125
+ end
126
+
127
+ # Log redirect if verbose mode enabled
128
+ #
129
+ # @param url [String] redirect URL
130
+ # @return [void]
131
+ def log_redirect(url)
132
+ return unless @verbose
133
+
134
+ @output.puts(" Redirecting to #{url}...")
135
+ end
136
+ end
137
+ end
138
+
@@ -2,5 +2,5 @@
2
2
 
3
3
  module LlmDocsBuilder
4
4
  # Current version of the LlmDocsBuilder gem
5
- VERSION = '0.10.0'
5
+ VERSION = '0.12.0'
6
6
  end
@@ -4,10 +4,20 @@ require 'zeitwerk'
4
4
  require 'pathname'
5
5
  require 'find'
6
6
 
7
+ autoload(:Nokogiri, 'nokogiri')
8
+ autoload(:CGI, 'cgi')
9
+
7
10
  loader = Zeitwerk::Loader.for_gem
8
11
  loader.inflector.inflect('cli' => 'CLI')
9
12
  loader.setup
10
13
 
14
+ # Build and optimize documentation for LLMs
15
+ #
16
+ # This gem provides tools for generating llms.txt files and transforming markdown
17
+ # documentation to be AI-friendly. It can reduce token consumption by 67-95% while
18
+ # preserving essential documentation content.
19
+ #
20
+ # @api public
11
21
  module LlmDocsBuilder
12
22
  class << self
13
23
  # Generates llms.txt from existing markdown documentation
@@ -25,6 +35,7 @@ module LlmDocsBuilder
25
35
  # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
26
36
  # config)
27
37
  # @option options [Boolean] :verbose enable verbose output (overrides config)
38
+ # @option options [String] :content raw markdown content (used for remote sources)
28
39
  # @return [String] generated llms.txt content
29
40
  #
30
41
  # @example Generate from docs directory
@@ -35,6 +35,7 @@ Gem::Specification.new do |spec|
35
35
  spec.executables = spec.files.grep(%r{\Abin/}) { |f| File.basename(f) }
36
36
  spec.require_paths = ['lib']
37
37
 
38
+ spec.add_dependency 'nokogiri', '~> 1.17'
38
39
  spec.add_dependency 'zeitwerk', '~> 2.6'
39
40
 
40
41
  spec.add_development_dependency 'bundler', '~> 2.0'
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llm-docs-builder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maciej Mensfeld
@@ -9,6 +9,20 @@ bindir: bin
9
9
  cert_chain: []
10
10
  date: 1980-01-02 00:00:00.000000000 Z
11
11
  dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: nokogiri
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.17'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - "~>"
24
+ - !ruby/object:Gem::Version
25
+ version: '1.17'
12
26
  - !ruby/object:Gem::Dependency
13
27
  name: zeitwerk
14
28
  requirement: !ruby/object:Gem::Requirement
@@ -132,6 +146,13 @@ files:
132
146
  - lib/llm_docs_builder/config.rb
133
147
  - lib/llm_docs_builder/errors.rb
134
148
  - lib/llm_docs_builder/generator.rb
149
+ - lib/llm_docs_builder/helpers.rb
150
+ - lib/llm_docs_builder/helpers/prune_trailing_unsafe_link_separator.rb
151
+ - lib/llm_docs_builder/helpers/squeeze_blank_lines_outside_fences.rb
152
+ - lib/llm_docs_builder/html_detector.rb
153
+ - lib/llm_docs_builder/html_to_markdown/figure_code_block_renderer.rb
154
+ - lib/llm_docs_builder/html_to_markdown/table_markup_renderer.rb
155
+ - lib/llm_docs_builder/html_to_markdown_converter.rb
135
156
  - lib/llm_docs_builder/markdown_transformer.rb
136
157
  - lib/llm_docs_builder/output_formatter.rb
137
158
  - lib/llm_docs_builder/parser.rb
@@ -143,6 +164,7 @@ files:
143
164
  - lib/llm_docs_builder/transformers/heading_transformer.rb
144
165
  - lib/llm_docs_builder/transformers/link_transformer.rb
145
166
  - lib/llm_docs_builder/transformers/whitespace_transformer.rb
167
+ - lib/llm_docs_builder/url_fetcher.rb
146
168
  - lib/llm_docs_builder/validator.rb
147
169
  - lib/llm_docs_builder/version.rb
148
170
  - llm-docs-builder.gemspec