llm-docs-builder 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,238 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+
6
+ module LlmDocsBuilder
7
+ # Compares content sizes between human and AI versions
8
+ #
9
+ # Helps quantify context window savings by comparing:
10
+ # - Remote URL with different User-Agents (human vs AI bot)
11
+ # - Remote URL with local markdown file
12
+ #
13
+ # @example Compare remote versions
14
+ # comparator = LlmDocsBuilder::Comparator.new('https://example.com/docs/page.html')
15
+ # result = comparator.compare
16
+ # puts "Reduction: #{result[:reduction_percent]}%"
17
+ #
18
+ # @example Compare remote with local file
19
+ # comparator = LlmDocsBuilder::Comparator.new('https://example.com/docs/page.html',
20
+ # local_file: 'docs/page.md'
21
+ # )
22
+ # result = comparator.compare
23
+ #
24
+ # @api public
25
+ class Comparator
26
+ # Default User-Agent for simulating human browser
27
+ HUMAN_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0'
28
+
29
+ # Default User-Agent for simulating AI bot
30
+ AI_USER_AGENT = 'Claude-Web/1.0 (Anthropic AI Assistant)'
31
+
32
+ # Maximum number of redirects to follow before raising an error
33
+ MAX_REDIRECTS = 10
34
+
35
+ # @return [String] URL to compare
36
+ attr_reader :url
37
+
38
+ # @return [Hash] comparison options
39
+ attr_reader :options
40
+
41
+ # Initialize a new comparator
42
+ #
43
+ # @param url [String] URL to fetch and compare
44
+ # @param options [Hash] comparison options
45
+ # @option options [String] :local_file path to local markdown file for comparison
46
+ # @option options [String] :human_user_agent custom User-Agent for human version
47
+ # @option options [String] :ai_user_agent custom User-Agent for AI version
48
+ # @option options [Boolean] :verbose enable verbose output
49
+ def initialize(url, options = {})
50
+ @url = url
51
+ @options = {
52
+ human_user_agent: HUMAN_USER_AGENT,
53
+ ai_user_agent: AI_USER_AGENT
54
+ }.merge(options)
55
+ end
56
+
57
+ # Compare content sizes and calculate reduction
58
+ #
59
+ # @return [Hash] comparison results with keys:
60
+ # - :human_size [Integer] size of human version in bytes
61
+ # - :ai_size [Integer] size of AI version in bytes
62
+ # - :reduction_bytes [Integer] bytes saved
63
+ # - :reduction_percent [Integer] percentage reduction
64
+ # - :factor [Float] compression factor
65
+ # - :human_source [String] source description (URL or file)
66
+ # - :ai_source [String] source description (URL or file)
67
+ def compare
68
+ if options[:local_file]
69
+ compare_with_local_file
70
+ else
71
+ compare_remote_versions
72
+ end
73
+ end
74
+
75
+ private
76
+
77
+ # Compare remote URL (human User-Agent) with remote URL (AI User-Agent)
78
+ #
79
+ # @return [Hash] comparison results
80
+ def compare_remote_versions
81
+ puts "Fetching human version from #{url}..." if options[:verbose]
82
+ human_content = fetch_url(url, options[:human_user_agent])
83
+
84
+ puts "Fetching AI version from #{url}..." if options[:verbose]
85
+ ai_content = fetch_url(url, options[:ai_user_agent])
86
+
87
+ calculate_results(
88
+ human_content.bytesize,
89
+ ai_content.bytesize,
90
+ "#{url} (User-Agent: human)",
91
+ "#{url} (User-Agent: AI)"
92
+ )
93
+ end
94
+
95
+ # Compare remote URL (human User-Agent) with local markdown file
96
+ #
97
+ # @return [Hash] comparison results
98
+ def compare_with_local_file
99
+ local_file = options[:local_file]
100
+
101
+ unless File.exist?(local_file)
102
+ raise(
103
+ Errors::GenerationError,
104
+ "Local file not found: #{local_file}"
105
+ )
106
+ end
107
+
108
+ puts "Fetching human version from #{url}..." if options[:verbose]
109
+ human_content = fetch_url(url, options[:human_user_agent])
110
+
111
+ puts "Reading local file #{local_file}..." if options[:verbose]
112
+ ai_content = File.read(local_file)
113
+
114
+ calculate_results(
115
+ human_content.bytesize,
116
+ ai_content.bytesize,
117
+ url,
118
+ local_file
119
+ )
120
+ end
121
+
122
+ # Fetch URL content with specified User-Agent
123
+ #
124
+ # Follows redirects (up to MAX_REDIRECTS) and handles HTTPS
125
+ #
126
+ # @param url_string [String] URL to fetch
127
+ # @param user_agent [String] User-Agent header value
128
+ # @param redirect_count [Integer] current redirect depth (internal use)
129
+ # @return [String] response body
130
+ # @raise [Errors::GenerationError] if fetch fails or too many redirects
131
+ def fetch_url(url_string, user_agent, redirect_count = 0)
132
+ if redirect_count >= MAX_REDIRECTS
133
+ raise(
134
+ Errors::GenerationError,
135
+ "Too many redirects (#{MAX_REDIRECTS}) when fetching #{url_string}"
136
+ )
137
+ end
138
+
139
+ uri = validate_and_parse_url(url_string)
140
+
141
+ http = Net::HTTP.new(uri.host, uri.port)
142
+ http.use_ssl = uri.scheme == 'https'
143
+ http.open_timeout = 10
144
+ http.read_timeout = 30
145
+
146
+ request = Net::HTTP::Get.new(uri.request_uri)
147
+ request['User-Agent'] = user_agent
148
+
149
+ response = http.request(request)
150
+
151
+ case response
152
+ when Net::HTTPSuccess
153
+ response.body
154
+ when Net::HTTPRedirection
155
+ # Follow redirect with incremented counter
156
+ redirect_url = response['location']
157
+ puts " Redirecting to #{redirect_url}..." if options[:verbose] && redirect_count.positive?
158
+ fetch_url(redirect_url, user_agent, redirect_count + 1)
159
+ else
160
+ raise(
161
+ Errors::GenerationError,
162
+ "Failed to fetch #{url_string}: #{response.code} #{response.message}"
163
+ )
164
+ end
165
+ rescue Errors::GenerationError
166
+ raise
167
+ rescue StandardError => e
168
+ raise(
169
+ Errors::GenerationError,
170
+ "Error fetching #{url_string}: #{e.message}"
171
+ )
172
+ end
173
+
174
+ # Validates and parses URL to prevent malformed URLs
175
+ #
176
+ # @param url_string [String] URL to validate and parse
177
+ # @return [URI::HTTP, URI::HTTPS] parsed URI
178
+ # @raise [Errors::GenerationError] if URL is invalid or uses unsupported scheme
179
+ def validate_and_parse_url(url_string)
180
+ uri = URI.parse(url_string)
181
+
182
+ # Only allow HTTP and HTTPS schemes
183
+ unless %w[http https].include?(uri.scheme&.downcase)
184
+ raise(
185
+ Errors::GenerationError,
186
+ "Unsupported URL scheme: #{uri.scheme || 'none'} (only http/https allowed)"
187
+ )
188
+ end
189
+
190
+ # Ensure host is present
191
+ if uri.host.nil? || uri.host.empty?
192
+ raise(
193
+ Errors::GenerationError,
194
+ "Invalid URL: missing host in #{url_string}"
195
+ )
196
+ end
197
+
198
+ uri
199
+ rescue URI::InvalidURIError => e
200
+ raise(
201
+ Errors::GenerationError,
202
+ "Invalid URL format: #{e.message}"
203
+ )
204
+ end
205
+
206
+ # Calculate comparison statistics
207
+ #
208
+ # @param human_size [Integer] size of human version in bytes
209
+ # @param ai_size [Integer] size of AI version in bytes
210
+ # @param human_source [String] description of human source
211
+ # @param ai_source [String] description of AI source
212
+ # @return [Hash] comparison results
213
+ def calculate_results(human_size, ai_size, human_source, ai_source)
214
+ reduction_bytes = human_size - ai_size
215
+ reduction_percent = if human_size.positive?
216
+ ((reduction_bytes.to_f / human_size) * 100).round
217
+ else
218
+ 0
219
+ end
220
+
221
+ factor = if ai_size.positive?
222
+ (human_size.to_f / ai_size).round(1)
223
+ else
224
+ Float::INFINITY
225
+ end
226
+
227
+ {
228
+ human_size: human_size,
229
+ ai_size: ai_size,
230
+ reduction_bytes: reduction_bytes,
231
+ reduction_percent: reduction_percent,
232
+ factor: factor,
233
+ human_source: human_source,
234
+ ai_source: ai_source
235
+ }
236
+ end
237
+ end
238
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+
5
+ module LlmDocsBuilder
6
+ # Simple configuration loader for llm-docs-builder.yml files
7
+ #
8
+ # Loads YAML configuration files and provides a simple interface for accessing configuration
9
+ # values. Automatically looks for config files in the current directory if none specified.
10
+ #
11
+ # @example Load default config file
12
+ # config = LlmDocsBuilder::Config.new
13
+ #
14
+ # @example Load specific config file
15
+ # config = LlmDocsBuilder::Config.new('my-config.yml')
16
+ #
17
+ # @example Access config values
18
+ # config['base_url'] # => "https://myproject.io"
19
+ # config.dig('output') # => "llms.txt"
20
+ #
21
+ # @api public
22
+ class Config
23
+ # @return [Hash] the loaded configuration data
24
+ attr_reader :data
25
+
26
+ # Initialize a new configuration loader
27
+ #
28
+ # @param config_file [String, nil] path to YAML config file, or nil to auto-find
29
+ def initialize(config_file = nil)
30
+ @config_file = config_file || find_config_file
31
+ @data = load_config
32
+ end
33
+
34
+ # Access configuration value by key
35
+ #
36
+ # @param key [String, Symbol] configuration key
37
+ # @return [Object, nil] configuration value or nil if not found
38
+ def [](key)
39
+ data[key.to_s]
40
+ end
41
+
42
+ # Access nested configuration values
43
+ #
44
+ # @param keys [Array<String, Symbol>] nested keys to access
45
+ # @return [Object, nil] configuration value or nil if not found
46
+ def dig(*keys)
47
+ data.dig(*keys.map(&:to_s))
48
+ end
49
+
50
+ # Merge config file values with CLI options
51
+ #
52
+ # CLI options take precedence over config file values. Config file provides
53
+ # defaults for any options not specified via CLI.
54
+ #
55
+ # @param options [Hash] CLI options hash
56
+ # @return [Hash] merged configuration with CLI overrides applied
57
+ def merge_with_options(options)
58
+ # CLI options override config file, config file provides defaults
59
+ {
60
+ docs: options[:docs] || self['docs'] || '.',
61
+ base_url: options[:base_url] || self['base_url'],
62
+ title: options[:title] || self['title'],
63
+ description: options[:description] || self['description'],
64
+ output: options[:output] || self['output'] || 'llms.txt',
65
+ convert_urls: if options.key?(:convert_urls)
66
+ options[:convert_urls]
67
+ else
68
+ self['convert_urls'] || false
69
+ end,
70
+ verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
71
+ # Bulk transformation options
72
+ suffix: options[:suffix] || self['suffix'] || '.llm',
73
+ excludes: options[:excludes] || self['excludes'] || [],
74
+ bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
75
+ }
76
+ end
77
+
78
+ # Check if a config file was found and exists
79
+ #
80
+ # @return [Boolean] true if config file exists, false otherwise
81
+ def exists?
82
+ @config_file && File.exist?(@config_file)
83
+ end
84
+
85
+ private
86
+
87
+ # Find config file in current directory
88
+ #
89
+ # Looks for config files in order of preference:
90
+ # 1. llm-docs-builder.yml
91
+ # 2. llm-docs-builder.yaml
92
+ # 3. .llm-docs-builder.yml
93
+ #
94
+ # @return [String, nil] path to config file or nil if none found
95
+ def find_config_file
96
+ candidates = ['llm-docs-builder.yml', 'llm-docs-builder.yaml', '.llm-docs-builder.yml']
97
+ candidates.find { |file| File.exist?(file) }
98
+ end
99
+
100
+ # Load and parse YAML config file
101
+ #
102
+ # @return [Hash] parsed config data, empty hash if no file
103
+ # @raise [Errors::GenerationError] if YAML is invalid or file cannot be read
104
+ def load_config
105
+ return {} unless @config_file && File.exist?(@config_file)
106
+
107
+ begin
108
+ YAML.load_file(@config_file) || {}
109
+ rescue Psych::SyntaxError => e
110
+ raise Errors::GenerationError, "Invalid YAML in config file #{@config_file}: #{e.message}"
111
+ rescue StandardError => e
112
+ raise Errors::GenerationError, "Failed to load config file #{@config_file}: #{e.message}"
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,31 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Namespace used to encapsulate all the internal errors of LlmDocsBuilder
5
+ module Errors
6
+ # Base class for all the LlmDocsBuilder internal errors
7
+ BaseError = Class.new(StandardError)
8
+
9
+ # Raised when llms.txt generation fails due to configuration issues,
10
+ # missing directories, invalid YAML, or file access problems
11
+ #
12
+ # @example When directory doesn't exist
13
+ # LlmDocsBuilder.bulk_transform('/nonexistent/path')
14
+ # # => raises GenerationError: "Directory not found: /nonexistent/path"
15
+ #
16
+ # @example When config YAML is invalid
17
+ # LlmDocsBuilder.generate_from_docs(config_file: 'invalid.yml')
18
+ # # => raises GenerationError: "Invalid YAML in config file..."
19
+ GenerationError = Class.new(BaseError)
20
+
21
+ # Raised when llms.txt content validation fails
22
+ #
23
+ # This error is intended for validation failures but currently not used.
24
+ # The Validator class returns boolean results instead of raising errors.
25
+ #
26
+ # @example Future usage (when validation raises)
27
+ # LlmDocsBuilder.validate!(invalid_content)
28
+ # # => raises ValidationError: "Missing required H1 title"
29
+ ValidationError = Class.new(BaseError)
30
+ end
31
+ end
@@ -0,0 +1,234 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Simple generator that creates llms.txt from existing markdown documentation
5
+ #
6
+ # Takes a documentation directory or file and generates a properly formatted llms.txt file by
7
+ # analyzing markdown files, extracting titles and descriptions, and organizing them by priority.
8
+ #
9
+ # @example Generate from docs directory
10
+ # generator = LlmDocsBuilder::Generator.new('./docs', base_url: 'https://myproject.io')
11
+ # content = generator.generate
12
+ #
13
+ # @api public
14
+ class Generator
15
+ # @return [String] path to documentation directory or file
16
+ attr_reader :docs_path
17
+
18
+ # @return [Hash] generation options
19
+ attr_reader :options
20
+
21
+ # Initialize a new generator
22
+ #
23
+ # @param docs_path [String] path to documentation directory or file
24
+ # @param options [Hash] generation options
25
+ # @option options [String] :base_url base URL for expanding relative links
26
+ # @option options [String] :title project title (overrides auto-detection)
27
+ # @option options [String] :description project description (overrides auto-detection)
28
+ # @option options [String] :output output file path for saving
29
+ # @option options [Boolean] :verbose enable verbose output
30
+ def initialize(docs_path, options = {})
31
+ @docs_path = docs_path
32
+ @options = options
33
+ end
34
+
35
+ # Generate llms.txt content from documentation
36
+ #
37
+ # Scans documentation files, extracts metadata, prioritizes them, and builds a formatted
38
+ # llms.txt file.
39
+ #
40
+ # @return [String] generated llms.txt content
41
+ def generate
42
+ docs = find_documentation_files
43
+
44
+ content = build_llms_txt(docs)
45
+
46
+ if (output_path = options[:output])
47
+ File.write(output_path, content)
48
+ end
49
+
50
+ content
51
+ end
52
+
53
+ private
54
+
55
+ # Locates and analyzes documentation files from docs_path
56
+ #
57
+ # Handles both single file and directory paths
58
+ #
59
+ # @return [Array<Hash>] array of analyzed file metadata
60
+ def find_documentation_files
61
+ return [] unless File.exist?(docs_path)
62
+
63
+ if File.file?(docs_path)
64
+ [analyze_file(docs_path)]
65
+ else
66
+ find_markdown_files_in_directory
67
+ end
68
+ end
69
+
70
+ # Recursively finds and analyzes markdown files in directory
71
+ #
72
+ # Sorts by priority (README, guides, etc.) and skips hidden files
73
+ #
74
+ # @return [Array<Hash>] sorted array of analyzed file metadata
75
+ def find_markdown_files_in_directory
76
+ files = []
77
+
78
+ Find.find(docs_path) do |path|
79
+ next unless File.file?(path)
80
+ next unless path.match?(/\.md$/i)
81
+ next if File.basename(path).start_with?('.')
82
+
83
+ files << analyze_file(path)
84
+ end
85
+
86
+ files.sort_by { |f| f[:priority] }
87
+ end
88
+
89
+ # Extracts metadata from a documentation file
90
+ #
91
+ # Analyzes file content to extract title, description, and priority
92
+ #
93
+ # @param file_path [String] path to file to analyze
94
+ # @return [Hash] file metadata with :path, :title, :description, :priority
95
+ def analyze_file(file_path)
96
+ # Handle single file case differently
97
+ relative_path = if File.file?(docs_path)
98
+ File.basename(file_path)
99
+ else
100
+ Pathname.new(file_path).relative_path_from(Pathname.new(docs_path)).to_s
101
+ end
102
+
103
+ content = File.read(file_path)
104
+
105
+ {
106
+ path: relative_path,
107
+ title: extract_title(content, file_path),
108
+ description: extract_description(content),
109
+ priority: calculate_priority(file_path)
110
+ }
111
+ end
112
+
113
+ # Extracts title from file content or generates from filename
114
+ #
115
+ # Prefers first H1 header, falls back to formatted filename
116
+ #
117
+ # @param content [String] file content
118
+ # @param file_path [String] path to file
119
+ # @return [String] extracted or generated title
120
+ def extract_title(content, file_path)
121
+ # Try to extract title from first # header
122
+ if content.match(/^#\s+(.+)/)
123
+ ::Regexp.last_match(1).strip
124
+ else
125
+ # Use filename as fallback
126
+ File.basename(file_path, '.md').gsub(/[_-]/, ' ').split.map(&:capitalize).join(' ')
127
+ end
128
+ end
129
+
130
+ # Extracts description from file content
131
+ #
132
+ # Takes first paragraph after title, truncated to 200 characters
133
+ #
134
+ # @param content [String] file content
135
+ # @return [String] extracted description
136
+ def extract_description(content)
137
+ lines = content.lines
138
+
139
+ # Skip title line and empty lines
140
+ description_lines = lines.drop_while { |line| line.start_with?('#') || line.strip.empty? }
141
+
142
+ # Get first paragraph
143
+ first_paragraph = description_lines.take_while { |line| !line.strip.empty? }
144
+
145
+ first_paragraph.join(' ').strip.slice(0, 200)
146
+ end
147
+
148
+ # Assigns priority to file based on filename patterns
149
+ #
150
+ # README gets highest priority, followed by guides, tutorials, API docs
151
+ #
152
+ # @param file_path [String] path to file
153
+ # @return [Integer] priority value (1-7, lower is higher priority)
154
+ def calculate_priority(file_path)
155
+ basename = File.basename(file_path).downcase
156
+
157
+ return 1 if basename.start_with?('readme')
158
+ return 2 if basename.include?('getting')
159
+ return 3 if basename.include?('guide')
160
+ return 4 if basename.include?('tutorial')
161
+ return 5 if basename.include?('api')
162
+ return 6 if basename.include?('reference')
163
+
164
+ 7 # default priority
165
+ end
166
+
167
+ # Constructs llms.txt content from analyzed documentation files
168
+ #
169
+ # Combines title, description, and documentation links into formatted output
170
+ #
171
+ # @param docs [Array<Hash>] analyzed file metadata
172
+ # @return [String] formatted llms.txt content
173
+ def build_llms_txt(docs)
174
+ title = options[:title] || detect_project_title(docs)
175
+ description = options[:description] || detect_project_description(docs)
176
+
177
+ content = []
178
+ content << "# #{title}"
179
+ content << ''
180
+ content << "> #{description}" if description
181
+ content << ''
182
+
183
+ if docs.any?
184
+ content << '## Documentation'
185
+ content << ''
186
+
187
+ docs.each do |doc|
188
+ url = build_url(doc[:path])
189
+ content << if doc[:description] && !doc[:description].empty?
190
+ "- [#{doc[:title]}](#{url}): #{doc[:description]}"
191
+ else
192
+ "- [#{doc[:title]}](#{url})"
193
+ end
194
+ end
195
+ end
196
+
197
+ "#{content.join("\n")}\n"
198
+ end
199
+
200
+ # Attempts to detect project title from README or directory name
201
+ #
202
+ # @param docs [Array<Hash>] analyzed file metadata
203
+ # @return [String] detected project title
204
+ def detect_project_title(docs)
205
+ readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
206
+ return readme[:title] if readme
207
+
208
+ File.basename(File.expand_path('.'))
209
+ end
210
+
211
+ # Attempts to extract project description from README
212
+ #
213
+ # @param docs [Array<Hash>] analyzed file metadata
214
+ # @return [String, nil] detected project description or nil
215
+ def detect_project_description(docs)
216
+ readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
217
+ return readme[:description] if readme&.fetch(:description, nil)
218
+
219
+ nil
220
+ end
221
+
222
+ # Constructs full URL from path using base_url option if provided
223
+ #
224
+ # @param path [String] relative path to file
225
+ # @return [String] full URL or relative path
226
+ def build_url(path)
227
+ if (base_url = options[:base_url])
228
+ File.join(base_url, path)
229
+ else
230
+ path
231
+ end
232
+ end
233
+ end
234
+ end