llm-docs-builder 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Transforms markdown files to be AI-friendly
5
+ #
6
+ # Processes individual markdown files to make them more suitable for LLM consumption by
7
+ # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
8
+ # formats.
9
+ #
10
+ # @example Transform with base URL
11
+ # transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
12
+ # base_url: 'https://myproject.io'
13
+ # )
14
+ # content = transformer.transform
15
+ #
16
+ # @api public
17
+ class MarkdownTransformer
18
+ # @return [String] path to markdown file
19
+ attr_reader :file_path
20
+
21
+ # @return [Hash] transformation options
22
+ attr_reader :options
23
+
24
+ # Initialize a new markdown transformer
25
+ #
26
+ # @param file_path [String] path to markdown file to transform
27
+ # @param options [Hash] transformation options
28
+ # @option options [String] :base_url base URL for expanding relative links
29
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
30
+ def initialize(file_path, options = {})
31
+ @file_path = file_path
32
+ @options = options
33
+ end
34
+
35
+ # Transform markdown content to be AI-friendly
36
+ #
37
+ # Applies transformations to make the markdown more suitable for LLM processing:
38
+ # - Expands relative links to absolute URLs (if base_url provided)
39
+ # - Converts HTML URLs to markdown format (if convert_urls enabled)
40
+ #
41
+ # @return [String] transformed markdown content
42
+ def transform
43
+ content = File.read(file_path)
44
+
45
+ content = expand_relative_links(content) if options[:base_url]
46
+ content = convert_html_urls(content) if options[:convert_urls]
47
+
48
+ content
49
+ end
50
+
51
+ private
52
+
53
+ # Expand relative links to absolute URLs
54
+ #
55
+ # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
56
+ # Leaves absolute URLs and anchors unchanged.
57
+ #
58
+ # @param content [String] markdown content to process
59
+ # @return [String] content with expanded links
60
+ def expand_relative_links(content)
61
+ base_url = options[:base_url]
62
+
63
+ content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
64
+ text = ::Regexp.last_match(1)
65
+ url = ::Regexp.last_match(2)
66
+
67
+ if url.start_with?('http://', 'https://', '//', '#')
68
+ match # Already absolute or anchor
69
+ else
70
+ # Clean up relative path
71
+ clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
72
+ expanded_url = File.join(base_url, clean_url)
73
+ "[#{text}](#{expanded_url})"
74
+ end
75
+ end
76
+ end
77
+
78
+ # Convert HTML URLs to markdown-friendly format
79
+ #
80
+ # Changes URLs ending in .html or .htm to .md for better LLM understanding
81
+ #
82
+ # @param content [String] markdown content to process
83
+ # @return [String] content with converted URLs
84
+ def convert_html_urls(content)
85
+ content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
86
+ url.sub(/\.html?$/, '.md')
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Parses llms.txt files into structured data
5
+ #
6
+ # Reads and parses llms.txt files according to the llms.txt specification,
7
+ # extracting the title, description, and structured sections (Documentation,
8
+ # Examples, Optional) with their links.
9
+ #
10
+ # @example Parse an llms.txt file
11
+ # parser = LlmDocsBuilder::Parser.new('llms.txt')
12
+ # parsed = parser.parse
13
+ # parsed.title # => "My Project"
14
+ # parsed.description # => "Project description"
15
+ # parsed.documentation_links # => [{title: "README", url: "...", description: "..."}]
16
+ #
17
+ # @api public
18
+ class Parser
19
+ # @return [String] path to the llms.txt file
20
+ attr_reader :file_path
21
+
22
+ # @return [String] raw content of the llms.txt file
23
+ attr_reader :content
24
+
25
+ # Initialize a new parser
26
+ #
27
+ # @param file_path [String] path to the llms.txt file to parse
28
+ def initialize(file_path)
29
+ @file_path = file_path
30
+ @content = File.read(file_path)
31
+ end
32
+
33
+ # Parse the llms.txt file
34
+ #
35
+ # Parses the file content and returns a {ParsedContent} object containing
36
+ # the extracted title, description, and structured sections with links.
37
+ #
38
+ # @return [ParsedContent] parsed content with title, description, and sections
39
+ def parse
40
+ sections = {}
41
+ current_section = nil
42
+ current_content = []
43
+
44
+ lines = content.lines
45
+
46
+ lines.each_with_index do |line, index|
47
+ if line.start_with?('# ')
48
+ save_section(sections, current_section, current_content) if current_section
49
+
50
+ sections[:title] = line[2..].strip if sections.empty?
51
+ current_section = :description if index == 1 && line.start_with?('> ')
52
+ current_content = []
53
+ elsif line.start_with?('> ') && sections[:title] && !sections[:description]
54
+ sections[:description] = line[2..].strip
55
+ elsif line.start_with?('## ')
56
+ save_section(sections, current_section, current_content) if current_section
57
+
58
+ current_section = line[3..].strip.downcase.gsub(/\s+/, '_').to_sym
59
+ current_content = []
60
+ elsif !line.strip.empty?
61
+ current_content << line
62
+ end
63
+ end
64
+
65
+ save_section(sections, current_section, current_content) if current_section
66
+
67
+ ParsedContent.new(sections)
68
+ end
69
+
70
+ private
71
+
72
+ # Parses and stores section content in the sections hash
73
+ #
74
+ # Skips empty sections and delegates to parse_section_content for processing
75
+ #
76
+ # @param sections [Hash] accumulator hash for sections
77
+ # @param section_name [Symbol] name of the section
78
+ # @param content [Array<String>] raw content lines
79
+ def save_section(sections, section_name, content)
80
+ return if content.empty?
81
+
82
+ sections[section_name] ||= []
83
+ sections[section_name] = parse_section_content(content.join)
84
+ end
85
+
86
+ # Extracts markdown links from section content into structured format
87
+ #
88
+ # Scans for markdown list items with links and descriptions. Returns raw content
89
+ # if no links are found in the expected format.
90
+ #
91
+ # @param content [String] raw section content
92
+ # @return [Array<Hash>, String] array of link hashes or raw content if no links found
93
+ def parse_section_content(content)
94
+ links = []
95
+
96
+ content.scan(/^[-*]\s*\[([^\]]+)\]\(([^)]+)\):\s*(.*)$/m) do |title, url, description|
97
+ links << {
98
+ title: title,
99
+ url: url,
100
+ description: description.strip
101
+ }
102
+ end
103
+
104
+ links.empty? ? content.strip : links
105
+ end
106
+ end
107
+
108
+ # Represents parsed llms.txt content with structured access to sections
109
+ #
110
+ # Provides convenient access to parsed llms.txt sections including title,
111
+ # description, and link collections. Can be converted to Hash or XML formats.
112
+ #
113
+ # @example Access parsed content
114
+ # parsed.title # => "My Project"
115
+ # parsed.description # => "A description"
116
+ # parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
117
+ # parsed.to_h # => Hash representation
118
+ # parsed.to_xml # => XML string
119
+ #
120
+ # @api public
121
+ class ParsedContent
122
+ # @return [Hash] the parsed sections hash
123
+ attr_reader :sections
124
+
125
+ # Initialize parsed content
126
+ #
127
+ # @param sections [Hash] hash containing parsed sections (:title, :description, :documentation, etc.)
128
+ def initialize(sections)
129
+ @sections = sections
130
+ end
131
+
132
+ # Get the project title
133
+ #
134
+ # @return [String, nil] the H1 title or nil if not present
135
+ def title
136
+ sections[:title]
137
+ end
138
+
139
+ # Get the project description
140
+ #
141
+ # @return [String, nil] the description blockquote or nil if not present
142
+ def description
143
+ sections[:description]
144
+ end
145
+
146
+ # Get documentation links
147
+ #
148
+ # @return [Array<Hash>] array of documentation links with :title, :url, and :description
149
+ def documentation_links
150
+ sections[:documentation] || []
151
+ end
152
+
153
+ # Get example links
154
+ #
155
+ # @return [Array<Hash>] array of example links with :title, :url, and :description
156
+ def example_links
157
+ sections[:examples] || []
158
+ end
159
+
160
+ # Get optional links
161
+ #
162
+ # @return [Array<Hash>] array of optional links with :title, :url, and :description
163
+ def optional_links
164
+ sections[:optional] || []
165
+ end
166
+
167
+ # Convert to hash representation
168
+ #
169
+ # @return [Hash] hash containing all parsed sections
170
+ def to_h
171
+ sections
172
+ end
173
+
174
+ # Convert to XML representation
175
+ #
176
+ # Generates an XML document with all parsed sections and links.
177
+ #
178
+ # @return [String] XML string representation
179
+ def to_xml
180
+ builder = []
181
+ builder << '<?xml version="1.0" encoding="UTF-8"?>'
182
+ builder << '<llms_context>'
183
+ builder << " <title>#{title}</title>" if title
184
+ builder << " <description>#{description}</description>" if description
185
+
186
+ add_xml_section(builder, 'documentation', documentation_links)
187
+ add_xml_section(builder, 'examples', example_links)
188
+ add_xml_section(builder, 'optional', optional_links) if sections[:optional]
189
+
190
+ builder << '</llms_context>'
191
+ builder.join("\n")
192
+ end
193
+
194
+ private
195
+
196
+ # Appends section XML elements to builder array
197
+ #
198
+ # Handles both array of link hashes and raw string content
199
+ #
200
+ # @param builder [Array<String>] XML lines accumulator
201
+ # @param name [String] section name
202
+ # @param links [Array<Hash>, String] section links or content
203
+ def add_xml_section(builder, name, links)
204
+ return if links.empty?
205
+
206
+ builder << " <#{name}>"
207
+
208
+ if links.is_a?(Array)
209
+ links.each do |link|
210
+ builder << ' <link>'
211
+ builder << " <title>#{link[:title]}</title>"
212
+ builder << " <url>#{link[:url]}</url>"
213
+ builder << " <description>#{link[:description]}</description>"
214
+ builder << ' </link>'
215
+ end
216
+ else
217
+ builder << " #{links}"
218
+ end
219
+
220
+ builder << " </#{name}>"
221
+ end
222
+ end
223
+ end
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Validates llms.txt content against the llms.txt specification
5
+ #
6
+ # Ensures that llms.txt content follows proper formatting rules including:
7
+ # - Required H1 title header
8
+ # - Optional description blockquote
9
+ # - Proper section ordering (Documentation, Examples, Optional)
10
+ # - Valid markdown syntax and link formats
11
+ # - File size and line length limits
12
+ #
13
+ # @example Validate llms.txt content
14
+ # validator = LlmDocsBuilder::Validator.new(content)
15
+ # validator.valid? # => true or false
16
+ # validator.errors # => Array of error messages
17
+ #
18
+ # @api public
19
+ class Validator
20
+ # @return [String] the llms.txt content being validated
21
+ attr_reader :content
22
+
23
+ # @return [Array<String>] array of validation error messages
24
+ attr_reader :errors
25
+
26
+ # Required sections that must appear in llms.txt
27
+ REQUIRED_SECTIONS = ['# '].freeze
28
+
29
+ # Optional sections that may appear in llms.txt
30
+ OPTIONAL_SECTIONS = ['> ', '## Documentation', '## Examples', '## Optional'].freeze
31
+
32
+ # Maximum length for a single line in characters
33
+ MAX_LINE_LENGTH = 120
34
+
35
+ # Maximum file size in bytes
36
+ MAX_FILE_SIZE = 50_000
37
+
38
+ # Initialize a new validator
39
+ #
40
+ # @param content [String] the llms.txt content to validate
41
+ def initialize(content)
42
+ @content = content
43
+ @errors = []
44
+ end
45
+
46
+ # Check if content is valid
47
+ #
48
+ # Runs all validation checks and returns whether the content is valid.
49
+ # Use {#errors} to access validation error messages.
50
+ #
51
+ # @return [Boolean] true if content is valid, false otherwise
52
+ def valid?
53
+ validate!
54
+ errors.empty?
55
+ end
56
+
57
+ # Validate content and return result
58
+ #
59
+ # Runs all validation checks, populates {#errors} array, and returns whether
60
+ # the content is valid.
61
+ #
62
+ # @return [Boolean] true if content is valid, false otherwise
63
+ def validate!
64
+ @errors = []
65
+
66
+ validate_required_sections
67
+ validate_structure
68
+ validate_markdown_syntax
69
+ validate_links
70
+ validate_file_size
71
+
72
+ errors.empty?
73
+ end
74
+
75
+ private
76
+
77
+ # Checks for required H1 title header and validates title length
78
+ #
79
+ # Adds errors if title is missing or exceeds 80 characters
80
+ def validate_required_sections
81
+ lines = content.lines
82
+
83
+ errors << 'Missing required H1 title (must start with "# ")' unless lines.first&.start_with?('# ')
84
+
85
+ return unless lines.first&.strip&.length.to_i > 80
86
+
87
+ errors << 'Title is too long (max 80 characters)'
88
+ end
89
+
90
+ # Validates H1 uniqueness, description length, and section ordering
91
+ #
92
+ # Ensures only one H1, description under 200 chars, and proper section order
93
+ def validate_structure
94
+ lines = content.lines
95
+ h1_count = lines.count { |line| line.start_with?('# ') }
96
+
97
+ errors << 'Multiple H1 headers found (only one allowed)' if h1_count > 1
98
+
99
+ if lines[1]&.start_with?('> ') && lines[1].strip.length > 200
100
+ errors << 'Description blockquote is too long (max 200 characters)'
101
+ end
102
+
103
+ validate_section_order
104
+ end
105
+
106
+ # Verifies sections appear in correct order: Documentation, Examples, Optional
107
+ #
108
+ # Detects out-of-order sections and adds validation errors
109
+ def validate_section_order
110
+ sections = content.scan(/^## (.+)$/).flatten
111
+ expected_order = %w[Documentation Examples Optional]
112
+
113
+ current_index = -1
114
+ sections.each do |section|
115
+ index = expected_order.index(section)
116
+ next unless index
117
+
118
+ errors << "Section '#{section}' is out of order" if index < current_index
119
+ current_index = index
120
+ end
121
+ end
122
+
123
+ # Validates markdown syntax including links, lists, and headers
124
+ #
125
+ # Delegates to specialized validators for different markdown elements
126
+ def validate_markdown_syntax
127
+ validate_link_format
128
+ validate_list_format
129
+ validate_headers
130
+ end
131
+
132
+ # Checks markdown links for empty text/URLs and valid URL formats
133
+ #
134
+ # Validates URLs follow expected patterns for relative/absolute paths
135
+ def validate_link_format
136
+ content.scan(/\[([^\]]*)\]\(([^)]*)\)/) do |text, url|
137
+ errors << 'Empty link text found' if text.empty?
138
+
139
+ errors << 'Empty link URL found' if url.empty?
140
+
141
+ # Allow relative paths, absolute paths, HTTP(S) URLs, and common file extensions
142
+ url_pattern = %r{
143
+ ^(?:
144
+ https?://|
145
+ /|
146
+ \.\.?/|
147
+ [a-zA-Z0-9_.-]+(?:/|\.md|\.txt|\.rb|\.html)?|
148
+ [A-Z]+[a-zA-Z]*|
149
+ docs/|
150
+ examples/|
151
+ lib/
152
+ ).*$
153
+ }x
154
+ errors << "Invalid URL format: #{url}" unless url =~ url_pattern
155
+ end
156
+ end
157
+
158
+ # Validates list items match expected markdown link format
159
+ #
160
+ # Ensures list items with links have proper syntax with optional descriptions
161
+ def validate_list_format
162
+ content.lines.each_with_index do |line, index|
163
+ next unless line =~ /^[-*]\s+\[/
164
+
165
+ # Allow both with and without descriptions
166
+ next if line =~ /^[-*]\s+\[.+\]\(.+\)(?::\s*.+)?$/
167
+
168
+ errors << "Invalid list item format at line #{index + 1}"
169
+ end
170
+ end
171
+
172
+ # Validates header levels and content
173
+ #
174
+ # Checks for empty H1 headers and warns about headers deeper than H2
175
+ def validate_headers
176
+ content.scan(/^(#+)\s+(.+)$/) do |hashes, text|
177
+ level = hashes.length
178
+
179
+ if level == 1 && text.strip.empty?
180
+ errors << 'Empty H1 header text'
181
+ elsif level > 2
182
+ errors << "Headers deeper than H2 not recommended (found H#{level})"
183
+ end
184
+ end
185
+ end
186
+
187
+ # Validates link security and format requirements
188
+ #
189
+ # Warns about non-HTTPS URLs and URLs containing spaces
190
+ def validate_links
191
+ urls = content.scan(/\[([^\]]+)\]\(([^)]+)\)/).map(&:last)
192
+
193
+ urls.each do |url|
194
+ if url.start_with?('http') && !url.start_with?('https')
195
+ errors << "Non-HTTPS URL found: #{url} (consider using HTTPS)"
196
+ end
197
+
198
+ errors << "URL contains spaces: #{url}" if url.include?(' ')
199
+ end
200
+ end
201
+
202
+ # Checks file size and individual line lengths against limits
203
+ #
204
+ # Enforces 50KB file size limit and 120 character line length limit
205
+ def validate_file_size
206
+ errors << "File size exceeds maximum (#{MAX_FILE_SIZE} bytes)" if content.bytesize > MAX_FILE_SIZE
207
+
208
+ lines = content.lines
209
+ lines.each_with_index do |line, index|
210
+ if line.chomp.length > MAX_LINE_LENGTH
211
+ errors << "Line #{index + 1} exceeds maximum length (#{MAX_LINE_LENGTH} characters)"
212
+ end
213
+ end
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Current version of the LlmDocsBuilder gem
5
+ VERSION = '0.3.0'
6
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'zeitwerk'
4
+ require 'pathname'
5
+ require 'find'
6
+
7
+ loader = Zeitwerk::Loader.for_gem
8
+ loader.inflector.inflect('cli' => 'CLI')
9
+ loader.setup
10
+
11
+ module LlmDocsBuilder
12
+ class << self
13
+ # Generates llms.txt from existing markdown documentation
14
+ #
15
+ # @param docs_path [String, nil] path to documentation directory or file (optional if
16
+ # config_file provided)
17
+ # @param options [Hash] generation options
18
+ # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
19
+ # @option options [String] :base_url base URL for converting relative links (overrides config)
20
+ # @option options [String] :title project title (auto-detected if not provided, overrides
21
+ # config)
22
+ # @option options [String] :description project description (auto-detected if not provided,
23
+ # overrides config)
24
+ # @option options [String] :output output file path (default: 'llms.txt', overrides config)
25
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
26
+ # config)
27
+ # @option options [Boolean] :verbose enable verbose output (overrides config)
28
+ # @return [String] generated llms.txt content
29
+ #
30
+ # @example Generate from docs directory
31
+ # LlmDocsBuilder.generate_from_docs('./docs')
32
+ #
33
+ # @example Generate using config file
34
+ # LlmDocsBuilder.generate_from_docs(config_file: 'llm-docs-builder.yml')
35
+ #
36
+ # @example Generate with config file and overrides
37
+ # LlmDocsBuilder.generate_from_docs('./docs',
38
+ # config_file: 'my-config.yml',
39
+ # title: 'Override Title'
40
+ # )
41
+ def generate_from_docs(docs_path = nil, options = {})
42
+ # Support config-first usage: generate_from_docs(config_file: 'path.yml')
43
+ if docs_path.is_a?(Hash) && docs_path.key?(:config_file)
44
+ options = docs_path
45
+ docs_path = nil
46
+ end
47
+
48
+ config = Config.new(options[:config_file])
49
+ merged_options = config.merge_with_options(options)
50
+
51
+ # Use docs_path param or config file docs setting
52
+ final_docs_path = docs_path || merged_options[:docs]
53
+
54
+ Generator.new(final_docs_path, merged_options).generate
55
+ end
56
+
57
+ # Transforms a markdown file to be AI-friendly
58
+ #
59
+ # @param file_path [String] path to markdown file
60
+ # @param options [Hash] transformation options
61
+ # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
62
+ # @option options [String] :base_url base URL for expanding relative links (overrides config)
63
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
64
+ # config)
65
+ # @option options [Boolean] :verbose enable verbose output (overrides config)
66
+ # @return [String] transformed markdown content
67
+ #
68
+ # @example Transform with direct options
69
+ # LlmDocsBuilder.transform_markdown('README.md',
70
+ # base_url: 'https://myproject.io',
71
+ # convert_urls: true
72
+ # )
73
+ #
74
+ # @example Transform using config file
75
+ # LlmDocsBuilder.transform_markdown('README.md', config_file: 'llm-docs-builder.yml')
76
+ def transform_markdown(file_path, options = {})
77
+ config = Config.new(options[:config_file])
78
+ merged_options = config.merge_with_options(options)
79
+
80
+ MarkdownTransformer.new(file_path, merged_options).transform
81
+ end
82
+
83
+ # Bulk transforms multiple markdown files to be AI-friendly
84
+ #
85
+ # @param docs_path [String] path to documentation directory
86
+ # @param options [Hash] transformation options
87
+ # @option options [String] :config_file path to YAML config file (auto-finds llm-docs-builder.yml)
88
+ # @option options [String] :base_url base URL for expanding relative links (overrides config)
89
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format (overrides
90
+ # config)
91
+ # @option options [String] :suffix suffix for transformed files (default: '.llm', overrides
92
+ # config)
93
+ # @option options [Array<String>] :excludes glob patterns for files to exclude (overrides
94
+ # config)
95
+ # @option options [Boolean] :verbose enable verbose output (overrides config)
96
+ # @return [Array<String>] paths of transformed files
97
+ #
98
+ # @example Bulk transform with direct options
99
+ # LlmDocsBuilder.bulk_transform('./docs',
100
+ # base_url: 'https://myproject.io',
101
+ # suffix: '.ai',
102
+ # excludes: ['**/private/**', 'draft-*.md']
103
+ # )
104
+ #
105
+ # @example Bulk transform using config file
106
+ # LlmDocsBuilder.bulk_transform('./docs', config_file: 'llm-docs-builder.yml')
107
+ def bulk_transform(docs_path, options = {})
108
+ config = Config.new(options[:config_file])
109
+ merged_options = config.merge_with_options(options)
110
+
111
+ BulkTransformer.new(docs_path, merged_options).transform_all
112
+ end
113
+
114
+ # Parses an existing llms.txt file
115
+ #
116
+ # @param file_path [String] path to the llms.txt file to parse
117
+ # @return [Parser] parsed llms.txt object
118
+ def parse(file_path)
119
+ Parser.new(file_path).parse
120
+ end
121
+
122
+ # Validates llms.txt content
123
+ #
124
+ # @param content [String] the llms.txt content to validate
125
+ # @return [Boolean] true if content is valid, false otherwise
126
+ def validate(content)
127
+ Validator.new(content).valid?
128
+ end
129
+ end
130
+ end