llms-txt-ruby 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,234 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmsTxt
4
+ # Simple generator that creates llms.txt from existing markdown documentation
5
+ #
6
+ # Takes a documentation directory or file and generates a properly formatted llms.txt file by
7
+ # analyzing markdown files, extracting titles and descriptions, and organizing them by priority.
8
+ #
9
+ # @example Generate from docs directory
10
+ # generator = LlmsTxt::Generator.new('./docs', base_url: 'https://myproject.io')
11
+ # content = generator.generate
12
+ #
13
+ # @api public
14
+ class Generator
15
+ # @return [String] path to documentation directory or file
16
+ attr_reader :docs_path
17
+
18
+ # @return [Hash] generation options
19
+ attr_reader :options
20
+
21
+ # Initialize a new generator
22
+ #
23
+ # @param docs_path [String] path to documentation directory or file
24
+ # @param options [Hash] generation options
25
+ # @option options [String] :base_url base URL for expanding relative links
26
+ # @option options [String] :title project title (overrides auto-detection)
27
+ # @option options [String] :description project description (overrides auto-detection)
28
+ # @option options [String] :output output file path for saving
29
+ # @option options [Boolean] :verbose enable verbose output
30
+ def initialize(docs_path, options = {})
31
+ @docs_path = docs_path
32
+ @options = options
33
+ end
34
+
35
+ # Generate llms.txt content from documentation
36
+ #
37
+ # Scans documentation files, extracts metadata, prioritizes them, and builds a formatted
38
+ # llms.txt file.
39
+ #
40
+ # @return [String] generated llms.txt content
41
+ def generate
42
+ docs = find_documentation_files
43
+
44
+ content = build_llms_txt(docs)
45
+
46
+ if output_path = options[:output]
47
+ File.write(output_path, content)
48
+ end
49
+
50
+ content
51
+ end
52
+
53
+ private
54
+
55
+ # Locates and analyzes documentation files from docs_path
56
+ #
57
+ # Handles both single file and directory paths
58
+ #
59
+ # @return [Array<Hash>] array of analyzed file metadata
60
+ def find_documentation_files
61
+ return [] unless File.exist?(docs_path)
62
+
63
+ if File.file?(docs_path)
64
+ [analyze_file(docs_path)]
65
+ else
66
+ find_markdown_files_in_directory
67
+ end
68
+ end
69
+
70
+ # Recursively finds and analyzes markdown files in directory
71
+ #
72
+ # Sorts by priority (README, guides, etc.) and skips hidden files
73
+ #
74
+ # @return [Array<Hash>] sorted array of analyzed file metadata
75
+ def find_markdown_files_in_directory
76
+ files = []
77
+
78
+ Find.find(docs_path) do |path|
79
+ next unless File.file?(path)
80
+ next unless path.match?(/\.md$/i)
81
+ next if File.basename(path).start_with?('.')
82
+
83
+ files << analyze_file(path)
84
+ end
85
+
86
+ files.sort_by { |f| f[:priority] }
87
+ end
88
+
89
+ # Extracts metadata from a documentation file
90
+ #
91
+ # Analyzes file content to extract title, description, and priority
92
+ #
93
+ # @param file_path [String] path to file to analyze
94
+ # @return [Hash] file metadata with :path, :title, :description, :priority
95
+ def analyze_file(file_path)
96
+ # Handle single file case differently
97
+ relative_path = if File.file?(docs_path)
98
+ File.basename(file_path)
99
+ else
100
+ Pathname.new(file_path).relative_path_from(Pathname.new(docs_path)).to_s
101
+ end
102
+
103
+ content = File.read(file_path)
104
+
105
+ {
106
+ path: relative_path,
107
+ title: extract_title(content, file_path),
108
+ description: extract_description(content),
109
+ priority: calculate_priority(file_path)
110
+ }
111
+ end
112
+
113
+ # Extracts title from file content or generates from filename
114
+ #
115
+ # Prefers first H1 header, falls back to formatted filename
116
+ #
117
+ # @param content [String] file content
118
+ # @param file_path [String] path to file
119
+ # @return [String] extracted or generated title
120
+ def extract_title(content, file_path)
121
+ # Try to extract title from first # header
122
+ if content.match(/^#\s+(.+)/)
123
+ $1.strip
124
+ else
125
+ # Use filename as fallback
126
+ File.basename(file_path, '.md').gsub(/[_-]/, ' ').split.map(&:capitalize).join(' ')
127
+ end
128
+ end
129
+
130
+ # Extracts description from file content
131
+ #
132
+ # Takes first paragraph after title, truncated to 200 characters
133
+ #
134
+ # @param content [String] file content
135
+ # @return [String] extracted description
136
+ def extract_description(content)
137
+ lines = content.lines
138
+
139
+ # Skip title line and empty lines
140
+ description_lines = lines.drop_while { |line| line.start_with?('#') || line.strip.empty? }
141
+
142
+ # Get first paragraph
143
+ first_paragraph = description_lines.take_while { |line| !line.strip.empty? }
144
+
145
+ first_paragraph.join(' ').strip.slice(0, 200)
146
+ end
147
+
148
+ # Assigns priority to file based on filename patterns
149
+ #
150
+ # README gets highest priority, followed by guides, tutorials, API docs
151
+ #
152
+ # @param file_path [String] path to file
153
+ # @return [Integer] priority value (1-7, lower is higher priority)
154
+ def calculate_priority(file_path)
155
+ basename = File.basename(file_path).downcase
156
+
157
+ return 1 if basename.start_with?('readme')
158
+ return 2 if basename.include?('getting')
159
+ return 3 if basename.include?('guide')
160
+ return 4 if basename.include?('tutorial')
161
+ return 5 if basename.include?('api')
162
+ return 6 if basename.include?('reference')
163
+
164
+ 7 # default priority
165
+ end
166
+
167
+ # Constructs llms.txt content from analyzed documentation files
168
+ #
169
+ # Combines title, description, and documentation links into formatted output
170
+ #
171
+ # @param docs [Array<Hash>] analyzed file metadata
172
+ # @return [String] formatted llms.txt content
173
+ def build_llms_txt(docs)
174
+ title = options[:title] || detect_project_title(docs)
175
+ description = options[:description] || detect_project_description(docs)
176
+
177
+ content = []
178
+ content << "# #{title}"
179
+ content << ""
180
+ content << "> #{description}" if description
181
+ content << ""
182
+
183
+ if docs.any?
184
+ content << "## Documentation"
185
+ content << ""
186
+
187
+ docs.each do |doc|
188
+ url = build_url(doc[:path])
189
+ if doc[:description] && !doc[:description].empty?
190
+ content << "- [#{doc[:title]}](#{url}): #{doc[:description]}"
191
+ else
192
+ content << "- [#{doc[:title]}](#{url})"
193
+ end
194
+ end
195
+ end
196
+
197
+ content.join("\n") + "\n"
198
+ end
199
+
200
+ # Attempts to detect project title from README or directory name
201
+ #
202
+ # @param docs [Array<Hash>] analyzed file metadata
203
+ # @return [String] detected project title
204
+ def detect_project_title(docs)
205
+ readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
206
+ return readme[:title] if readme
207
+
208
+ File.basename(File.expand_path('.'))
209
+ end
210
+
211
+ # Attempts to extract project description from README
212
+ #
213
+ # @param docs [Array<Hash>] analyzed file metadata
214
+ # @return [String, nil] detected project description or nil
215
+ def detect_project_description(docs)
216
+ readme = docs.find { |doc| doc[:path].downcase.include?('readme') }
217
+ return readme[:description] if readme&.fetch(:description, nil)
218
+
219
+ nil
220
+ end
221
+
222
+ # Constructs full URL from path using base_url option if provided
223
+ #
224
+ # @param path [String] relative path to file
225
+ # @return [String] full URL or relative path
226
+ def build_url(path)
227
+ if base_url = options[:base_url]
228
+ File.join(base_url, path)
229
+ else
230
+ path
231
+ end
232
+ end
233
+ end
234
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmsTxt
4
+ # Transforms markdown files to be AI-friendly
5
+ #
6
+ # Processes individual markdown files to make them more suitable for LLM consumption by
7
+ # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
8
+ # formats.
9
+ #
10
+ # @example Transform with base URL
11
+ # transformer = LlmsTxt::MarkdownTransformer.new('README.md',
12
+ # base_url: 'https://myproject.io'
13
+ # )
14
+ # content = transformer.transform
15
+ #
16
+ # @api public
17
+ class MarkdownTransformer
18
+ # @return [String] path to markdown file
19
+ attr_reader :file_path
20
+
21
+ # @return [Hash] transformation options
22
+ attr_reader :options
23
+
24
+ # Initialize a new markdown transformer
25
+ #
26
+ # @param file_path [String] path to markdown file to transform
27
+ # @param options [Hash] transformation options
28
+ # @option options [String] :base_url base URL for expanding relative links
29
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
30
+ def initialize(file_path, options = {})
31
+ @file_path = file_path
32
+ @options = options
33
+ end
34
+
35
+ # Transform markdown content to be AI-friendly
36
+ #
37
+ # Applies transformations to make the markdown more suitable for LLM processing:
38
+ # - Expands relative links to absolute URLs (if base_url provided)
39
+ # - Converts HTML URLs to markdown format (if convert_urls enabled)
40
+ #
41
+ # @return [String] transformed markdown content
42
+ def transform
43
+ content = File.read(file_path)
44
+
45
+ content = expand_relative_links(content) if options[:base_url]
46
+ content = convert_html_urls(content) if options[:convert_urls]
47
+
48
+ content
49
+ end
50
+
51
+ private
52
+
53
+ # Expand relative links to absolute URLs
54
+ #
55
+ # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
56
+ # Leaves absolute URLs and anchors unchanged.
57
+ #
58
+ # @param content [String] markdown content to process
59
+ # @return [String] content with expanded links
60
+ def expand_relative_links(content)
61
+ base_url = options[:base_url]
62
+
63
+ content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
64
+ text = $1
65
+ url = $2
66
+
67
+ if url.start_with?('http://', 'https://', '//', '#')
68
+ match # Already absolute or anchor
69
+ else
70
+ # Clean up relative path
71
+ clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
72
+ expanded_url = File.join(base_url, clean_url)
73
+ "[#{text}](#{expanded_url})"
74
+ end
75
+ end
76
+ end
77
+
78
+ # Convert HTML URLs to markdown-friendly format
79
+ #
80
+ # Changes URLs ending in .html or .htm to .md for better LLM understanding
81
+ #
82
+ # @param content [String] markdown content to process
83
+ # @return [String] content with converted URLs
84
+ def convert_html_urls(content)
85
+ content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
86
+ url.sub(/\.html?$/, '.md')
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,223 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmsTxt
4
+ # Parses llms.txt files into structured data
5
+ #
6
+ # Reads and parses llms.txt files according to the llms.txt specification,
7
+ # extracting the title, description, and structured sections (Documentation,
8
+ # Examples, Optional) with their links.
9
+ #
10
+ # @example Parse an llms.txt file
11
+ # parser = LlmsTxt::Parser.new('llms.txt')
12
+ # parsed = parser.parse
13
+ # parsed.title # => "My Project"
14
+ # parsed.description # => "Project description"
15
+ # parsed.documentation_links # => [{title: "README", url: "...", description: "..."}]
16
+ #
17
+ # @api public
18
+ class Parser
19
+ # @return [String] path to the llms.txt file
20
+ attr_reader :file_path
21
+
22
+ # @return [String] raw content of the llms.txt file
23
+ attr_reader :content
24
+
25
+ # Initialize a new parser
26
+ #
27
+ # @param file_path [String] path to the llms.txt file to parse
28
+ def initialize(file_path)
29
+ @file_path = file_path
30
+ @content = File.read(file_path)
31
+ end
32
+
33
+ # Parse the llms.txt file
34
+ #
35
+ # Parses the file content and returns a {ParsedContent} object containing
36
+ # the extracted title, description, and structured sections with links.
37
+ #
38
+ # @return [ParsedContent] parsed content with title, description, and sections
39
+ def parse
40
+ sections = {}
41
+ current_section = nil
42
+ current_content = []
43
+
44
+ lines = content.lines
45
+
46
+ lines.each_with_index do |line, index|
47
+ if line.start_with?('# ')
48
+ save_section(sections, current_section, current_content) if current_section
49
+
50
+ sections[:title] = line[2..].strip if sections.empty?
51
+ current_section = :description if index == 1 && line.start_with?('> ')
52
+ current_content = []
53
+ elsif line.start_with?('> ') && sections[:title] && !sections[:description]
54
+ sections[:description] = line[2..].strip
55
+ elsif line.start_with?('## ')
56
+ save_section(sections, current_section, current_content) if current_section
57
+
58
+ current_section = line[3..].strip.downcase.gsub(/\s+/, '_').to_sym
59
+ current_content = []
60
+ elsif !line.strip.empty?
61
+ current_content << line
62
+ end
63
+ end
64
+
65
+ save_section(sections, current_section, current_content) if current_section
66
+
67
+ ParsedContent.new(sections)
68
+ end
69
+
70
+ private
71
+
72
+ # Parses and stores section content in the sections hash
73
+ #
74
+ # Skips empty sections and delegates to parse_section_content for processing
75
+ #
76
+ # @param sections [Hash] accumulator hash for sections
77
+ # @param section_name [Symbol] name of the section
78
+ # @param content [Array<String>] raw content lines
79
+ def save_section(sections, section_name, content)
80
+ return if content.empty?
81
+
82
+ sections[section_name] ||= []
83
+ sections[section_name] = parse_section_content(content.join)
84
+ end
85
+
86
+ # Extracts markdown links from section content into structured format
87
+ #
88
+ # Scans for markdown list items with links and descriptions. Returns raw content
89
+ # if no links are found in the expected format.
90
+ #
91
+ # @param content [String] raw section content
92
+ # @return [Array<Hash>, String] array of link hashes or raw content if no links found
93
+ def parse_section_content(content)
94
+ links = []
95
+
96
+ content.scan(/^[-*]\s*\[([^\]]+)\]\(([^)]+)\):\s*(.*)$/m) do |title, url, description|
97
+ links << {
98
+ title: title,
99
+ url: url,
100
+ description: description.strip
101
+ }
102
+ end
103
+
104
+ links.empty? ? content.strip : links
105
+ end
106
+ end
107
+
108
+ # Represents parsed llms.txt content with structured access to sections
109
+ #
110
+ # Provides convenient access to parsed llms.txt sections including title,
111
+ # description, and link collections. Can be converted to Hash or XML formats.
112
+ #
113
+ # @example Access parsed content
114
+ # parsed.title # => "My Project"
115
+ # parsed.description # => "A description"
116
+ # parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
117
+ # parsed.to_h # => Hash representation
118
+ # parsed.to_xml # => XML string
119
+ #
120
+ # @api public
121
+ class ParsedContent
122
+ # @return [Hash] the parsed sections hash
123
+ attr_reader :sections
124
+
125
+ # Initialize parsed content
126
+ #
127
+ # @param sections [Hash] hash containing parsed sections (:title, :description, :documentation, etc.)
128
+ def initialize(sections)
129
+ @sections = sections
130
+ end
131
+
132
+ # Get the project title
133
+ #
134
+ # @return [String, nil] the H1 title or nil if not present
135
+ def title
136
+ sections[:title]
137
+ end
138
+
139
+ # Get the project description
140
+ #
141
+ # @return [String, nil] the description blockquote or nil if not present
142
+ def description
143
+ sections[:description]
144
+ end
145
+
146
+ # Get documentation links
147
+ #
148
+ # @return [Array<Hash>] array of documentation links with :title, :url, and :description
149
+ def documentation_links
150
+ sections[:documentation] || []
151
+ end
152
+
153
+ # Get example links
154
+ #
155
+ # @return [Array<Hash>] array of example links with :title, :url, and :description
156
+ def example_links
157
+ sections[:examples] || []
158
+ end
159
+
160
+ # Get optional links
161
+ #
162
+ # @return [Array<Hash>] array of optional links with :title, :url, and :description
163
+ def optional_links
164
+ sections[:optional] || []
165
+ end
166
+
167
+ # Convert to hash representation
168
+ #
169
+ # @return [Hash] hash containing all parsed sections
170
+ def to_h
171
+ sections
172
+ end
173
+
174
+ # Convert to XML representation
175
+ #
176
+ # Generates an XML document with all parsed sections and links.
177
+ #
178
+ # @return [String] XML string representation
179
+ def to_xml
180
+ builder = []
181
+ builder << '<?xml version="1.0" encoding="UTF-8"?>'
182
+ builder << '<llms_context>'
183
+ builder << " <title>#{title}</title>" if title
184
+ builder << " <description>#{description}</description>" if description
185
+
186
+ add_xml_section(builder, 'documentation', documentation_links)
187
+ add_xml_section(builder, 'examples', example_links)
188
+ add_xml_section(builder, 'optional', optional_links) if sections[:optional]
189
+
190
+ builder << '</llms_context>'
191
+ builder.join("\n")
192
+ end
193
+
194
+ private
195
+
196
+ # Appends section XML elements to builder array
197
+ #
198
+ # Handles both array of link hashes and raw string content
199
+ #
200
+ # @param builder [Array<String>] XML lines accumulator
201
+ # @param name [String] section name
202
+ # @param links [Array<Hash>, String] section links or content
203
+ def add_xml_section(builder, name, links)
204
+ return if links.empty?
205
+
206
+ builder << " <#{name}>"
207
+
208
+ if links.is_a?(Array)
209
+ links.each do |link|
210
+ builder << ' <link>'
211
+ builder << " <title>#{link[:title]}</title>"
212
+ builder << " <url>#{link[:url]}</url>"
213
+ builder << " <description>#{link[:description]}</description>"
214
+ builder << ' </link>'
215
+ end
216
+ else
217
+ builder << " #{links}"
218
+ end
219
+
220
+ builder << " </#{name}>"
221
+ end
222
+ end
223
+ end