llm-docs-builder 0.3.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/rspecs CHANGED
@@ -4,4 +4,5 @@
4
4
  set -e
5
5
 
6
6
  echo "Running all tests..."
7
- bundle exec rspec --format documentation
7
+ # Explicitly specify the spec directory to ensure all tests are discovered
8
+ bundle exec rspec spec/ --format documentation
@@ -295,8 +295,6 @@ module LlmDocsBuilder
295
295
  puts "Documentation Links: #{parsed.documentation_links.size}"
296
296
  puts "Example Links: #{parsed.example_links.size}" if parsed.respond_to?(:example_links)
297
297
  puts "Optional Links: #{parsed.optional_links.size}" if parsed.respond_to?(:optional_links)
298
- elsif parsed.respond_to?(:to_xml)
299
- puts parsed.to_xml
300
298
  end
301
299
  end
302
300
 
@@ -335,60 +333,13 @@ module LlmDocsBuilder
335
333
 
336
334
  begin
337
335
  result = comparator.compare
338
- display_comparison_results(result)
336
+ OutputFormatter.display_comparison_results(result)
339
337
  rescue LlmDocsBuilder::Errors::BaseError => e
340
338
  puts "Error during comparison: #{e.message}"
341
339
  exit 1
342
340
  end
343
341
  end
344
342
 
345
- # Display formatted comparison results
346
- #
347
- # @param result [Hash] comparison results from Comparator
348
- def display_comparison_results(result)
349
- puts ''
350
- puts '=' * 60
351
- puts 'Context Window Comparison'
352
- puts '=' * 60
353
- puts ''
354
- puts "Human version: #{format_bytes(result[:human_size])}"
355
- puts " Source: #{result[:human_source]}"
356
- puts ''
357
- puts "AI version: #{format_bytes(result[:ai_size])}"
358
- puts " Source: #{result[:ai_source]}"
359
- puts ''
360
- puts '-' * 60
361
-
362
- if result[:reduction_bytes].positive?
363
- puts "Reduction: #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
364
- puts "Factor: #{result[:factor]}x smaller"
365
- elsif result[:reduction_bytes].negative?
366
- increase_bytes = result[:reduction_bytes].abs
367
- increase_percent = result[:reduction_percent].abs
368
- puts "Increase: #{format_bytes(increase_bytes)} (#{increase_percent}%)"
369
- puts "Factor: #{result[:factor]}x larger"
370
- else
371
- puts 'Same size'
372
- end
373
-
374
- puts '=' * 60
375
- puts ''
376
- end
377
-
378
- # Format bytes into human-readable string
379
- #
380
- # @param bytes [Integer] number of bytes
381
- # @return [String] formatted string with units
382
- def format_bytes(bytes)
383
- if bytes < 1024
384
- "#{bytes} bytes"
385
- elsif bytes < 1024 * 1024
386
- "#{(bytes / 1024.0).round(1)} KB"
387
- else
388
- "#{(bytes / (1024.0 * 1024)).round(2)} MB"
389
- end
390
- end
391
-
392
343
  # Validate llms.txt file format
393
344
  #
394
345
  # Checks if llms.txt file follows proper format with title, description, and documentation links.
@@ -62,6 +62,10 @@ module LlmDocsBuilder
62
62
  # - :reduction_bytes [Integer] bytes saved
63
63
  # - :reduction_percent [Integer] percentage reduction
64
64
  # - :factor [Float] compression factor
65
+ # - :human_tokens [Integer] estimated tokens in human version
66
+ # - :ai_tokens [Integer] estimated tokens in AI version
67
+ # - :token_reduction [Integer] estimated tokens saved
68
+ # - :token_reduction_percent [Integer] percentage of tokens saved
65
69
  # - :human_source [String] source description (URL or file)
66
70
  # - :ai_source [String] source description (URL or file)
67
71
  def compare
@@ -85,8 +89,8 @@ module LlmDocsBuilder
85
89
  ai_content = fetch_url(url, options[:ai_user_agent])
86
90
 
87
91
  calculate_results(
88
- human_content.bytesize,
89
- ai_content.bytesize,
92
+ human_content,
93
+ ai_content,
90
94
  "#{url} (User-Agent: human)",
91
95
  "#{url} (User-Agent: AI)"
92
96
  )
@@ -112,8 +116,8 @@ module LlmDocsBuilder
112
116
  ai_content = File.read(local_file)
113
117
 
114
118
  calculate_results(
115
- human_content.bytesize,
116
- ai_content.bytesize,
119
+ human_content,
120
+ ai_content,
117
121
  url,
118
122
  local_file
119
123
  )
@@ -205,12 +209,15 @@ module LlmDocsBuilder
205
209
 
206
210
  # Calculate comparison statistics
207
211
  #
208
- # @param human_size [Integer] size of human version in bytes
209
- # @param ai_size [Integer] size of AI version in bytes
212
+ # @param human_content [String] content of human version
213
+ # @param ai_content [String] content of AI version
210
214
  # @param human_source [String] description of human source
211
215
  # @param ai_source [String] description of AI source
212
216
  # @return [Hash] comparison results
213
- def calculate_results(human_size, ai_size, human_source, ai_source)
217
+ def calculate_results(human_content, ai_content, human_source, ai_source)
218
+ human_size = human_content.bytesize
219
+ ai_size = ai_content.bytesize
220
+
214
221
  reduction_bytes = human_size - ai_size
215
222
  reduction_percent = if human_size.positive?
216
223
  ((reduction_bytes.to_f / human_size) * 100).round
@@ -224,15 +231,31 @@ module LlmDocsBuilder
224
231
  Float::INFINITY
225
232
  end
226
233
 
234
+ # Estimate tokens using TokenEstimator
235
+ estimator = TokenEstimator.new
236
+ human_tokens = estimator.estimate(human_content)
237
+ ai_tokens = estimator.estimate(ai_content)
238
+ token_reduction = human_tokens - ai_tokens
239
+ token_reduction_percent = if human_tokens.positive?
240
+ ((token_reduction.to_f / human_tokens) * 100).round
241
+ else
242
+ 0
243
+ end
244
+
227
245
  {
228
246
  human_size: human_size,
229
247
  ai_size: ai_size,
230
248
  reduction_bytes: reduction_bytes,
231
249
  reduction_percent: reduction_percent,
232
250
  factor: factor,
251
+ human_tokens: human_tokens,
252
+ ai_tokens: ai_tokens,
253
+ token_reduction: token_reduction,
254
+ token_reduction_percent: token_reduction_percent,
233
255
  human_source: human_source,
234
256
  ai_source: ai_source
235
257
  }
236
258
  end
259
+
237
260
  end
238
261
  end
@@ -67,11 +67,68 @@ module LlmDocsBuilder
67
67
  else
68
68
  self['convert_urls'] || false
69
69
  end,
70
+ remove_comments: if options.key?(:remove_comments)
71
+ options[:remove_comments]
72
+ else
73
+ self['remove_comments'] || true
74
+ end,
75
+ normalize_whitespace: if options.key?(:normalize_whitespace)
76
+ options[:normalize_whitespace]
77
+ else
78
+ self['normalize_whitespace'] || true
79
+ end,
80
+ remove_badges: if options.key?(:remove_badges)
81
+ options[:remove_badges]
82
+ else
83
+ self['remove_badges'] || true
84
+ end,
85
+ remove_frontmatter: if options.key?(:remove_frontmatter)
86
+ options[:remove_frontmatter]
87
+ else
88
+ self['remove_frontmatter'] || true
89
+ end,
70
90
  verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
71
91
  # Bulk transformation options
72
92
  suffix: options[:suffix] || self['suffix'] || '.llm',
73
93
  excludes: options[:excludes] || self['excludes'] || [],
74
- bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
94
+ bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false),
95
+ # New compression options
96
+ remove_code_examples: if options.key?(:remove_code_examples)
97
+ options[:remove_code_examples]
98
+ else
99
+ self['remove_code_examples'] || false
100
+ end,
101
+ remove_images: if options.key?(:remove_images)
102
+ options[:remove_images]
103
+ else
104
+ self['remove_images'] || false
105
+ end,
106
+ simplify_links: if options.key?(:simplify_links)
107
+ options[:simplify_links]
108
+ else
109
+ self['simplify_links'] || false
110
+ end,
111
+ remove_blockquotes: if options.key?(:remove_blockquotes)
112
+ options[:remove_blockquotes]
113
+ else
114
+ self['remove_blockquotes'] || false
115
+ end,
116
+ generate_toc: if options.key?(:generate_toc)
117
+ options[:generate_toc]
118
+ else
119
+ self['generate_toc'] || false
120
+ end,
121
+ custom_instruction: options[:custom_instruction] || self['custom_instruction'],
122
+ remove_stopwords: if options.key?(:remove_stopwords)
123
+ options[:remove_stopwords]
124
+ else
125
+ self['remove_stopwords'] || false
126
+ end,
127
+ remove_duplicates: if options.key?(:remove_duplicates)
128
+ options[:remove_duplicates]
129
+ else
130
+ self['remove_duplicates'] || false
131
+ end
75
132
  }
76
133
  end
77
134
 
@@ -3,9 +3,8 @@
3
3
  module LlmDocsBuilder
4
4
  # Transforms markdown files to be AI-friendly
5
5
  #
6
- # Processes individual markdown files to make them more suitable for LLM consumption by
7
- # expanding relative links to absolute URLs and converting HTML URLs to markdown-friendly
8
- # formats.
6
+ # Orchestrates a pipeline of specialized transformers to process markdown content.
7
+ # Each transformer is responsible for a specific aspect of the transformation.
9
8
  #
10
9
  # @example Transform with base URL
11
10
  # transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
@@ -27,64 +26,94 @@ module LlmDocsBuilder
27
26
  # @param options [Hash] transformation options
28
27
  # @option options [String] :base_url base URL for expanding relative links
29
28
  # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
29
+ # @option options [Boolean] :remove_comments remove HTML comments from markdown
30
+ # @option options [Boolean] :normalize_whitespace normalize excessive whitespace
31
+ # @option options [Boolean] :remove_badges remove badge/shield images
32
+ # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
33
+ # @option options [Boolean] :remove_code_examples remove code blocks and inline code
34
+ # @option options [Boolean] :remove_images remove image syntax
35
+ # @option options [Boolean] :simplify_links shorten verbose link text
36
+ # @option options [Boolean] :remove_blockquotes remove blockquote formatting
37
+ # @option options [Boolean] :generate_toc generate table of contents at the top
38
+ # @option options [String] :custom_instruction custom instruction text to inject at top
39
+ # @option options [Boolean] :remove_stopwords remove common stopwords (aggressive)
40
+ # @option options [Boolean] :remove_duplicates remove duplicate paragraphs
30
41
  def initialize(file_path, options = {})
31
42
  @file_path = file_path
32
43
  @options = options
33
44
  end
34
45
 
35
- # Transform markdown content to be AI-friendly
46
+ # Transform markdown content using a pipeline of transformers
36
47
  #
37
- # Applies transformations to make the markdown more suitable for LLM processing:
38
- # - Expands relative links to absolute URLs (if base_url provided)
39
- # - Converts HTML URLs to markdown format (if convert_urls enabled)
48
+ # Processes content through specialized transformers in order:
49
+ # 1. ContentCleanupTransformer - Removes unwanted elements
50
+ # 2. LinkTransformer - Processes links
51
+ # 3. TextCompressor - Advanced compression (if enabled)
52
+ # 4. EnhancementTransformer - Adds TOC and instructions
53
+ # 5. WhitespaceTransformer - Normalizes whitespace
40
54
  #
41
55
  # @return [String] transformed markdown content
42
56
  def transform
43
57
  content = File.read(file_path)
44
58
 
45
- content = expand_relative_links(content) if options[:base_url]
46
- content = convert_html_urls(content) if options[:convert_urls]
59
+ # Build and execute transformation pipeline
60
+ content = cleanup_transformer.transform(content, options)
61
+ content = link_transformer.transform(content, options)
62
+ content = compress_content(content) if should_compress?
63
+ content = enhancement_transformer.transform(content, options)
64
+ content = whitespace_transformer.transform(content, options)
47
65
 
48
66
  content
49
67
  end
50
68
 
51
69
  private
52
70
 
53
- # Expand relative links to absolute URLs
71
+ # Get content cleanup transformer instance
54
72
  #
55
- # Converts markdown links like `[text](./path.md)` to `[text](https://base.url/path.md)`.
56
- # Leaves absolute URLs and anchors unchanged.
73
+ # @return [Transformers::ContentCleanupTransformer]
74
+ def cleanup_transformer
75
+ @cleanup_transformer ||= Transformers::ContentCleanupTransformer.new
76
+ end
77
+
78
+ # Get link transformer instance
57
79
  #
58
- # @param content [String] markdown content to process
59
- # @return [String] content with expanded links
60
- def expand_relative_links(content)
61
- base_url = options[:base_url]
80
+ # @return [Transformers::LinkTransformer]
81
+ def link_transformer
82
+ @link_transformer ||= Transformers::LinkTransformer.new
83
+ end
62
84
 
63
- content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
64
- text = ::Regexp.last_match(1)
65
- url = ::Regexp.last_match(2)
85
+ # Get enhancement transformer instance
86
+ #
87
+ # @return [Transformers::EnhancementTransformer]
88
+ def enhancement_transformer
89
+ @enhancement_transformer ||= Transformers::EnhancementTransformer.new
90
+ end
66
91
 
67
- if url.start_with?('http://', 'https://', '//', '#')
68
- match # Already absolute or anchor
69
- else
70
- # Clean up relative path
71
- clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
72
- expanded_url = File.join(base_url, clean_url)
73
- "[#{text}](#{expanded_url})"
74
- end
75
- end
92
+ # Get whitespace transformer instance
93
+ #
94
+ # @return [Transformers::WhitespaceTransformer]
95
+ def whitespace_transformer
96
+ @whitespace_transformer ||= Transformers::WhitespaceTransformer.new
76
97
  end
77
98
 
78
- # Convert HTML URLs to markdown-friendly format
99
+ # Check if content compression should be applied
79
100
  #
80
- # Changes URLs ending in .html or .htm to .md for better LLM understanding
101
+ # @return [Boolean]
102
+ def should_compress?
103
+ options[:remove_stopwords] || options[:remove_duplicates]
104
+ end
105
+
106
+ # Compress content using TextCompressor
81
107
  #
82
- # @param content [String] markdown content to process
83
- # @return [String] content with converted URLs
84
- def convert_html_urls(content)
85
- content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
86
- url.sub(/\.html?$/, '.md')
87
- end
108
+ # @param content [String] content to compress
109
+ # @return [String] compressed content
110
+ def compress_content(content)
111
+ compressor = TextCompressor.new
112
+ compression_methods = {
113
+ remove_stopwords: options[:remove_stopwords],
114
+ remove_duplicates: options[:remove_duplicates]
115
+ }
116
+ compressor.compress(content, compression_methods)
88
117
  end
89
118
  end
90
119
  end
@@ -0,0 +1,93 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Formats output for CLI display
5
+ #
6
+ # Provides formatting utilities for displaying comparison results,
7
+ # byte sizes, and numbers in a user-friendly way.
8
+ #
9
+ # @api private
10
+ class OutputFormatter
11
+ # Format bytes into human-readable string
12
+ #
13
+ # @param bytes [Integer] number of bytes
14
+ # @return [String] formatted string with units (bytes/KB/MB)
15
+ #
16
+ # @example
17
+ # OutputFormatter.format_bytes(1024) #=> "1.0 KB"
18
+ # OutputFormatter.format_bytes(1048576) #=> "1.0 MB"
19
+ def self.format_bytes(bytes)
20
+ if bytes < 1024
21
+ "#{bytes} bytes"
22
+ elsif bytes < 1024 * 1024
23
+ "#{(bytes / 1024.0).round(1)} KB"
24
+ else
25
+ "#{(bytes / (1024.0 * 1024)).round(2)} MB"
26
+ end
27
+ end
28
+
29
+ # Format number with comma separators for readability
30
+ #
31
+ # @param number [Integer] number to format
32
+ # @return [String] formatted number with commas
33
+ #
34
+ # @example
35
+ # OutputFormatter.format_number(1234567) #=> "1,234,567"
36
+ def self.format_number(number)
37
+ number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
38
+ end
39
+
40
+ # Display formatted comparison results
41
+ #
42
+ # @param result [Hash] comparison results from Comparator
43
+ def self.display_comparison_results(result)
44
+ puts ''
45
+ puts '=' * 60
46
+ puts 'Context Window Comparison'
47
+ puts '=' * 60
48
+ puts ''
49
+ puts "Human version: #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
50
+ puts " Source: #{result[:human_source]}"
51
+ puts ''
52
+ puts "AI version: #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
53
+ puts " Source: #{result[:ai_source]}"
54
+ puts ''
55
+ puts '-' * 60
56
+
57
+ if result[:reduction_bytes].positive?
58
+ display_reduction(result)
59
+ elsif result[:reduction_bytes].negative?
60
+ display_increase(result)
61
+ else
62
+ puts 'Same size'
63
+ end
64
+
65
+ puts '=' * 60
66
+ puts ''
67
+ end
68
+
69
+ # Display reduction statistics
70
+ #
71
+ # @param result [Hash] comparison results
72
+ # @api private
73
+ def self.display_reduction(result)
74
+ puts "Reduction: #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
75
+ puts "Token savings: #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
76
+ puts "Factor: #{result[:factor]}x smaller"
77
+ end
78
+
79
+ # Display increase statistics
80
+ #
81
+ # @param result [Hash] comparison results
82
+ # @api private
83
+ def self.display_increase(result)
84
+ increase_bytes = result[:reduction_bytes].abs
85
+ increase_percent = result[:reduction_percent].abs
86
+ token_increase = result[:token_reduction].abs
87
+ token_increase_percent = result[:token_reduction_percent].abs
88
+ puts "Increase: #{format_bytes(increase_bytes)} (#{increase_percent}%)"
89
+ puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
90
+ puts "Factor: #{result[:factor]}x larger"
91
+ end
92
+ end
93
+ end
@@ -108,14 +108,12 @@ module LlmDocsBuilder
108
108
  # Represents parsed llms.txt content with structured access to sections
109
109
  #
110
110
  # Provides convenient access to parsed llms.txt sections including title,
111
- # description, and link collections. Can be converted to Hash or XML formats.
111
+ # description, and link collections.
112
112
  #
113
113
  # @example Access parsed content
114
114
  # parsed.title # => "My Project"
115
115
  # parsed.description # => "A description"
116
116
  # parsed.documentation_links # => [{title: "...", url: "...", description: "..."}]
117
- # parsed.to_h # => Hash representation
118
- # parsed.to_xml # => XML string
119
117
  #
120
118
  # @api public
121
119
  class ParsedContent
@@ -163,61 +161,5 @@ module LlmDocsBuilder
163
161
  def optional_links
164
162
  sections[:optional] || []
165
163
  end
166
-
167
- # Convert to hash representation
168
- #
169
- # @return [Hash] hash containing all parsed sections
170
- def to_h
171
- sections
172
- end
173
-
174
- # Convert to XML representation
175
- #
176
- # Generates an XML document with all parsed sections and links.
177
- #
178
- # @return [String] XML string representation
179
- def to_xml
180
- builder = []
181
- builder << '<?xml version="1.0" encoding="UTF-8"?>'
182
- builder << '<llms_context>'
183
- builder << " <title>#{title}</title>" if title
184
- builder << " <description>#{description}</description>" if description
185
-
186
- add_xml_section(builder, 'documentation', documentation_links)
187
- add_xml_section(builder, 'examples', example_links)
188
- add_xml_section(builder, 'optional', optional_links) if sections[:optional]
189
-
190
- builder << '</llms_context>'
191
- builder.join("\n")
192
- end
193
-
194
- private
195
-
196
- # Appends section XML elements to builder array
197
- #
198
- # Handles both array of link hashes and raw string content
199
- #
200
- # @param builder [Array<String>] XML lines accumulator
201
- # @param name [String] section name
202
- # @param links [Array<Hash>, String] section links or content
203
- def add_xml_section(builder, name, links)
204
- return if links.empty?
205
-
206
- builder << " <#{name}>"
207
-
208
- if links.is_a?(Array)
209
- links.each do |link|
210
- builder << ' <link>'
211
- builder << " <title>#{link[:title]}</title>"
212
- builder << " <url>#{link[:url]}</url>"
213
- builder << " <description>#{link[:description]}</description>"
214
- builder << ' </link>'
215
- end
216
- else
217
- builder << " #{links}"
218
- end
219
-
220
- builder << " </#{name}>"
221
- end
222
164
  end
223
165
  end