llm-docs-builder 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +3 -0
- data/CHANGELOG.md +59 -0
- data/Gemfile.lock +1 -1
- data/README.md +241 -541
- data/bin/rspecs +2 -1
- data/lib/llm_docs_builder/cli.rb +1 -62
- data/lib/llm_docs_builder/comparator.rb +4 -16
- data/lib/llm_docs_builder/config.rb +74 -5
- data/lib/llm_docs_builder/generator.rb +67 -8
- data/lib/llm_docs_builder/markdown_transformer.rb +61 -126
- data/lib/llm_docs_builder/output_formatter.rb +93 -0
- data/lib/llm_docs_builder/parser.rb +1 -59
- data/lib/llm_docs_builder/text_compressor.rb +164 -0
- data/lib/llm_docs_builder/token_estimator.rb +52 -0
- data/lib/llm_docs_builder/transformers/base_transformer.rb +30 -0
- data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb +106 -0
- data/lib/llm_docs_builder/transformers/enhancement_transformer.rb +95 -0
- data/lib/llm_docs_builder/transformers/heading_transformer.rb +72 -0
- data/lib/llm_docs_builder/transformers/link_transformer.rb +84 -0
- data/lib/llm_docs_builder/transformers/whitespace_transformer.rb +44 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- metadata +11 -3
- data/CLAUDE.md +0 -178
- data/llm-docs-builder.yml +0 -7
data/bin/rspecs
CHANGED
data/lib/llm_docs_builder/cli.rb
CHANGED
@@ -295,8 +295,6 @@ module LlmDocsBuilder
|
|
295
295
|
puts "Documentation Links: #{parsed.documentation_links.size}"
|
296
296
|
puts "Example Links: #{parsed.example_links.size}" if parsed.respond_to?(:example_links)
|
297
297
|
puts "Optional Links: #{parsed.optional_links.size}" if parsed.respond_to?(:optional_links)
|
298
|
-
elsif parsed.respond_to?(:to_xml)
|
299
|
-
puts parsed.to_xml
|
300
298
|
end
|
301
299
|
end
|
302
300
|
|
@@ -335,72 +333,13 @@ module LlmDocsBuilder
|
|
335
333
|
|
336
334
|
begin
|
337
335
|
result = comparator.compare
|
338
|
-
display_comparison_results(result)
|
336
|
+
OutputFormatter.display_comparison_results(result)
|
339
337
|
rescue LlmDocsBuilder::Errors::BaseError => e
|
340
338
|
puts "Error during comparison: #{e.message}"
|
341
339
|
exit 1
|
342
340
|
end
|
343
341
|
end
|
344
342
|
|
345
|
-
# Display formatted comparison results
|
346
|
-
#
|
347
|
-
# @param result [Hash] comparison results from Comparator
|
348
|
-
def display_comparison_results(result)
|
349
|
-
puts ''
|
350
|
-
puts '=' * 60
|
351
|
-
puts 'Context Window Comparison'
|
352
|
-
puts '=' * 60
|
353
|
-
puts ''
|
354
|
-
puts "Human version: #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
|
355
|
-
puts " Source: #{result[:human_source]}"
|
356
|
-
puts ''
|
357
|
-
puts "AI version: #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
|
358
|
-
puts " Source: #{result[:ai_source]}"
|
359
|
-
puts ''
|
360
|
-
puts '-' * 60
|
361
|
-
|
362
|
-
if result[:reduction_bytes].positive?
|
363
|
-
puts "Reduction: #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
|
364
|
-
puts "Token savings: #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
|
365
|
-
puts "Factor: #{result[:factor]}x smaller"
|
366
|
-
elsif result[:reduction_bytes].negative?
|
367
|
-
increase_bytes = result[:reduction_bytes].abs
|
368
|
-
increase_percent = result[:reduction_percent].abs
|
369
|
-
token_increase = result[:token_reduction].abs
|
370
|
-
token_increase_percent = result[:token_reduction_percent].abs
|
371
|
-
puts "Increase: #{format_bytes(increase_bytes)} (#{increase_percent}%)"
|
372
|
-
puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
|
373
|
-
puts "Factor: #{result[:factor]}x larger"
|
374
|
-
else
|
375
|
-
puts 'Same size'
|
376
|
-
end
|
377
|
-
|
378
|
-
puts '=' * 60
|
379
|
-
puts ''
|
380
|
-
end
|
381
|
-
|
382
|
-
# Format bytes into human-readable string
|
383
|
-
#
|
384
|
-
# @param bytes [Integer] number of bytes
|
385
|
-
# @return [String] formatted string with units
|
386
|
-
def format_bytes(bytes)
|
387
|
-
if bytes < 1024
|
388
|
-
"#{bytes} bytes"
|
389
|
-
elsif bytes < 1024 * 1024
|
390
|
-
"#{(bytes / 1024.0).round(1)} KB"
|
391
|
-
else
|
392
|
-
"#{(bytes / (1024.0 * 1024)).round(2)} MB"
|
393
|
-
end
|
394
|
-
end
|
395
|
-
|
396
|
-
# Format number with comma separators for readability
|
397
|
-
#
|
398
|
-
# @param number [Integer] number to format
|
399
|
-
# @return [String] formatted number with commas
|
400
|
-
def format_number(number)
|
401
|
-
number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
402
|
-
end
|
403
|
-
|
404
343
|
# Validate llms.txt file format
|
405
344
|
#
|
406
345
|
# Checks if llms.txt file follows proper format with title, description, and documentation links.
|
@@ -231,9 +231,10 @@ module LlmDocsBuilder
|
|
231
231
|
Float::INFINITY
|
232
232
|
end
|
233
233
|
|
234
|
-
# Estimate tokens
|
235
|
-
|
236
|
-
|
234
|
+
# Estimate tokens using TokenEstimator
|
235
|
+
estimator = TokenEstimator.new
|
236
|
+
human_tokens = estimator.estimate(human_content)
|
237
|
+
ai_tokens = estimator.estimate(ai_content)
|
237
238
|
token_reduction = human_tokens - ai_tokens
|
238
239
|
token_reduction_percent = if human_tokens.positive?
|
239
240
|
((token_reduction.to_f / human_tokens) * 100).round
|
@@ -256,18 +257,5 @@ module LlmDocsBuilder
|
|
256
257
|
}
|
257
258
|
end
|
258
259
|
|
259
|
-
# Estimate token count using character-based approximation
|
260
|
-
#
|
261
|
-
# Uses the common heuristic that ~4 characters equals 1 token for English text.
|
262
|
-
# This provides reasonable estimates for documentation content without requiring
|
263
|
-
# external tokenizer dependencies.
|
264
|
-
#
|
265
|
-
# @param content [String] text content to estimate tokens for
|
266
|
-
# @return [Integer] estimated number of tokens
|
267
|
-
def estimate_tokens(content)
|
268
|
-
# Use 4 characters per token as a reasonable approximation
|
269
|
-
# This is a common heuristic for English text and works well for documentation
|
270
|
-
(content.length / 4.0).round
|
271
|
-
end
|
272
260
|
end
|
273
261
|
end
|
@@ -70,28 +70,97 @@ module LlmDocsBuilder
|
|
70
70
|
remove_comments: if options.key?(:remove_comments)
|
71
71
|
options[:remove_comments]
|
72
72
|
else
|
73
|
-
self['remove_comments'] ||
|
73
|
+
self['remove_comments'] || true
|
74
74
|
end,
|
75
75
|
normalize_whitespace: if options.key?(:normalize_whitespace)
|
76
76
|
options[:normalize_whitespace]
|
77
77
|
else
|
78
|
-
self['normalize_whitespace'] ||
|
78
|
+
self['normalize_whitespace'] || true
|
79
79
|
end,
|
80
80
|
remove_badges: if options.key?(:remove_badges)
|
81
81
|
options[:remove_badges]
|
82
82
|
else
|
83
|
-
self['remove_badges'] ||
|
83
|
+
self['remove_badges'] || true
|
84
84
|
end,
|
85
85
|
remove_frontmatter: if options.key?(:remove_frontmatter)
|
86
86
|
options[:remove_frontmatter]
|
87
87
|
else
|
88
|
-
self['remove_frontmatter'] ||
|
88
|
+
self['remove_frontmatter'] || true
|
89
89
|
end,
|
90
90
|
verbose: options.key?(:verbose) ? options[:verbose] : (self['verbose'] || false),
|
91
91
|
# Bulk transformation options
|
92
92
|
suffix: options[:suffix] || self['suffix'] || '.llm',
|
93
93
|
excludes: options[:excludes] || self['excludes'] || [],
|
94
|
-
bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false)
|
94
|
+
bulk: options.key?(:bulk) ? options[:bulk] : (self['bulk'] || false),
|
95
|
+
# New compression options
|
96
|
+
remove_code_examples: if options.key?(:remove_code_examples)
|
97
|
+
options[:remove_code_examples]
|
98
|
+
else
|
99
|
+
self['remove_code_examples'] || false
|
100
|
+
end,
|
101
|
+
remove_images: if options.key?(:remove_images)
|
102
|
+
options[:remove_images]
|
103
|
+
else
|
104
|
+
self['remove_images'] || false
|
105
|
+
end,
|
106
|
+
simplify_links: if options.key?(:simplify_links)
|
107
|
+
options[:simplify_links]
|
108
|
+
else
|
109
|
+
self['simplify_links'] || false
|
110
|
+
end,
|
111
|
+
remove_blockquotes: if options.key?(:remove_blockquotes)
|
112
|
+
options[:remove_blockquotes]
|
113
|
+
else
|
114
|
+
self['remove_blockquotes'] || false
|
115
|
+
end,
|
116
|
+
generate_toc: if options.key?(:generate_toc)
|
117
|
+
options[:generate_toc]
|
118
|
+
else
|
119
|
+
self['generate_toc'] || false
|
120
|
+
end,
|
121
|
+
custom_instruction: options[:custom_instruction] || self['custom_instruction'],
|
122
|
+
remove_stopwords: if options.key?(:remove_stopwords)
|
123
|
+
options[:remove_stopwords]
|
124
|
+
else
|
125
|
+
self['remove_stopwords'] || false
|
126
|
+
end,
|
127
|
+
remove_duplicates: if options.key?(:remove_duplicates)
|
128
|
+
options[:remove_duplicates]
|
129
|
+
else
|
130
|
+
self['remove_duplicates'] || false
|
131
|
+
end,
|
132
|
+
# New RAG enhancement options
|
133
|
+
normalize_headings: if options.key?(:normalize_headings)
|
134
|
+
options[:normalize_headings]
|
135
|
+
else
|
136
|
+
self['normalize_headings'] || false
|
137
|
+
end,
|
138
|
+
heading_separator: options[:heading_separator] || self['heading_separator'] || ' / ',
|
139
|
+
include_metadata: if options.key?(:include_metadata)
|
140
|
+
options[:include_metadata]
|
141
|
+
else
|
142
|
+
self['include_metadata'] || false
|
143
|
+
end,
|
144
|
+
include_tokens: if options.key?(:include_tokens)
|
145
|
+
options[:include_tokens]
|
146
|
+
else
|
147
|
+
self['include_tokens'] || false
|
148
|
+
end,
|
149
|
+
include_timestamps: if options.key?(:include_timestamps)
|
150
|
+
options[:include_timestamps]
|
151
|
+
else
|
152
|
+
self['include_timestamps'] || false
|
153
|
+
end,
|
154
|
+
include_priority: if options.key?(:include_priority)
|
155
|
+
options[:include_priority]
|
156
|
+
else
|
157
|
+
self['include_priority'] || false
|
158
|
+
end,
|
159
|
+
calculate_compression: if options.key?(:calculate_compression)
|
160
|
+
options[:calculate_compression]
|
161
|
+
else
|
162
|
+
self['calculate_compression'] || false
|
163
|
+
end
|
95
164
|
}
|
96
165
|
end
|
97
166
|
|
@@ -88,10 +88,10 @@ module LlmDocsBuilder
|
|
88
88
|
|
89
89
|
# Extracts metadata from a documentation file
|
90
90
|
#
|
91
|
-
# Analyzes file content to extract title, description, and
|
91
|
+
# Analyzes file content to extract title, description, priority, and optional metadata
|
92
92
|
#
|
93
93
|
# @param file_path [String] path to file to analyze
|
94
|
-
# @return [Hash] file metadata with :path, :title, :description, :priority
|
94
|
+
# @return [Hash] file metadata with :path, :title, :description, :priority, :tokens, :updated
|
95
95
|
def analyze_file(file_path)
|
96
96
|
# Handle single file case differently
|
97
97
|
relative_path = if File.file?(docs_path)
|
@@ -102,12 +102,28 @@ module LlmDocsBuilder
|
|
102
102
|
|
103
103
|
content = File.read(file_path)
|
104
104
|
|
105
|
-
{
|
105
|
+
metadata = {
|
106
106
|
path: relative_path,
|
107
107
|
title: extract_title(content, file_path),
|
108
108
|
description: extract_description(content),
|
109
109
|
priority: calculate_priority(file_path)
|
110
110
|
}
|
111
|
+
|
112
|
+
# Add optional enhanced metadata
|
113
|
+
if options[:include_metadata]
|
114
|
+
metadata[:tokens] = TokenEstimator.estimate(content) if options[:include_tokens]
|
115
|
+
metadata[:updated] = File.mtime(file_path).strftime('%Y-%m-%d') if options[:include_timestamps]
|
116
|
+
|
117
|
+
# Calculate compression ratio if transformation is enabled
|
118
|
+
if options[:calculate_compression]
|
119
|
+
transformed = apply_transformations(content, file_path)
|
120
|
+
original_tokens = TokenEstimator.estimate(content)
|
121
|
+
transformed_tokens = TokenEstimator.estimate(transformed)
|
122
|
+
metadata[:compression] = (transformed_tokens.to_f / original_tokens).round(2)
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
metadata
|
111
127
|
end
|
112
128
|
|
113
129
|
# Extracts title from file content or generates from filename
|
@@ -164,6 +180,21 @@ module LlmDocsBuilder
|
|
164
180
|
7 # default priority
|
165
181
|
end
|
166
182
|
|
183
|
+
# Applies transformations to content for compression ratio calculation
|
184
|
+
#
|
185
|
+
# @param content [String] original content
|
186
|
+
# @param file_path [String] path to file
|
187
|
+
# @return [String] transformed content
|
188
|
+
def apply_transformations(content, file_path)
|
189
|
+
transformer = MarkdownTransformer.new(file_path, options)
|
190
|
+
|
191
|
+
# Read file again through transformer to get transformed version
|
192
|
+
transformer.transform
|
193
|
+
rescue StandardError
|
194
|
+
# If transformation fails, return original content
|
195
|
+
content
|
196
|
+
end
|
197
|
+
|
167
198
|
# Constructs llms.txt content from analyzed documentation files
|
168
199
|
#
|
169
200
|
# Combines title, description, and documentation links into formatted output
|
@@ -186,11 +217,24 @@ module LlmDocsBuilder
|
|
186
217
|
|
187
218
|
docs.each do |doc|
|
188
219
|
url = build_url(doc[:path])
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
220
|
+
line = if doc[:description] && !doc[:description].empty?
|
221
|
+
"- [#{doc[:title]}](#{url}): #{doc[:description]}"
|
222
|
+
else
|
223
|
+
"- [#{doc[:title]}](#{url})"
|
224
|
+
end
|
225
|
+
|
226
|
+
# Append metadata if enabled
|
227
|
+
if options[:include_metadata]
|
228
|
+
metadata_parts = []
|
229
|
+
metadata_parts << "tokens:#{doc[:tokens]}" if doc[:tokens]
|
230
|
+
metadata_parts << "compression:#{doc[:compression]}" if doc[:compression]
|
231
|
+
metadata_parts << "updated:#{doc[:updated]}" if doc[:updated]
|
232
|
+
metadata_parts << priority_label(doc[:priority]) if options[:include_priority]
|
233
|
+
|
234
|
+
line += " #{metadata_parts.join(' ')}" unless metadata_parts.empty?
|
235
|
+
end
|
236
|
+
|
237
|
+
content << line
|
194
238
|
end
|
195
239
|
end
|
196
240
|
|
@@ -230,5 +274,20 @@ module LlmDocsBuilder
|
|
230
274
|
path
|
231
275
|
end
|
232
276
|
end
|
277
|
+
|
278
|
+
# Converts numeric priority to human-readable label
|
279
|
+
#
|
280
|
+
# @param priority [Integer] priority value (1-7)
|
281
|
+
# @return [String] priority label (high, medium, low)
|
282
|
+
def priority_label(priority)
|
283
|
+
case priority
|
284
|
+
when 1..2
|
285
|
+
'priority:high'
|
286
|
+
when 3..5
|
287
|
+
'priority:medium'
|
288
|
+
when 6..7
|
289
|
+
'priority:low'
|
290
|
+
end
|
291
|
+
end
|
233
292
|
end
|
234
293
|
end
|
@@ -3,9 +3,8 @@
|
|
3
3
|
module LlmDocsBuilder
|
4
4
|
# Transforms markdown files to be AI-friendly
|
5
5
|
#
|
6
|
-
#
|
7
|
-
#
|
8
|
-
# formats.
|
6
|
+
# Orchestrates a pipeline of specialized transformers to process markdown content.
|
7
|
+
# Each transformer is responsible for a specific aspect of the transformation.
|
9
8
|
#
|
10
9
|
# @example Transform with base URL
|
11
10
|
# transformer = LlmDocsBuilder::MarkdownTransformer.new('README.md',
|
@@ -31,163 +30,99 @@ module LlmDocsBuilder
|
|
31
30
|
# @option options [Boolean] :normalize_whitespace normalize excessive whitespace
|
32
31
|
# @option options [Boolean] :remove_badges remove badge/shield images
|
33
32
|
# @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
|
33
|
+
# @option options [Boolean] :remove_code_examples remove code blocks and inline code
|
34
|
+
# @option options [Boolean] :remove_images remove image syntax
|
35
|
+
# @option options [Boolean] :simplify_links shorten verbose link text
|
36
|
+
# @option options [Boolean] :remove_blockquotes remove blockquote formatting
|
37
|
+
# @option options [Boolean] :generate_toc generate table of contents at the top
|
38
|
+
# @option options [String] :custom_instruction custom instruction text to inject at top
|
39
|
+
# @option options [Boolean] :remove_stopwords remove common stopwords (aggressive)
|
40
|
+
# @option options [Boolean] :remove_duplicates remove duplicate paragraphs
|
34
41
|
def initialize(file_path, options = {})
|
35
42
|
@file_path = file_path
|
36
43
|
@options = options
|
37
44
|
end
|
38
45
|
|
39
|
-
# Transform markdown content
|
46
|
+
# Transform markdown content using a pipeline of transformers
|
40
47
|
#
|
41
|
-
#
|
42
|
-
# - Removes
|
43
|
-
# -
|
44
|
-
#
|
45
|
-
# -
|
46
|
-
#
|
47
|
-
# - Normalizes
|
48
|
+
# Processes content through specialized transformers in order:
|
49
|
+
# 1. ContentCleanupTransformer - Removes unwanted elements
|
50
|
+
# 2. LinkTransformer - Processes links
|
51
|
+
# 3. HeadingTransformer - Normalizes heading hierarchy (if enabled)
|
52
|
+
# 4. TextCompressor - Advanced compression (if enabled)
|
53
|
+
# 5. EnhancementTransformer - Adds TOC and instructions
|
54
|
+
# 6. WhitespaceTransformer - Normalizes whitespace
|
48
55
|
#
|
49
56
|
# @return [String] transformed markdown content
|
50
57
|
def transform
|
51
58
|
content = File.read(file_path)
|
52
59
|
|
53
|
-
#
|
54
|
-
content =
|
55
|
-
|
56
|
-
|
57
|
-
content =
|
58
|
-
content =
|
59
|
-
|
60
|
-
# Content cleanup
|
61
|
-
content = remove_comments(content) if options[:remove_comments]
|
62
|
-
content = remove_badges(content) if options[:remove_badges]
|
63
|
-
|
64
|
-
# Whitespace normalization last (after all other transformations)
|
65
|
-
content = normalize_whitespace(content) if options[:normalize_whitespace]
|
60
|
+
# Build and execute transformation pipeline
|
61
|
+
content = cleanup_transformer.transform(content, options)
|
62
|
+
content = link_transformer.transform(content, options)
|
63
|
+
content = heading_transformer.transform(content, options)
|
64
|
+
content = compress_content(content) if should_compress?
|
65
|
+
content = enhancement_transformer.transform(content, options)
|
66
|
+
content = whitespace_transformer.transform(content, options)
|
66
67
|
|
67
68
|
content
|
68
69
|
end
|
69
70
|
|
70
71
|
private
|
71
72
|
|
72
|
-
#
|
73
|
+
# Get content cleanup transformer instance
|
73
74
|
#
|
74
|
-
#
|
75
|
-
|
76
|
-
|
77
|
-
# @param content [String] markdown content to process
|
78
|
-
# @return [String] content with expanded links
|
79
|
-
def expand_relative_links(content)
|
80
|
-
base_url = options[:base_url]
|
81
|
-
|
82
|
-
content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
|
83
|
-
text = ::Regexp.last_match(1)
|
84
|
-
url = ::Regexp.last_match(2)
|
85
|
-
|
86
|
-
if url.start_with?('http://', 'https://', '//', '#')
|
87
|
-
match # Already absolute or anchor
|
88
|
-
else
|
89
|
-
# Clean up relative path
|
90
|
-
clean_url = url.gsub(%r{^\./}, '') # Remove leading './'
|
91
|
-
expanded_url = File.join(base_url, clean_url)
|
92
|
-
"[#{text}](#{expanded_url})"
|
93
|
-
end
|
94
|
-
end
|
75
|
+
# @return [Transformers::ContentCleanupTransformer]
|
76
|
+
def cleanup_transformer
|
77
|
+
@cleanup_transformer ||= Transformers::ContentCleanupTransformer.new
|
95
78
|
end
|
96
79
|
|
97
|
-
#
|
80
|
+
# Get link transformer instance
|
98
81
|
#
|
99
|
-
#
|
100
|
-
|
101
|
-
|
102
|
-
# @return [String] content with converted URLs
|
103
|
-
def convert_html_urls(content)
|
104
|
-
content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
|
105
|
-
url.sub(/\.html?$/, '.md')
|
106
|
-
end
|
82
|
+
# @return [Transformers::LinkTransformer]
|
83
|
+
def link_transformer
|
84
|
+
@link_transformer ||= Transformers::LinkTransformer.new
|
107
85
|
end
|
108
86
|
|
109
|
-
#
|
110
|
-
#
|
111
|
-
# Strips out HTML comments (<!-- ... -->) which are typically metadata for developers
|
112
|
-
# and not relevant for LLM consumption. This reduces token usage and improves clarity.
|
113
|
-
#
|
114
|
-
# Handles:
|
115
|
-
# - Single-line comments: <!-- comment -->
|
116
|
-
# - Multi-line comments spanning multiple lines
|
117
|
-
# - Multiple comments in the same content
|
87
|
+
# Get heading transformer instance
|
118
88
|
#
|
119
|
-
# @
|
120
|
-
|
121
|
-
|
122
|
-
# Remove HTML comments (single and multi-line)
|
123
|
-
# The .*? makes it non-greedy so it stops at the first -->
|
124
|
-
content.gsub(/<!--.*?-->/m, '')
|
89
|
+
# @return [Transformers::HeadingTransformer]
|
90
|
+
def heading_transformer
|
91
|
+
@heading_transformer ||= Transformers::HeadingTransformer.new
|
125
92
|
end
|
126
93
|
|
127
|
-
#
|
94
|
+
# Get enhancement transformer instance
|
128
95
|
#
|
129
|
-
#
|
130
|
-
|
131
|
-
|
132
|
-
# Recognizes common patterns:
|
133
|
-
# - [](link) (linked badges)
|
134
|
-
# -  (unlinked badges)
|
135
|
-
# - Common badge domains: shields.io, badge.fury.io, travis-ci.org, etc.
|
136
|
-
#
|
137
|
-
# @param content [String] markdown content to process
|
138
|
-
# @return [String] content with badges removed
|
139
|
-
def remove_badges(content)
|
140
|
-
# Remove linked badges: [](link-url)
|
141
|
-
content = content.gsub(/\[\!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)\]\([^\)]*\)/i, '')
|
142
|
-
|
143
|
-
# Remove standalone badges: 
|
144
|
-
content = content.gsub(/!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)/i, '')
|
145
|
-
|
146
|
-
content
|
96
|
+
# @return [Transformers::EnhancementTransformer]
|
97
|
+
def enhancement_transformer
|
98
|
+
@enhancement_transformer ||= Transformers::EnhancementTransformer.new
|
147
99
|
end
|
148
100
|
|
149
|
-
#
|
150
|
-
#
|
151
|
-
# Strips out frontmatter blocks which are metadata used by static site generators
|
152
|
-
# (Jekyll, Hugo, etc.) but not relevant for LLM consumption.
|
101
|
+
# Get whitespace transformer instance
|
153
102
|
#
|
154
|
-
#
|
155
|
-
|
156
|
-
|
157
|
-
#
|
158
|
-
# @param content [String] markdown content to process
|
159
|
-
# @return [String] content with frontmatter removed
|
160
|
-
def remove_frontmatter(content)
|
161
|
-
# Remove YAML frontmatter (--- ... ---)
|
162
|
-
content = content.sub(/\A---\s*$.*?^---\s*$/m, '')
|
163
|
-
|
164
|
-
# Remove TOML frontmatter (+++ ... +++)
|
165
|
-
content = content.sub(/\A\+\+\+\s*$.*?^\+\+\+\s*$/m, '')
|
166
|
-
|
167
|
-
content
|
103
|
+
# @return [Transformers::WhitespaceTransformer]
|
104
|
+
def whitespace_transformer
|
105
|
+
@whitespace_transformer ||= Transformers::WhitespaceTransformer.new
|
168
106
|
end
|
169
107
|
|
170
|
-
#
|
171
|
-
#
|
172
|
-
# Reduces excessive blank lines and trailing whitespace to make content more compact
|
173
|
-
# for LLM consumption without affecting readability.
|
174
|
-
#
|
175
|
-
# Transformations:
|
176
|
-
# - Multiple consecutive blank lines (3+) → 2 blank lines max
|
177
|
-
# - Trailing whitespace on lines → removed
|
178
|
-
# - Leading/trailing whitespace in file → trimmed
|
108
|
+
# Check if content compression should be applied
|
179
109
|
#
|
180
|
-
# @
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
content = content.gsub(/ +$/, '')
|
185
|
-
|
186
|
-
# Reduce multiple consecutive blank lines to maximum of 2
|
187
|
-
content = content.gsub(/\n{4,}/, "\n\n\n")
|
110
|
+
# @return [Boolean]
|
111
|
+
def should_compress?
|
112
|
+
options[:remove_stopwords] || options[:remove_duplicates]
|
113
|
+
end
|
188
114
|
|
189
|
-
|
190
|
-
|
115
|
+
# Compress content using TextCompressor
|
116
|
+
#
|
117
|
+
# @param content [String] content to compress
|
118
|
+
# @return [String] compressed content
|
119
|
+
def compress_content(content)
|
120
|
+
compressor = TextCompressor.new
|
121
|
+
compression_methods = {
|
122
|
+
remove_stopwords: options[:remove_stopwords],
|
123
|
+
remove_duplicates: options[:remove_duplicates]
|
124
|
+
}
|
125
|
+
compressor.compress(content, compression_methods)
|
191
126
|
end
|
192
127
|
end
|
193
128
|
end
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
# Formats output for CLI display
|
5
|
+
#
|
6
|
+
# Provides formatting utilities for displaying comparison results,
|
7
|
+
# byte sizes, and numbers in a user-friendly way.
|
8
|
+
#
|
9
|
+
# @api private
|
10
|
+
class OutputFormatter
|
11
|
+
# Format bytes into human-readable string
|
12
|
+
#
|
13
|
+
# @param bytes [Integer] number of bytes
|
14
|
+
# @return [String] formatted string with units (bytes/KB/MB)
|
15
|
+
#
|
16
|
+
# @example
|
17
|
+
# OutputFormatter.format_bytes(1024) #=> "1.0 KB"
|
18
|
+
# OutputFormatter.format_bytes(1048576) #=> "1.0 MB"
|
19
|
+
def self.format_bytes(bytes)
|
20
|
+
if bytes < 1024
|
21
|
+
"#{bytes} bytes"
|
22
|
+
elsif bytes < 1024 * 1024
|
23
|
+
"#{(bytes / 1024.0).round(1)} KB"
|
24
|
+
else
|
25
|
+
"#{(bytes / (1024.0 * 1024)).round(2)} MB"
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Format number with comma separators for readability
|
30
|
+
#
|
31
|
+
# @param number [Integer] number to format
|
32
|
+
# @return [String] formatted number with commas
|
33
|
+
#
|
34
|
+
# @example
|
35
|
+
# OutputFormatter.format_number(1234567) #=> "1,234,567"
|
36
|
+
def self.format_number(number)
|
37
|
+
number.to_s.reverse.gsub(/(\d{3})(?=\d)/, '\\1,').reverse
|
38
|
+
end
|
39
|
+
|
40
|
+
# Display formatted comparison results
|
41
|
+
#
|
42
|
+
# @param result [Hash] comparison results from Comparator
|
43
|
+
def self.display_comparison_results(result)
|
44
|
+
puts ''
|
45
|
+
puts '=' * 60
|
46
|
+
puts 'Context Window Comparison'
|
47
|
+
puts '=' * 60
|
48
|
+
puts ''
|
49
|
+
puts "Human version: #{format_bytes(result[:human_size])} (~#{format_number(result[:human_tokens])} tokens)"
|
50
|
+
puts " Source: #{result[:human_source]}"
|
51
|
+
puts ''
|
52
|
+
puts "AI version: #{format_bytes(result[:ai_size])} (~#{format_number(result[:ai_tokens])} tokens)"
|
53
|
+
puts " Source: #{result[:ai_source]}"
|
54
|
+
puts ''
|
55
|
+
puts '-' * 60
|
56
|
+
|
57
|
+
if result[:reduction_bytes].positive?
|
58
|
+
display_reduction(result)
|
59
|
+
elsif result[:reduction_bytes].negative?
|
60
|
+
display_increase(result)
|
61
|
+
else
|
62
|
+
puts 'Same size'
|
63
|
+
end
|
64
|
+
|
65
|
+
puts '=' * 60
|
66
|
+
puts ''
|
67
|
+
end
|
68
|
+
|
69
|
+
# Display reduction statistics
|
70
|
+
#
|
71
|
+
# @param result [Hash] comparison results
|
72
|
+
# @api private
|
73
|
+
def self.display_reduction(result)
|
74
|
+
puts "Reduction: #{format_bytes(result[:reduction_bytes])} (#{result[:reduction_percent]}%)"
|
75
|
+
puts "Token savings: #{format_number(result[:token_reduction])} tokens (#{result[:token_reduction_percent]}%)"
|
76
|
+
puts "Factor: #{result[:factor]}x smaller"
|
77
|
+
end
|
78
|
+
|
79
|
+
# Display increase statistics
|
80
|
+
#
|
81
|
+
# @param result [Hash] comparison results
|
82
|
+
# @api private
|
83
|
+
def self.display_increase(result)
|
84
|
+
increase_bytes = result[:reduction_bytes].abs
|
85
|
+
increase_percent = result[:reduction_percent].abs
|
86
|
+
token_increase = result[:token_reduction].abs
|
87
|
+
token_increase_percent = result[:token_reduction_percent].abs
|
88
|
+
puts "Increase: #{format_bytes(increase_bytes)} (#{increase_percent}%)"
|
89
|
+
puts "Token increase: #{format_number(token_increase)} tokens (#{token_increase_percent}%)"
|
90
|
+
puts "Factor: #{result[:factor]}x larger"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|