llm-docs-builder 0.3.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/docker.yml +6 -6
- data/.rspec +3 -0
- data/CHANGELOG.md +38 -1
- data/Gemfile.lock +1 -1
- data/README.md +190 -519
- data/bin/rspecs +2 -1
- data/lib/llm_docs_builder/cli.rb +1 -50
- data/lib/llm_docs_builder/comparator.rb +30 -7
- data/lib/llm_docs_builder/config.rb +58 -1
- data/lib/llm_docs_builder/markdown_transformer.rb +65 -36
- data/lib/llm_docs_builder/output_formatter.rb +93 -0
- data/lib/llm_docs_builder/parser.rb +1 -59
- data/lib/llm_docs_builder/text_compressor.rb +164 -0
- data/lib/llm_docs_builder/token_estimator.rb +52 -0
- data/lib/llm_docs_builder/transformers/base_transformer.rb +30 -0
- data/lib/llm_docs_builder/transformers/content_cleanup_transformer.rb +106 -0
- data/lib/llm_docs_builder/transformers/enhancement_transformer.rb +95 -0
- data/lib/llm_docs_builder/transformers/link_transformer.rb +84 -0
- data/lib/llm_docs_builder/transformers/whitespace_transformer.rb +44 -0
- data/lib/llm_docs_builder/version.rb +1 -1
- metadata +10 -3
- data/CLAUDE.md +0 -178
- data/llm-docs-builder.yml +0 -7
@@ -0,0 +1,164 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
# Advanced text compression techniques for reducing token count
|
5
|
+
#
|
6
|
+
# Provides more aggressive text compression methods including stopword removal,
|
7
|
+
# duplicate content detection, and sentence deduplication. These methods are more
|
8
|
+
# aggressive than basic markdown cleanup and should be used carefully.
|
9
|
+
#
|
10
|
+
# @example Basic usage
|
11
|
+
# compressor = LlmDocsBuilder::TextCompressor.new
|
12
|
+
# compressed = compressor.compress("Your text here", remove_stopwords: true)
|
13
|
+
#
|
14
|
+
# @api public
|
15
|
+
class TextCompressor
|
16
|
+
# Common English stopwords that can be safely removed from documentation
|
17
|
+
# Excludes words that might be important in technical contexts (like "not", "no")
|
18
|
+
STOPWORDS = %w[
|
19
|
+
a an the this that these those
|
20
|
+
is am are was were be being been
|
21
|
+
have has had do does did
|
22
|
+
will would shall should may might must can could
|
23
|
+
i me my mine we us our ours
|
24
|
+
you your yours
|
25
|
+
he him his she her hers it its
|
26
|
+
they them their theirs
|
27
|
+
what which who whom whose where when why how
|
28
|
+
all both each few more most other some such
|
29
|
+
and or but if then else
|
30
|
+
at by for from in into of on to with
|
31
|
+
as so than
|
32
|
+
very really quite
|
33
|
+
there here
|
34
|
+
about above across after against along among around because before behind below
|
35
|
+
beneath beside besides between beyond during except inside near off since through
|
36
|
+
throughout under until up upon within without
|
37
|
+
].freeze
|
38
|
+
|
39
|
+
# @return [Hash] compression options
|
40
|
+
attr_reader :options
|
41
|
+
|
42
|
+
# Initialize a new text compressor
|
43
|
+
#
|
44
|
+
# @param options [Hash] compression options
|
45
|
+
# @option options [Array<String>] :custom_stopwords additional stopwords to remove
|
46
|
+
# @option options [Boolean] :preserve_technical preserve technical terms and code
|
47
|
+
def initialize(options = {})
|
48
|
+
@options = {
|
49
|
+
preserve_technical: true,
|
50
|
+
custom_stopwords: []
|
51
|
+
}.merge(options)
|
52
|
+
end
|
53
|
+
|
54
|
+
# Compress text using configured methods
|
55
|
+
#
|
56
|
+
# @param content [String] text to compress
|
57
|
+
# @param methods [Hash] compression methods to apply
|
58
|
+
# @option methods [Boolean] :remove_stopwords remove common filler words
|
59
|
+
# @option methods [Boolean] :remove_duplicates remove duplicate sentences/paragraphs
|
60
|
+
# @return [String] compressed text
|
61
|
+
def compress(content, methods = {})
|
62
|
+
result = content.dup
|
63
|
+
|
64
|
+
result = remove_stopwords(result) if methods[:remove_stopwords]
|
65
|
+
result = remove_duplicate_paragraphs(result) if methods[:remove_duplicates]
|
66
|
+
|
67
|
+
result
|
68
|
+
end
|
69
|
+
|
70
|
+
# Remove stopwords from text while preserving technical content
|
71
|
+
#
|
72
|
+
# Removes common English stopwords that don't carry significant meaning.
|
73
|
+
# Preserves code blocks, inline code, and technical terms.
|
74
|
+
#
|
75
|
+
# WARNING: This is an aggressive optimization that may affect readability.
|
76
|
+
# Use with caution and test results carefully.
|
77
|
+
#
|
78
|
+
# @param content [String] text to process
|
79
|
+
# @return [String] text with stopwords removed
|
80
|
+
def remove_stopwords(content)
|
81
|
+
# Preserve code blocks by temporarily replacing them
|
82
|
+
code_blocks = {}
|
83
|
+
code_counter = 0
|
84
|
+
|
85
|
+
# Extract and preserve fenced code blocks
|
86
|
+
content = content.gsub(/^```.*?^```/m) do |match|
|
87
|
+
placeholder = "___CODE_BLOCK_#{code_counter}___"
|
88
|
+
code_blocks[placeholder] = match
|
89
|
+
code_counter += 1
|
90
|
+
placeholder
|
91
|
+
end
|
92
|
+
|
93
|
+
# Extract and preserve inline code
|
94
|
+
content = content.gsub(/`[^`]+`/) do |match|
|
95
|
+
placeholder = "___INLINE_CODE_#{code_counter}___"
|
96
|
+
code_blocks[placeholder] = match
|
97
|
+
code_counter += 1
|
98
|
+
placeholder
|
99
|
+
end
|
100
|
+
|
101
|
+
# Get combined stopwords list
|
102
|
+
stopwords_list = STOPWORDS + options[:custom_stopwords]
|
103
|
+
|
104
|
+
# Process each line
|
105
|
+
content = content.split("\n").map do |line|
|
106
|
+
# Skip markdown headers, lists, and links
|
107
|
+
if line.match?(/^#+\s/) || line.match?(/^[\*\-]\s/) || line.match?(/\[[^\]]+\]\([^)]+\)/)
|
108
|
+
line
|
109
|
+
else
|
110
|
+
# Remove stopwords from regular text
|
111
|
+
words = line.split(/\b/)
|
112
|
+
words.map do |word|
|
113
|
+
# Preserve the word if it's not a stopword or if we should preserve technical terms
|
114
|
+
if stopwords_list.include?(word.downcase) && !word.match?(/^[A-Z]/) # Don't remove capitalized words
|
115
|
+
''
|
116
|
+
else
|
117
|
+
word
|
118
|
+
end
|
119
|
+
end.join
|
120
|
+
end
|
121
|
+
end.join("\n")
|
122
|
+
|
123
|
+
# Restore code blocks
|
124
|
+
code_blocks.each do |placeholder, original|
|
125
|
+
content = content.gsub(placeholder, original)
|
126
|
+
end
|
127
|
+
|
128
|
+
content
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove duplicate paragraphs from text
|
132
|
+
#
|
133
|
+
# Detects and removes paragraphs that are duplicates or near-duplicates.
|
134
|
+
# Documentation often repeats concepts across different sections.
|
135
|
+
#
|
136
|
+
# @param content [String] text to process
|
137
|
+
# @return [String] text with duplicate paragraphs removed
|
138
|
+
def remove_duplicate_paragraphs(content)
|
139
|
+
# Split into paragraphs (separated by blank lines)
|
140
|
+
paragraphs = content.split(/\n\s*\n/)
|
141
|
+
|
142
|
+
# Track seen paragraphs with normalized comparison
|
143
|
+
seen = {}
|
144
|
+
unique_paragraphs = []
|
145
|
+
|
146
|
+
paragraphs.each do |para|
|
147
|
+
# Normalize for comparison (remove extra whitespace, lowercase)
|
148
|
+
normalized = para.gsub(/\s+/, ' ').strip.downcase
|
149
|
+
|
150
|
+
# Skip empty paragraphs
|
151
|
+
next if normalized.empty?
|
152
|
+
|
153
|
+
# Check if we've seen this or similar paragraph
|
154
|
+
unless seen[normalized]
|
155
|
+
seen[normalized] = true
|
156
|
+
unique_paragraphs << para
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
unique_paragraphs.join("\n\n")
|
161
|
+
end
|
162
|
+
|
163
|
+
end
|
164
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
# Estimates token count for text content using character-based approximation
|
5
|
+
#
|
6
|
+
# Provides token estimation without requiring external tokenizer dependencies.
|
7
|
+
# Uses the common heuristic that ~4 characters equals 1 token for English text,
|
8
|
+
# which works reasonably well for documentation and markdown content.
|
9
|
+
#
|
10
|
+
# @example Basic usage
|
11
|
+
# estimator = LlmDocsBuilder::TokenEstimator.new
|
12
|
+
# token_count = estimator.estimate("This is a sample text.")
|
13
|
+
#
|
14
|
+
# @example With custom characters per token
|
15
|
+
# estimator = LlmDocsBuilder::TokenEstimator.new(chars_per_token: 3.5)
|
16
|
+
# token_count = estimator.estimate(content)
|
17
|
+
#
|
18
|
+
# @api public
|
19
|
+
class TokenEstimator
|
20
|
+
# Default number of characters per token
|
21
|
+
DEFAULT_CHARS_PER_TOKEN = 4.0
|
22
|
+
|
23
|
+
# @return [Float] characters per token ratio
|
24
|
+
attr_reader :chars_per_token
|
25
|
+
|
26
|
+
# Initialize a new token estimator
|
27
|
+
#
|
28
|
+
# @param chars_per_token [Float] number of characters per token (default: 4.0)
|
29
|
+
def initialize(chars_per_token: DEFAULT_CHARS_PER_TOKEN)
|
30
|
+
@chars_per_token = chars_per_token.to_f
|
31
|
+
end
|
32
|
+
|
33
|
+
# Estimate token count for given content
|
34
|
+
#
|
35
|
+
# @param content [String] text content to estimate tokens for
|
36
|
+
# @return [Integer] estimated number of tokens
|
37
|
+
def estimate(content)
|
38
|
+
return 0 if content.nil? || content.empty?
|
39
|
+
|
40
|
+
(content.length / chars_per_token).round
|
41
|
+
end
|
42
|
+
|
43
|
+
# Estimate token count (class method for convenience)
|
44
|
+
#
|
45
|
+
# @param content [String] text content to estimate tokens for
|
46
|
+
# @param chars_per_token [Float] number of characters per token (default: 4.0)
|
47
|
+
# @return [Integer] estimated number of tokens
|
48
|
+
def self.estimate(content, chars_per_token: DEFAULT_CHARS_PER_TOKEN)
|
49
|
+
new(chars_per_token: chars_per_token).estimate(content)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
module Transformers
|
5
|
+
# Base module for all transformers
|
6
|
+
#
|
7
|
+
# Provides a common interface for content transformation operations.
|
8
|
+
# Each transformer should implement the `transform` method.
|
9
|
+
#
|
10
|
+
# @api public
|
11
|
+
module BaseTransformer
|
12
|
+
# Transform content
|
13
|
+
#
|
14
|
+
# @param content [String] content to transform
|
15
|
+
# @param options [Hash] transformation options
|
16
|
+
# @return [String] transformed content
|
17
|
+
def transform(content, options = {})
|
18
|
+
raise NotImplementedError, "#{self.class} must implement #transform"
|
19
|
+
end
|
20
|
+
|
21
|
+
# Check if transformation should be applied
|
22
|
+
#
|
23
|
+
# @param options [Hash] transformation options
|
24
|
+
# @return [Boolean] true if transformation should be applied
|
25
|
+
def should_transform?(options)
|
26
|
+
true
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
module Transformers
|
5
|
+
# Transformer for content cleanup operations
|
6
|
+
#
|
7
|
+
# Handles removal of various markdown elements that don't provide
|
8
|
+
# value for LLM consumption (frontmatter, comments, badges, etc.).
|
9
|
+
#
|
10
|
+
# @api public
|
11
|
+
class ContentCleanupTransformer
|
12
|
+
include BaseTransformer
|
13
|
+
|
14
|
+
# Transform content by removing unwanted elements
|
15
|
+
#
|
16
|
+
# @param content [String] markdown content
|
17
|
+
# @param options [Hash] transformation options
|
18
|
+
# @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
|
19
|
+
# @option options [Boolean] :remove_comments remove HTML comments
|
20
|
+
# @option options [Boolean] :remove_badges remove badge images
|
21
|
+
# @option options [Boolean] :remove_code_examples remove code blocks
|
22
|
+
# @option options [Boolean] :remove_images remove image syntax
|
23
|
+
# @option options [Boolean] :remove_blockquotes remove blockquote formatting
|
24
|
+
# @return [String] transformed content
|
25
|
+
def transform(content, options = {})
|
26
|
+
result = content.dup
|
27
|
+
|
28
|
+
result = remove_frontmatter(result) if options[:remove_frontmatter]
|
29
|
+
result = remove_comments(result) if options[:remove_comments]
|
30
|
+
result = remove_badges(result) if options[:remove_badges]
|
31
|
+
result = remove_code_examples(result) if options[:remove_code_examples]
|
32
|
+
result = remove_images(result) if options[:remove_images]
|
33
|
+
result = remove_blockquotes(result) if options[:remove_blockquotes]
|
34
|
+
|
35
|
+
result
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
# Remove YAML or TOML frontmatter
|
41
|
+
#
|
42
|
+
# @param content [String] markdown content
|
43
|
+
# @return [String] content without frontmatter
|
44
|
+
def remove_frontmatter(content)
|
45
|
+
content = content.sub(/\A---\s*$.*?^---\s*$/m, '')
|
46
|
+
content = content.sub(/\A\+\+\+\s*$.*?^\+\+\+\s*$/m, '')
|
47
|
+
content
|
48
|
+
end
|
49
|
+
|
50
|
+
# Remove HTML comments
|
51
|
+
#
|
52
|
+
# @param content [String] markdown content
|
53
|
+
# @return [String] content without comments
|
54
|
+
def remove_comments(content)
|
55
|
+
content.gsub(/<!--.*?-->/m, '')
|
56
|
+
end
|
57
|
+
|
58
|
+
# Remove badge images
|
59
|
+
#
|
60
|
+
# @param content [String] markdown content
|
61
|
+
# @return [String] content without badges
|
62
|
+
def remove_badges(content)
|
63
|
+
# Remove linked badges
|
64
|
+
content = content.gsub(/\[\!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)\]\([^\)]*\)/i, '')
|
65
|
+
# Remove standalone badges
|
66
|
+
content = content.gsub(/!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)/i, '')
|
67
|
+
content
|
68
|
+
end
|
69
|
+
|
70
|
+
# Remove code blocks and inline code
|
71
|
+
#
|
72
|
+
# @param content [String] markdown content
|
73
|
+
# @return [String] content without code
|
74
|
+
def remove_code_examples(content)
|
75
|
+
# Remove fenced code blocks
|
76
|
+
content = content.gsub(/^```.*?^```/m, '')
|
77
|
+
content = content.gsub(/^~~~.*?^~~~/m, '')
|
78
|
+
# Remove indented code blocks
|
79
|
+
content = content.gsub(/^([ ]{4,}|\t).+$/m, '')
|
80
|
+
# Remove inline code
|
81
|
+
content = content.gsub(/`[^`]+`/, '')
|
82
|
+
content
|
83
|
+
end
|
84
|
+
|
85
|
+
# Remove image syntax
|
86
|
+
#
|
87
|
+
# @param content [String] markdown content
|
88
|
+
# @return [String] content without images
|
89
|
+
def remove_images(content)
|
90
|
+
# Remove inline images
|
91
|
+
content = content.gsub(/!\[([^\]]*)\]\([^\)]+\)/, '')
|
92
|
+
# Remove reference-style images
|
93
|
+
content = content.gsub(/!\[([^\]]*)\]\[[^\]]+\]/, '')
|
94
|
+
content
|
95
|
+
end
|
96
|
+
|
97
|
+
# Remove blockquote formatting
|
98
|
+
#
|
99
|
+
# @param content [String] markdown content
|
100
|
+
# @return [String] content without blockquote markers
|
101
|
+
def remove_blockquotes(content)
|
102
|
+
content.gsub(/^>\s?/, '')
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
module Transformers
|
5
|
+
# Transformer for document enhancements
|
6
|
+
#
|
7
|
+
# Adds helpful features like table of contents and custom instructions
|
8
|
+
# to improve LLM navigation and context understanding.
|
9
|
+
#
|
10
|
+
# @api public
|
11
|
+
class EnhancementTransformer
|
12
|
+
include BaseTransformer
|
13
|
+
|
14
|
+
# Transform content by adding enhancements
|
15
|
+
#
|
16
|
+
# @param content [String] markdown content
|
17
|
+
# @param options [Hash] transformation options
|
18
|
+
# @option options [Boolean] :generate_toc generate table of contents
|
19
|
+
# @option options [String] :custom_instruction custom instruction text
|
20
|
+
# @option options [Boolean] :remove_blockquotes whether blockquotes are being removed
|
21
|
+
# @return [String] transformed content
|
22
|
+
def transform(content, options = {})
|
23
|
+
result = content.dup
|
24
|
+
|
25
|
+
if options[:custom_instruction]
|
26
|
+
result = inject_custom_instruction(result, options[:custom_instruction], options[:remove_blockquotes])
|
27
|
+
end
|
28
|
+
result = generate_table_of_contents(result) if options[:generate_toc]
|
29
|
+
|
30
|
+
result
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# Generate table of contents from headings
|
36
|
+
#
|
37
|
+
# @param content [String] markdown content
|
38
|
+
# @return [String] content with TOC prepended
|
39
|
+
def generate_table_of_contents(content)
|
40
|
+
headings = []
|
41
|
+
content.scan(/^(#{Regexp.escape('#')}{1,6})\s+(.+)$/) do
|
42
|
+
level = ::Regexp.last_match(1).length
|
43
|
+
title = ::Regexp.last_match(2).strip
|
44
|
+
|
45
|
+
anchor = title.downcase
|
46
|
+
.gsub(/[^\w\s-]/, '')
|
47
|
+
.gsub(/\s+/, '-')
|
48
|
+
|
49
|
+
headings << { level: level, title: title, anchor: anchor }
|
50
|
+
end
|
51
|
+
|
52
|
+
return content if headings.empty?
|
53
|
+
|
54
|
+
toc = ["## Table of Contents\n"]
|
55
|
+
|
56
|
+
headings.each do |heading|
|
57
|
+
next if heading[:level] == 1 && headings.first == heading
|
58
|
+
|
59
|
+
indent = ' ' * (heading[:level] - 1)
|
60
|
+
toc << "#{indent}- [#{heading[:title]}](##{heading[:anchor]})"
|
61
|
+
end
|
62
|
+
|
63
|
+
toc << "\n---\n"
|
64
|
+
|
65
|
+
if content.match(/^#\s+(.+)$/)
|
66
|
+
content.sub(/^(#\s+.+\n)/, "\\1\n#{toc.join("\n")}\n")
|
67
|
+
else
|
68
|
+
"#{toc.join("\n")}\n\n#{content}"
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# Inject custom instruction at document top
|
73
|
+
#
|
74
|
+
# @param content [String] markdown content
|
75
|
+
# @param instruction [String] instruction text
|
76
|
+
# @param remove_blockquotes [Boolean] whether to avoid blockquote formatting
|
77
|
+
# @return [String] content with instruction prepended
|
78
|
+
def inject_custom_instruction(content, instruction, remove_blockquotes = false)
|
79
|
+
return content if instruction.nil? || instruction.empty?
|
80
|
+
|
81
|
+
formatted_instruction = if remove_blockquotes
|
82
|
+
"**AI Context**: #{instruction}\n\n---\n\n"
|
83
|
+
else
|
84
|
+
"> **AI Context**: #{instruction}\n\n---\n\n"
|
85
|
+
end
|
86
|
+
|
87
|
+
if content.match(/^#\s+(.+?)\n/)
|
88
|
+
content.sub(/^(#\s+.+?\n)/, "\\1\n#{formatted_instruction}")
|
89
|
+
else
|
90
|
+
"#{formatted_instruction}#{content}"
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
module Transformers
|
5
|
+
# Transformer for link-related operations
|
6
|
+
#
|
7
|
+
# Handles expansion of relative links to absolute URLs and
|
8
|
+
# conversion of HTML URLs to markdown format.
|
9
|
+
#
|
10
|
+
# @api public
|
11
|
+
class LinkTransformer
|
12
|
+
include BaseTransformer
|
13
|
+
|
14
|
+
# Transform links in content
|
15
|
+
#
|
16
|
+
# @param content [String] markdown content
|
17
|
+
# @param options [Hash] transformation options
|
18
|
+
# @option options [String] :base_url base URL for expanding relative links
|
19
|
+
# @option options [Boolean] :convert_urls convert HTML URLs to markdown format
|
20
|
+
# @return [String] transformed content
|
21
|
+
def transform(content, options = {})
|
22
|
+
result = content.dup
|
23
|
+
|
24
|
+
result = expand_relative_links(result, options[:base_url]) if options[:base_url]
|
25
|
+
result = convert_html_urls(result) if options[:convert_urls]
|
26
|
+
result = simplify_links(result) if options[:simplify_links]
|
27
|
+
|
28
|
+
result
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
# Expand relative links to absolute URLs
|
34
|
+
#
|
35
|
+
# @param content [String] markdown content
|
36
|
+
# @param base_url [String] base URL for expansion
|
37
|
+
# @return [String] content with expanded links
|
38
|
+
def expand_relative_links(content, base_url)
|
39
|
+
content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
|
40
|
+
text = ::Regexp.last_match(1)
|
41
|
+
url = ::Regexp.last_match(2)
|
42
|
+
|
43
|
+
if url.start_with?('http://', 'https://', '//', '#')
|
44
|
+
match
|
45
|
+
else
|
46
|
+
clean_url = url.gsub(%r{^\./}, '')
|
47
|
+
expanded_url = File.join(base_url, clean_url)
|
48
|
+
"[#{text}](#{expanded_url})"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Convert HTML URLs to markdown format
|
54
|
+
#
|
55
|
+
# @param content [String] markdown content
|
56
|
+
# @return [String] content with converted URLs
|
57
|
+
def convert_html_urls(content)
|
58
|
+
content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
|
59
|
+
url.sub(/\.html?$/, '.md')
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Simplify verbose link text
|
64
|
+
#
|
65
|
+
# @param content [String] markdown content
|
66
|
+
# @return [String] content with simplified links
|
67
|
+
def simplify_links(content)
|
68
|
+
content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do
|
69
|
+
text = ::Regexp.last_match(1)
|
70
|
+
url = ::Regexp.last_match(2)
|
71
|
+
|
72
|
+
simplified_text = text
|
73
|
+
.gsub(/^(click here to|see|read more about|check out|visit)\s+(the\s+)?/i, '')
|
74
|
+
.gsub(/\s+(here|documentation|docs)$/i, '')
|
75
|
+
.strip
|
76
|
+
|
77
|
+
simplified_text = text if simplified_text.empty?
|
78
|
+
|
79
|
+
"[#{simplified_text}](#{url})"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module LlmDocsBuilder
|
4
|
+
module Transformers
|
5
|
+
# Transformer for whitespace normalization
|
6
|
+
#
|
7
|
+
# Reduces excessive blank lines and trailing whitespace to make
|
8
|
+
# content more compact for LLM consumption.
|
9
|
+
#
|
10
|
+
# @api public
|
11
|
+
class WhitespaceTransformer
|
12
|
+
include BaseTransformer
|
13
|
+
|
14
|
+
# Transform content by normalizing whitespace
|
15
|
+
#
|
16
|
+
# @param content [String] markdown content
|
17
|
+
# @param options [Hash] transformation options
|
18
|
+
# @option options [Boolean] :normalize_whitespace enable normalization
|
19
|
+
# @return [String] transformed content
|
20
|
+
def transform(content, options = {})
|
21
|
+
return content unless options[:normalize_whitespace]
|
22
|
+
|
23
|
+
normalize_whitespace(content)
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
# Normalize excessive whitespace
|
29
|
+
#
|
30
|
+
# @param content [String] markdown content
|
31
|
+
# @return [String] content with normalized whitespace
|
32
|
+
def normalize_whitespace(content)
|
33
|
+
# Remove trailing whitespace from each line
|
34
|
+
content = content.gsub(/ +$/, '')
|
35
|
+
|
36
|
+
# Reduce multiple consecutive blank lines to maximum of 2
|
37
|
+
content = content.gsub(/\n{4,}/, "\n\n\n")
|
38
|
+
|
39
|
+
# Trim leading and trailing whitespace
|
40
|
+
content.strip
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: llm-docs-builder
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Maciej Mensfeld
|
@@ -113,10 +113,10 @@ files:
|
|
113
113
|
- ".github/workflows/docker.yml"
|
114
114
|
- ".github/workflows/push.yml"
|
115
115
|
- ".gitignore"
|
116
|
+
- ".rspec"
|
116
117
|
- ".rubocop.yml"
|
117
118
|
- ".ruby-version"
|
118
119
|
- CHANGELOG.md
|
119
|
-
- CLAUDE.md
|
120
120
|
- Dockerfile
|
121
121
|
- Gemfile
|
122
122
|
- Gemfile.lock
|
@@ -133,11 +133,18 @@ files:
|
|
133
133
|
- lib/llm_docs_builder/errors.rb
|
134
134
|
- lib/llm_docs_builder/generator.rb
|
135
135
|
- lib/llm_docs_builder/markdown_transformer.rb
|
136
|
+
- lib/llm_docs_builder/output_formatter.rb
|
136
137
|
- lib/llm_docs_builder/parser.rb
|
138
|
+
- lib/llm_docs_builder/text_compressor.rb
|
139
|
+
- lib/llm_docs_builder/token_estimator.rb
|
140
|
+
- lib/llm_docs_builder/transformers/base_transformer.rb
|
141
|
+
- lib/llm_docs_builder/transformers/content_cleanup_transformer.rb
|
142
|
+
- lib/llm_docs_builder/transformers/enhancement_transformer.rb
|
143
|
+
- lib/llm_docs_builder/transformers/link_transformer.rb
|
144
|
+
- lib/llm_docs_builder/transformers/whitespace_transformer.rb
|
137
145
|
- lib/llm_docs_builder/validator.rb
|
138
146
|
- lib/llm_docs_builder/version.rb
|
139
147
|
- llm-docs-builder.gemspec
|
140
|
-
- llm-docs-builder.yml
|
141
148
|
- llm-docs-builder.yml.example
|
142
149
|
- renovate.json
|
143
150
|
homepage: https://github.com/mensfeld/llm-docs-builder
|