llm-docs-builder 0.3.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,164 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Advanced text compression techniques for reducing token count
5
+ #
6
+ # Provides more aggressive text compression methods including stopword removal,
7
+ # duplicate content detection, and sentence deduplication. These methods are more
8
+ # aggressive than basic markdown cleanup and should be used carefully.
9
+ #
10
+ # @example Basic usage
11
+ # compressor = LlmDocsBuilder::TextCompressor.new
12
+ # compressed = compressor.compress("Your text here", remove_stopwords: true)
13
+ #
14
+ # @api public
15
+ class TextCompressor
16
+ # Common English stopwords that can be safely removed from documentation
17
+ # Excludes words that might be important in technical contexts (like "not", "no")
18
+ STOPWORDS = %w[
19
+ a an the this that these those
20
+ is am are was were be being been
21
+ have has had do does did
22
+ will would shall should may might must can could
23
+ i me my mine we us our ours
24
+ you your yours
25
+ he him his she her hers it its
26
+ they them their theirs
27
+ what which who whom whose where when why how
28
+ all both each few more most other some such
29
+ and or but if then else
30
+ at by for from in into of on to with
31
+ as so than
32
+ very really quite
33
+ there here
34
+ about above across after against along among around because before behind below
35
+ beneath beside besides between beyond during except inside near off since through
36
+ throughout under until up upon within without
37
+ ].freeze
38
+
39
+ # @return [Hash] compression options
40
+ attr_reader :options
41
+
42
+ # Initialize a new text compressor
43
+ #
44
+ # @param options [Hash] compression options
45
+ # @option options [Array<String>] :custom_stopwords additional stopwords to remove
46
+ # @option options [Boolean] :preserve_technical preserve technical terms and code
47
+ def initialize(options = {})
48
+ @options = {
49
+ preserve_technical: true,
50
+ custom_stopwords: []
51
+ }.merge(options)
52
+ end
53
+
54
+ # Compress text using configured methods
55
+ #
56
+ # @param content [String] text to compress
57
+ # @param methods [Hash] compression methods to apply
58
+ # @option methods [Boolean] :remove_stopwords remove common filler words
59
+ # @option methods [Boolean] :remove_duplicates remove duplicate sentences/paragraphs
60
+ # @return [String] compressed text
61
+ def compress(content, methods = {})
62
+ result = content.dup
63
+
64
+ result = remove_stopwords(result) if methods[:remove_stopwords]
65
+ result = remove_duplicate_paragraphs(result) if methods[:remove_duplicates]
66
+
67
+ result
68
+ end
69
+
70
+ # Remove stopwords from text while preserving technical content
71
+ #
72
+ # Removes common English stopwords that don't carry significant meaning.
73
+ # Preserves code blocks, inline code, and technical terms.
74
+ #
75
+ # WARNING: This is an aggressive optimization that may affect readability.
76
+ # Use with caution and test results carefully.
77
+ #
78
+ # @param content [String] text to process
79
+ # @return [String] text with stopwords removed
80
+ def remove_stopwords(content)
81
+ # Preserve code blocks by temporarily replacing them
82
+ code_blocks = {}
83
+ code_counter = 0
84
+
85
+ # Extract and preserve fenced code blocks
86
+ content = content.gsub(/^```.*?^```/m) do |match|
87
+ placeholder = "___CODE_BLOCK_#{code_counter}___"
88
+ code_blocks[placeholder] = match
89
+ code_counter += 1
90
+ placeholder
91
+ end
92
+
93
+ # Extract and preserve inline code
94
+ content = content.gsub(/`[^`]+`/) do |match|
95
+ placeholder = "___INLINE_CODE_#{code_counter}___"
96
+ code_blocks[placeholder] = match
97
+ code_counter += 1
98
+ placeholder
99
+ end
100
+
101
+ # Get combined stopwords list
102
+ stopwords_list = STOPWORDS + options[:custom_stopwords]
103
+
104
+ # Process each line
105
+ content = content.split("\n").map do |line|
106
+ # Skip markdown headers, lists, and links
107
+ if line.match?(/^#+\s/) || line.match?(/^[\*\-]\s/) || line.match?(/\[[^\]]+\]\([^)]+\)/)
108
+ line
109
+ else
110
+ # Remove stopwords from regular text
111
+ words = line.split(/\b/)
112
+ words.map do |word|
113
+ # Preserve the word if it's not a stopword or if we should preserve technical terms
114
+ if stopwords_list.include?(word.downcase) && !word.match?(/^[A-Z]/) # Don't remove capitalized words
115
+ ''
116
+ else
117
+ word
118
+ end
119
+ end.join
120
+ end
121
+ end.join("\n")
122
+
123
+ # Restore code blocks
124
+ code_blocks.each do |placeholder, original|
125
+ content = content.gsub(placeholder, original)
126
+ end
127
+
128
+ content
129
+ end
130
+
131
+ # Remove duplicate paragraphs from text
132
+ #
133
+ # Detects and removes paragraphs that are duplicates or near-duplicates.
134
+ # Documentation often repeats concepts across different sections.
135
+ #
136
+ # @param content [String] text to process
137
+ # @return [String] text with duplicate paragraphs removed
138
+ def remove_duplicate_paragraphs(content)
139
+ # Split into paragraphs (separated by blank lines)
140
+ paragraphs = content.split(/\n\s*\n/)
141
+
142
+ # Track seen paragraphs with normalized comparison
143
+ seen = {}
144
+ unique_paragraphs = []
145
+
146
+ paragraphs.each do |para|
147
+ # Normalize for comparison (remove extra whitespace, lowercase)
148
+ normalized = para.gsub(/\s+/, ' ').strip.downcase
149
+
150
+ # Skip empty paragraphs
151
+ next if normalized.empty?
152
+
153
+ # Check if we've seen this or similar paragraph
154
+ unless seen[normalized]
155
+ seen[normalized] = true
156
+ unique_paragraphs << para
157
+ end
158
+ end
159
+
160
+ unique_paragraphs.join("\n\n")
161
+ end
162
+
163
+ end
164
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ # Estimates token count for text content using character-based approximation
5
+ #
6
+ # Provides token estimation without requiring external tokenizer dependencies.
7
+ # Uses the common heuristic that ~4 characters equals 1 token for English text,
8
+ # which works reasonably well for documentation and markdown content.
9
+ #
10
+ # @example Basic usage
11
+ # estimator = LlmDocsBuilder::TokenEstimator.new
12
+ # token_count = estimator.estimate("This is a sample text.")
13
+ #
14
+ # @example With custom characters per token
15
+ # estimator = LlmDocsBuilder::TokenEstimator.new(chars_per_token: 3.5)
16
+ # token_count = estimator.estimate(content)
17
+ #
18
+ # @api public
19
+ class TokenEstimator
20
+ # Default number of characters per token
21
+ DEFAULT_CHARS_PER_TOKEN = 4.0
22
+
23
+ # @return [Float] characters per token ratio
24
+ attr_reader :chars_per_token
25
+
26
+ # Initialize a new token estimator
27
+ #
28
+ # @param chars_per_token [Float] number of characters per token (default: 4.0)
29
+ def initialize(chars_per_token: DEFAULT_CHARS_PER_TOKEN)
30
+ @chars_per_token = chars_per_token.to_f
31
+ end
32
+
33
+ # Estimate token count for given content
34
+ #
35
+ # @param content [String] text content to estimate tokens for
36
+ # @return [Integer] estimated number of tokens
37
+ def estimate(content)
38
+ return 0 if content.nil? || content.empty?
39
+
40
+ (content.length / chars_per_token).round
41
+ end
42
+
43
+ # Estimate token count (class method for convenience)
44
+ #
45
+ # @param content [String] text content to estimate tokens for
46
+ # @param chars_per_token [Float] number of characters per token (default: 4.0)
47
+ # @return [Integer] estimated number of tokens
48
+ def self.estimate(content, chars_per_token: DEFAULT_CHARS_PER_TOKEN)
49
+ new(chars_per_token: chars_per_token).estimate(content)
50
+ end
51
+ end
52
+ end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Transformers
5
+ # Base module for all transformers
6
+ #
7
+ # Provides a common interface for content transformation operations.
8
+ # Each transformer should implement the `transform` method.
9
+ #
10
+ # @api public
11
+ module BaseTransformer
12
+ # Transform content
13
+ #
14
+ # @param content [String] content to transform
15
+ # @param options [Hash] transformation options
16
+ # @return [String] transformed content
17
+ def transform(content, options = {})
18
+ raise NotImplementedError, "#{self.class} must implement #transform"
19
+ end
20
+
21
+ # Check if transformation should be applied
22
+ #
23
+ # @param options [Hash] transformation options
24
+ # @return [Boolean] true if transformation should be applied
25
+ def should_transform?(options)
26
+ true
27
+ end
28
+ end
29
+ end
30
+ end
@@ -0,0 +1,106 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Transformers
5
+ # Transformer for content cleanup operations
6
+ #
7
+ # Handles removal of various markdown elements that don't provide
8
+ # value for LLM consumption (frontmatter, comments, badges, etc.).
9
+ #
10
+ # @api public
11
+ class ContentCleanupTransformer
12
+ include BaseTransformer
13
+
14
+ # Transform content by removing unwanted elements
15
+ #
16
+ # @param content [String] markdown content
17
+ # @param options [Hash] transformation options
18
+ # @option options [Boolean] :remove_frontmatter remove YAML/TOML frontmatter
19
+ # @option options [Boolean] :remove_comments remove HTML comments
20
+ # @option options [Boolean] :remove_badges remove badge images
21
+ # @option options [Boolean] :remove_code_examples remove code blocks
22
+ # @option options [Boolean] :remove_images remove image syntax
23
+ # @option options [Boolean] :remove_blockquotes remove blockquote formatting
24
+ # @return [String] transformed content
25
+ def transform(content, options = {})
26
+ result = content.dup
27
+
28
+ result = remove_frontmatter(result) if options[:remove_frontmatter]
29
+ result = remove_comments(result) if options[:remove_comments]
30
+ result = remove_badges(result) if options[:remove_badges]
31
+ result = remove_code_examples(result) if options[:remove_code_examples]
32
+ result = remove_images(result) if options[:remove_images]
33
+ result = remove_blockquotes(result) if options[:remove_blockquotes]
34
+
35
+ result
36
+ end
37
+
38
+ private
39
+
40
+ # Remove YAML or TOML frontmatter
41
+ #
42
+ # @param content [String] markdown content
43
+ # @return [String] content without frontmatter
44
+ def remove_frontmatter(content)
45
+ content = content.sub(/\A---\s*$.*?^---\s*$/m, '')
46
+ content = content.sub(/\A\+\+\+\s*$.*?^\+\+\+\s*$/m, '')
47
+ content
48
+ end
49
+
50
+ # Remove HTML comments
51
+ #
52
+ # @param content [String] markdown content
53
+ # @return [String] content without comments
54
+ def remove_comments(content)
55
+ content.gsub(/<!--.*?-->/m, '')
56
+ end
57
+
58
+ # Remove badge images
59
+ #
60
+ # @param content [String] markdown content
61
+ # @return [String] content without badges
62
+ def remove_badges(content)
63
+ # Remove linked badges
64
+ content = content.gsub(/\[\!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)\]\([^\)]*\)/i, '')
65
+ # Remove standalone badges
66
+ content = content.gsub(/!\[([^\]]*)\]\([^\)]*(?:badge|shield|svg|travis|coveralls|fury)[^\)]*\)/i, '')
67
+ content
68
+ end
69
+
70
+ # Remove code blocks and inline code
71
+ #
72
+ # @param content [String] markdown content
73
+ # @return [String] content without code
74
+ def remove_code_examples(content)
75
+ # Remove fenced code blocks
76
+ content = content.gsub(/^```.*?^```/m, '')
77
+ content = content.gsub(/^~~~.*?^~~~/m, '')
78
+ # Remove indented code blocks
79
+ content = content.gsub(/^([ ]{4,}|\t).+$/m, '')
80
+ # Remove inline code
81
+ content = content.gsub(/`[^`]+`/, '')
82
+ content
83
+ end
84
+
85
+ # Remove image syntax
86
+ #
87
+ # @param content [String] markdown content
88
+ # @return [String] content without images
89
+ def remove_images(content)
90
+ # Remove inline images
91
+ content = content.gsub(/!\[([^\]]*)\]\([^\)]+\)/, '')
92
+ # Remove reference-style images
93
+ content = content.gsub(/!\[([^\]]*)\]\[[^\]]+\]/, '')
94
+ content
95
+ end
96
+
97
+ # Remove blockquote formatting
98
+ #
99
+ # @param content [String] markdown content
100
+ # @return [String] content without blockquote markers
101
+ def remove_blockquotes(content)
102
+ content.gsub(/^>\s?/, '')
103
+ end
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Transformers
5
+ # Transformer for document enhancements
6
+ #
7
+ # Adds helpful features like table of contents and custom instructions
8
+ # to improve LLM navigation and context understanding.
9
+ #
10
+ # @api public
11
+ class EnhancementTransformer
12
+ include BaseTransformer
13
+
14
+ # Transform content by adding enhancements
15
+ #
16
+ # @param content [String] markdown content
17
+ # @param options [Hash] transformation options
18
+ # @option options [Boolean] :generate_toc generate table of contents
19
+ # @option options [String] :custom_instruction custom instruction text
20
+ # @option options [Boolean] :remove_blockquotes whether blockquotes are being removed
21
+ # @return [String] transformed content
22
+ def transform(content, options = {})
23
+ result = content.dup
24
+
25
+ if options[:custom_instruction]
26
+ result = inject_custom_instruction(result, options[:custom_instruction], options[:remove_blockquotes])
27
+ end
28
+ result = generate_table_of_contents(result) if options[:generate_toc]
29
+
30
+ result
31
+ end
32
+
33
+ private
34
+
35
+ # Generate table of contents from headings
36
+ #
37
+ # @param content [String] markdown content
38
+ # @return [String] content with TOC prepended
39
+ def generate_table_of_contents(content)
40
+ headings = []
41
+ content.scan(/^(#{Regexp.escape('#')}{1,6})\s+(.+)$/) do
42
+ level = ::Regexp.last_match(1).length
43
+ title = ::Regexp.last_match(2).strip
44
+
45
+ anchor = title.downcase
46
+ .gsub(/[^\w\s-]/, '')
47
+ .gsub(/\s+/, '-')
48
+
49
+ headings << { level: level, title: title, anchor: anchor }
50
+ end
51
+
52
+ return content if headings.empty?
53
+
54
+ toc = ["## Table of Contents\n"]
55
+
56
+ headings.each do |heading|
57
+ next if heading[:level] == 1 && headings.first == heading
58
+
59
+ indent = ' ' * (heading[:level] - 1)
60
+ toc << "#{indent}- [#{heading[:title]}](##{heading[:anchor]})"
61
+ end
62
+
63
+ toc << "\n---\n"
64
+
65
+ if content.match(/^#\s+(.+)$/)
66
+ content.sub(/^(#\s+.+\n)/, "\\1\n#{toc.join("\n")}\n")
67
+ else
68
+ "#{toc.join("\n")}\n\n#{content}"
69
+ end
70
+ end
71
+
72
+ # Inject custom instruction at document top
73
+ #
74
+ # @param content [String] markdown content
75
+ # @param instruction [String] instruction text
76
+ # @param remove_blockquotes [Boolean] whether to avoid blockquote formatting
77
+ # @return [String] content with instruction prepended
78
+ def inject_custom_instruction(content, instruction, remove_blockquotes = false)
79
+ return content if instruction.nil? || instruction.empty?
80
+
81
+ formatted_instruction = if remove_blockquotes
82
+ "**AI Context**: #{instruction}\n\n---\n\n"
83
+ else
84
+ "> **AI Context**: #{instruction}\n\n---\n\n"
85
+ end
86
+
87
+ if content.match(/^#\s+(.+?)\n/)
88
+ content.sub(/^(#\s+.+?\n)/, "\\1\n#{formatted_instruction}")
89
+ else
90
+ "#{formatted_instruction}#{content}"
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Transformers
5
+ # Transformer for link-related operations
6
+ #
7
+ # Handles expansion of relative links to absolute URLs and
8
+ # conversion of HTML URLs to markdown format.
9
+ #
10
+ # @api public
11
+ class LinkTransformer
12
+ include BaseTransformer
13
+
14
+ # Transform links in content
15
+ #
16
+ # @param content [String] markdown content
17
+ # @param options [Hash] transformation options
18
+ # @option options [String] :base_url base URL for expanding relative links
19
+ # @option options [Boolean] :convert_urls convert HTML URLs to markdown format
20
+ # @return [String] transformed content
21
+ def transform(content, options = {})
22
+ result = content.dup
23
+
24
+ result = expand_relative_links(result, options[:base_url]) if options[:base_url]
25
+ result = convert_html_urls(result) if options[:convert_urls]
26
+ result = simplify_links(result) if options[:simplify_links]
27
+
28
+ result
29
+ end
30
+
31
+ private
32
+
33
+ # Expand relative links to absolute URLs
34
+ #
35
+ # @param content [String] markdown content
36
+ # @param base_url [String] base URL for expansion
37
+ # @return [String] content with expanded links
38
+ def expand_relative_links(content, base_url)
39
+ content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do |match|
40
+ text = ::Regexp.last_match(1)
41
+ url = ::Regexp.last_match(2)
42
+
43
+ if url.start_with?('http://', 'https://', '//', '#')
44
+ match
45
+ else
46
+ clean_url = url.gsub(%r{^\./}, '')
47
+ expanded_url = File.join(base_url, clean_url)
48
+ "[#{text}](#{expanded_url})"
49
+ end
50
+ end
51
+ end
52
+
53
+ # Convert HTML URLs to markdown format
54
+ #
55
+ # @param content [String] markdown content
56
+ # @return [String] content with converted URLs
57
+ def convert_html_urls(content)
58
+ content.gsub(%r{https?://[^\s<>]+\.html?(?=[)\s]|$)}) do |url|
59
+ url.sub(/\.html?$/, '.md')
60
+ end
61
+ end
62
+
63
+ # Simplify verbose link text
64
+ #
65
+ # @param content [String] markdown content
66
+ # @return [String] content with simplified links
67
+ def simplify_links(content)
68
+ content.gsub(/\[([^\]]+)\]\(([^)]+)\)/) do
69
+ text = ::Regexp.last_match(1)
70
+ url = ::Regexp.last_match(2)
71
+
72
+ simplified_text = text
73
+ .gsub(/^(click here to|see|read more about|check out|visit)\s+(the\s+)?/i, '')
74
+ .gsub(/\s+(here|documentation|docs)$/i, '')
75
+ .strip
76
+
77
+ simplified_text = text if simplified_text.empty?
78
+
79
+ "[#{simplified_text}](#{url})"
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LlmDocsBuilder
4
+ module Transformers
5
+ # Transformer for whitespace normalization
6
+ #
7
+ # Reduces excessive blank lines and trailing whitespace to make
8
+ # content more compact for LLM consumption.
9
+ #
10
+ # @api public
11
+ class WhitespaceTransformer
12
+ include BaseTransformer
13
+
14
+ # Transform content by normalizing whitespace
15
+ #
16
+ # @param content [String] markdown content
17
+ # @param options [Hash] transformation options
18
+ # @option options [Boolean] :normalize_whitespace enable normalization
19
+ # @return [String] transformed content
20
+ def transform(content, options = {})
21
+ return content unless options[:normalize_whitespace]
22
+
23
+ normalize_whitespace(content)
24
+ end
25
+
26
+ private
27
+
28
+ # Normalize excessive whitespace
29
+ #
30
+ # @param content [String] markdown content
31
+ # @return [String] content with normalized whitespace
32
+ def normalize_whitespace(content)
33
+ # Remove trailing whitespace from each line
34
+ content = content.gsub(/ +$/, '')
35
+
36
+ # Reduce multiple consecutive blank lines to maximum of 2
37
+ content = content.gsub(/\n{4,}/, "\n\n\n")
38
+
39
+ # Trim leading and trailing whitespace
40
+ content.strip
41
+ end
42
+ end
43
+ end
44
+ end
@@ -2,5 +2,5 @@
2
2
 
3
3
  module LlmDocsBuilder
4
4
  # Current version of the LlmDocsBuilder gem
5
- VERSION = '0.3.0'
5
+ VERSION = '0.7.0'
6
6
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llm-docs-builder
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Maciej Mensfeld
@@ -113,10 +113,10 @@ files:
113
113
  - ".github/workflows/docker.yml"
114
114
  - ".github/workflows/push.yml"
115
115
  - ".gitignore"
116
+ - ".rspec"
116
117
  - ".rubocop.yml"
117
118
  - ".ruby-version"
118
119
  - CHANGELOG.md
119
- - CLAUDE.md
120
120
  - Dockerfile
121
121
  - Gemfile
122
122
  - Gemfile.lock
@@ -133,11 +133,18 @@ files:
133
133
  - lib/llm_docs_builder/errors.rb
134
134
  - lib/llm_docs_builder/generator.rb
135
135
  - lib/llm_docs_builder/markdown_transformer.rb
136
+ - lib/llm_docs_builder/output_formatter.rb
136
137
  - lib/llm_docs_builder/parser.rb
138
+ - lib/llm_docs_builder/text_compressor.rb
139
+ - lib/llm_docs_builder/token_estimator.rb
140
+ - lib/llm_docs_builder/transformers/base_transformer.rb
141
+ - lib/llm_docs_builder/transformers/content_cleanup_transformer.rb
142
+ - lib/llm_docs_builder/transformers/enhancement_transformer.rb
143
+ - lib/llm_docs_builder/transformers/link_transformer.rb
144
+ - lib/llm_docs_builder/transformers/whitespace_transformer.rb
137
145
  - lib/llm_docs_builder/validator.rb
138
146
  - lib/llm_docs_builder/version.rb
139
147
  - llm-docs-builder.gemspec
140
- - llm-docs-builder.yml
141
148
  - llm-docs-builder.yml.example
142
149
  - renovate.json
143
150
  homepage: https://github.com/mensfeld/llm-docs-builder