smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
@@ -0,0 +1,154 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "digest"
5
+ require "json"
6
+ require_relative "common"
7
+
8
+ include Examples::Common
9
+
10
+ class ContextualSearch
11
+ def initialize(smart_rag)
12
+ @smart_rag = smart_rag
13
+ end
14
+
15
+ def search_with_context(query, user_context = {})
16
+ enhanced_query = enhance_query(query, user_context)
17
+ filters = build_filters(user_context)
18
+
19
+ @smart_rag.search(
20
+ enhanced_query,
21
+ search_type: "hybrid",
22
+ limit: 10,
23
+ filters: filters,
24
+ )
25
+ end
26
+
27
+ private
28
+
29
+ def enhance_query(query, context)
30
+ case context[:domain]
31
+ when "healthcare"
32
+ "#{query} medical clinical"
33
+ when "finance"
34
+ "#{query} finance banking"
35
+ else
36
+ query
37
+ end
38
+ end
39
+
40
+ def build_filters(context)
41
+ filters = {}
42
+ filters[:document_ids] = context[:document_ids] if context[:document_ids]
43
+ filters[:tag_ids] = context[:preferred_tags] if context[:preferred_tags]
44
+ filters
45
+ end
46
+ end
47
+
48
+ class SearchPipeline
49
+ def initialize(smart_rag)
50
+ @smart_rag = smart_rag
51
+ @processors = []
52
+ end
53
+
54
+ def add_processor(&block)
55
+ @processors << block
56
+ self
57
+ end
58
+
59
+ def search(query, options = {})
60
+ results = @smart_rag.search(query, options)
61
+ @processors.each { |processor| results = processor.call(results, query, options) }
62
+ results
63
+ end
64
+ end
65
+
66
+ class MemorySearchCache
67
+ def initialize(ttl_seconds: 300)
68
+ @ttl_seconds = ttl_seconds
69
+ @store = {}
70
+ end
71
+
72
+ def fetch(query, options)
73
+ key = cache_key(query, options)
74
+ cached = @store[key]
75
+ if cached && cached[:expires_at] > Time.now
76
+ return cached[:payload]
77
+ end
78
+
79
+ payload = yield
80
+ @store[key] = { payload: payload, expires_at: Time.now + @ttl_seconds }
81
+ payload
82
+ end
83
+
84
+ private
85
+
86
+ def cache_key(query, options)
87
+ Digest::MD5.hexdigest("#{query}:#{options.sort_by { |k, _| k.to_s }.to_h.to_json}")
88
+ end
89
+ end
90
+
91
+ class QASystem
92
+ def initialize(smart_rag)
93
+ @smart_rag = smart_rag
94
+ end
95
+
96
+ def answer(question, context_limit: 5)
97
+ search_results = @smart_rag.search(
98
+ question,
99
+ search_type: "hybrid",
100
+ limit: context_limit,
101
+ include_content: true,
102
+ )
103
+ results = search_results.fetch(:results, [])
104
+
105
+ {
106
+ question: question,
107
+ answer: generate_answer(results),
108
+ sources: results.map { |r| { section_id: r[:section_id], title: r[:section_title] } },
109
+ confidence: results.empty? ? 0.0 : [results.first[:combined_score].to_f, 1.0].min,
110
+ }
111
+ end
112
+
113
+ private
114
+
115
+ def generate_answer(results)
116
+ return "I do not have enough information in the current knowledge base." if results.empty?
117
+
118
+ context = results.map { |r| r[:content].to_s }.join("\n---\n")
119
+ "Draft answer from retrieved context: #{context[0, 400]}..."
120
+ end
121
+ end
122
+
123
+ print_header("Advanced Patterns")
124
+ smart_rag = build_client
125
+
126
+ contextual = ContextualSearch.new(smart_rag)
127
+ contextual_results = contextual.search_with_context(
128
+ "risk assessment",
129
+ domain: "finance",
130
+ document_ids: [],
131
+ preferred_tags: [],
132
+ )
133
+ puts "Contextual search results: #{contextual_results.fetch(:results, []).length}"
134
+
135
+ pipeline = SearchPipeline.new(smart_rag)
136
+ pipeline.add_processor do |results, _query, options|
137
+ min_score = options[:min_score] || 0.5
138
+ results[:results] = results.fetch(:results, []).select do |r|
139
+ (r[:combined_score] || r[:similarity] || 0.0) >= min_score
140
+ end
141
+ results
142
+ end
143
+ pipeline_results = pipeline.search("neural networks", search_type: "hybrid", limit: 20, min_score: 0.7)
144
+ puts "Pipeline filtered results: #{pipeline_results.fetch(:results, []).length}"
145
+
146
+ cache = MemorySearchCache.new(ttl_seconds: 600)
147
+ cached_1 = cache.fetch("deep learning", { limit: 5 }) { smart_rag.search("deep learning", limit: 5) }
148
+ cached_2 = cache.fetch("deep learning", { limit: 5 }) { smart_rag.search("deep learning", limit: 5) }
149
+ puts "Cache demo result sizes: #{cached_1.fetch(:results, []).length}, #{cached_2.fetch(:results, []).length}"
150
+
151
+ qa = QASystem.new(smart_rag)
152
+ qa_response = qa.answer("What are common applications of transformers in NLP?", context_limit: 3)
153
+ print_json("Q&A Response", qa_response)
154
+
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "common"
5
+
6
+ include Examples::Common
7
+
8
+ class RetryableSmartRAG
9
+ def initialize(smart_rag, retries: 3, base_interval: 0.5)
10
+ @smart_rag = smart_rag
11
+ @retries = retries
12
+ @base_interval = base_interval
13
+ end
14
+
15
+ def add_document(path, **options)
16
+ with_retry(on: [SmartRAG::Errors::EmbeddingGenerationError]) do
17
+ @smart_rag.add_document(path, options)
18
+ end
19
+ end
20
+
21
+ def search(query, **options)
22
+ with_retry(on: [SmartRAG::Errors::DatabaseError]) do
23
+ @smart_rag.search(query, options)
24
+ end
25
+ end
26
+
27
+ private
28
+
29
+ def with_retry(on:)
30
+ attempt = 0
31
+ begin
32
+ attempt += 1
33
+ yield
34
+ rescue *on => e
35
+ raise e if attempt >= @retries
36
+
37
+ sleep @base_interval * (2**(attempt - 1))
38
+ retry
39
+ end
40
+ end
41
+ end
42
+
43
+ print_header("Error Handling And Retry")
44
+ smart_rag = build_client
45
+ retryable = RetryableSmartRAG.new(smart_rag)
46
+
47
+ begin
48
+ # Intentional invalid call to show argument error handling.
49
+ smart_rag.search("", search_type: "hybrid")
50
+ rescue SmartRAG::Errors::InvalidParameterError, SmartRAG::Errors::InvalidQueryError, ArgumentError => e
51
+ puts "Argument error: #{e.message}"
52
+ rescue SmartRAG::Errors::DatabaseError => e
53
+ puts "Database error: #{e.message}"
54
+ rescue SmartRAG::Errors::EmbeddingGenerationError => e
55
+ puts "Embedding generation error: #{e.message}"
56
+ rescue SmartRAG::Errors::DocumentProcessingError => e
57
+ puts "Document processing error: #{e.message}"
58
+ rescue StandardError => e
59
+ puts "Unexpected error: #{e.class} - #{e.message}"
60
+ end
61
+
62
+ safe_query = ARGV[0] || "machine learning"
63
+ safe_result = retryable.search(safe_query, search_type: "hybrid", limit: 5)
64
+ puts "Retry search result count: #{safe_result.fetch(:results, []).length}"
@@ -0,0 +1,42 @@
1
+ # SmartRAG Examples
2
+
3
+ These examples are extracted and organized from `docs/USAGE_EXAMPLES.md`.
4
+
5
+ ## Prerequisites
6
+
7
+ 1. Install dependencies:
8
+ - `bundle install`
9
+ 2. Configure environment variables (or rely on defaults in `examples/common.rb`):
10
+ - `SMARTRAG_DB_HOST`
11
+ - `SMARTRAG_DB_NAME`
12
+ - `SMARTRAG_DB_USER`
13
+ - `SMARTRAG_DB_PASSWORD`
14
+ - `OPENAI_API_KEY`
15
+ 3. Ensure your database is migrated and has test data if needed.
16
+
17
+ ## Run Examples
18
+
19
+ - Quick start:
20
+ - `ruby examples/01_quick_start.rb`
21
+ - Document management:
22
+ - `ruby examples/02_document_management.rb test/python_basics.md`
23
+ - Optional delete demo:
24
+ - `DELETE=1 ruby examples/02_document_management.rb test/python_basics.md`
25
+ - Search operations:
26
+ - `ruby examples/03_search_operations.rb`
27
+ - Topics and tags:
28
+ - `ruby examples/04_topics_and_tags.rb`
29
+ - Advanced patterns:
30
+ - `ruby examples/05_advanced_patterns.rb`
31
+ - Error handling and retry:
32
+ - `ruby examples/06_error_handling_and_retry.rb`
33
+
34
+ ## Mapping To `docs/USAGE_EXAMPLES.md`
35
+
36
+ - `01_quick_start.rb`: Quick Start
37
+ - `02_document_management.rb`: Document Management
38
+ - `03_search_operations.rb`: Search Operations
39
+ - `04_topics_and_tags.rb`: Research Topic Management + Tag Management
40
+ - `05_advanced_patterns.rb`: Advanced Usage Patterns + Q&A + Caching
41
+ - `06_error_handling_and_retry.rb`: Error Handling + Retry Logic
42
+ - `common.rb`: shared setup/config/logging utilities
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "logger"
5
+ require "dotenv"
6
+ require_relative "../lib/smart_rag"
7
+
8
+ module Examples
9
+ module Common
10
+ module_function
11
+
12
+ def load_env!
13
+ Dotenv.load(".env.local", ".env")
14
+ rescue StandardError
15
+ # Keep examples usable even if dotenv loading fails unexpectedly.
16
+ end
17
+
18
+ def default_config
19
+ load_env!
20
+ {
21
+ database: {
22
+ adapter: "postgresql",
23
+ host: ENV["SMARTRAG_DB_HOST"] || "localhost",
24
+ port: (ENV["SMARTRAG_DB_PORT"] || "5432").to_i,
25
+ database: ENV["SMARTRAG_DB_NAME"] || "smart_rag_development",
26
+ user: ENV["SMARTRAG_DB_USER"] || "smart_rag_user",
27
+ password: ENV["SMARTRAG_DB_PASSWORD"],
28
+ },
29
+ llm: {
30
+ provider: ENV["SMARTRAG_LLM_PROVIDER"] || ENV["LLM_PROVIDER"] || "openai",
31
+ api_key: ENV["OPENAI_API_KEY"] || ENV["LLM_API_KEY"] || "ollama-local",
32
+ endpoint: ENV["LLM_ENDPOINT"] || "http://localhost:11434/v1/chat/completions",
33
+ model: ENV["LLM_MODEL"] || "qwen3",
34
+ },
35
+ }
36
+ end
37
+
38
+ def build_client(log_level: Logger::INFO)
39
+ client = SmartRAG::SmartRAG.new(default_config)
40
+ client.logger = Logger.new($stdout)
41
+ client.logger.level = log_level
42
+ client
43
+ end
44
+
45
+ def print_header(title)
46
+ puts "\n#{'=' * 72}"
47
+ puts title
48
+ puts "=" * 72
49
+ end
50
+
51
+ def print_json(name, data)
52
+ require "json"
53
+ puts "#{name}:"
54
+ puts JSON.pretty_generate(data)
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,315 @@
1
+ module SmartRAG
2
+ module Chunker
3
+ # MarkdownChunker splits markdown content into chunks based on headings
4
+ class MarkdownChunker
5
+ attr_reader :chunk_size, :overlap, :heading_levels
6
+
7
+ # @param [Integer] chunk_size Target size for each chunk (in characters)
8
+ # @param [Integer] overlap Overlap between chunks (in characters)
9
+ # @param [Array<Integer>] heading_levels Which heading levels to split on (default: [1, 2, 3])
10
+ def initialize(chunk_size: 2000, overlap: 200, heading_levels: [1, 2, 3])
11
+ @chunk_size = chunk_size
12
+ @overlap = overlap
13
+ @heading_levels = heading_levels
14
+ end
15
+
16
+ # Split markdown content into chunks
17
+ # @param [String] markdown_content Content to chunk
18
+ # @return [Array<Hash>] Array of chunk hashes with :title and :content
19
+ def chunk(markdown_content)
20
+ return [] if markdown_content.nil? || markdown_content.empty?
21
+
22
+ # First, try to split by headings
23
+ heading_chunks = split_by_headings(markdown_content)
24
+
25
+ # If we have heading-based chunks with actual headings, use them
26
+ # Otherwise, fall back to size-based splitting
27
+ if heading_chunks.length > 0 && heading_chunks.first[:title]
28
+ # Further split large heading chunks by size
29
+ result = []
30
+ heading_chunks.each do |chunk|
31
+ if chunk[:content].length > chunk_size * 1.5
32
+ # Split large chunk further
33
+ sub_chunks = split_large_chunk(chunk)
34
+ result.concat(sub_chunks)
35
+ else
36
+ result << chunk
37
+ end
38
+ end
39
+ result
40
+ else
41
+ # No headings found, use size-based splitting
42
+ split_by_size(markdown_content)
43
+ end
44
+ end
45
+
46
+ private
47
+
48
+ # Split content by markdown headings - Handles nested headings properly
49
+ def split_by_headings(content)
50
+ chunks = []
51
+ content = content.sub(/\A\uFEFF/, '')
52
+ lines = content.lines
53
+ return chunks if lines.empty?
54
+
55
+ # Check if we have any headings
56
+ heading_pattern = /^(#{@heading_levels.map { |l| '#' * l }.join('|')})\s+(.+)$/
57
+
58
+ # Find all heading positions first
59
+ heading_positions = []
60
+ lines.each_with_index do |line, idx|
61
+ if line.match?(heading_pattern)
62
+ heading_positions << idx
63
+ end
64
+ end
65
+
66
+ # If no headings, use fallback to size-based splitting
67
+ if heading_positions.empty?
68
+ cleaned = clean_chunk_content(content)
69
+ return cleaned.length > 10 ? [{ title: nil, content: cleaned }] : []
70
+ end
71
+
72
+ # Build chunks based on heading hierarchy
73
+ # Split at any heading that matches our configured heading levels
74
+ current_chunk_start = nil
75
+ current_chunk_title = nil
76
+ first_h1_processed = false
77
+ first_h1_title = nil
78
+ first_h1_skipped = false
79
+ intro_lines = []
80
+ intro_text = nil
81
+ intro_pending = false
82
+
83
+ lines.each_with_index do |line, idx|
84
+ # Check if this line is a heading
85
+ if (match = line.match(heading_pattern))
86
+ heading_level = match[1].length
87
+ heading_title = match[2].strip
88
+
89
+ # Handle the first H1 - treat as document title (skip for chunking)
90
+ if !first_h1_processed && heading_level == 1
91
+ first_h1_title = heading_title
92
+ first_h1_processed = true
93
+ first_h1_skipped = true
94
+ next
95
+ end
96
+
97
+ # Check if this heading level should create a chunk
98
+ if @heading_levels.include?(heading_level) && !(idx == 0 && heading_level == 1)
99
+ # Save previous chunk if it exists
100
+ if current_chunk_start && current_chunk_title
101
+ # Extract content from start of this section to current line
102
+ content_lines = lines[current_chunk_start...idx]
103
+ if content_lines && content_lines.length > 1
104
+ # Remove the heading line
105
+ section_content = clean_chunk_content(content_lines[1..-1].join)
106
+ if intro_pending && intro_text && section_content.length > 0
107
+ section_content = [intro_text, section_content].join("\n\n")
108
+ intro_pending = false
109
+ elsif intro_pending && intro_text && section_content.empty?
110
+ section_content = intro_text
111
+ intro_pending = false
112
+ end
113
+ if section_content.length > 0
114
+ chunks << { title: current_chunk_title, content: section_content }
115
+ end
116
+ end
117
+ end
118
+
119
+ if first_h1_skipped && current_chunk_start.nil? && intro_text.nil?
120
+ cleaned_intro = clean_chunk_content(intro_lines.join)
121
+ if cleaned_intro.length > 0
122
+ intro_text = cleaned_intro
123
+ intro_pending = true
124
+ end
125
+ end
126
+
127
+ # Start new chunk
128
+ current_chunk_start = idx
129
+ current_chunk_title = heading_title
130
+ end
131
+ elsif first_h1_skipped && current_chunk_start.nil?
132
+ intro_lines << line
133
+ end
134
+ end
135
+
136
+ # Save the last chunk
137
+ if current_chunk_start && current_chunk_title
138
+ end_idx = heading_positions[heading_positions.index(current_chunk_start) + 1] || lines.length
139
+ content_lines = lines[current_chunk_start...end_idx]
140
+ if content_lines && content_lines.length > 1
141
+ section_content = clean_chunk_content(content_lines[1..-1].join)
142
+ if intro_pending && intro_text && section_content.length > 0
143
+ section_content = [intro_text, section_content].join("\n\n")
144
+ intro_pending = false
145
+ elsif intro_pending && intro_text && section_content.empty?
146
+ section_content = intro_text
147
+ intro_pending = false
148
+ end
149
+ if section_content.length > 0
150
+ chunks << { title: current_chunk_title, content: section_content }
151
+ end
152
+ end
153
+ end
154
+
155
+ # If we skipped the first h1 as a document title but no chunks were created,
156
+ # go back and create a chunk for it
157
+ if first_h1_skipped && chunks.empty? && first_h1_title
158
+ # Get content after the first h1
159
+ if lines.length > 1
160
+ content_after_h1 = lines[1..-1].join
161
+ cleaned_content = clean_chunk_content(content_after_h1)
162
+ if cleaned_content.length > 0
163
+ chunks << { title: first_h1_title, content: cleaned_content }
164
+ end
165
+ end
166
+ end
167
+
168
+ chunks
169
+ end
170
+
171
+ # Clean chunk content by removing heading lines
172
+ def clean_chunk_content(content)
173
+ return "" if content.nil? || content.empty?
174
+
175
+ content = content.strip
176
+ return "" if content.empty?
177
+
178
+ lines = content.lines
179
+
180
+ # Remove leading heading lines
181
+ while lines.first && lines.first =~ /^#+\s+/
182
+ lines.shift
183
+ end
184
+
185
+ result = lines.join.strip
186
+ result
187
+ end
188
+
189
+ # SPLIT_CONTENT
190
+
191
+ # Split content by size without considering headings
192
+ def split_by_size(content)
193
+ chunks = []
194
+ return chunks if content.nil? || content.empty?
195
+
196
+ position = 0
197
+ chunk_index = 0
198
+ content_length = content.length
199
+
200
+ while position < content_length
201
+ # Calculate the end position for this chunk
202
+ end_pos = [position + chunk_size, content_length].min
203
+
204
+ # Look backward for a sentence boundary if we're not at the end
205
+ if end_pos < content_length
206
+ # Search backward for a sentence boundary (., !, ?)
207
+ search_start = end_pos
208
+ search_end = [position + chunk_size * 0.8, content_length].min # Don't look too far back
209
+
210
+ (search_start - 1).downto(search_end.to_i) do |i|
211
+ if content[i] =~ /[.!?]/
212
+ # Found a sentence boundary
213
+ # Make sure it's followed by whitespace or end of content
214
+ if i + 1 >= content_length || content[i + 1] =~ /\s/
215
+ end_pos = i + 1
216
+ break
217
+ end
218
+ end
219
+ end
220
+ end
221
+
222
+ # Extract the chunk
223
+ chunk_content = content[position...end_pos].strip
224
+
225
+ if chunk_content.length > 50
226
+ chunks << {
227
+ title: generate_chunk_title(chunk_content, chunk_index),
228
+ content: chunk_content
229
+ }
230
+ chunk_index += 1
231
+ end
232
+
233
+ # Move position forward, keeping overlap
234
+ next_position = end_pos
235
+ if next_position < content_length
236
+ # Add overlap
237
+ overlap_chars = [overlap, chunk_content.length].min
238
+ position = [next_position - overlap_chars, position + 50].max # Don't overlap too much
239
+ else
240
+ position = content_length
241
+ end
242
+ end
243
+
244
+ chunks
245
+ end
246
+
247
+ # Split a large chunk that exceeded size limits
248
+ def split_large_chunk(chunk)
249
+ # Use the heading as title and split the content
250
+ sub_chunks = split_by_size(chunk[:content])
251
+
252
+ # Prepend the original heading to each sub-chunk
253
+ sub_chunks.each_with_index do |sub_chunk, i|
254
+ sub_chunk[:title] = if i == 0
255
+ chunk[:title]
256
+ else
257
+ "#{chunk[:title]} (Part #{i + 1})"
258
+ end
259
+ end
260
+
261
+ sub_chunks
262
+ end
263
+
264
+ # Generate a title for a chunk based on its content
265
+ def generate_chunk_title(content, index)
266
+ return nil if content.nil? || content.empty?
267
+
268
+ # Try to extract first heading (if present)
269
+ lines = content.strip.lines
270
+ lines.each do |line|
271
+ if line =~ /^#+\s+(.+)$/
272
+ return $1.strip
273
+ end
274
+ end
275
+
276
+ # Use first sentence
277
+ first_line = lines.first&.strip
278
+ if first_line && first_line =~ /^(.*?[.!?])/
279
+ sentence = $1.strip
280
+ return sentence.length > 100 ? sentence[0...100] + '...' : sentence
281
+ elsif first_line && !first_line.empty?
282
+ # Just use first line (without period)
283
+ return first_line.length > 100 ? first_line[0...100] + '...' : first_line
284
+ end
285
+
286
+ # Fallback to chunk number
287
+ "Section #{index + 1}"
288
+ end
289
+ end
290
+ end
291
+
292
+ # Monkey-patch String to add reverse match method
293
+ class ::String
294
+ # Find last match of pattern in string
295
+ # @param [Regexp] pattern Pattern to match
296
+ # @param [Integer] start_pos Position to start searching from (default: end of string)
297
+ # @return [MatchData] Last match or nil
298
+ def rmatch(pattern, start_pos = nil)
299
+ start_pos ||= length
300
+ start_pos = length + start_pos if start_pos < 0
301
+ start_pos = length if start_pos > length
302
+
303
+ # Search backwards from start_pos
304
+ (0..start_pos).reverse_each do |i|
305
+ # Try to match at position i
306
+ substring = self[i...length]
307
+ match = pattern.match(substring)
308
+ return match if match
309
+ end
310
+ nil
311
+ rescue
312
+ nil
313
+ end
314
+ end
315
+ end