smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "digest"
|
|
5
|
+
require "json"
|
|
6
|
+
require_relative "common"
|
|
7
|
+
|
|
8
|
+
include Examples::Common
|
|
9
|
+
|
|
10
|
+
class ContextualSearch
|
|
11
|
+
def initialize(smart_rag)
|
|
12
|
+
@smart_rag = smart_rag
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def search_with_context(query, user_context = {})
|
|
16
|
+
enhanced_query = enhance_query(query, user_context)
|
|
17
|
+
filters = build_filters(user_context)
|
|
18
|
+
|
|
19
|
+
@smart_rag.search(
|
|
20
|
+
enhanced_query,
|
|
21
|
+
search_type: "hybrid",
|
|
22
|
+
limit: 10,
|
|
23
|
+
filters: filters,
|
|
24
|
+
)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def enhance_query(query, context)
|
|
30
|
+
case context[:domain]
|
|
31
|
+
when "healthcare"
|
|
32
|
+
"#{query} medical clinical"
|
|
33
|
+
when "finance"
|
|
34
|
+
"#{query} finance banking"
|
|
35
|
+
else
|
|
36
|
+
query
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def build_filters(context)
|
|
41
|
+
filters = {}
|
|
42
|
+
filters[:document_ids] = context[:document_ids] if context[:document_ids]
|
|
43
|
+
filters[:tag_ids] = context[:preferred_tags] if context[:preferred_tags]
|
|
44
|
+
filters
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
class SearchPipeline
|
|
49
|
+
def initialize(smart_rag)
|
|
50
|
+
@smart_rag = smart_rag
|
|
51
|
+
@processors = []
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def add_processor(&block)
|
|
55
|
+
@processors << block
|
|
56
|
+
self
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def search(query, options = {})
|
|
60
|
+
results = @smart_rag.search(query, options)
|
|
61
|
+
@processors.each { |processor| results = processor.call(results, query, options) }
|
|
62
|
+
results
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
class MemorySearchCache
|
|
67
|
+
def initialize(ttl_seconds: 300)
|
|
68
|
+
@ttl_seconds = ttl_seconds
|
|
69
|
+
@store = {}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def fetch(query, options)
|
|
73
|
+
key = cache_key(query, options)
|
|
74
|
+
cached = @store[key]
|
|
75
|
+
if cached && cached[:expires_at] > Time.now
|
|
76
|
+
return cached[:payload]
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
payload = yield
|
|
80
|
+
@store[key] = { payload: payload, expires_at: Time.now + @ttl_seconds }
|
|
81
|
+
payload
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
private
|
|
85
|
+
|
|
86
|
+
def cache_key(query, options)
|
|
87
|
+
Digest::MD5.hexdigest("#{query}:#{options.sort_by { |k, _| k.to_s }.to_h.to_json}")
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
class QASystem
|
|
92
|
+
def initialize(smart_rag)
|
|
93
|
+
@smart_rag = smart_rag
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def answer(question, context_limit: 5)
|
|
97
|
+
search_results = @smart_rag.search(
|
|
98
|
+
question,
|
|
99
|
+
search_type: "hybrid",
|
|
100
|
+
limit: context_limit,
|
|
101
|
+
include_content: true,
|
|
102
|
+
)
|
|
103
|
+
results = search_results.fetch(:results, [])
|
|
104
|
+
|
|
105
|
+
{
|
|
106
|
+
question: question,
|
|
107
|
+
answer: generate_answer(results),
|
|
108
|
+
sources: results.map { |r| { section_id: r[:section_id], title: r[:section_title] } },
|
|
109
|
+
confidence: results.empty? ? 0.0 : [results.first[:combined_score].to_f, 1.0].min,
|
|
110
|
+
}
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private
|
|
114
|
+
|
|
115
|
+
def generate_answer(results)
|
|
116
|
+
return "I do not have enough information in the current knowledge base." if results.empty?
|
|
117
|
+
|
|
118
|
+
context = results.map { |r| r[:content].to_s }.join("\n---\n")
|
|
119
|
+
"Draft answer from retrieved context: #{context[0, 400]}..."
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
print_header("Advanced Patterns")
|
|
124
|
+
smart_rag = build_client
|
|
125
|
+
|
|
126
|
+
contextual = ContextualSearch.new(smart_rag)
|
|
127
|
+
contextual_results = contextual.search_with_context(
|
|
128
|
+
"risk assessment",
|
|
129
|
+
domain: "finance",
|
|
130
|
+
document_ids: [],
|
|
131
|
+
preferred_tags: [],
|
|
132
|
+
)
|
|
133
|
+
puts "Contextual search results: #{contextual_results.fetch(:results, []).length}"
|
|
134
|
+
|
|
135
|
+
pipeline = SearchPipeline.new(smart_rag)
|
|
136
|
+
pipeline.add_processor do |results, _query, options|
|
|
137
|
+
min_score = options[:min_score] || 0.5
|
|
138
|
+
results[:results] = results.fetch(:results, []).select do |r|
|
|
139
|
+
(r[:combined_score] || r[:similarity] || 0.0) >= min_score
|
|
140
|
+
end
|
|
141
|
+
results
|
|
142
|
+
end
|
|
143
|
+
pipeline_results = pipeline.search("neural networks", search_type: "hybrid", limit: 20, min_score: 0.7)
|
|
144
|
+
puts "Pipeline filtered results: #{pipeline_results.fetch(:results, []).length}"
|
|
145
|
+
|
|
146
|
+
cache = MemorySearchCache.new(ttl_seconds: 600)
|
|
147
|
+
cached_1 = cache.fetch("deep learning", { limit: 5 }) { smart_rag.search("deep learning", limit: 5) }
|
|
148
|
+
cached_2 = cache.fetch("deep learning", { limit: 5 }) { smart_rag.search("deep learning", limit: 5) }
|
|
149
|
+
puts "Cache demo result sizes: #{cached_1.fetch(:results, []).length}, #{cached_2.fetch(:results, []).length}"
|
|
150
|
+
|
|
151
|
+
qa = QASystem.new(smart_rag)
|
|
152
|
+
qa_response = qa.answer("What are common applications of transformers in NLP?", context_limit: 3)
|
|
153
|
+
print_json("Q&A Response", qa_response)
|
|
154
|
+
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "common"
|
|
5
|
+
|
|
6
|
+
include Examples::Common
|
|
7
|
+
|
|
8
|
+
class RetryableSmartRAG
|
|
9
|
+
def initialize(smart_rag, retries: 3, base_interval: 0.5)
|
|
10
|
+
@smart_rag = smart_rag
|
|
11
|
+
@retries = retries
|
|
12
|
+
@base_interval = base_interval
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def add_document(path, **options)
|
|
16
|
+
with_retry(on: [SmartRAG::Errors::EmbeddingGenerationError]) do
|
|
17
|
+
@smart_rag.add_document(path, options)
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def search(query, **options)
|
|
22
|
+
with_retry(on: [SmartRAG::Errors::DatabaseError]) do
|
|
23
|
+
@smart_rag.search(query, options)
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
private
|
|
28
|
+
|
|
29
|
+
def with_retry(on:)
|
|
30
|
+
attempt = 0
|
|
31
|
+
begin
|
|
32
|
+
attempt += 1
|
|
33
|
+
yield
|
|
34
|
+
rescue *on => e
|
|
35
|
+
raise e if attempt >= @retries
|
|
36
|
+
|
|
37
|
+
sleep @base_interval * (2**(attempt - 1))
|
|
38
|
+
retry
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
print_header("Error Handling And Retry")
|
|
44
|
+
smart_rag = build_client
|
|
45
|
+
retryable = RetryableSmartRAG.new(smart_rag)
|
|
46
|
+
|
|
47
|
+
begin
|
|
48
|
+
# Intentional invalid call to show argument error handling.
|
|
49
|
+
smart_rag.search("", search_type: "hybrid")
|
|
50
|
+
rescue SmartRAG::Errors::InvalidParameterError, SmartRAG::Errors::InvalidQueryError, ArgumentError => e
|
|
51
|
+
puts "Argument error: #{e.message}"
|
|
52
|
+
rescue SmartRAG::Errors::DatabaseError => e
|
|
53
|
+
puts "Database error: #{e.message}"
|
|
54
|
+
rescue SmartRAG::Errors::EmbeddingGenerationError => e
|
|
55
|
+
puts "Embedding generation error: #{e.message}"
|
|
56
|
+
rescue SmartRAG::Errors::DocumentProcessingError => e
|
|
57
|
+
puts "Document processing error: #{e.message}"
|
|
58
|
+
rescue StandardError => e
|
|
59
|
+
puts "Unexpected error: #{e.class} - #{e.message}"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
safe_query = ARGV[0] || "machine learning"
|
|
63
|
+
safe_result = retryable.search(safe_query, search_type: "hybrid", limit: 5)
|
|
64
|
+
puts "Retry search result count: #{safe_result.fetch(:results, []).length}"
|
data/examples/README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# SmartRAG Examples
|
|
2
|
+
|
|
3
|
+
These examples are extracted and organized from `docs/USAGE_EXAMPLES.md`.
|
|
4
|
+
|
|
5
|
+
## Prerequisites
|
|
6
|
+
|
|
7
|
+
1. Install dependencies:
|
|
8
|
+
- `bundle install`
|
|
9
|
+
2. Configure environment variables (or rely on defaults in `examples/common.rb`):
|
|
10
|
+
- `SMARTRAG_DB_HOST`
|
|
11
|
+
- `SMARTRAG_DB_NAME`
|
|
12
|
+
- `SMARTRAG_DB_USER`
|
|
13
|
+
- `SMARTRAG_DB_PASSWORD`
|
|
14
|
+
- `OPENAI_API_KEY`
|
|
15
|
+
3. Ensure your database is migrated and has test data if needed.
|
|
16
|
+
|
|
17
|
+
## Run Examples
|
|
18
|
+
|
|
19
|
+
- Quick start:
|
|
20
|
+
- `ruby examples/01_quick_start.rb`
|
|
21
|
+
- Document management:
|
|
22
|
+
- `ruby examples/02_document_management.rb test/python_basics.md`
|
|
23
|
+
- Optional delete demo:
|
|
24
|
+
- `DELETE=1 ruby examples/02_document_management.rb test/python_basics.md`
|
|
25
|
+
- Search operations:
|
|
26
|
+
- `ruby examples/03_search_operations.rb`
|
|
27
|
+
- Topics and tags:
|
|
28
|
+
- `ruby examples/04_topics_and_tags.rb`
|
|
29
|
+
- Advanced patterns:
|
|
30
|
+
- `ruby examples/05_advanced_patterns.rb`
|
|
31
|
+
- Error handling and retry:
|
|
32
|
+
- `ruby examples/06_error_handling_and_retry.rb`
|
|
33
|
+
|
|
34
|
+
## Mapping To `docs/USAGE_EXAMPLES.md`
|
|
35
|
+
|
|
36
|
+
- `01_quick_start.rb`: Quick Start
|
|
37
|
+
- `02_document_management.rb`: Document Management
|
|
38
|
+
- `03_search_operations.rb`: Search Operations
|
|
39
|
+
- `04_topics_and_tags.rb`: Research Topic Management + Tag Management
|
|
40
|
+
- `05_advanced_patterns.rb`: Advanced Usage Patterns + Q&A + Caching
|
|
41
|
+
- `06_error_handling_and_retry.rb`: Error Handling + Retry Logic
|
|
42
|
+
- `common.rb`: shared setup/config/logging utilities
|
data/examples/common.rb
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "logger"
|
|
5
|
+
require "dotenv"
|
|
6
|
+
require_relative "../lib/smart_rag"
|
|
7
|
+
|
|
8
|
+
module Examples
|
|
9
|
+
module Common
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def load_env!
|
|
13
|
+
Dotenv.load(".env.local", ".env")
|
|
14
|
+
rescue StandardError
|
|
15
|
+
# Keep examples usable even if dotenv loading fails unexpectedly.
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def default_config
|
|
19
|
+
load_env!
|
|
20
|
+
{
|
|
21
|
+
database: {
|
|
22
|
+
adapter: "postgresql",
|
|
23
|
+
host: ENV["SMARTRAG_DB_HOST"] || "localhost",
|
|
24
|
+
port: (ENV["SMARTRAG_DB_PORT"] || "5432").to_i,
|
|
25
|
+
database: ENV["SMARTRAG_DB_NAME"] || "smart_rag_development",
|
|
26
|
+
user: ENV["SMARTRAG_DB_USER"] || "smart_rag_user",
|
|
27
|
+
password: ENV["SMARTRAG_DB_PASSWORD"],
|
|
28
|
+
},
|
|
29
|
+
llm: {
|
|
30
|
+
provider: ENV["SMARTRAG_LLM_PROVIDER"] || ENV["LLM_PROVIDER"] || "openai",
|
|
31
|
+
api_key: ENV["OPENAI_API_KEY"] || ENV["LLM_API_KEY"] || "ollama-local",
|
|
32
|
+
endpoint: ENV["LLM_ENDPOINT"] || "http://localhost:11434/v1/chat/completions",
|
|
33
|
+
model: ENV["LLM_MODEL"] || "qwen3",
|
|
34
|
+
},
|
|
35
|
+
}
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def build_client(log_level: Logger::INFO)
|
|
39
|
+
client = SmartRAG::SmartRAG.new(default_config)
|
|
40
|
+
client.logger = Logger.new($stdout)
|
|
41
|
+
client.logger.level = log_level
|
|
42
|
+
client
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def print_header(title)
|
|
46
|
+
puts "\n#{'=' * 72}"
|
|
47
|
+
puts title
|
|
48
|
+
puts "=" * 72
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def print_json(name, data)
|
|
52
|
+
require "json"
|
|
53
|
+
puts "#{name}:"
|
|
54
|
+
puts JSON.pretty_generate(data)
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
module SmartRAG
|
|
2
|
+
module Chunker
|
|
3
|
+
# MarkdownChunker splits markdown content into chunks based on headings
|
|
4
|
+
class MarkdownChunker
|
|
5
|
+
attr_reader :chunk_size, :overlap, :heading_levels
|
|
6
|
+
|
|
7
|
+
# @param [Integer] chunk_size Target size for each chunk (in characters)
|
|
8
|
+
# @param [Integer] overlap Overlap between chunks (in characters)
|
|
9
|
+
# @param [Array<Integer>] heading_levels Which heading levels to split on (default: [1, 2, 3])
|
|
10
|
+
def initialize(chunk_size: 2000, overlap: 200, heading_levels: [1, 2, 3])
|
|
11
|
+
@chunk_size = chunk_size
|
|
12
|
+
@overlap = overlap
|
|
13
|
+
@heading_levels = heading_levels
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Split markdown content into chunks
|
|
17
|
+
# @param [String] markdown_content Content to chunk
|
|
18
|
+
# @return [Array<Hash>] Array of chunk hashes with :title and :content
|
|
19
|
+
def chunk(markdown_content)
|
|
20
|
+
return [] if markdown_content.nil? || markdown_content.empty?
|
|
21
|
+
|
|
22
|
+
# First, try to split by headings
|
|
23
|
+
heading_chunks = split_by_headings(markdown_content)
|
|
24
|
+
|
|
25
|
+
# If we have heading-based chunks with actual headings, use them
|
|
26
|
+
# Otherwise, fall back to size-based splitting
|
|
27
|
+
if heading_chunks.length > 0 && heading_chunks.first[:title]
|
|
28
|
+
# Further split large heading chunks by size
|
|
29
|
+
result = []
|
|
30
|
+
heading_chunks.each do |chunk|
|
|
31
|
+
if chunk[:content].length > chunk_size * 1.5
|
|
32
|
+
# Split large chunk further
|
|
33
|
+
sub_chunks = split_large_chunk(chunk)
|
|
34
|
+
result.concat(sub_chunks)
|
|
35
|
+
else
|
|
36
|
+
result << chunk
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
result
|
|
40
|
+
else
|
|
41
|
+
# No headings found, use size-based splitting
|
|
42
|
+
split_by_size(markdown_content)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
private
|
|
47
|
+
|
|
48
|
+
# Split content by markdown headings - Handles nested headings properly
|
|
49
|
+
def split_by_headings(content)
|
|
50
|
+
chunks = []
|
|
51
|
+
content = content.sub(/\A\uFEFF/, '')
|
|
52
|
+
lines = content.lines
|
|
53
|
+
return chunks if lines.empty?
|
|
54
|
+
|
|
55
|
+
# Check if we have any headings
|
|
56
|
+
heading_pattern = /^(#{@heading_levels.map { |l| '#' * l }.join('|')})\s+(.+)$/
|
|
57
|
+
|
|
58
|
+
# Find all heading positions first
|
|
59
|
+
heading_positions = []
|
|
60
|
+
lines.each_with_index do |line, idx|
|
|
61
|
+
if line.match?(heading_pattern)
|
|
62
|
+
heading_positions << idx
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# If no headings, use fallback to size-based splitting
|
|
67
|
+
if heading_positions.empty?
|
|
68
|
+
cleaned = clean_chunk_content(content)
|
|
69
|
+
return cleaned.length > 10 ? [{ title: nil, content: cleaned }] : []
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Build chunks based on heading hierarchy
|
|
73
|
+
# Split at any heading that matches our configured heading levels
|
|
74
|
+
current_chunk_start = nil
|
|
75
|
+
current_chunk_title = nil
|
|
76
|
+
first_h1_processed = false
|
|
77
|
+
first_h1_title = nil
|
|
78
|
+
first_h1_skipped = false
|
|
79
|
+
intro_lines = []
|
|
80
|
+
intro_text = nil
|
|
81
|
+
intro_pending = false
|
|
82
|
+
|
|
83
|
+
lines.each_with_index do |line, idx|
|
|
84
|
+
# Check if this line is a heading
|
|
85
|
+
if (match = line.match(heading_pattern))
|
|
86
|
+
heading_level = match[1].length
|
|
87
|
+
heading_title = match[2].strip
|
|
88
|
+
|
|
89
|
+
# Handle the first H1 - treat as document title (skip for chunking)
|
|
90
|
+
if !first_h1_processed && heading_level == 1
|
|
91
|
+
first_h1_title = heading_title
|
|
92
|
+
first_h1_processed = true
|
|
93
|
+
first_h1_skipped = true
|
|
94
|
+
next
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Check if this heading level should create a chunk
|
|
98
|
+
if @heading_levels.include?(heading_level) && !(idx == 0 && heading_level == 1)
|
|
99
|
+
# Save previous chunk if it exists
|
|
100
|
+
if current_chunk_start && current_chunk_title
|
|
101
|
+
# Extract content from start of this section to current line
|
|
102
|
+
content_lines = lines[current_chunk_start...idx]
|
|
103
|
+
if content_lines && content_lines.length > 1
|
|
104
|
+
# Remove the heading line
|
|
105
|
+
section_content = clean_chunk_content(content_lines[1..-1].join)
|
|
106
|
+
if intro_pending && intro_text && section_content.length > 0
|
|
107
|
+
section_content = [intro_text, section_content].join("\n\n")
|
|
108
|
+
intro_pending = false
|
|
109
|
+
elsif intro_pending && intro_text && section_content.empty?
|
|
110
|
+
section_content = intro_text
|
|
111
|
+
intro_pending = false
|
|
112
|
+
end
|
|
113
|
+
if section_content.length > 0
|
|
114
|
+
chunks << { title: current_chunk_title, content: section_content }
|
|
115
|
+
end
|
|
116
|
+
end
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
if first_h1_skipped && current_chunk_start.nil? && intro_text.nil?
|
|
120
|
+
cleaned_intro = clean_chunk_content(intro_lines.join)
|
|
121
|
+
if cleaned_intro.length > 0
|
|
122
|
+
intro_text = cleaned_intro
|
|
123
|
+
intro_pending = true
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Start new chunk
|
|
128
|
+
current_chunk_start = idx
|
|
129
|
+
current_chunk_title = heading_title
|
|
130
|
+
end
|
|
131
|
+
elsif first_h1_skipped && current_chunk_start.nil?
|
|
132
|
+
intro_lines << line
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# Save the last chunk
|
|
137
|
+
if current_chunk_start && current_chunk_title
|
|
138
|
+
end_idx = heading_positions[heading_positions.index(current_chunk_start) + 1] || lines.length
|
|
139
|
+
content_lines = lines[current_chunk_start...end_idx]
|
|
140
|
+
if content_lines && content_lines.length > 1
|
|
141
|
+
section_content = clean_chunk_content(content_lines[1..-1].join)
|
|
142
|
+
if intro_pending && intro_text && section_content.length > 0
|
|
143
|
+
section_content = [intro_text, section_content].join("\n\n")
|
|
144
|
+
intro_pending = false
|
|
145
|
+
elsif intro_pending && intro_text && section_content.empty?
|
|
146
|
+
section_content = intro_text
|
|
147
|
+
intro_pending = false
|
|
148
|
+
end
|
|
149
|
+
if section_content.length > 0
|
|
150
|
+
chunks << { title: current_chunk_title, content: section_content }
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# If we skipped the first h1 as a document title but no chunks were created,
|
|
156
|
+
# go back and create a chunk for it
|
|
157
|
+
if first_h1_skipped && chunks.empty? && first_h1_title
|
|
158
|
+
# Get content after the first h1
|
|
159
|
+
if lines.length > 1
|
|
160
|
+
content_after_h1 = lines[1..-1].join
|
|
161
|
+
cleaned_content = clean_chunk_content(content_after_h1)
|
|
162
|
+
if cleaned_content.length > 0
|
|
163
|
+
chunks << { title: first_h1_title, content: cleaned_content }
|
|
164
|
+
end
|
|
165
|
+
end
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
chunks
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Clean chunk content by removing heading lines
|
|
172
|
+
def clean_chunk_content(content)
|
|
173
|
+
return "" if content.nil? || content.empty?
|
|
174
|
+
|
|
175
|
+
content = content.strip
|
|
176
|
+
return "" if content.empty?
|
|
177
|
+
|
|
178
|
+
lines = content.lines
|
|
179
|
+
|
|
180
|
+
# Remove leading heading lines
|
|
181
|
+
while lines.first && lines.first =~ /^#+\s+/
|
|
182
|
+
lines.shift
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
result = lines.join.strip
|
|
186
|
+
result
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# SPLIT_CONTENT
|
|
190
|
+
|
|
191
|
+
# Split content by size without considering headings
|
|
192
|
+
def split_by_size(content)
|
|
193
|
+
chunks = []
|
|
194
|
+
return chunks if content.nil? || content.empty?
|
|
195
|
+
|
|
196
|
+
position = 0
|
|
197
|
+
chunk_index = 0
|
|
198
|
+
content_length = content.length
|
|
199
|
+
|
|
200
|
+
while position < content_length
|
|
201
|
+
# Calculate the end position for this chunk
|
|
202
|
+
end_pos = [position + chunk_size, content_length].min
|
|
203
|
+
|
|
204
|
+
# Look backward for a sentence boundary if we're not at the end
|
|
205
|
+
if end_pos < content_length
|
|
206
|
+
# Search backward for a sentence boundary (., !, ?)
|
|
207
|
+
search_start = end_pos
|
|
208
|
+
search_end = [position + chunk_size * 0.8, content_length].min # Don't look too far back
|
|
209
|
+
|
|
210
|
+
(search_start - 1).downto(search_end.to_i) do |i|
|
|
211
|
+
if content[i] =~ /[.!?]/
|
|
212
|
+
# Found a sentence boundary
|
|
213
|
+
# Make sure it's followed by whitespace or end of content
|
|
214
|
+
if i + 1 >= content_length || content[i + 1] =~ /\s/
|
|
215
|
+
end_pos = i + 1
|
|
216
|
+
break
|
|
217
|
+
end
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
# Extract the chunk
|
|
223
|
+
chunk_content = content[position...end_pos].strip
|
|
224
|
+
|
|
225
|
+
if chunk_content.length > 50
|
|
226
|
+
chunks << {
|
|
227
|
+
title: generate_chunk_title(chunk_content, chunk_index),
|
|
228
|
+
content: chunk_content
|
|
229
|
+
}
|
|
230
|
+
chunk_index += 1
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
# Move position forward, keeping overlap
|
|
234
|
+
next_position = end_pos
|
|
235
|
+
if next_position < content_length
|
|
236
|
+
# Add overlap
|
|
237
|
+
overlap_chars = [overlap, chunk_content.length].min
|
|
238
|
+
position = [next_position - overlap_chars, position + 50].max # Don't overlap too much
|
|
239
|
+
else
|
|
240
|
+
position = content_length
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
chunks
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Split a large chunk that exceeded size limits
|
|
248
|
+
def split_large_chunk(chunk)
|
|
249
|
+
# Use the heading as title and split the content
|
|
250
|
+
sub_chunks = split_by_size(chunk[:content])
|
|
251
|
+
|
|
252
|
+
# Prepend the original heading to each sub-chunk
|
|
253
|
+
sub_chunks.each_with_index do |sub_chunk, i|
|
|
254
|
+
sub_chunk[:title] = if i == 0
|
|
255
|
+
chunk[:title]
|
|
256
|
+
else
|
|
257
|
+
"#{chunk[:title]} (Part #{i + 1})"
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
sub_chunks
|
|
262
|
+
end
|
|
263
|
+
|
|
264
|
+
# Generate a title for a chunk based on its content
|
|
265
|
+
def generate_chunk_title(content, index)
|
|
266
|
+
return nil if content.nil? || content.empty?
|
|
267
|
+
|
|
268
|
+
# Try to extract first heading (if present)
|
|
269
|
+
lines = content.strip.lines
|
|
270
|
+
lines.each do |line|
|
|
271
|
+
if line =~ /^#+\s+(.+)$/
|
|
272
|
+
return $1.strip
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Use first sentence
|
|
277
|
+
first_line = lines.first&.strip
|
|
278
|
+
if first_line && first_line =~ /^(.*?[.!?])/
|
|
279
|
+
sentence = $1.strip
|
|
280
|
+
return sentence.length > 100 ? sentence[0...100] + '...' : sentence
|
|
281
|
+
elsif first_line && !first_line.empty?
|
|
282
|
+
# Just use first line (without period)
|
|
283
|
+
return first_line.length > 100 ? first_line[0...100] + '...' : first_line
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
# Fallback to chunk number
|
|
287
|
+
"Section #{index + 1}"
|
|
288
|
+
end
|
|
289
|
+
end
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
# Monkey-patch String to add reverse match method
|
|
293
|
+
class ::String
|
|
294
|
+
# Find last match of pattern in string
|
|
295
|
+
# @param [Regexp] pattern Pattern to match
|
|
296
|
+
# @param [Integer] start_pos Position to start searching from (default: end of string)
|
|
297
|
+
# @return [MatchData] Last match or nil
|
|
298
|
+
def rmatch(pattern, start_pos = nil)
|
|
299
|
+
start_pos ||= length
|
|
300
|
+
start_pos = length + start_pos if start_pos < 0
|
|
301
|
+
start_pos = length if start_pos > length
|
|
302
|
+
|
|
303
|
+
# Search backwards from start_pos
|
|
304
|
+
(0..start_pos).reverse_each do |i|
|
|
305
|
+
# Try to match at position i
|
|
306
|
+
substring = self[i...length]
|
|
307
|
+
match = pattern.match(substring)
|
|
308
|
+
return match if match
|
|
309
|
+
end
|
|
310
|
+
nil
|
|
311
|
+
rescue
|
|
312
|
+
nil
|
|
313
|
+
end
|
|
314
|
+
end
|
|
315
|
+
end
|