deepsearch-rb 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/README.md +2 -8
- data/lib/deepsearch/configuration.rb +2 -1
- data/lib/deepsearch/engine/pipeline.rb +11 -14
- data/lib/deepsearch/engine/steps/data_aggregation/parsed_website.rb +4 -4
- data/lib/deepsearch/engine/steps/data_aggregation/result.rb +1 -1
- data/lib/deepsearch/engine/steps/parallel_search/process.rb +3 -3
- data/lib/deepsearch/engine/steps/parallel_search/result.rb +1 -1
- data/lib/deepsearch/engine/steps/parallel_search/search.rb +5 -6
- data/lib/deepsearch/engine/steps/prepare_subqueries/process.rb +28 -32
- data/lib/deepsearch/engine/steps/prepare_subqueries/result.rb +1 -1
- data/lib/deepsearch/engine/steps/rag/chunker.rb +1 -1
- data/lib/deepsearch/engine/steps/rag/process.rb +68 -36
- data/lib/deepsearch/engine/steps/rag/similarity.rb +4 -5
- data/lib/deepsearch/engine/steps/rag/values/chunk.rb +2 -2
- data/lib/deepsearch/engine/steps/rag/values/query.rb +1 -1
- data/lib/deepsearch/engine/steps/rag/values/result.rb +1 -1
- data/lib/deepsearch/engine/steps/summarization/process.rb +9 -8
- data/lib/deepsearch/engine/steps/summarization/result.rb +29 -0
- data/lib/deepsearch/logger.rb +1 -1
- data/lib/deepsearch/prompts_config.rb +1 -1
- data/lib/deepsearch/version.rb +1 -1
- data/lib/deepsearch.rb +1 -1
- data/lib/search_adapters/mock_adapter.rb +3 -2
- data/lib/search_adapters/serper_adapter.rb +1 -1
- data/lib/search_adapters/tavily_adapter.rb +5 -9
- metadata +2 -2
- data/lib/deepsearch/engine/steps/summarization/values/result.rb +0 -31
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51ef2b5c1780ff68752d82960a651443f6b0d5f7de38f9d0313acd17bdaaa144
|
4
|
+
data.tar.gz: 16af241a4eff83e5a5290832dc4ad14f4f8ed724f7c91abbfa060b4bf8c44ddb
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3172fe8597368c0ba863f6593f6c080a41b73de0890a3a02ca49f8e1dbcbf33bc17ab4c4a06a1b557dc324a949e9cabb2ee307752b8af8afd16872398c1dcaeb
|
7
|
+
data.tar.gz: eb185bd46ca110d9bbcb9134872dae8dab0c76db52841d4c85e38ec4a98f1c41033ab620930c4f9a22ec2238f2fa6f6d421b65bb7811492fccd8097c03cc099a
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -13,14 +13,8 @@ The only runtime dependencies are [ruby_llm](https://github.com/crmne/ruby_llm?t
|
|
13
13
|
|
14
14
|
---
|
15
15
|
|
16
|
-
**NOTE**: You can implement your own chains in the way it works for you, BFS/DFS search on any topic.
|
17
|
-
|
18
|
-
```
|
19
|
-
Deepsearch.search(initial search) ->
|
20
|
-
LLM(Generate additional queries) ->
|
21
|
-
Async [for each additional query]{ Deepsearch.search(sub-query) } ->
|
22
|
-
Aggregate()
|
23
|
-
```
|
16
|
+
**NOTE**: You can also implement your own chains in the way it works for you, BFS/DFS search on any topic.
|
17
|
+
See the draft implementation of multi-chain flow in `examples/multi-step-chain/script.rb`
|
24
18
|
|
25
19
|
## Installation
|
26
20
|
|
@@ -47,7 +47,8 @@ module Deepsearch
|
|
47
47
|
# end
|
48
48
|
# end
|
49
49
|
# Deepsearch.configure { |c| c.listener = MyListener.new
|
50
|
-
attr_accessor :tavily_api_key, :serper_api_key, :search_adapter, :custom_search_adapter_class, :logger, :listener,
|
50
|
+
attr_accessor :tavily_api_key, :serper_api_key, :search_adapter, :custom_search_adapter_class, :logger, :listener,
|
51
|
+
:prompts
|
51
52
|
attr_reader :ruby_llm
|
52
53
|
|
53
54
|
def initialize
|
@@ -31,7 +31,6 @@ module Deepsearch
|
|
31
31
|
# - original_query [String] The unmodified input query
|
32
32
|
# - sub_queries [Array<String>] Generated subqueries (empty array on error)
|
33
33
|
# - error [String, nil] Error message if processing failed
|
34
|
-
|
35
34
|
|
36
35
|
parallel_search_options = {
|
37
36
|
initial_query: query_preprocessing_result.cleaned_query,
|
@@ -42,7 +41,7 @@ module Deepsearch
|
|
42
41
|
|
43
42
|
parallel_search_result = with_retry { Steps::ParallelSearch::Process.new(**parallel_search_options).execute }
|
44
43
|
notify_listener(:step_completed, step: :parallel_search, result: parallel_search_result)
|
45
|
-
# [parallel_search_result] Contains:
|
44
|
+
# [parallel_search_result] Contains:
|
46
45
|
# - websites [Array<ParallelSearch::Result>] Search results
|
47
46
|
# - ParallelSearch::Result objects with:
|
48
47
|
# - websites [Array<Hash#url>] Array of website URLs
|
@@ -51,18 +50,18 @@ module Deepsearch
|
|
51
50
|
|
52
51
|
data_aggregation_result = with_retry do
|
53
52
|
Steps::DataAggregation::Process.new(
|
54
|
-
websites: parallel_search_result.websites
|
53
|
+
websites: parallel_search_result.websites
|
55
54
|
).execute
|
56
55
|
end
|
57
56
|
notify_listener(:step_completed, step: :data_aggregation, result: data_aggregation_result)
|
58
|
-
# [data_aggregation_result] Contains:
|
57
|
+
# [data_aggregation_result] Contains:
|
59
58
|
# - parsed_websites [Array<DataAggregation::Result>]
|
60
59
|
# - DataAggregation::Result objects with:
|
61
60
|
# - url [String] Website URL
|
62
61
|
# - content [String] Parsed content from the website
|
63
62
|
# - success [Boolean] Whether search succeeded
|
64
63
|
# - error [String, nil] Error message if search failed
|
65
|
-
|
64
|
+
|
66
65
|
rag_result = with_retry do
|
67
66
|
Steps::Rag::Process.new(
|
68
67
|
query: query_preprocessing_result.cleaned_query,
|
@@ -92,7 +91,7 @@ module Deepsearch
|
|
92
91
|
|
93
92
|
def notify_listener(event, **payload)
|
94
93
|
listener = Deepsearch.configuration.listener
|
95
|
-
|
94
|
+
unless listener.respond_to?(:on_deepsearch_event)
|
96
95
|
Deepsearch.configuration.logger.debug("Attached listener does not respond to on_deepsearch_event, skipping notification")
|
97
96
|
return
|
98
97
|
end
|
@@ -110,15 +109,13 @@ module Deepsearch
|
|
110
109
|
result = block.call
|
111
110
|
# Handle "soft" failures from steps that return a result object with a #failure? method
|
112
111
|
raise "Operation failed: #{result.error}" if result.respond_to?(:failure?) && result.failure?
|
113
|
-
|
112
|
+
|
114
113
|
result
|
115
|
-
rescue => e
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
raise e
|
121
|
-
end
|
114
|
+
rescue StandardError => e
|
115
|
+
raise e unless (retries += 1) <= 1
|
116
|
+
|
117
|
+
Deepsearch.configuration.logger.debug("Retrying after error: #{e.message}")
|
118
|
+
retry
|
122
119
|
end
|
123
120
|
end
|
124
121
|
end
|
@@ -42,7 +42,7 @@ module Deepsearch
|
|
42
42
|
|
43
43
|
def fetch_content!
|
44
44
|
uri = URI.parse(@url)
|
45
|
-
|
45
|
+
|
46
46
|
unless %w[http https].include?(uri.scheme)
|
47
47
|
@error = "Invalid URL scheme: #{uri.scheme}"
|
48
48
|
return
|
@@ -65,7 +65,7 @@ module Deepsearch
|
|
65
65
|
else
|
66
66
|
@error = "HTTP #{response.code}"
|
67
67
|
end
|
68
|
-
rescue => e
|
68
|
+
rescue StandardError => e
|
69
69
|
@error = e.message
|
70
70
|
end
|
71
71
|
|
@@ -111,7 +111,7 @@ module Deepsearch
|
|
111
111
|
rescue StandardError
|
112
112
|
# Fallback if Nokogiri fails. The raw_content is the problem. Sanitize it from binary to UTF-8.
|
113
113
|
fallback_text = content.to_s.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
|
114
|
-
fallback_text.gsub(
|
114
|
+
fallback_text.gsub(%r{<script\b[^>]*>.*?</script>}mi, "").gsub(%r{<style\b[^>]*>.*?</style>}mi, "").gsub(
|
115
115
|
/[[:space:]]+/, " "
|
116
116
|
).strip
|
117
117
|
end
|
@@ -119,4 +119,4 @@ module Deepsearch
|
|
119
119
|
end
|
120
120
|
end
|
121
121
|
end
|
122
|
-
end
|
122
|
+
end
|
@@ -14,9 +14,9 @@ module Deepsearch
|
|
14
14
|
attr_reader :initial_query, :sub_queries, :search_adapter, :options
|
15
15
|
|
16
16
|
def initialize(initial_query:,
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
sub_queries:,
|
18
|
+
search_adapter:,
|
19
|
+
**options)
|
20
20
|
@initial_query = initial_query
|
21
21
|
@sub_queries = sub_queries
|
22
22
|
@search_adapter = search_adapter
|
@@ -44,11 +44,11 @@ module Deepsearch
|
|
44
44
|
|
45
45
|
Sync do |task|
|
46
46
|
semaphore = Async::Semaphore.new(MAX_CONCURRENCY, parent: task)
|
47
|
-
|
47
|
+
|
48
48
|
tasks = @all_queries.each_with_index.map do |query, index|
|
49
49
|
# Add a small delay for subsequent tasks to avoid overwhelming the search api
|
50
50
|
sleep(1) if index > 0
|
51
|
-
|
51
|
+
|
52
52
|
semaphore.async do |sub_task|
|
53
53
|
sub_task.annotate("query ##{index + 1}: #{query}")
|
54
54
|
perform_search_with_retries(query, index + 1)
|
@@ -62,15 +62,14 @@ module Deepsearch
|
|
62
62
|
def perform_search_with_retries(query, query_number)
|
63
63
|
(MAX_RETRIES + 1).times do |attempt|
|
64
64
|
@logger.debug("Task #{query_number}: Searching '#{query}' (Attempt #{attempt + 1})")
|
65
|
-
|
65
|
+
|
66
66
|
results = @search_adapter.search(query, @search_options)
|
67
67
|
extracted = extract_results(results)
|
68
68
|
@logger.debug("✓ Task #{query_number} completed with #{extracted.size} results for '#{query}'")
|
69
69
|
return extracted
|
70
|
-
|
71
70
|
rescue StandardError => e
|
72
71
|
@logger.debug("✗ Task #{query_number} error for '#{query}': #{e.message}")
|
73
|
-
|
72
|
+
|
74
73
|
break if attempt >= MAX_RETRIES
|
75
74
|
|
76
75
|
sleep_duration = (INITIAL_BACKOFF * (2**attempt)) + rand(0.1..0.5)
|
@@ -92,4 +91,4 @@ module Deepsearch
|
|
92
91
|
end
|
93
92
|
end
|
94
93
|
end
|
95
|
-
end
|
94
|
+
end
|
@@ -5,7 +5,7 @@ require_relative 'result'
|
|
5
5
|
module Deepsearch
|
6
6
|
class Engine
|
7
7
|
module Steps
|
8
|
-
module PrepareSubqueries
|
8
|
+
module PrepareSubqueries
|
9
9
|
class Process
|
10
10
|
def initialize(original_query)
|
11
11
|
@original_query = original_query
|
@@ -26,15 +26,15 @@ module Deepsearch
|
|
26
26
|
private
|
27
27
|
|
28
28
|
def validate_input
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
return if @original_query && !@original_query.strip.empty?
|
30
|
+
|
31
|
+
raise StandardError, "Original query is required for preprocessing"
|
32
32
|
end
|
33
33
|
|
34
34
|
def process_query
|
35
35
|
cleaned_query = clean_query(@original_query)
|
36
36
|
subqueries = generate_subqueries(cleaned_query)
|
37
|
-
|
37
|
+
|
38
38
|
PrepareSubqueries::Result.new(
|
39
39
|
cleaned_query: cleaned_query,
|
40
40
|
original_query: @original_query,
|
@@ -47,38 +47,34 @@ module Deepsearch
|
|
47
47
|
end
|
48
48
|
|
49
49
|
def generate_subqueries(query)
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
[]
|
67
|
-
end
|
50
|
+
Deepsearch.configuration.logger.debug("Attempting to generate subqueries using LLM...")
|
51
|
+
chat = RubyLLM.chat
|
52
|
+
|
53
|
+
prompt = Deepsearch.configuration.prompts.subquery_prompt(query: query)
|
54
|
+
Deepsearch.configuration.logger.debug("Sending prompt to LLM...")
|
55
|
+
response = chat.ask(prompt)
|
56
|
+
|
57
|
+
Deepsearch.configuration.logger.debug("Received response from LLM")
|
58
|
+
subqueries = parse_subqueries(response.content)
|
59
|
+
Deepsearch.configuration.logger.debug("Generated #{subqueries.size} subqueries")
|
60
|
+
subqueries
|
61
|
+
rescue StandardError => e
|
62
|
+
Deepsearch.configuration.logger.debug("Error generating subqueries: #{e.message}")
|
63
|
+
Deepsearch.configuration.logger.debug("Error class: #{e.class}")
|
64
|
+
Deepsearch.configuration.logger.debug("Backtrace: #{e.backtrace.first(3).join('\n')}")
|
65
|
+
[]
|
68
66
|
end
|
69
67
|
|
70
68
|
def parse_subqueries(response_content)
|
71
69
|
return [] unless response_content
|
72
70
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
subqueries
|
71
|
+
response_content.split("\n")
|
72
|
+
.map(&:strip)
|
73
|
+
.reject(&:empty?)
|
74
|
+
.map { |line| line.gsub(/^\d+\.\s*|^[-*]\s*/, '') }
|
75
|
+
.map { |query| query.gsub(/^["']|["']$/, '') }
|
76
|
+
.reject(&:empty?)
|
77
|
+
.first(5)
|
82
78
|
end
|
83
79
|
end
|
84
80
|
end
|
@@ -1,5 +1,8 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
+
require 'async'
|
4
|
+
require 'async/semaphore'
|
5
|
+
|
3
6
|
require_relative 'values/chunk'
|
4
7
|
require_relative 'values/query'
|
5
8
|
require_relative 'values/result'
|
@@ -13,63 +16,92 @@ module Deepsearch
|
|
13
16
|
# Implements the core Retrieval-Augmented Generation (RAG) logic.
|
14
17
|
# It takes a query and a set of parsed websites, then:
|
15
18
|
# 1. Chunks the website content into smaller pieces.
|
16
|
-
# 2. Generates embeddings for all text chunks in batches.
|
19
|
+
# 2. Generates embeddings for all text chunks concurrently in batches.
|
17
20
|
# 3. Uses a similarity search to find the chunks most relevant to the query.
|
18
21
|
# 4. Returns a result containing the relevant chunks.
|
19
22
|
class Process
|
20
23
|
CHUNK_BATCH_SIZE = 100
|
21
24
|
MAX_TOTAL_CHUNKS = 500
|
22
25
|
MAX_CHUNKS_PER_WEBSITE = 15
|
26
|
+
MAX_EMBEDDING_CONCURRENCY = 3
|
23
27
|
|
24
28
|
def initialize(query:, parsed_websites:)
|
25
29
|
@query = Values::Query.new(text: query)
|
26
30
|
@documents = parsed_websites.map do |website|
|
27
31
|
{ url: website.url, content: website.content }
|
28
32
|
end
|
33
|
+
@logger = Deepsearch.configuration.logger
|
29
34
|
end
|
30
35
|
|
31
36
|
def execute
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
doc_chunks = doc_chunks.first(MAX_CHUNKS_PER_WEBSITE)
|
41
|
-
end
|
42
|
-
doc_chunks.each { |chunk| chunk.document_url = doc[:url] }
|
43
|
-
chunks.concat(doc_chunks)
|
37
|
+
chunker = Chunker.new
|
38
|
+
all_chunks = @documents.each_with_object([]) do |doc, chunks|
|
39
|
+
next if doc[:content].to_s.strip.empty?
|
40
|
+
|
41
|
+
doc_chunks = chunker.chunk(doc[:content])
|
42
|
+
if doc_chunks.count > MAX_CHUNKS_PER_WEBSITE
|
43
|
+
@logger.debug("Truncating chunks for #{doc[:url]} from #{doc_chunks.count} to #{MAX_CHUNKS_PER_WEBSITE}")
|
44
|
+
doc_chunks = doc_chunks.first(MAX_CHUNKS_PER_WEBSITE)
|
44
45
|
end
|
46
|
+
doc_chunks.each { |chunk| chunk.document_url = doc[:url] }
|
47
|
+
chunks.concat(doc_chunks)
|
48
|
+
end
|
45
49
|
|
46
|
-
|
50
|
+
@logger.debug("Chunked #{@documents.count} documents into #{all_chunks.count} chunks")
|
47
51
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
+
if all_chunks.count > MAX_TOTAL_CHUNKS
|
53
|
+
@logger.debug("Chunk count (#{all_chunks.count}) exceeds limit of #{MAX_TOTAL_CHUNKS}. Truncating.")
|
54
|
+
all_chunks = all_chunks.first(MAX_TOTAL_CHUNKS)
|
55
|
+
end
|
56
|
+
|
57
|
+
generate_embeddings_in_parallel(all_chunks)
|
58
|
+
|
59
|
+
@logger.debug('Finished embedding generation, initiating similarity match..')
|
60
|
+
chunks_with_embeddings = all_chunks.select(&:embedding)
|
61
|
+
relevant_chunks = Similarity.new.find_relevant(@query, chunks_with_embeddings)
|
62
|
+
@logger.debug("Found #{relevant_chunks.count} relevant chunks for query: '#{@query.text}'")
|
52
63
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
64
|
+
Values::Result.new(
|
65
|
+
query: @query,
|
66
|
+
relevant_chunks: relevant_chunks
|
67
|
+
)
|
68
|
+
rescue StandardError => e
|
69
|
+
Values::Result.new(
|
70
|
+
query: @query,
|
71
|
+
relevant_chunks: [],
|
72
|
+
error: e.message
|
73
|
+
)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def generate_embeddings_in_parallel(chunks)
|
79
|
+
return if chunks.empty?
|
80
|
+
|
81
|
+
num_batches = (chunks.count.to_f / CHUNK_BATCH_SIZE).ceil
|
82
|
+
@logger.debug("Starting parallel embedding generation for #{num_batches} batches with max concurrency of #{MAX_EMBEDDING_CONCURRENCY}")
|
83
|
+
|
84
|
+
Sync do |task|
|
85
|
+
semaphore = Async::Semaphore.new(MAX_EMBEDDING_CONCURRENCY, parent: task)
|
86
|
+
|
87
|
+
tasks = chunks.each_slice(CHUNK_BATCH_SIZE).with_index.map do |batch, index|
|
88
|
+
semaphore.async do |sub_task|
|
89
|
+
task_number = index + 1
|
90
|
+
sub_task.annotate("Embedding batch #{task_number}/#{num_batches}")
|
91
|
+
@logger.debug("Task #{task_number}: Generating embeddings for batch of #{batch.size} chunks")
|
92
|
+
|
93
|
+
begin
|
94
|
+
texts = batch.map(&:text)
|
95
|
+
embeddings = RubyLLM.embed(texts).vectors
|
96
|
+
batch.each_with_index { |chunk, i| chunk.embedding = embeddings[i] }
|
97
|
+
@logger.debug("✓ Task #{task_number} completed.")
|
98
|
+
rescue StandardError => e
|
99
|
+
@logger.error("✗ Task #{task_number} error: #{e.message}")
|
100
|
+
end
|
101
|
+
end
|
57
102
|
end
|
58
103
|
|
59
|
-
|
60
|
-
relevant_chunks = Similarity.new.find_relevant(@query, all_chunks)
|
61
|
-
Deepsearch.configuration.logger.debug("Found #{relevant_chunks.count} relevant chunks for query: '#{@query.text}'")
|
62
|
-
|
63
|
-
Values::Result.new(
|
64
|
-
query: @query,
|
65
|
-
relevant_chunks: relevant_chunks
|
66
|
-
)
|
67
|
-
rescue StandardError => e
|
68
|
-
Values::Result.new(
|
69
|
-
query: @query,
|
70
|
-
relevant_chunks: [],
|
71
|
-
error: e.message
|
72
|
-
)
|
104
|
+
tasks.map(&:wait)
|
73
105
|
end
|
74
106
|
end
|
75
107
|
end
|
@@ -24,10 +24,8 @@ module Deepsearch
|
|
24
24
|
best_score = top_candidates.first.first
|
25
25
|
cutoff_score = best_score * threshold
|
26
26
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
relevant_chunks
|
27
|
+
top_candidates.select { |score, _| score >= cutoff_score }
|
28
|
+
.map { |_, index| chunks[index] }
|
31
29
|
end
|
32
30
|
|
33
31
|
private
|
@@ -51,10 +49,11 @@ module Deepsearch
|
|
51
49
|
magnitude_b = Math.sqrt(vec_b.sum { |v| v**2 })
|
52
50
|
|
53
51
|
return 0.0 if magnitude_a.zero? || magnitude_b.zero?
|
52
|
+
|
54
53
|
dot_product / (magnitude_a * magnitude_b)
|
55
54
|
end
|
56
55
|
end
|
57
56
|
end
|
58
57
|
end
|
59
58
|
end
|
60
|
-
end
|
59
|
+
end
|
@@ -9,7 +9,7 @@ module Deepsearch
|
|
9
9
|
# This is the fundamental unit of data used in the RAG process.
|
10
10
|
class Chunk
|
11
11
|
attr_accessor :text, :embedding, :document_url
|
12
|
-
|
12
|
+
|
13
13
|
def initialize(text:, embedding: nil, document_url: nil)
|
14
14
|
@text = text
|
15
15
|
@embedding = embedding
|
@@ -20,4 +20,4 @@ module Deepsearch
|
|
20
20
|
end
|
21
21
|
end
|
22
22
|
end
|
23
|
-
end
|
23
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative '
|
3
|
+
require_relative 'result'
|
4
4
|
|
5
5
|
module Deepsearch
|
6
6
|
class Engine
|
@@ -18,17 +18,17 @@ module Deepsearch
|
|
18
18
|
end
|
19
19
|
|
20
20
|
def execute
|
21
|
-
return
|
21
|
+
return Result.new(summary: "No relevant content found to summarize.") if relevant_chunks.empty?
|
22
22
|
|
23
23
|
prompt = build_summary_prompt
|
24
24
|
Deepsearch.configuration.logger.debug("Summarizing content with LLM...")
|
25
25
|
response = RubyLLM.chat.ask(prompt)
|
26
26
|
Deepsearch.configuration.logger.debug("Summarization complete.")
|
27
27
|
|
28
|
-
|
28
|
+
Result.new(summary: response.content)
|
29
29
|
rescue StandardError => e
|
30
30
|
Deepsearch.configuration.logger.debug("Error during summarization: #{e.message}")
|
31
|
-
|
31
|
+
Result.new(summary: nil, error: e.message)
|
32
32
|
end
|
33
33
|
|
34
34
|
private
|
@@ -36,18 +36,19 @@ module Deepsearch
|
|
36
36
|
def build_summary_prompt
|
37
37
|
chunks_by_url = relevant_chunks.group_by(&:document_url)
|
38
38
|
citation_map = chunks_by_url.keys.each_with_index.to_h { |url, i| [url, i + 1] }
|
39
|
-
|
39
|
+
|
40
40
|
context_text = chunks_by_url.map do |url, chunks|
|
41
41
|
citation_number = citation_map[url]
|
42
42
|
chunk_contents = chunks.map(&:text).join("\n\n")
|
43
43
|
"Source [#{citation_number}]:\n#{chunk_contents}"
|
44
44
|
end.join("\n\n---\n\n")
|
45
|
-
|
45
|
+
|
46
46
|
sources_list = citation_map.map { |url, number| "[#{number}]: #{url}" }.join("\n")
|
47
|
-
Deepsearch.configuration.prompts.summarization_prompt(query: @query.text, context_text: context_text,
|
47
|
+
Deepsearch.configuration.prompts.summarization_prompt(query: @query.text, context_text: context_text,
|
48
|
+
sources_list: sources_list)
|
48
49
|
end
|
49
50
|
end
|
50
51
|
end
|
51
52
|
end
|
52
53
|
end
|
53
|
-
end
|
54
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Deepsearch
|
4
|
+
class Engine
|
5
|
+
module Steps
|
6
|
+
module Summarization
|
7
|
+
# Represents the result of the summarization step.
|
8
|
+
# It holds the final, synthesized summary and any potential error message.
|
9
|
+
class Result
|
10
|
+
attr_reader :summary, :error, :success
|
11
|
+
|
12
|
+
def initialize(summary: nil, error: nil)
|
13
|
+
@summary = summary
|
14
|
+
@success = error.nil?
|
15
|
+
@error = error
|
16
|
+
end
|
17
|
+
|
18
|
+
def success?
|
19
|
+
@success
|
20
|
+
end
|
21
|
+
|
22
|
+
def failure?
|
23
|
+
!success?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/deepsearch/logger.rb
CHANGED
data/lib/deepsearch/version.rb
CHANGED
data/lib/deepsearch.rb
CHANGED
@@ -9,7 +9,7 @@ module Deepsearch
|
|
9
9
|
def initialize(api_key = nil); end
|
10
10
|
|
11
11
|
def search(query, options = {})
|
12
|
-
|
12
|
+
mock_results(query, options)
|
13
13
|
end
|
14
14
|
|
15
15
|
private
|
@@ -63,7 +63,8 @@ module Deepsearch
|
|
63
63
|
}
|
64
64
|
|
65
65
|
if include_answer
|
66
|
-
response["answer"] =
|
66
|
+
response["answer"] =
|
67
|
+
"Ruby is a dynamic, open-source programming language with a focus on simplicity and productivity. It was created by Yukihiro Matsumoto in the mid-1990s and follows the principle that everything is an object. Ruby is particularly popular for web development, especially with the Ruby on Rails framework, but it's also used for automation, data processing, and various other applications."
|
67
68
|
end
|
68
69
|
|
69
70
|
response
|
@@ -40,13 +40,11 @@ module Deepsearch
|
|
40
40
|
private
|
41
41
|
|
42
42
|
def validate_api_key!
|
43
|
-
if @api_key.nil? || @api_key.strip.empty?
|
44
|
-
raise TavilyError, "API key is required"
|
45
|
-
end
|
43
|
+
raise TavilyError, "API key is required" if @api_key.nil? || @api_key.strip.empty?
|
46
44
|
|
47
|
-
|
48
|
-
|
49
|
-
|
45
|
+
return if @api_key.start_with?('tvly-')
|
46
|
+
|
47
|
+
raise TavilyError, "Invalid API key format. Expected format: tvly-YOUR_API_KEY"
|
50
48
|
end
|
51
49
|
|
52
50
|
def build_payload(query, options)
|
@@ -78,9 +76,7 @@ module Deepsearch
|
|
78
76
|
|
79
77
|
response = http.request(request)
|
80
78
|
|
81
|
-
unless response.is_a?(Net::HTTPSuccess)
|
82
|
-
handle_error_response(response)
|
83
|
-
end
|
79
|
+
handle_error_response(response) unless response.is_a?(Net::HTTPSuccess)
|
84
80
|
|
85
81
|
response
|
86
82
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: deepsearch-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Alexander Shagov
|
@@ -150,7 +150,7 @@ files:
|
|
150
150
|
- lib/deepsearch/engine/steps/rag/values/query.rb
|
151
151
|
- lib/deepsearch/engine/steps/rag/values/result.rb
|
152
152
|
- lib/deepsearch/engine/steps/summarization/process.rb
|
153
|
-
- lib/deepsearch/engine/steps/summarization/
|
153
|
+
- lib/deepsearch/engine/steps/summarization/result.rb
|
154
154
|
- lib/deepsearch/logger.rb
|
155
155
|
- lib/deepsearch/prompts_config.rb
|
156
156
|
- lib/deepsearch/version.rb
|
@@ -1,31 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Deepsearch
|
4
|
-
class Engine
|
5
|
-
module Steps
|
6
|
-
module Summarization
|
7
|
-
module Values
|
8
|
-
# Represents the result of the summarization step.
|
9
|
-
# It holds the final, synthesized summary and any potential error message.
|
10
|
-
class Result
|
11
|
-
attr_reader :summary, :error, :success
|
12
|
-
|
13
|
-
def initialize(summary: nil, error: nil)
|
14
|
-
@summary = summary
|
15
|
-
@success = error.nil?
|
16
|
-
@error = error
|
17
|
-
end
|
18
|
-
|
19
|
-
def success?
|
20
|
-
@success
|
21
|
-
end
|
22
|
-
|
23
|
-
def failure?
|
24
|
-
!success?
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
end
|
31
|
-
end
|