deepsearch-rb 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +15 -0
  3. data/README.md +2 -8
  4. data/lib/deepsearch/configuration.rb +34 -19
  5. data/lib/deepsearch/engine/pipeline.rb +11 -14
  6. data/lib/deepsearch/engine/steps/data_aggregation/parsed_website.rb +4 -4
  7. data/lib/deepsearch/engine/steps/data_aggregation/result.rb +1 -1
  8. data/lib/deepsearch/engine/steps/parallel_search/process.rb +3 -3
  9. data/lib/deepsearch/engine/steps/parallel_search/result.rb +1 -1
  10. data/lib/deepsearch/engine/steps/parallel_search/search.rb +5 -6
  11. data/lib/deepsearch/engine/steps/prepare_subqueries/process.rb +28 -32
  12. data/lib/deepsearch/engine/steps/prepare_subqueries/result.rb +1 -1
  13. data/lib/deepsearch/engine/steps/rag/chunker.rb +1 -1
  14. data/lib/deepsearch/engine/steps/rag/process.rb +68 -36
  15. data/lib/deepsearch/engine/steps/rag/similarity.rb +4 -5
  16. data/lib/deepsearch/engine/steps/rag/values/chunk.rb +2 -2
  17. data/lib/deepsearch/engine/steps/rag/values/query.rb +1 -1
  18. data/lib/deepsearch/engine/steps/rag/values/result.rb +1 -1
  19. data/lib/deepsearch/engine/steps/summarization/process.rb +9 -8
  20. data/lib/deepsearch/engine/steps/summarization/result.rb +29 -0
  21. data/lib/deepsearch/logger.rb +1 -1
  22. data/lib/deepsearch/prompts_config.rb +1 -1
  23. data/lib/deepsearch/version.rb +1 -1
  24. data/lib/deepsearch.rb +1 -1
  25. data/lib/search_adapters/mock_adapter.rb +3 -2
  26. data/lib/search_adapters/serper_adapter.rb +1 -1
  27. data/lib/search_adapters/tavily_adapter.rb +5 -9
  28. metadata +6 -6
  29. data/lib/deepsearch/engine/steps/summarization/values/result.rb +0 -31
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 435d04eb1b8c5a7d2d8e86cd17b863766559f82ffe4f194597e9b05a087b5ae0
4
- data.tar.gz: 1f394f33996d8f85b17e17709a3178debb8ccad84d8268bd7590b532ac0d9fa7
3
+ metadata.gz: 91d4780990957a409d46cf941454353f54645dcdda52e7dcd5a89e393dd58a27
4
+ data.tar.gz: b1761828c3b3c263ed9970cfc7ed44bb99883f8f08bce1a3a5309c3135f0bc5a
5
5
  SHA512:
6
- metadata.gz: 30634e66e50d377e755fe82354255d8b1e265578aca7e8f4c4833e8959ff33277a9a120c56b96c93efb13ea6e890cec8ddbf06d5952c0ef384efaceb3f8f8839
7
- data.tar.gz: fca34ac34b09e4a8c5558c8114853dfec7c982fdf4bb9f630068c8ebf60bc398b1f711d5443b4e30aedc04c78539de24dcbf8a33761a719eb9815d8b38e37499
6
+ metadata.gz: 5750aeb0a5c696adde1466eaa8f103f15774bd57c92ca2c93bcd933b47d2238b2e9a975eea2a20eadfa6ae517e625e8fbe0c1d0f130020ed532c264426b32209
7
+ data.tar.gz: 472b52a34a5a07209693ff92f40d03ffcd7b90d2d01df31d608b3947c3ac39a29e04745f2de7d6ef051104c27762a072f7e5ed8a8b17e5ef17c316553dc61276
data/CHANGELOG.md CHANGED
@@ -2,6 +2,21 @@
2
2
 
3
3
  ## [Released]
4
4
 
5
+ ## [0.1.2] - 2025-09-17
6
+
7
+ ### Fixed
8
+ - Fixed compatibility with a new version of RubyLLM
9
+
10
+ ## [Released]
11
+
12
+ ## [0.1.1] - 2025-07-20
13
+
14
+ ### Added
15
+ - Calls to embedding API are now executed async
16
+ - Added multi-step-chain example
17
+
18
+ ## [Released]
19
+
5
20
  ## [0.1.0] - 2025-07-12
6
21
 
7
22
  ### Added
data/README.md CHANGED
@@ -13,14 +13,8 @@ The only runtime dependencies are [ruby_llm](https://github.com/crmne/ruby_llm?t
13
13
 
14
14
  ---
15
15
 
16
- **NOTE**: You can implement your own chains in the way it works for you, BFS/DFS search on any topic. A draft code might look like:
17
-
18
- ```
19
- Deepsearch.search(initial search) ->
20
- LLM(Generate additional queries) ->
21
- Async [for each additional query]{ Deepsearch.search(sub-query) } ->
22
- Aggregate()
23
- ```
16
+ **NOTE**: You can also implement your own chains in the way it works for you, BFS/DFS search on any topic.
17
+ See the draft implementation of multi-chain flow in `examples/multi-step-chain/script.rb`
24
18
 
25
19
  ## Installation
26
20
 
@@ -13,21 +13,38 @@ module Deepsearch
13
13
  # config.ruby_llm.default_model = "gpt-4o-mini"
14
14
  # config.ruby_llm.request_timeout = 90
15
15
  # end
16
+ #
16
17
  class RubyLLMConfig
17
- SUPPORTED_ATTRIBUTES = %i[
18
- openai_api_key openai_organization_id openai_project_id
19
- anthropic_api_key gemini_api_key deepseek_api_key openrouter_api_key
20
- ollama_api_base bedrock_api_key bedrock_secret_key bedrock_region
21
- bedrock_session_token openai_api_base default_model
22
- default_embedding_model default_image_model request_timeout max_retries
23
- retry_interval retry_backoff_factor retry_interval_randomness
24
- http_proxy logger log_file log_level log_assume_model_exists
25
- ].freeze
18
+ def self.supported_attributes
19
+ @supported_attributes ||= discover_attributes
20
+ end
21
+
22
+ def self.reset_supported_attributes!
23
+ @supported_attributes = nil
24
+ end
25
+
26
+ private
27
+
28
+ def self.discover_attributes
29
+ if defined?(RubyLLM::Configuration)
30
+ config_instance = RubyLLM::Configuration.new
31
+ else
32
+ require "ruby_llm"
33
+ config_instance = RubyLLM::Configuration.new
34
+ end
35
+
36
+ # Getting all setter methods (ending with =) and remove the = suffix
37
+ config_instance.public_methods(false)
38
+ .select { |method| method.to_s.end_with?('=') }
39
+ .map { |method| method.to_s.chomp('=').to_sym }
40
+ .reject { |attr| [:configuration].include?(attr) }
41
+ end
42
+
43
+ public
26
44
 
27
- attr_accessor(*SUPPORTED_ATTRIBUTES)
45
+ attr_accessor(*supported_attributes)
28
46
 
29
47
  def initialize
30
- # Set some sensible defaults for Deepsearch's use case
31
48
  @default_model = "gpt-4o-mini"
32
49
  @default_embedding_model = "text-embedding-3-small"
33
50
  @request_timeout = 30 # seconds
@@ -35,7 +52,6 @@ module Deepsearch
35
52
  end
36
53
  end
37
54
 
38
- # Configuration class for managing gem settings
39
55
  class Configuration
40
56
  # @!attribute listener
41
57
  # An object that can listen to events from the Deepsearch pipeline.
@@ -47,7 +63,8 @@ module Deepsearch
47
63
  # end
48
64
  # end
49
65
  # Deepsearch.configure { |c| c.listener = MyListener.new
50
- attr_accessor :tavily_api_key, :serper_api_key, :search_adapter, :custom_search_adapter_class, :logger, :listener, :prompts
66
+ attr_accessor :tavily_api_key, :serper_api_key, :search_adapter, :custom_search_adapter_class, :logger, :listener,
67
+ :prompts
51
68
  attr_reader :ruby_llm
52
69
 
53
70
  def initialize
@@ -61,7 +78,6 @@ module Deepsearch
61
78
  @prompts = PromptsConfig.new
62
79
  end
63
80
 
64
- # Reset configuration to default values
65
81
  def reset!
66
82
  @tavily_api_key = nil
67
83
  @serper_api_key = nil
@@ -72,14 +88,13 @@ module Deepsearch
72
88
  @prompts = PromptsConfig.new
73
89
  end
74
90
 
75
- # Configure RubyLLM with current settings from the `ruby_llm` config object.
91
+ # Configure RubyLLM with current settings from the `RubyLLMConfig` config object.
76
92
  def configure_llm!
77
- require "ruby_llm"
93
+ require "ruby_llm" unless defined?(RubyLLM)
78
94
 
79
95
  RubyLLM.configure do |config|
80
- RubyLLMConfig::SUPPORTED_ATTRIBUTES.each do |attr|
81
- value = @ruby_llm.public_send(attr)
82
- # Only set the value if it's not nil to avoid overriding RubyLLM's internal defaults.
96
+ RubyLLMConfig.supported_attributes.each do |attr|
97
+ value = @ruby_llm.public_send(attr)
83
98
  config.public_send("#{attr}=", value) unless value.nil?
84
99
  end
85
100
  end
@@ -31,7 +31,6 @@ module Deepsearch
31
31
  # - original_query [String] The unmodified input query
32
32
  # - sub_queries [Array<String>] Generated subqueries (empty array on error)
33
33
  # - error [String, nil] Error message if processing failed
34
-
35
34
 
36
35
  parallel_search_options = {
37
36
  initial_query: query_preprocessing_result.cleaned_query,
@@ -42,7 +41,7 @@ module Deepsearch
42
41
 
43
42
  parallel_search_result = with_retry { Steps::ParallelSearch::Process.new(**parallel_search_options).execute }
44
43
  notify_listener(:step_completed, step: :parallel_search, result: parallel_search_result)
45
- # [parallel_search_result] Contains:
44
+ # [parallel_search_result] Contains:
46
45
  # - websites [Array<ParallelSearch::Result>] Search results
47
46
  # - ParallelSearch::Result objects with:
48
47
  # - websites [Array<Hash#url>] Array of website URLs
@@ -51,18 +50,18 @@ module Deepsearch
51
50
 
52
51
  data_aggregation_result = with_retry do
53
52
  Steps::DataAggregation::Process.new(
54
- websites: parallel_search_result.websites,
53
+ websites: parallel_search_result.websites
55
54
  ).execute
56
55
  end
57
56
  notify_listener(:step_completed, step: :data_aggregation, result: data_aggregation_result)
58
- # [data_aggregation_result] Contains:
57
+ # [data_aggregation_result] Contains:
59
58
  # - parsed_websites [Array<DataAggregation::Result>]
60
59
  # - DataAggregation::Result objects with:
61
60
  # - url [String] Website URL
62
61
  # - content [String] Parsed content from the website
63
62
  # - success [Boolean] Whether search succeeded
64
63
  # - error [String, nil] Error message if search failed
65
-
64
+
66
65
  rag_result = with_retry do
67
66
  Steps::Rag::Process.new(
68
67
  query: query_preprocessing_result.cleaned_query,
@@ -92,7 +91,7 @@ module Deepsearch
92
91
 
93
92
  def notify_listener(event, **payload)
94
93
  listener = Deepsearch.configuration.listener
95
- if !listener.respond_to?(:on_deepsearch_event)
94
+ unless listener.respond_to?(:on_deepsearch_event)
96
95
  Deepsearch.configuration.logger.debug("Attached listener does not respond to on_deepsearch_event, skipping notification")
97
96
  return
98
97
  end
@@ -110,15 +109,13 @@ module Deepsearch
110
109
  result = block.call
111
110
  # Handle "soft" failures from steps that return a result object with a #failure? method
112
111
  raise "Operation failed: #{result.error}" if result.respond_to?(:failure?) && result.failure?
113
-
112
+
114
113
  result
115
- rescue => e
116
- if (retries += 1) <= 1
117
- Deepsearch.configuration.logger.debug("Retrying after error: #{e.message}")
118
- retry
119
- else
120
- raise e
121
- end
114
+ rescue StandardError => e
115
+ raise e unless (retries += 1) <= 1
116
+
117
+ Deepsearch.configuration.logger.debug("Retrying after error: #{e.message}")
118
+ retry
122
119
  end
123
120
  end
124
121
  end
@@ -42,7 +42,7 @@ module Deepsearch
42
42
 
43
43
  def fetch_content!
44
44
  uri = URI.parse(@url)
45
-
45
+
46
46
  unless %w[http https].include?(uri.scheme)
47
47
  @error = "Invalid URL scheme: #{uri.scheme}"
48
48
  return
@@ -65,7 +65,7 @@ module Deepsearch
65
65
  else
66
66
  @error = "HTTP #{response.code}"
67
67
  end
68
- rescue => e
68
+ rescue StandardError => e
69
69
  @error = e.message
70
70
  end
71
71
 
@@ -111,7 +111,7 @@ module Deepsearch
111
111
  rescue StandardError
112
112
  # Fallback if Nokogiri fails. The raw_content is the problem. Sanitize it from binary to UTF-8.
113
113
  fallback_text = content.to_s.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
114
- fallback_text.gsub(/<script\b[^>]*>.*?<\/script>/mi, "").gsub(/<style\b[^>]*>.*?<\/style>/mi, "").gsub(
114
+ fallback_text.gsub(%r{<script\b[^>]*>.*?</script>}mi, "").gsub(%r{<style\b[^>]*>.*?</style>}mi, "").gsub(
115
115
  /[[:space:]]+/, " "
116
116
  ).strip
117
117
  end
@@ -119,4 +119,4 @@ module Deepsearch
119
119
  end
120
120
  end
121
121
  end
122
- end
122
+ end
@@ -25,4 +25,4 @@ module Deepsearch
25
25
  end
26
26
  end
27
27
  end
28
- end
28
+ end
@@ -14,9 +14,9 @@ module Deepsearch
14
14
  attr_reader :initial_query, :sub_queries, :search_adapter, :options
15
15
 
16
16
  def initialize(initial_query:,
17
- sub_queries:,
18
- search_adapter:,
19
- **options)
17
+ sub_queries:,
18
+ search_adapter:,
19
+ **options)
20
20
  @initial_query = initial_query
21
21
  @sub_queries = sub_queries
22
22
  @search_adapter = search_adapter
@@ -25,4 +25,4 @@ module Deepsearch
25
25
  end
26
26
  end
27
27
  end
28
- end
28
+ end
@@ -44,11 +44,11 @@ module Deepsearch
44
44
 
45
45
  Sync do |task|
46
46
  semaphore = Async::Semaphore.new(MAX_CONCURRENCY, parent: task)
47
-
47
+
48
48
  tasks = @all_queries.each_with_index.map do |query, index|
49
49
  # Add a small delay for subsequent tasks to avoid overwhelming the search api
50
50
  sleep(1) if index > 0
51
-
51
+
52
52
  semaphore.async do |sub_task|
53
53
  sub_task.annotate("query ##{index + 1}: #{query}")
54
54
  perform_search_with_retries(query, index + 1)
@@ -62,15 +62,14 @@ module Deepsearch
62
62
  def perform_search_with_retries(query, query_number)
63
63
  (MAX_RETRIES + 1).times do |attempt|
64
64
  @logger.debug("Task #{query_number}: Searching '#{query}' (Attempt #{attempt + 1})")
65
-
65
+
66
66
  results = @search_adapter.search(query, @search_options)
67
67
  extracted = extract_results(results)
68
68
  @logger.debug("✓ Task #{query_number} completed with #{extracted.size} results for '#{query}'")
69
69
  return extracted
70
-
71
70
  rescue StandardError => e
72
71
  @logger.debug("✗ Task #{query_number} error for '#{query}': #{e.message}")
73
-
72
+
74
73
  break if attempt >= MAX_RETRIES
75
74
 
76
75
  sleep_duration = (INITIAL_BACKOFF * (2**attempt)) + rand(0.1..0.5)
@@ -92,4 +91,4 @@ module Deepsearch
92
91
  end
93
92
  end
94
93
  end
95
- end
94
+ end
@@ -5,7 +5,7 @@ require_relative 'result'
5
5
  module Deepsearch
6
6
  class Engine
7
7
  module Steps
8
- module PrepareSubqueries
8
+ module PrepareSubqueries
9
9
  class Process
10
10
  def initialize(original_query)
11
11
  @original_query = original_query
@@ -26,15 +26,15 @@ module Deepsearch
26
26
  private
27
27
 
28
28
  def validate_input
29
- unless @original_query && !@original_query.strip.empty?
30
- raise StandardError, "Original query is required for preprocessing"
31
- end
29
+ return if @original_query && !@original_query.strip.empty?
30
+
31
+ raise StandardError, "Original query is required for preprocessing"
32
32
  end
33
33
 
34
34
  def process_query
35
35
  cleaned_query = clean_query(@original_query)
36
36
  subqueries = generate_subqueries(cleaned_query)
37
-
37
+
38
38
  PrepareSubqueries::Result.new(
39
39
  cleaned_query: cleaned_query,
40
40
  original_query: @original_query,
@@ -47,38 +47,34 @@ module Deepsearch
47
47
  end
48
48
 
49
49
  def generate_subqueries(query)
50
- begin
51
- Deepsearch.configuration.logger.debug("Attempting to generate subqueries using LLM...")
52
- chat = RubyLLM.chat
53
-
54
- prompt = Deepsearch.configuration.prompts.subquery_prompt(query: query)
55
- Deepsearch.configuration.logger.debug("Sending prompt to LLM...")
56
- response = chat.ask(prompt)
57
-
58
- Deepsearch.configuration.logger.debug("Received response from LLM")
59
- subqueries = parse_subqueries(response.content)
60
- Deepsearch.configuration.logger.debug("Generated #{subqueries.size} subqueries")
61
- subqueries
62
- rescue StandardError => e
63
- Deepsearch.configuration.logger.debug("Error generating subqueries: #{e.message}")
64
- Deepsearch.configuration.logger.debug("Error class: #{e.class}")
65
- Deepsearch.configuration.logger.debug("Backtrace: #{e.backtrace.first(3).join('\n')}")
66
- []
67
- end
50
+ Deepsearch.configuration.logger.debug("Attempting to generate subqueries using LLM...")
51
+ chat = RubyLLM.chat
52
+
53
+ prompt = Deepsearch.configuration.prompts.subquery_prompt(query: query)
54
+ Deepsearch.configuration.logger.debug("Sending prompt to LLM...")
55
+ response = chat.ask(prompt)
56
+
57
+ Deepsearch.configuration.logger.debug("Received response from LLM")
58
+ subqueries = parse_subqueries(response.content)
59
+ Deepsearch.configuration.logger.debug("Generated #{subqueries.size} subqueries")
60
+ subqueries
61
+ rescue StandardError => e
62
+ Deepsearch.configuration.logger.debug("Error generating subqueries: #{e.message}")
63
+ Deepsearch.configuration.logger.debug("Error class: #{e.class}")
64
+ Deepsearch.configuration.logger.debug("Backtrace: #{e.backtrace.first(3).join('\n')}")
65
+ []
68
66
  end
69
67
 
70
68
  def parse_subqueries(response_content)
71
69
  return [] unless response_content
72
70
 
73
- subqueries = response_content.split("\n")
74
- .map(&:strip)
75
- .reject(&:empty?)
76
- .map { |line| line.gsub(/^\d+\.\s*|^[-*]\s*/, '') }
77
- .map { |query| query.gsub(/^["']|["']$/, '') }
78
- .reject(&:empty?)
79
- .first(5)
80
-
81
- subqueries
71
+ response_content.split("\n")
72
+ .map(&:strip)
73
+ .reject(&:empty?)
74
+ .map { |line| line.gsub(/^\d+\.\s*|^[-*]\s*/, '') }
75
+ .map { |query| query.gsub(/^["']|["']$/, '') }
76
+ .reject(&:empty?)
77
+ .first(5)
82
78
  end
83
79
  end
84
80
  end
@@ -27,4 +27,4 @@ module Deepsearch
27
27
  end
28
28
  end
29
29
  end
30
- end
30
+ end
@@ -28,4 +28,4 @@ module Deepsearch
28
28
  end
29
29
  end
30
30
  end
31
- end
31
+ end
@@ -1,5 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require 'async'
4
+ require 'async/semaphore'
5
+
3
6
  require_relative 'values/chunk'
4
7
  require_relative 'values/query'
5
8
  require_relative 'values/result'
@@ -13,63 +16,92 @@ module Deepsearch
13
16
  # Implements the core Retrieval-Augmented Generation (RAG) logic.
14
17
  # It takes a query and a set of parsed websites, then:
15
18
  # 1. Chunks the website content into smaller pieces.
16
- # 2. Generates embeddings for all text chunks in batches.
19
+ # 2. Generates embeddings for all text chunks concurrently in batches.
17
20
  # 3. Uses a similarity search to find the chunks most relevant to the query.
18
21
  # 4. Returns a result containing the relevant chunks.
19
22
  class Process
20
23
  CHUNK_BATCH_SIZE = 100
21
24
  MAX_TOTAL_CHUNKS = 500
22
25
  MAX_CHUNKS_PER_WEBSITE = 15
26
+ MAX_EMBEDDING_CONCURRENCY = 3
23
27
 
24
28
  def initialize(query:, parsed_websites:)
25
29
  @query = Values::Query.new(text: query)
26
30
  @documents = parsed_websites.map do |website|
27
31
  { url: website.url, content: website.content }
28
32
  end
33
+ @logger = Deepsearch.configuration.logger
29
34
  end
30
35
 
31
36
  def execute
32
- begin
33
- chunker = Chunker.new
34
- all_chunks = @documents.each_with_object([]) do |doc, chunks|
35
- next if doc[:content].to_s.strip.empty?
36
-
37
- doc_chunks = chunker.chunk(doc[:content])
38
- if doc_chunks.count > MAX_CHUNKS_PER_WEBSITE
39
- Deepsearch.configuration.logger.debug("Truncating chunks for #{doc[:url]} from #{doc_chunks.count} to #{MAX_CHUNKS_PER_WEBSITE}")
40
- doc_chunks = doc_chunks.first(MAX_CHUNKS_PER_WEBSITE)
41
- end
42
- doc_chunks.each { |chunk| chunk.document_url = doc[:url] }
43
- chunks.concat(doc_chunks)
37
+ chunker = Chunker.new
38
+ all_chunks = @documents.each_with_object([]) do |doc, chunks|
39
+ next if doc[:content].to_s.strip.empty?
40
+
41
+ doc_chunks = chunker.chunk(doc[:content])
42
+ if doc_chunks.count > MAX_CHUNKS_PER_WEBSITE
43
+ @logger.debug("Truncating chunks for #{doc[:url]} from #{doc_chunks.count} to #{MAX_CHUNKS_PER_WEBSITE}")
44
+ doc_chunks = doc_chunks.first(MAX_CHUNKS_PER_WEBSITE)
44
45
  end
46
+ doc_chunks.each { |chunk| chunk.document_url = doc[:url] }
47
+ chunks.concat(doc_chunks)
48
+ end
45
49
 
46
- Deepsearch.configuration.logger.debug("Chunked #{@documents.count} documents into #{all_chunks.count} chunks")
50
+ @logger.debug("Chunked #{@documents.count} documents into #{all_chunks.count} chunks")
47
51
 
48
- if all_chunks.count > MAX_TOTAL_CHUNKS
49
- Deepsearch.configuration.logger.debug("Chunk count (#{all_chunks.count}) exceeds limit of #{MAX_TOTAL_CHUNKS}. Truncating.")
50
- all_chunks = all_chunks.first(MAX_TOTAL_CHUNKS)
51
- end
52
+ if all_chunks.count > MAX_TOTAL_CHUNKS
53
+ @logger.debug("Chunk count (#{all_chunks.count}) exceeds limit of #{MAX_TOTAL_CHUNKS}. Truncating.")
54
+ all_chunks = all_chunks.first(MAX_TOTAL_CHUNKS)
55
+ end
56
+
57
+ generate_embeddings_in_parallel(all_chunks)
58
+
59
+ @logger.debug('Finished embedding generation, initiating similarity match..')
60
+ chunks_with_embeddings = all_chunks.select(&:embedding)
61
+ relevant_chunks = Similarity.new.find_relevant(@query, chunks_with_embeddings)
62
+ @logger.debug("Found #{relevant_chunks.count} relevant chunks for query: '#{@query.text}'")
52
63
 
53
- all_chunks.each_slice(CHUNK_BATCH_SIZE) do |batch|
54
- texts = batch.map(&:text)
55
- embeddings = RubyLLM.embed(texts).vectors
56
- batch.each_with_index { |chunk, i| chunk.embedding = embeddings[i] }
64
+ Values::Result.new(
65
+ query: @query,
66
+ relevant_chunks: relevant_chunks
67
+ )
68
+ rescue StandardError => e
69
+ Values::Result.new(
70
+ query: @query,
71
+ relevant_chunks: [],
72
+ error: e.message
73
+ )
74
+ end
75
+
76
+ private
77
+
78
+ def generate_embeddings_in_parallel(chunks)
79
+ return if chunks.empty?
80
+
81
+ num_batches = (chunks.count.to_f / CHUNK_BATCH_SIZE).ceil
82
+ @logger.debug("Starting parallel embedding generation for #{num_batches} batches with max concurrency of #{MAX_EMBEDDING_CONCURRENCY}")
83
+
84
+ Sync do |task|
85
+ semaphore = Async::Semaphore.new(MAX_EMBEDDING_CONCURRENCY, parent: task)
86
+
87
+ tasks = chunks.each_slice(CHUNK_BATCH_SIZE).with_index.map do |batch, index|
88
+ semaphore.async do |sub_task|
89
+ task_number = index + 1
90
+ sub_task.annotate("Embedding batch #{task_number}/#{num_batches}")
91
+ @logger.debug("Task #{task_number}: Generating embeddings for batch of #{batch.size} chunks")
92
+
93
+ begin
94
+ texts = batch.map(&:text)
95
+ embeddings = RubyLLM.embed(texts).vectors
96
+ batch.each_with_index { |chunk, i| chunk.embedding = embeddings[i] }
97
+ @logger.debug("✓ Task #{task_number} completed.")
98
+ rescue StandardError => e
99
+ @logger.error("✗ Task #{task_number} error: #{e.message}")
100
+ end
101
+ end
57
102
  end
58
103
 
59
- Deepsearch.configuration.logger.debug("Generated embeddings for #{all_chunks.count} chunks, initiating similarity match..")
60
- relevant_chunks = Similarity.new.find_relevant(@query, all_chunks)
61
- Deepsearch.configuration.logger.debug("Found #{relevant_chunks.count} relevant chunks for query: '#{@query.text}'")
62
-
63
- Values::Result.new(
64
- query: @query,
65
- relevant_chunks: relevant_chunks
66
- )
67
- rescue StandardError => e
68
- Values::Result.new(
69
- query: @query,
70
- relevant_chunks: [],
71
- error: e.message
72
- )
104
+ tasks.map(&:wait)
73
105
  end
74
106
  end
75
107
  end
@@ -24,10 +24,8 @@ module Deepsearch
24
24
  best_score = top_candidates.first.first
25
25
  cutoff_score = best_score * threshold
26
26
 
27
- relevant_chunks = top_candidates.select { |score, _| score >= cutoff_score }
28
- .map { |_, index| chunks[index] }
29
-
30
- relevant_chunks
27
+ top_candidates.select { |score, _| score >= cutoff_score }
28
+ .map { |_, index| chunks[index] }
31
29
  end
32
30
 
33
31
  private
@@ -51,10 +49,11 @@ module Deepsearch
51
49
  magnitude_b = Math.sqrt(vec_b.sum { |v| v**2 })
52
50
 
53
51
  return 0.0 if magnitude_a.zero? || magnitude_b.zero?
52
+
54
53
  dot_product / (magnitude_a * magnitude_b)
55
54
  end
56
55
  end
57
56
  end
58
57
  end
59
58
  end
60
- end
59
+ end
@@ -9,7 +9,7 @@ module Deepsearch
9
9
  # This is the fundamental unit of data used in the RAG process.
10
10
  class Chunk
11
11
  attr_accessor :text, :embedding, :document_url
12
-
12
+
13
13
  def initialize(text:, embedding: nil, document_url: nil)
14
14
  @text = text
15
15
  @embedding = embedding
@@ -20,4 +20,4 @@ module Deepsearch
20
20
  end
21
21
  end
22
22
  end
23
- end
23
+ end
@@ -41,4 +41,4 @@ module Deepsearch
41
41
  end
42
42
  end
43
43
  end
44
- end
44
+ end
@@ -30,4 +30,4 @@ module Deepsearch
30
30
  end
31
31
  end
32
32
  end
33
- end
33
+ end
@@ -1,6 +1,6 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'values/result'
3
+ require_relative 'result'
4
4
 
5
5
  module Deepsearch
6
6
  class Engine
@@ -18,17 +18,17 @@ module Deepsearch
18
18
  end
19
19
 
20
20
  def execute
21
- return Values::Result.new(summary: "No relevant content found to summarize.") if relevant_chunks.empty?
21
+ return Result.new(summary: "No relevant content found to summarize.") if relevant_chunks.empty?
22
22
 
23
23
  prompt = build_summary_prompt
24
24
  Deepsearch.configuration.logger.debug("Summarizing content with LLM...")
25
25
  response = RubyLLM.chat.ask(prompt)
26
26
  Deepsearch.configuration.logger.debug("Summarization complete.")
27
27
 
28
- Values::Result.new(summary: response.content)
28
+ Result.new(summary: response.content)
29
29
  rescue StandardError => e
30
30
  Deepsearch.configuration.logger.debug("Error during summarization: #{e.message}")
31
- Values::Result.new(summary: nil, error: e.message)
31
+ Result.new(summary: nil, error: e.message)
32
32
  end
33
33
 
34
34
  private
@@ -36,18 +36,19 @@ module Deepsearch
36
36
  def build_summary_prompt
37
37
  chunks_by_url = relevant_chunks.group_by(&:document_url)
38
38
  citation_map = chunks_by_url.keys.each_with_index.to_h { |url, i| [url, i + 1] }
39
-
39
+
40
40
  context_text = chunks_by_url.map do |url, chunks|
41
41
  citation_number = citation_map[url]
42
42
  chunk_contents = chunks.map(&:text).join("\n\n")
43
43
  "Source [#{citation_number}]:\n#{chunk_contents}"
44
44
  end.join("\n\n---\n\n")
45
-
45
+
46
46
  sources_list = citation_map.map { |url, number| "[#{number}]: #{url}" }.join("\n")
47
- Deepsearch.configuration.prompts.summarization_prompt(query: @query.text, context_text: context_text, sources_list: sources_list)
47
+ Deepsearch.configuration.prompts.summarization_prompt(query: @query.text, context_text: context_text,
48
+ sources_list: sources_list)
48
49
  end
49
50
  end
50
51
  end
51
52
  end
52
53
  end
53
- end
54
+ end
@@ -0,0 +1,29 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Deepsearch
4
+ class Engine
5
+ module Steps
6
+ module Summarization
7
+ # Represents the result of the summarization step.
8
+ # It holds the final, synthesized summary and any potential error message.
9
+ class Result
10
+ attr_reader :summary, :error, :success
11
+
12
+ def initialize(summary: nil, error: nil)
13
+ @summary = summary
14
+ @success = error.nil?
15
+ @error = error
16
+ end
17
+
18
+ def success?
19
+ @success
20
+ end
21
+
22
+ def failure?
23
+ !success?
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
@@ -29,4 +29,4 @@ module Deepsearch
29
29
  end
30
30
  end
31
31
  end
32
- end
32
+ end
@@ -79,4 +79,4 @@ module Deepsearch
79
79
  PROMPT
80
80
  end
81
81
  end
82
- end
82
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Deepsearch
4
- VERSION = "0.1.0"
4
+ VERSION = "0.1.2"
5
5
  end
data/lib/deepsearch.rb CHANGED
@@ -12,7 +12,7 @@ module Deepsearch
12
12
  # A generic error class for exceptions raised by the Deepsearch gem,
13
13
  # from which more specific errors can inherit.
14
14
  class Error < StandardError; end
15
-
15
+
16
16
  class << self
17
17
  def configuration
18
18
  @configuration ||= Configuration.new
@@ -9,7 +9,7 @@ module Deepsearch
9
9
  def initialize(api_key = nil); end
10
10
 
11
11
  def search(query, options = {})
12
- return mock_results(query, options)
12
+ mock_results(query, options)
13
13
  end
14
14
 
15
15
  private
@@ -63,7 +63,8 @@ module Deepsearch
63
63
  }
64
64
 
65
65
  if include_answer
66
- response["answer"] = "Ruby is a dynamic, open-source programming language with a focus on simplicity and productivity. It was created by Yukihiro Matsumoto in the mid-1990s and follows the principle that everything is an object. Ruby is particularly popular for web development, especially with the Ruby on Rails framework, but it's also used for automation, data processing, and various other applications."
66
+ response["answer"] =
67
+ "Ruby is a dynamic, open-source programming language with a focus on simplicity and productivity. It was created by Yukihiro Matsumoto in the mid-1990s and follows the principle that everything is an object. Ruby is particularly popular for web development, especially with the Ruby on Rails framework, but it's also used for automation, data processing, and various other applications."
67
68
  end
68
69
 
69
70
  response
@@ -103,4 +103,4 @@ module Deepsearch
103
103
  # Custom error class for exceptions raised by the SerperAdapter.
104
104
  class SerperError < StandardError; end
105
105
  end
106
- end
106
+ end
@@ -40,13 +40,11 @@ module Deepsearch
40
40
  private
41
41
 
42
42
  def validate_api_key!
43
- if @api_key.nil? || @api_key.strip.empty?
44
- raise TavilyError, "API key is required"
45
- end
43
+ raise TavilyError, "API key is required" if @api_key.nil? || @api_key.strip.empty?
46
44
 
47
- unless @api_key.start_with?('tvly-')
48
- raise TavilyError, "Invalid API key format. Expected format: tvly-YOUR_API_KEY"
49
- end
45
+ return if @api_key.start_with?('tvly-')
46
+
47
+ raise TavilyError, "Invalid API key format. Expected format: tvly-YOUR_API_KEY"
50
48
  end
51
49
 
52
50
  def build_payload(query, options)
@@ -78,9 +76,7 @@ module Deepsearch
78
76
 
79
77
  response = http.request(request)
80
78
 
81
- unless response.is_a?(Net::HTTPSuccess)
82
- handle_error_response(response)
83
- end
79
+ handle_error_response(response) unless response.is_a?(Net::HTTPSuccess)
84
80
 
85
81
  response
86
82
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: deepsearch-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Alexander Shagov
@@ -41,16 +41,16 @@ dependencies:
41
41
  name: ruby_llm
42
42
  requirement: !ruby/object:Gem::Requirement
43
43
  requirements:
44
- - - "~>"
44
+ - - ">="
45
45
  - !ruby/object:Gem::Version
46
- version: '1.0'
46
+ version: '1.6'
47
47
  type: :runtime
48
48
  prerelease: false
49
49
  version_requirements: !ruby/object:Gem::Requirement
50
50
  requirements:
51
- - - "~>"
51
+ - - ">="
52
52
  - !ruby/object:Gem::Version
53
- version: '1.0'
53
+ version: '1.6'
54
54
  - !ruby/object:Gem::Dependency
55
55
  name: bundler
56
56
  requirement: !ruby/object:Gem::Requirement
@@ -150,7 +150,7 @@ files:
150
150
  - lib/deepsearch/engine/steps/rag/values/query.rb
151
151
  - lib/deepsearch/engine/steps/rag/values/result.rb
152
152
  - lib/deepsearch/engine/steps/summarization/process.rb
153
- - lib/deepsearch/engine/steps/summarization/values/result.rb
153
+ - lib/deepsearch/engine/steps/summarization/result.rb
154
154
  - lib/deepsearch/logger.rb
155
155
  - lib/deepsearch/prompts_config.rb
156
156
  - lib/deepsearch/version.rb
@@ -1,31 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Deepsearch
4
- class Engine
5
- module Steps
6
- module Summarization
7
- module Values
8
- # Represents the result of the summarization step.
9
- # It holds the final, synthesized summary and any potential error message.
10
- class Result
11
- attr_reader :summary, :error, :success
12
-
13
- def initialize(summary: nil, error: nil)
14
- @summary = summary
15
- @success = error.nil?
16
- @error = error
17
- end
18
-
19
- def success?
20
- @success
21
- end
22
-
23
- def failure?
24
- !success?
25
- end
26
- end
27
- end
28
- end
29
- end
30
- end
31
- end