deepsearch-rb 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +8 -0
- data/LICENSE.txt +21 -0
- data/README.md +138 -0
- data/lib/deepsearch/configuration.rb +88 -0
- data/lib/deepsearch/engine/pipeline.rb +126 -0
- data/lib/deepsearch/engine/steps/data_aggregation/parsed_website.rb +122 -0
- data/lib/deepsearch/engine/steps/data_aggregation/process.rb +56 -0
- data/lib/deepsearch/engine/steps/data_aggregation/result.rb +28 -0
- data/lib/deepsearch/engine/steps/parallel_search/process.rb +42 -0
- data/lib/deepsearch/engine/steps/parallel_search/result.rb +28 -0
- data/lib/deepsearch/engine/steps/parallel_search/search.rb +95 -0
- data/lib/deepsearch/engine/steps/prepare_subqueries/process.rb +87 -0
- data/lib/deepsearch/engine/steps/prepare_subqueries/result.rb +30 -0
- data/lib/deepsearch/engine/steps/rag/chunker.rb +31 -0
- data/lib/deepsearch/engine/steps/rag/process.rb +79 -0
- data/lib/deepsearch/engine/steps/rag/similarity.rb +60 -0
- data/lib/deepsearch/engine/steps/rag/values/chunk.rb +23 -0
- data/lib/deepsearch/engine/steps/rag/values/query.rb +44 -0
- data/lib/deepsearch/engine/steps/rag/values/result.rb +33 -0
- data/lib/deepsearch/engine/steps/summarization/process.rb +53 -0
- data/lib/deepsearch/engine/steps/summarization/values/result.rb +31 -0
- data/lib/deepsearch/engine.rb +25 -0
- data/lib/deepsearch/logger.rb +32 -0
- data/lib/deepsearch/prompts_config.rb +82 -0
- data/lib/deepsearch/version.rb +5 -0
- data/lib/deepsearch.rb +39 -0
- data/lib/search_adapters/mock_adapter.rb +73 -0
- data/lib/search_adapters/serper_adapter.rb +106 -0
- data/lib/search_adapters/tavily_adapter.rb +113 -0
- data/lib/search_adapters.rb +24 -0
- metadata +186 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 435d04eb1b8c5a7d2d8e86cd17b863766559f82ffe4f194597e9b05a087b5ae0
|
4
|
+
data.tar.gz: 1f394f33996d8f85b17e17709a3178debb8ccad84d8268bd7590b532ac0d9fa7
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 30634e66e50d377e755fe82354255d8b1e265578aca7e8f4c4833e8959ff33277a9a120c56b96c93efb13ea6e890cec8ddbf06d5952c0ef384efaceb3f8f8839
|
7
|
+
data.tar.gz: fca34ac34b09e4a8c5558c8114853dfec7c982fdf4bb9f630068c8ebf60bc398b1f711d5443b4e30aedc04c78539de24dcbf8a33761a719eb9815d8b38e37499
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2025 Alexander Shagov
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,138 @@
|
|
1
|
+
# deepsearch-rb
|
2
|
+
|
3
|
+
A ruby gem for performing LLM-powered automated web search.
|
4
|
+
The only runtime dependencies are [ruby_llm](https://github.com/crmne/ruby_llm?tab=readme-ov-file) and [async](https://github.com/socketry/async).
|
5
|
+
|
6
|
+

|
7
|
+
|
8
|
+
<details>
|
9
|
+
<summary>🎥 A simple demo within a sinatra app demonstrating the basic 1-cycle flow. (`examples/simple_webapp`) ▶️</summary>
|
10
|
+
|
11
|
+

|
12
|
+
</details>
|
13
|
+
|
14
|
+
---
|
15
|
+
|
16
|
+
**NOTE**: You can implement your own chains in the way it works for you, BFS/DFS search on any topic. A draft code might look like:
|
17
|
+
|
18
|
+
```
|
19
|
+
Deepsearch.search(initial search) ->
|
20
|
+
LLM(Generate additional queries) ->
|
21
|
+
Async [for each additional query]{ Deepsearch.search(sub-query) } ->
|
22
|
+
Aggregate()
|
23
|
+
```
|
24
|
+
|
25
|
+
## Installation
|
26
|
+
|
27
|
+
[](https://github.com/alexshagov/deepsearch-rb/actions/workflows/ruby.yml)
|
28
|
+
|
29
|
+
### Quick installation via nix flakes
|
30
|
+
|
31
|
+
If you're using nix flakes, just run `nix develop` and you'll enter a fully prepared dev environment.
|
32
|
+
|
33
|
+
Otherwise, make sure you have installed `required_ruby_version` = ">= 3.2.0".
|
34
|
+
|
35
|
+
```ruby
|
36
|
+
# Gemfile
|
37
|
+
gem 'deepsearch-rb'
|
38
|
+
# ..
|
39
|
+
|
40
|
+
bundle install
|
41
|
+
```
|
42
|
+
|
43
|
+
Or install from a local folder:
|
44
|
+
```bash
|
45
|
+
gem 'deepsearch-rb', path: '<path_to_gem>'
|
46
|
+
```
|
47
|
+
|
48
|
+
## Examples
|
49
|
+
|
50
|
+
See the the `examples/` folder for a simple test as well as mini sinatra
|
51
|
+
application streaming gem events via websocket interface.
|
52
|
+
|
53
|
+
## Configuration Options
|
54
|
+
|
55
|
+
### Minimal config
|
56
|
+
|
57
|
+
- The LLM configuration is fully based on [ruby_llm](https://github.com/crmne/ruby_llm?tab=readme-ov-file) gem.
|
58
|
+
- There are two built-in search adapters you can use out of the box:
|
59
|
+
- [tavily](http://tavily.com/)
|
60
|
+
- [serper](http://serper.dev/playground)
|
61
|
+
|
62
|
+
```ruby
|
63
|
+
Deepsearch.configure do |config|
|
64
|
+
config.search_adapter = :tavily # or :serper, :mock
|
65
|
+
config.tavily_api_key = "your_tavily_api_key"
|
66
|
+
config.serper_api_key = "your_serper_api_key"
|
67
|
+
|
68
|
+
# LLM configuration (all config options are coming from `ruby_llm` gem),
|
69
|
+
# you can experiment with any model you like
|
70
|
+
config.ruby_llm.gemini_api_key = ENV['GEMINI_API_KEY']
|
71
|
+
config.ruby_llm.default_model = 'gemini-2.0-flash-lite'
|
72
|
+
config.ruby_llm.default_embedding_model = 'text-embedding-004'
|
73
|
+
# ..
|
74
|
+
end
|
75
|
+
```
|
76
|
+
|
77
|
+
### Advanced config
|
78
|
+
|
79
|
+
#### Specifying your own search adapter
|
80
|
+
|
81
|
+
```ruby
|
82
|
+
class MyCustomAdapter
|
83
|
+
def initializeт;end
|
84
|
+
|
85
|
+
def search(query, options = {})
|
86
|
+
# Implement your search logic here
|
87
|
+
{
|
88
|
+
"results" => [
|
89
|
+
{ "url" => "https://example.com/result1", "content" => "Content 1 from custom search" }
|
90
|
+
]
|
91
|
+
}
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
Deepsearch.configure { |config| config.custom_search_adapter_class = MyCustomAdapter }
|
96
|
+
```
|
97
|
+
|
98
|
+
#### Specifying your own prompts
|
99
|
+
|
100
|
+
See the `Deepsearch::PromptsConfig` interface to learn what methods and arguments are expected.
|
101
|
+
|
102
|
+
```ruby
|
103
|
+
class MyPromptsConfig < Deepsearch::PromptsConfig
|
104
|
+
def subquery_prompt(query:)
|
105
|
+
<<~PROMPT
|
106
|
+
You are a search expert. Given this query: "#{query}",
|
107
|
+
generate 3 alternative search queries that would help find more information.
|
108
|
+
Return them as a simple list, one per line.
|
109
|
+
PROMPT
|
110
|
+
end
|
111
|
+
|
112
|
+
# Override other prompt methods as needed
|
113
|
+
end
|
114
|
+
|
115
|
+
Deepsearch.configure { |config| config.prompts_config = MyPromptsConfig.new }
|
116
|
+
```
|
117
|
+
|
118
|
+
#### Specifying event listener
|
119
|
+
|
120
|
+
```ruby
|
121
|
+
class MyListener
|
122
|
+
def on_deepsearch_event(event, step:, result:)
|
123
|
+
puts "Event: #{event}, Step: #{step}, Success: #{result.success?}"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
Deepsearch.configure { |c| c.listener = MyListener.new }
|
128
|
+
```
|
129
|
+
|
130
|
+
#### Specifying max total search results
|
131
|
+
|
132
|
+
```ruby
|
133
|
+
result = Deepsearch.search("Ruby 3 unknown features", max_total_search_results: 25)
|
134
|
+
```
|
135
|
+
|
136
|
+
# License
|
137
|
+
|
138
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
@@ -0,0 +1,88 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "prompts_config"
|
4
|
+
|
5
|
+
module Deepsearch
|
6
|
+
# Encapsulates configuration options for the underlying `ruby_llm` gem.
|
7
|
+
# This provides a clean namespace for LLM settings within Deepsearch's configuration.
|
8
|
+
#
|
9
|
+
# @example
|
10
|
+
# Deepsearch.configure do |config|
|
11
|
+
# # Configure LLM settings via the `ruby_llm` accessor
|
12
|
+
# config.ruby_llm.openai_api_key = "sk-..."
|
13
|
+
# config.ruby_llm.default_model = "gpt-4o-mini"
|
14
|
+
# config.ruby_llm.request_timeout = 90
|
15
|
+
# end
|
16
|
+
class RubyLLMConfig
|
17
|
+
SUPPORTED_ATTRIBUTES = %i[
|
18
|
+
openai_api_key openai_organization_id openai_project_id
|
19
|
+
anthropic_api_key gemini_api_key deepseek_api_key openrouter_api_key
|
20
|
+
ollama_api_base bedrock_api_key bedrock_secret_key bedrock_region
|
21
|
+
bedrock_session_token openai_api_base default_model
|
22
|
+
default_embedding_model default_image_model request_timeout max_retries
|
23
|
+
retry_interval retry_backoff_factor retry_interval_randomness
|
24
|
+
http_proxy logger log_file log_level log_assume_model_exists
|
25
|
+
].freeze
|
26
|
+
|
27
|
+
attr_accessor(*SUPPORTED_ATTRIBUTES)
|
28
|
+
|
29
|
+
def initialize
|
30
|
+
# Set some sensible defaults for Deepsearch's use case
|
31
|
+
@default_model = "gpt-4o-mini"
|
32
|
+
@default_embedding_model = "text-embedding-3-small"
|
33
|
+
@request_timeout = 30 # seconds
|
34
|
+
@log_assume_model_exists = false
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
# Configuration class for managing gem settings
|
39
|
+
class Configuration
|
40
|
+
# @!attribute listener
|
41
|
+
# An object that can listen to events from the Deepsearch pipeline.
|
42
|
+
# The object must respond to `on_deepsearch_event(event_name, **payload)`.
|
43
|
+
# @example
|
44
|
+
# class MyListener
|
45
|
+
# def on_deepsearch_event(event, step:, result:)
|
46
|
+
# puts "Event: #{event}, Step: #{step}, Success: #{result.success?}"
|
47
|
+
# end
|
48
|
+
# end
|
49
|
+
# Deepsearch.configure { |c| c.listener = MyListener.new
|
50
|
+
attr_accessor :tavily_api_key, :serper_api_key, :search_adapter, :custom_search_adapter_class, :logger, :listener, :prompts
|
51
|
+
attr_reader :ruby_llm
|
52
|
+
|
53
|
+
def initialize
|
54
|
+
@tavily_api_key = nil
|
55
|
+
@serper_api_key = nil
|
56
|
+
@search_adapter = :tavily
|
57
|
+
@custom_search_adapter_class = nil
|
58
|
+
@listener = nil
|
59
|
+
@logger = Logger.new($stdout, level: Logger::DEBUG)
|
60
|
+
@ruby_llm = RubyLLMConfig.new
|
61
|
+
@prompts = PromptsConfig.new
|
62
|
+
end
|
63
|
+
|
64
|
+
# Reset configuration to default values
|
65
|
+
def reset!
|
66
|
+
@tavily_api_key = nil
|
67
|
+
@serper_api_key = nil
|
68
|
+
@search_adapter = :tavily
|
69
|
+
@listener = nil
|
70
|
+
@logger = Logger.new($stdout, level: Logger::DEBUG)
|
71
|
+
@ruby_llm = RubyLLMConfig.new
|
72
|
+
@prompts = PromptsConfig.new
|
73
|
+
end
|
74
|
+
|
75
|
+
# Configure RubyLLM with current settings from the `ruby_llm` config object.
|
76
|
+
def configure_llm!
|
77
|
+
require "ruby_llm"
|
78
|
+
|
79
|
+
RubyLLM.configure do |config|
|
80
|
+
RubyLLMConfig::SUPPORTED_ATTRIBUTES.each do |attr|
|
81
|
+
value = @ruby_llm.public_send(attr)
|
82
|
+
# Only set the value if it's not nil to avoid overriding RubyLLM's internal defaults.
|
83
|
+
config.public_send("#{attr}=", value) unless value.nil?
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
@@ -0,0 +1,126 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "steps/prepare_subqueries/process"
|
4
|
+
require_relative "steps/parallel_search/process"
|
5
|
+
require_relative "steps/data_aggregation/process"
|
6
|
+
require_relative "steps/rag/process"
|
7
|
+
require_relative "steps/summarization/process"
|
8
|
+
|
9
|
+
module Deepsearch
|
10
|
+
class Engine
|
11
|
+
# Orchestrates the entire multi-step search and summarization process.
|
12
|
+
# The pipeline executes a sequence of steps:
|
13
|
+
# 1. Prepares sub-queries from the initial query.
|
14
|
+
# 2. Performs parallel searches to gather website links.
|
15
|
+
# 3. Aggregates and parses content from the found websites.
|
16
|
+
# 4. Uses RAG to find text chunks relevant to the query.
|
17
|
+
# 5. Summarizes the relevant chunks into a final answer.
|
18
|
+
# It includes retry logic for each step to enhance robustness.
|
19
|
+
class Pipeline
|
20
|
+
def initialize(search_adapter)
|
21
|
+
@search_adapter = search_adapter
|
22
|
+
end
|
23
|
+
|
24
|
+
def execute(query, **options)
|
25
|
+
query_preprocessing_result = with_retry do
|
26
|
+
Steps::PrepareSubqueries::Process.new(query).execute
|
27
|
+
end
|
28
|
+
notify_listener(:step_completed, step: :prepare_subqueries, result: query_preprocessing_result)
|
29
|
+
# [query_preprocessing_result] Contains:
|
30
|
+
# - cleaned_query [String] The sanitized version of original query
|
31
|
+
# - original_query [String] The unmodified input query
|
32
|
+
# - sub_queries [Array<String>] Generated subqueries (empty array on error)
|
33
|
+
# - error [String, nil] Error message if processing failed
|
34
|
+
|
35
|
+
|
36
|
+
parallel_search_options = {
|
37
|
+
initial_query: query_preprocessing_result.cleaned_query,
|
38
|
+
sub_queries: query_preprocessing_result.sub_queries,
|
39
|
+
search_adapter: @search_adapter,
|
40
|
+
**options
|
41
|
+
}
|
42
|
+
|
43
|
+
parallel_search_result = with_retry { Steps::ParallelSearch::Process.new(**parallel_search_options).execute }
|
44
|
+
notify_listener(:step_completed, step: :parallel_search, result: parallel_search_result)
|
45
|
+
# [parallel_search_result] Contains:
|
46
|
+
# - websites [Array<ParallelSearch::Result>] Search results
|
47
|
+
# - ParallelSearch::Result objects with:
|
48
|
+
# - websites [Array<Hash#url>] Array of website URLs
|
49
|
+
# - success [Boolean] Whether search succeeded
|
50
|
+
# - error [String, nil] Error message if search failed
|
51
|
+
|
52
|
+
data_aggregation_result = with_retry do
|
53
|
+
Steps::DataAggregation::Process.new(
|
54
|
+
websites: parallel_search_result.websites,
|
55
|
+
).execute
|
56
|
+
end
|
57
|
+
notify_listener(:step_completed, step: :data_aggregation, result: data_aggregation_result)
|
58
|
+
# [data_aggregation_result] Contains:
|
59
|
+
# - parsed_websites [Array<DataAggregation::Result>]
|
60
|
+
# - DataAggregation::Result objects with:
|
61
|
+
# - url [String] Website URL
|
62
|
+
# - content [String] Parsed content from the website
|
63
|
+
# - success [Boolean] Whether search succeeded
|
64
|
+
# - error [String, nil] Error message if search failed
|
65
|
+
|
66
|
+
rag_result = with_retry do
|
67
|
+
Steps::Rag::Process.new(
|
68
|
+
query: query_preprocessing_result.cleaned_query,
|
69
|
+
parsed_websites: data_aggregation_result.parsed_websites
|
70
|
+
).execute
|
71
|
+
end
|
72
|
+
notify_listener(:step_completed, step: :rag, result: rag_result)
|
73
|
+
# [rag_result] Contains:
|
74
|
+
# - query [::Deepsearch::Engine::Steps::Rag::Values::Query]
|
75
|
+
# - relevant_chunks [Array<::Deepsearch::Engine::Steps::Rag::Values::Chunk>]
|
76
|
+
summarization_result = with_retry do
|
77
|
+
Steps::Summarization::Process.new(
|
78
|
+
query: rag_result.query,
|
79
|
+
relevant_chunks: rag_result.relevant_chunks
|
80
|
+
).execute
|
81
|
+
end
|
82
|
+
notify_listener(:step_completed, step: :summarization, result: summarization_result)
|
83
|
+
# [summarization_result] Contains:
|
84
|
+
# - summary [String] The final answer with citations
|
85
|
+
# - success [Boolean]
|
86
|
+
# - error [String, nil]
|
87
|
+
|
88
|
+
summarization_result
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
def notify_listener(event, **payload)
|
94
|
+
listener = Deepsearch.configuration.listener
|
95
|
+
if !listener.respond_to?(:on_deepsearch_event)
|
96
|
+
Deepsearch.configuration.logger.debug("Attached listener does not respond to on_deepsearch_event, skipping notification")
|
97
|
+
return
|
98
|
+
end
|
99
|
+
|
100
|
+
begin
|
101
|
+
listener.on_deepsearch_event(event, **payload)
|
102
|
+
rescue StandardError => e
|
103
|
+
Deepsearch.configuration.logger.debug("Deepsearch listener failed: #{e.message}")
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
def with_retry(&block)
|
108
|
+
retries = 0
|
109
|
+
begin
|
110
|
+
result = block.call
|
111
|
+
# Handle "soft" failures from steps that return a result object with a #failure? method
|
112
|
+
raise "Operation failed: #{result.error}" if result.respond_to?(:failure?) && result.failure?
|
113
|
+
|
114
|
+
result
|
115
|
+
rescue => e
|
116
|
+
if (retries += 1) <= 1
|
117
|
+
Deepsearch.configuration.logger.debug("Retrying after error: #{e.message}")
|
118
|
+
retry
|
119
|
+
else
|
120
|
+
raise e
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
@@ -0,0 +1,122 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'uri'
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
module Deepsearch
|
8
|
+
class Engine
|
9
|
+
module Steps
|
10
|
+
module DataAggregation
|
11
|
+
# Fetches content from a URL, parses it, and cleans it to extract meaningful text.
|
12
|
+
# It handles HTTP requests, content type detection, and removal of unwanted HTML elements.
|
13
|
+
class ParsedWebsite
|
14
|
+
attr_reader :url, :content, :success, :error, :metadata, :timestamp
|
15
|
+
|
16
|
+
def initialize(url:)
|
17
|
+
@url = url
|
18
|
+
@content = nil
|
19
|
+
@success = false
|
20
|
+
@error = nil
|
21
|
+
fetch_content!
|
22
|
+
end
|
23
|
+
|
24
|
+
def success?
|
25
|
+
@success
|
26
|
+
end
|
27
|
+
|
28
|
+
def size
|
29
|
+
content.to_s.size
|
30
|
+
end
|
31
|
+
|
32
|
+
def to_h
|
33
|
+
{
|
34
|
+
url: url,
|
35
|
+
success: success?,
|
36
|
+
error: error,
|
37
|
+
content: content
|
38
|
+
}
|
39
|
+
end
|
40
|
+
|
41
|
+
protected
|
42
|
+
|
43
|
+
def fetch_content!
|
44
|
+
uri = URI.parse(@url)
|
45
|
+
|
46
|
+
unless %w[http https].include?(uri.scheme)
|
47
|
+
@error = "Invalid URL scheme: #{uri.scheme}"
|
48
|
+
return
|
49
|
+
end
|
50
|
+
|
51
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
52
|
+
http.use_ssl = uri.scheme == 'https'
|
53
|
+
http.read_timeout = 10
|
54
|
+
http.open_timeout = 5
|
55
|
+
|
56
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
57
|
+
request['User-Agent'] = random_user_agent
|
58
|
+
|
59
|
+
response = http.request(request)
|
60
|
+
|
61
|
+
if response.is_a?(Net::HTTPSuccess)
|
62
|
+
body = response.body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
|
63
|
+
@content = clean_content(body)
|
64
|
+
@success = true
|
65
|
+
else
|
66
|
+
@error = "HTTP #{response.code}"
|
67
|
+
end
|
68
|
+
rescue => e
|
69
|
+
@error = e.message
|
70
|
+
end
|
71
|
+
|
72
|
+
private
|
73
|
+
|
74
|
+
def random_user_agent
|
75
|
+
[
|
76
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 DeepSearch/1.0',
|
77
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 DeepSearch/1.0',
|
78
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Safari/605.1.15 DeepSearch/1.0',
|
79
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0 DeepSearch/1.0'
|
80
|
+
].sample
|
81
|
+
end
|
82
|
+
|
83
|
+
def clean_content(content)
|
84
|
+
raw_content = content.to_s
|
85
|
+
# If not HTML, we still need to make sure it's valid UTF-8 for JSON serialization.
|
86
|
+
unless raw_content =~ /<html[\s>]|<!DOCTYPE html/i
|
87
|
+
return raw_content.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
|
88
|
+
end
|
89
|
+
|
90
|
+
# Let Nokogiri parse the raw bytes and detect encoding
|
91
|
+
doc = Nokogiri::HTML(raw_content)
|
92
|
+
|
93
|
+
# Remove unwanted elements
|
94
|
+
doc.css('script, style, head, meta, link, noscript, iframe, svg, img').remove
|
95
|
+
|
96
|
+
# Remove comments
|
97
|
+
doc.xpath('//comment()').remove
|
98
|
+
|
99
|
+
# Remove inline styles and event handlers
|
100
|
+
doc.css('*').each do |node|
|
101
|
+
node.remove_attribute('style')
|
102
|
+
node.remove_attribute('onclick')
|
103
|
+
node.remove_attribute('onload')
|
104
|
+
node.remove_attribute('onerror')
|
105
|
+
end
|
106
|
+
|
107
|
+
# Get text content and clean it up
|
108
|
+
text = (doc.at('body')&.text || doc.text).to_s
|
109
|
+
utf8_text = text.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
110
|
+
utf8_text.gsub(/[[:space:]]+/, " ").strip
|
111
|
+
rescue StandardError
|
112
|
+
# Fallback if Nokogiri fails. The raw_content is the problem. Sanitize it from binary to UTF-8.
|
113
|
+
fallback_text = content.to_s.encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "")
|
114
|
+
fallback_text.gsub(/<script\b[^>]*>.*?<\/script>/mi, "").gsub(/<style\b[^>]*>.*?<\/style>/mi, "").gsub(
|
115
|
+
/[[:space:]]+/, " "
|
116
|
+
).strip
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "async"
|
4
|
+
require "async/semaphore"
|
5
|
+
require_relative "parsed_website"
|
6
|
+
require_relative "result"
|
7
|
+
|
8
|
+
module Deepsearch
|
9
|
+
class Engine
|
10
|
+
module Steps
|
11
|
+
module DataAggregation
|
12
|
+
# Takes a list of website URLs from a previous search step and processes them in parallel.
|
13
|
+
# For each URL, it fetches, parses, and cleans the content using `ParsedWebsite`.
|
14
|
+
# It aggregates the successfully parsed websites into a `Result` object.
|
15
|
+
class Process
|
16
|
+
MAX_CONCURRENCY = 30
|
17
|
+
|
18
|
+
attr_reader :websites
|
19
|
+
|
20
|
+
def initialize(websites: [])
|
21
|
+
@websites = websites
|
22
|
+
end
|
23
|
+
|
24
|
+
def execute
|
25
|
+
Deepsearch.configuration.logger.debug("Starting data aggregation for #{@websites.size} websites")
|
26
|
+
|
27
|
+
parsed_websites = process_in_parallel
|
28
|
+
parsed_websites.filter!(&:success?)
|
29
|
+
|
30
|
+
Result.new(
|
31
|
+
parsed_websites: parsed_websites
|
32
|
+
)
|
33
|
+
rescue StandardError => e
|
34
|
+
Result.new(
|
35
|
+
parsed_websites: [],
|
36
|
+
error: e.message
|
37
|
+
)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def process_in_parallel
|
43
|
+
Sync do |task|
|
44
|
+
semaphore = Async::Semaphore.new(MAX_CONCURRENCY, parent: task)
|
45
|
+
websites.map do |website|
|
46
|
+
semaphore.async do
|
47
|
+
ParsedWebsite.new(url: website['url'])
|
48
|
+
end
|
49
|
+
end.map(&:wait)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Deepsearch
|
4
|
+
class Engine
|
5
|
+
module Steps
|
6
|
+
module DataAggregation
|
7
|
+
# Represents the result of the data aggregation step.
|
8
|
+
# It holds the collection of successfully parsed websites and any potential error message.
|
9
|
+
class Result
|
10
|
+
attr_reader :parsed_websites, :error
|
11
|
+
|
12
|
+
def initialize(parsed_websites: [], error: nil)
|
13
|
+
@parsed_websites = parsed_websites
|
14
|
+
@error = error
|
15
|
+
end
|
16
|
+
|
17
|
+
def success?
|
18
|
+
error.nil?
|
19
|
+
end
|
20
|
+
|
21
|
+
def failure?
|
22
|
+
!success?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'result'
|
4
|
+
require_relative 'search'
|
5
|
+
|
6
|
+
module Deepsearch
|
7
|
+
class Engine
|
8
|
+
module Steps
|
9
|
+
module ParallelSearch
|
10
|
+
# Orchestrates the parallel execution of multiple search queries (initial query + sub-queries).
|
11
|
+
# It uses the `Search` class to perform the actual concurrent searches via a search adapter
|
12
|
+
# and wraps the outcome in a `Result` object.
|
13
|
+
class Process
|
14
|
+
attr_reader :initial_query, :sub_queries, :search_adapter, :options
|
15
|
+
|
16
|
+
def initialize(initial_query:,
|
17
|
+
sub_queries:,
|
18
|
+
search_adapter:,
|
19
|
+
**options)
|
20
|
+
@initial_query = initial_query
|
21
|
+
@sub_queries = sub_queries
|
22
|
+
@search_adapter = search_adapter
|
23
|
+
@options = options
|
24
|
+
end
|
25
|
+
|
26
|
+
def execute
|
27
|
+
websites = Search.new(initial_query, sub_queries, search_adapter, **@options).output
|
28
|
+
Deepsearch.configuration.logger.debug("Parallel search completed with #{websites.size} results")
|
29
|
+
ParallelSearch::Result.new(
|
30
|
+
websites: websites
|
31
|
+
)
|
32
|
+
rescue StandardError => e
|
33
|
+
ParallelSearch::Result.new(
|
34
|
+
websites: [],
|
35
|
+
error: e.message
|
36
|
+
)
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Deepsearch
|
4
|
+
class Engine
|
5
|
+
module Steps
|
6
|
+
module ParallelSearch
|
7
|
+
# Represents the result of the parallel search step.
|
8
|
+
# It holds the aggregated list of websites found and any potential error message.
|
9
|
+
class Result
|
10
|
+
attr_reader :query, :websites, :error, :search_duration
|
11
|
+
|
12
|
+
def initialize(websites: [], error: nil)
|
13
|
+
@websites = websites || []
|
14
|
+
@error = error
|
15
|
+
end
|
16
|
+
|
17
|
+
def success?
|
18
|
+
error.nil?
|
19
|
+
end
|
20
|
+
|
21
|
+
def failure?
|
22
|
+
!success?
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|