duck_search 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/duck_search/client.rb +45 -27
- data/lib/duck_search/errors.rb +2 -0
- data/lib/duck_search.rb +1 -1
- metadata +43 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e559999c0129d8e3fb2ff6a49977e9987a892661853c0dad3511440779be9d7a
|
|
4
|
+
data.tar.gz: e5ad02f0123335f105a51d36695e40e2d0db84e3d6f931a686071cf22a71e151
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a5e75d35d6f5e2a31f4f4c5a5f788957746cb9ef43111b059437146e5ff537ca6c54c812fe6e2da21b79eeba2d291d3063a424f6ff2e5f29d81fed89bb26ee89
|
|
7
|
+
data.tar.gz: b476c7a66a9ffd84132cf22ee1ed77499d8b52348f478e16c525c87c0a7147f0d09a6a6c2d204406c04565d5a694ff412ab233d16085555abb8889c2fbcb31d6
|
data/lib/duck_search/client.rb
CHANGED
|
@@ -1,46 +1,59 @@
|
|
|
1
1
|
require "uri"
|
|
2
2
|
require "cgi"
|
|
3
3
|
require "faraday/retry"
|
|
4
|
+
require "faraday/gzip"
|
|
4
5
|
|
|
5
6
|
module DuckSearch
|
|
6
7
|
class Client
|
|
7
|
-
BASE_URL
|
|
8
|
-
SEARCH_PATH
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
BASE_URL = "https://html.duckduckgo.com"
|
|
9
|
+
SEARCH_PATH = "/html"
|
|
10
|
+
RESULT_CAP = 5
|
|
11
|
+
|
|
12
|
+
# Match ddgr's actual User-Agent exactly
|
|
13
|
+
DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " \
|
|
14
|
+
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
11
15
|
|
|
12
16
|
attr_reader :proxy, :timeout, :open_timeout, :user_agent
|
|
13
17
|
|
|
14
18
|
def initialize(proxy: nil, timeout: 15, open_timeout: 10, user_agent: DEFAULT_USER_AGENT)
|
|
15
|
-
@proxy
|
|
16
|
-
@timeout
|
|
19
|
+
@proxy = proxy
|
|
20
|
+
@timeout = timeout
|
|
17
21
|
@open_timeout = open_timeout
|
|
18
|
-
@user_agent
|
|
22
|
+
@user_agent = user_agent
|
|
19
23
|
end
|
|
20
24
|
|
|
21
25
|
def search(query)
|
|
26
|
+
# POST form body matching ddgr's page-0 payload
|
|
27
|
+
form_data = {
|
|
28
|
+
q: query,
|
|
29
|
+
b: "", # required blank field
|
|
30
|
+
kf: "-1", # disable favicons
|
|
31
|
+
kh: "1", # HTTPS always on
|
|
32
|
+
kl: "us-en", # region
|
|
33
|
+
kp: "1", # safe search (use -2 to disable)
|
|
34
|
+
k1: "-1", # ads off
|
|
35
|
+
}
|
|
36
|
+
|
|
22
37
|
response = connection.post(SEARCH_PATH) do |req|
|
|
23
|
-
req.headers["User-Agent"]
|
|
24
|
-
req.headers["
|
|
25
|
-
req.headers["
|
|
26
|
-
req.
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
kh: "1",
|
|
31
|
-
kp: "1",
|
|
32
|
-
k1: "-1"
|
|
33
|
-
)
|
|
38
|
+
req.headers["User-Agent"] = user_agent
|
|
39
|
+
req.headers["Accept-Encoding"] = "gzip, deflate"
|
|
40
|
+
req.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
|
|
41
|
+
req.headers["Accept-Language"] = "en-US,en;q=0.9"
|
|
42
|
+
req.headers["DNT"] = "1"
|
|
43
|
+
req.headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
44
|
+
req.body = URI.encode_www_form(form_data)
|
|
34
45
|
end
|
|
35
46
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
47
|
+
unless response.success?
|
|
48
|
+
raise HttpError.new("DuckDuckGo returned HTTP #{response.status}",
|
|
49
|
+
status: response.status,
|
|
50
|
+
url: "#{BASE_URL}#{SEARCH_PATH}")
|
|
51
|
+
end
|
|
39
52
|
|
|
40
53
|
parse_html(response.body)
|
|
41
54
|
rescue Faraday::Error => e
|
|
42
55
|
raise HttpError.new("DuckDuckGo connection failed: #{e.message}",
|
|
43
|
-
|
|
56
|
+
url: "#{BASE_URL}#{SEARCH_PATH}")
|
|
44
57
|
end
|
|
45
58
|
|
|
46
59
|
private
|
|
@@ -48,10 +61,12 @@ module DuckSearch
|
|
|
48
61
|
def connection
|
|
49
62
|
@connection ||= Faraday.new(url: BASE_URL) do |f|
|
|
50
63
|
f.proxy = proxy if proxy
|
|
51
|
-
f.request :
|
|
64
|
+
f.request :gzip
|
|
65
|
+
# Don't auto-retry on bot-detection responses — back off manually instead
|
|
66
|
+
f.request :retry, max: 2, interval: 1.0, backoff_factor: 2,
|
|
52
67
|
retry_statuses: [429, 500, 502, 503, 504],
|
|
53
68
|
methods: [:post]
|
|
54
|
-
f.options.timeout
|
|
69
|
+
f.options.timeout = timeout
|
|
55
70
|
f.options.open_timeout = open_timeout
|
|
56
71
|
f.adapter Faraday.default_adapter
|
|
57
72
|
end
|
|
@@ -62,10 +77,13 @@ module DuckSearch
|
|
|
62
77
|
|
|
63
78
|
doc = Nokogiri::HTML(html_body)
|
|
64
79
|
|
|
80
|
+
if doc.at_css(".anomaly-modal__mask") || html_body.include?("anomaly-modal")
|
|
81
|
+
raise BotError, "DuckDuckGo returned an anti-bot challenge page"
|
|
82
|
+
end
|
|
83
|
+
|
|
65
84
|
results = doc.css(".result").map do |node|
|
|
66
|
-
anchor
|
|
85
|
+
anchor = node.at_css(".result__a")
|
|
67
86
|
snippet_node = node.at_css(".result__snippet")
|
|
68
|
-
|
|
69
87
|
next unless anchor
|
|
70
88
|
|
|
71
89
|
DuckSearch::Result.new(
|
|
@@ -84,7 +102,7 @@ module DuckSearch
|
|
|
84
102
|
if href.include?("uddg=")
|
|
85
103
|
parsed = URI.parse(href.start_with?("http") ? href : "https:#{href}")
|
|
86
104
|
params = URI.decode_www_form(parsed.query || "")
|
|
87
|
-
uddg
|
|
105
|
+
uddg = params.find { |k, _| k == "uddg" }
|
|
88
106
|
uddg ? CGI.unescape(uddg[1]) : href
|
|
89
107
|
else
|
|
90
108
|
href.strip
|
data/lib/duck_search/errors.rb
CHANGED
data/lib/duck_search.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: duck_search
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Wenmar Pro
|
|
@@ -37,6 +37,20 @@ dependencies:
|
|
|
37
37
|
- - ">="
|
|
38
38
|
- !ruby/object:Gem::Version
|
|
39
39
|
version: '2.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: faraday-gzip
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '3'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '3'
|
|
40
54
|
- !ruby/object:Gem::Dependency
|
|
41
55
|
name: nokogiri
|
|
42
56
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -93,6 +107,34 @@ dependencies:
|
|
|
93
107
|
- - "~>"
|
|
94
108
|
- !ruby/object:Gem::Version
|
|
95
109
|
version: '3.0'
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: vcr
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - "~>"
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '6.0'
|
|
117
|
+
type: :development
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - "~>"
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '6.0'
|
|
124
|
+
- !ruby/object:Gem::Dependency
|
|
125
|
+
name: base64
|
|
126
|
+
requirement: !ruby/object:Gem::Requirement
|
|
127
|
+
requirements:
|
|
128
|
+
- - ">="
|
|
129
|
+
- !ruby/object:Gem::Version
|
|
130
|
+
version: '0'
|
|
131
|
+
type: :development
|
|
132
|
+
prerelease: false
|
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
134
|
+
requirements:
|
|
135
|
+
- - ">="
|
|
136
|
+
- !ruby/object:Gem::Version
|
|
137
|
+
version: '0'
|
|
96
138
|
- !ruby/object:Gem::Dependency
|
|
97
139
|
name: irb
|
|
98
140
|
requirement: !ruby/object:Gem::Requirement
|