duck_search 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b30f0ccc51c4d05deef7e46119c461ab3f4d1267c7ba617e7385f24aa61ea4c4
4
- data.tar.gz: eb98b71458dea2141e75467a942bf5ccf5cbfb3c72c3165cfd20af81ad9cfa7e
3
+ metadata.gz: e559999c0129d8e3fb2ff6a49977e9987a892661853c0dad3511440779be9d7a
4
+ data.tar.gz: e5ad02f0123335f105a51d36695e40e2d0db84e3d6f931a686071cf22a71e151
5
5
  SHA512:
6
- metadata.gz: e512526190e909700a253240d032a77db2502301eb1599328c4f1bd4b46047f67b5ef3339c6db68e3e557fe7741b6b85f126f56c316c4b4a1afa683e3aa0b4ab
7
- data.tar.gz: 34d234bab472e6e3db41deda79047f841cf3d443ab01c11b1c27948b0b450810be8f6acd582ff95bb0aebd569bad096b6e678b82775faf9610db12e7e8d95f1a
6
+ metadata.gz: a5e75d35d6f5e2a31f4f4c5a5f788957746cb9ef43111b059437146e5ff537ca6c54c812fe6e2da21b79eeba2d291d3063a424f6ff2e5f29d81fed89bb26ee89
7
+ data.tar.gz: b476c7a66a9ffd84132cf22ee1ed77499d8b52348f478e16c525c87c0a7147f0d09a6a6c2d204406c04565d5a694ff412ab233d16085555abb8889c2fbcb31d6
@@ -1,46 +1,59 @@
1
1
  require "uri"
2
2
  require "cgi"
3
3
  require "faraday/retry"
4
+ require "faraday/gzip"
4
5
 
5
6
  module DuckSearch
6
7
  class Client
7
- BASE_URL = "https://html.duckduckgo.com"
8
- SEARCH_PATH = "/html"
9
- DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
10
- RESULT_CAP = 5
8
+ BASE_URL = "https://html.duckduckgo.com"
9
+ SEARCH_PATH = "/html"
10
+ RESULT_CAP = 5
11
+
12
+ # Match ddgr's actual User-Agent exactly
13
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " \
14
+ "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
11
15
 
12
16
  attr_reader :proxy, :timeout, :open_timeout, :user_agent
13
17
 
14
18
  def initialize(proxy: nil, timeout: 15, open_timeout: 10, user_agent: DEFAULT_USER_AGENT)
15
- @proxy = proxy
16
- @timeout = timeout
19
+ @proxy = proxy
20
+ @timeout = timeout
17
21
  @open_timeout = open_timeout
18
- @user_agent = user_agent
22
+ @user_agent = user_agent
19
23
  end
20
24
 
21
25
  def search(query)
26
+ # POST form body matching ddgr's page-0 payload
27
+ form_data = {
28
+ q: query,
29
+ b: "", # required blank field
30
+ kf: "-1", # disable favicons
31
+ kh: "1", # HTTPS always on
32
+ kl: "us-en", # region
33
+ kp: "1", # safe search (use -2 to disable)
34
+ k1: "-1", # ads off
35
+ }
36
+
22
37
  response = connection.post(SEARCH_PATH) do |req|
23
- req.headers["User-Agent"] = user_agent
24
- req.headers["DNT"] = "1"
25
- req.headers["Content-Type"] = "application/x-www-form-urlencoded"
26
- req.body = URI.encode_www_form(
27
- q: query,
28
- b: "",
29
- kf: "-1",
30
- kh: "1",
31
- kp: "1",
32
- k1: "-1"
33
- )
38
+ req.headers["User-Agent"] = user_agent
39
+ req.headers["Accept-Encoding"] = "gzip, deflate"
40
+ req.headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
41
+ req.headers["Accept-Language"] = "en-US,en;q=0.9"
42
+ req.headers["DNT"] = "1"
43
+ req.headers["Content-Type"] = "application/x-www-form-urlencoded"
44
+ req.body = URI.encode_www_form(form_data)
34
45
  end
35
46
 
36
- raise HttpError.new("DuckDuckGo returned HTTP #{response.status}",
37
- status: response.status,
38
- url: "#{BASE_URL}#{SEARCH_PATH}") unless response.success?
47
+ unless response.success?
48
+ raise HttpError.new("DuckDuckGo returned HTTP #{response.status}",
49
+ status: response.status,
50
+ url: "#{BASE_URL}#{SEARCH_PATH}")
51
+ end
39
52
 
40
53
  parse_html(response.body)
41
54
  rescue Faraday::Error => e
42
55
  raise HttpError.new("DuckDuckGo connection failed: #{e.message}",
43
- url: "#{BASE_URL}#{SEARCH_PATH}")
56
+ url: "#{BASE_URL}#{SEARCH_PATH}")
44
57
  end
45
58
 
46
59
  private
@@ -48,10 +61,12 @@ module DuckSearch
48
61
  def connection
49
62
  @connection ||= Faraday.new(url: BASE_URL) do |f|
50
63
  f.proxy = proxy if proxy
51
- f.request :retry, max: 2, interval: 0.5, backoff_factor: 2,
64
+ f.request :gzip
65
+ # Don't auto-retry on bot-detection responses — back off manually instead
66
+ f.request :retry, max: 2, interval: 1.0, backoff_factor: 2,
52
67
  retry_statuses: [429, 500, 502, 503, 504],
53
68
  methods: [:post]
54
- f.options.timeout = timeout
69
+ f.options.timeout = timeout
55
70
  f.options.open_timeout = open_timeout
56
71
  f.adapter Faraday.default_adapter
57
72
  end
@@ -62,10 +77,13 @@ module DuckSearch
62
77
 
63
78
  doc = Nokogiri::HTML(html_body)
64
79
 
80
+ if doc.at_css(".anomaly-modal__mask") || html_body.include?("anomaly-modal")
81
+ raise BotError, "DuckDuckGo returned an anti-bot challenge page"
82
+ end
83
+
65
84
  results = doc.css(".result").map do |node|
66
- anchor = node.at_css(".result__a")
85
+ anchor = node.at_css(".result__a")
67
86
  snippet_node = node.at_css(".result__snippet")
68
-
69
87
  next unless anchor
70
88
 
71
89
  DuckSearch::Result.new(
@@ -84,7 +102,7 @@ module DuckSearch
84
102
  if href.include?("uddg=")
85
103
  parsed = URI.parse(href.start_with?("http") ? href : "https:#{href}")
86
104
  params = URI.decode_www_form(parsed.query || "")
87
- uddg = params.find { |k, _| k == "uddg" }
105
+ uddg = params.find { |k, _| k == "uddg" }
88
106
  uddg ? CGI.unescape(uddg[1]) : href
89
107
  else
90
108
  href.strip
@@ -12,4 +12,6 @@ module DuckSearch
12
12
  end
13
13
 
14
14
  class ParseError < Error; end
15
+
16
+ class BotError < Error; end
15
17
  end
data/lib/duck_search.rb CHANGED
@@ -6,5 +6,5 @@ require_relative "duck_search/result"
6
6
  require_relative "duck_search/client"
7
7
 
8
8
  module DuckSearch
9
- VERSION = "0.1.0"
9
+ VERSION = "0.1.1"
10
10
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: duck_search
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Wenmar Pro
@@ -37,6 +37,20 @@ dependencies:
37
37
  - - ">="
38
38
  - !ruby/object:Gem::Version
39
39
  version: '2.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: faraday-gzip
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '3'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '3'
40
54
  - !ruby/object:Gem::Dependency
41
55
  name: nokogiri
42
56
  requirement: !ruby/object:Gem::Requirement
@@ -93,6 +107,34 @@ dependencies:
93
107
  - - "~>"
94
108
  - !ruby/object:Gem::Version
95
109
  version: '3.0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: vcr
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '6.0'
117
+ type: :development
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '6.0'
124
+ - !ruby/object:Gem::Dependency
125
+ name: base64
126
+ requirement: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ type: :development
132
+ prerelease: false
133
+ version_requirements: !ruby/object:Gem::Requirement
134
+ requirements:
135
+ - - ">="
136
+ - !ruby/object:Gem::Version
137
+ version: '0'
96
138
  - !ruby/object:Gem::Dependency
97
139
  name: irb
98
140
  requirement: !ruby/object:Gem::Requirement