baidu 1.1.2 → 1.1.3
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +19 -25
- metadata +1 -1
data/lib/baidu.rb
CHANGED
@@ -4,7 +4,14 @@ require 'nokogiri'
|
|
4
4
|
require 'json'
|
5
5
|
require 'addressable/uri'
|
6
6
|
require 'httparty'
|
7
|
-
|
7
|
+
class SearchEngine
|
8
|
+
#是否收录
|
9
|
+
def indexed?(url)
|
10
|
+
URI(url)
|
11
|
+
result = query(url)
|
12
|
+
return result.has_result?
|
13
|
+
end
|
14
|
+
end
|
8
15
|
class SearchResult
|
9
16
|
def initialize(body,baseuri,pagenumber=nil)
|
10
17
|
@body = Nokogiri::HTML body
|
@@ -42,26 +49,17 @@ class SearchResult
|
|
42
49
|
return nil
|
43
50
|
end
|
44
51
|
end
|
45
|
-
|
52
|
+
|
53
|
+
class Qihoo < SearchEngine
|
46
54
|
Host = 'www.so.com'
|
47
55
|
#基本查询, 相当于在搜索框直接数据关键词查询
|
48
56
|
def query(wd)
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
return QihooResult.new(body,uri)
|
56
|
-
rescue Exception => e
|
57
|
-
warn "#{uri} fetch error: #{e.to_s}"
|
58
|
-
return false
|
59
|
-
end
|
60
|
-
end
|
61
|
-
#是否收录
|
62
|
-
def indexed?(url)
|
63
|
-
URI(url)
|
64
|
-
query(url).has_result?
|
57
|
+
#用原始路径请求
|
58
|
+
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
59
|
+
body = HTTParty.get(uri)
|
60
|
+
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
61
|
+
uri = URI.join("http://#{Host}/",body.request.path).to_s
|
62
|
+
QihooResult.new(body,uri)
|
65
63
|
end
|
66
64
|
end
|
67
65
|
|
@@ -101,7 +99,7 @@ class QihooResult < SearchResult
|
|
101
99
|
end
|
102
100
|
end
|
103
101
|
|
104
|
-
class Mbaidu
|
102
|
+
class Mbaidu < SearchEngine
|
105
103
|
BaseUri = 'http://m.baidu.com/s?'
|
106
104
|
headers = {
|
107
105
|
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
@@ -210,7 +208,7 @@ class MbaiduResult < SearchResult
|
|
210
208
|
end
|
211
209
|
|
212
210
|
end
|
213
|
-
class Baidu
|
211
|
+
class Baidu < SearchEngine
|
214
212
|
BaseUri = 'http://www.baidu.com/s?'
|
215
213
|
PerPage = 100
|
216
214
|
|
@@ -266,7 +264,7 @@ class Baidu
|
|
266
264
|
@page = @a.get uri
|
267
265
|
BaiduResult.new(@page)
|
268
266
|
rescue Net::HTTP::Persistent::Error
|
269
|
-
warn "#{uri}
|
267
|
+
warn "[timeout] #{uri}"
|
270
268
|
return false
|
271
269
|
end
|
272
270
|
=begin
|
@@ -295,10 +293,6 @@ class Baidu
|
|
295
293
|
def how_many_pages_with(host,string)
|
296
294
|
query("site:#{host} inurl:#{string}").how_many
|
297
295
|
end
|
298
|
-
#是否收录
|
299
|
-
def indexed?(url)
|
300
|
-
query(url).has_result?
|
301
|
-
end
|
302
296
|
|
303
297
|
=begin
|
304
298
|
private
|