baidu 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/baidu.rb +232 -35
- metadata +1 -1
data/lib/baidu.rb
CHANGED
@@ -1,8 +1,215 @@
|
|
1
1
|
#coding:UTF-8
|
2
|
-
require 'rubygems'
|
3
2
|
require 'mechanize'
|
3
|
+
require 'nokogiri'
|
4
4
|
require 'json'
|
5
5
|
require 'addressable/uri'
|
6
|
+
require 'httparty'
|
7
|
+
|
8
|
+
class SearchResult
|
9
|
+
def initialize(body,baseuri,pagenumber=nil)
|
10
|
+
@body = Nokogiri::HTML body
|
11
|
+
@baseuri = baseuri
|
12
|
+
# @host = URI(baseuri).host
|
13
|
+
if pagenumber.nil?
|
14
|
+
@pagenumber = 1
|
15
|
+
else
|
16
|
+
@pagenumber = pagenumber
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
#返回当前页中host满足条件的结果
|
21
|
+
def ranks_for(specific_host)
|
22
|
+
host_ranks = Hash.new
|
23
|
+
ranks.each do |id,line|
|
24
|
+
if specific_host.class == Regexp
|
25
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
26
|
+
elsif specific_host.class == String
|
27
|
+
host_ranks[id] = line if line['host'] == specific_host
|
28
|
+
end
|
29
|
+
end
|
30
|
+
host_ranks
|
31
|
+
end
|
32
|
+
#return the top rank number from @ranks with the input host
|
33
|
+
def rank(host)#on base of ranks
|
34
|
+
ranks.each do |id,line|
|
35
|
+
id = id.to_i
|
36
|
+
if host.class == Regexp
|
37
|
+
return id if line['host'] =~ host
|
38
|
+
elsif host.class == String
|
39
|
+
return id if line['host'] == host
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
end
|
45
|
+
class Qihoo
|
46
|
+
Host = 'www.so.com'
|
47
|
+
#基本查询, 相当于在搜索框直接数据关键词查询
|
48
|
+
def query(wd)
|
49
|
+
begin
|
50
|
+
#用原始路径请求
|
51
|
+
uri = URI.encode(URI.join("http://#{Host}/",'s?q='+wd).to_s)
|
52
|
+
body = HTTParty.get(uri)
|
53
|
+
#如果请求地址被跳转,重新获取当前页的URI
|
54
|
+
uri = URI.join("http://#{Host}/",body.request.path).to_s
|
55
|
+
return QihooResult.new(body,uri)
|
56
|
+
rescue Exception => e
|
57
|
+
warn "#{uri} fetch error: #{e.to_s}"
|
58
|
+
return false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
#是否收录
|
62
|
+
def indexed?(url)
|
63
|
+
URI(url)
|
64
|
+
query(url).has_result?
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class QihooResult < SearchResult
|
69
|
+
Host = 'www.so.com'
|
70
|
+
|
71
|
+
#返回所有当前页的排名结果
|
72
|
+
def ranks
|
73
|
+
return @ranks unless @ranks.nil?
|
74
|
+
@ranks = Hash.new
|
75
|
+
id = (@pagenumber - 1) * 10
|
76
|
+
@body.xpath('//li[@class="res-list"]').each do |li|
|
77
|
+
a = li.search("h3/a").first
|
78
|
+
url = li.search("cite")
|
79
|
+
next if a['data-pos'].nil?
|
80
|
+
id += 1
|
81
|
+
text = a.text.strip
|
82
|
+
href = a['href']
|
83
|
+
url = url.first.text
|
84
|
+
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
85
|
+
@ranks[id] = {'href'=>"http://so.com#{href}",'text'=>text,'host'=>host}
|
86
|
+
end
|
87
|
+
@ranks
|
88
|
+
end
|
89
|
+
#下一页
|
90
|
+
def next
|
91
|
+
next_href = @body.xpath('//a[@id="snext"]').first['href']
|
92
|
+
next_href = URI.join(@baseuri,next_href).to_s
|
93
|
+
# next_href = URI.join("http://#{@host}",next_href).to_s
|
94
|
+
next_body = HTTParty.get(next_href).body
|
95
|
+
return QihooResult.new(next_body,next_href,@pagenumber+1)
|
96
|
+
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
97
|
+
end
|
98
|
+
#有结果
|
99
|
+
def has_result?
|
100
|
+
!@body.xpath('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Mbaidu
|
105
|
+
BaseUri = 'http://m.baidu.com/s?'
|
106
|
+
headers = {
|
107
|
+
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
108
|
+
}
|
109
|
+
Options = {:headers => headers}
|
110
|
+
|
111
|
+
#基本查询,相当于从搜索框直接输入关键词查询
|
112
|
+
def query(wd)
|
113
|
+
queryStr = "word=#{wd}"
|
114
|
+
uri = URI.encode((BaseUri + queryStr))
|
115
|
+
begin
|
116
|
+
res = HTTParty.get(uri,Options)
|
117
|
+
MbaiduResult.new(res,uri)
|
118
|
+
rescue Exception => e
|
119
|
+
warn "#{uri} fetch error: #{e.to_s}"
|
120
|
+
return false
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
class MbaiduResult < SearchResult
|
125
|
+
def initialize(body,baseuri,pagenumber=nil)
|
126
|
+
@body = Nokogiri::HTML body
|
127
|
+
@baseuri = baseuri
|
128
|
+
if pagenumber.nil?
|
129
|
+
@pagenumber = 1
|
130
|
+
else
|
131
|
+
@pagenumber = pagenumber
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
#返回当前页所有查询结果
|
136
|
+
def ranks
|
137
|
+
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
138
|
+
return @ranks unless @ranks.nil?
|
139
|
+
@ranks = Hash.new
|
140
|
+
@body.xpath('//div[@class="result"]').each do |result|
|
141
|
+
href,text,host,is_mobile = '','','',false
|
142
|
+
a = result.search("a").first
|
143
|
+
is_mobile = true unless a.search("img").empty?
|
144
|
+
host = result.search('span[@class="site"]').first.text
|
145
|
+
href = a['href']
|
146
|
+
text = a.text
|
147
|
+
id = href.scan(/&order=(\d+)&/)
|
148
|
+
if id.empty?
|
149
|
+
id = nil
|
150
|
+
else
|
151
|
+
id = id.first.first.to_i
|
152
|
+
id = (@pagenumber-1)*10+id
|
153
|
+
end
|
154
|
+
=begin
|
155
|
+
result.children.each do |elem|
|
156
|
+
if elem.name == 'a'
|
157
|
+
href = elem['href']
|
158
|
+
id = elem.text.match(/^\d+/).to_s.to_i
|
159
|
+
text = elem.text.sub(/^\d+/,'')
|
160
|
+
text.sub!(/^\u00A0/,'')
|
161
|
+
elsif elem['class'] == 'abs'
|
162
|
+
elem.children.each do |elem2|
|
163
|
+
if elem2['class'] == 'site'
|
164
|
+
host = elem2.text
|
165
|
+
break
|
166
|
+
end
|
167
|
+
end
|
168
|
+
elsif elem['class'] == 'site'
|
169
|
+
host == elem['href']
|
170
|
+
end
|
171
|
+
end
|
172
|
+
=end
|
173
|
+
|
174
|
+
@ranks[id] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
|
175
|
+
end
|
176
|
+
@ranks
|
177
|
+
end
|
178
|
+
=begin
|
179
|
+
#返回当前页中,符合host条件的结果
|
180
|
+
def ranks_for(specific_host)
|
181
|
+
host_ranks = Hash.new
|
182
|
+
ranks.each do |id,line|
|
183
|
+
if specific_host.class == Regexp
|
184
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
185
|
+
elsif specific_host.class == String
|
186
|
+
host_ranks[id] = line if line['host'] == specific_host
|
187
|
+
end
|
188
|
+
end
|
189
|
+
host_ranks
|
190
|
+
end
|
191
|
+
#return the top rank number from @ranks with the input host
|
192
|
+
def rank(host)#on base of ranks
|
193
|
+
ranks.each do |id,line|
|
194
|
+
id = id.to_i
|
195
|
+
if host.class == Regexp
|
196
|
+
return id if line['host'] =~ host
|
197
|
+
elsif host.class == String
|
198
|
+
return id if line['host'] == host
|
199
|
+
end
|
200
|
+
end
|
201
|
+
return nil
|
202
|
+
end
|
203
|
+
=end
|
204
|
+
#下一页
|
205
|
+
def next
|
206
|
+
url = @body.xpath('//a[text()="下一页"]').first['href']
|
207
|
+
url = URI.join(@baseuri,url).to_s
|
208
|
+
body = HTTParty.get(url)
|
209
|
+
return MbaiduResult.new(body,url,@pagenumber+1)
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
6
213
|
class Baidu
|
7
214
|
BaseUri = 'http://www.baidu.com/s?'
|
8
215
|
PerPage = 100
|
@@ -15,7 +222,7 @@ class Baidu
|
|
15
222
|
end
|
16
223
|
|
17
224
|
def suggestions(wd)
|
18
|
-
json =
|
225
|
+
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
19
226
|
m = /\[([^\]]*)\]/.match json
|
20
227
|
return JSON.parse m[0]
|
21
228
|
end
|
@@ -74,12 +281,6 @@ class Baidu
|
|
74
281
|
=end
|
75
282
|
end
|
76
283
|
|
77
|
-
=begin
|
78
|
-
def maxpage
|
79
|
-
@maxpage ||= (how_many / PerPage.to_f).round
|
80
|
-
end
|
81
|
-
=end
|
82
|
-
|
83
284
|
#site:xxx.yyy.com
|
84
285
|
def how_many_pages(host)
|
85
286
|
query("site:#{host}").how_many
|
@@ -94,6 +295,10 @@ class Baidu
|
|
94
295
|
def how_many_pages_with(host,string)
|
95
296
|
query("site:#{host} inurl:#{string}").how_many
|
96
297
|
end
|
298
|
+
#是否收录
|
299
|
+
def indexed?(url)
|
300
|
+
query(url).has_result?
|
301
|
+
end
|
97
302
|
|
98
303
|
=begin
|
99
304
|
private
|
@@ -105,13 +310,13 @@ class Baidu
|
|
105
310
|
=end
|
106
311
|
end
|
107
312
|
|
108
|
-
class BaiduResult
|
313
|
+
class BaiduResult < SearchResult
|
109
314
|
def initialize(page)
|
110
315
|
raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
111
316
|
@page = page
|
112
317
|
end
|
113
318
|
|
114
|
-
def ranks
|
319
|
+
def ranks
|
115
320
|
return @ranks unless @ranks.nil?
|
116
321
|
@ranks = Hash.new
|
117
322
|
@page.search("//table[@class=\"result\"]").each do |table|
|
@@ -129,33 +334,21 @@ class BaiduResult
|
|
129
334
|
end
|
130
335
|
end
|
131
336
|
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
132
|
-
|
133
|
-
@ranks
|
134
|
-
else
|
135
|
-
host_ranks = Hash.new
|
136
|
-
@ranks.each do |id,line|
|
137
|
-
if host.class == Regexp
|
138
|
-
host_ranks[id] = line if line['host'] =~ host
|
139
|
-
elsif host.class == String
|
140
|
-
host_ranks[id] = line if line['host'] == host
|
141
|
-
end
|
142
|
-
end
|
143
|
-
host_ranks
|
144
|
-
#'not finished'#@ranks.each_with_index.map{|h,i| i if !h.nil? and h==host}.compact
|
145
|
-
end
|
337
|
+
@ranks
|
146
338
|
end
|
147
339
|
|
148
340
|
#return the top rank number from @ranks with the input host
|
149
|
-
def rank(host)#on base of ranks
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
341
|
+
# def rank(host)#on base of ranks
|
342
|
+
# ranks.each do |id,line|
|
343
|
+
# id = id.to_i
|
344
|
+
# if host.class == Regexp
|
345
|
+
# return id if line['host'] =~ host
|
346
|
+
# elsif host.class == String
|
347
|
+
# return id if line['host'] == host
|
348
|
+
# end
|
349
|
+
# end
|
350
|
+
# return nil
|
351
|
+
# end
|
159
352
|
|
160
353
|
def how_many
|
161
354
|
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
@@ -168,5 +361,9 @@ class BaiduResult
|
|
168
361
|
def next
|
169
362
|
@page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
170
363
|
end
|
364
|
+
|
365
|
+
def has_result?
|
366
|
+
@page.search('//div[@class="nors"]').empty?
|
367
|
+
end
|
171
368
|
|
172
|
-
end
|
369
|
+
end
|