baidu 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/baidu.rb +232 -35
- metadata +1 -1
data/lib/baidu.rb
CHANGED
@@ -1,8 +1,215 @@
|
|
1
1
|
#coding:UTF-8
|
2
|
-
require 'rubygems'
|
3
2
|
require 'mechanize'
|
3
|
+
require 'nokogiri'
|
4
4
|
require 'json'
|
5
5
|
require 'addressable/uri'
|
6
|
+
require 'httparty'
|
7
|
+
|
8
|
+
class SearchResult
|
9
|
+
def initialize(body,baseuri,pagenumber=nil)
|
10
|
+
@body = Nokogiri::HTML body
|
11
|
+
@baseuri = baseuri
|
12
|
+
# @host = URI(baseuri).host
|
13
|
+
if pagenumber.nil?
|
14
|
+
@pagenumber = 1
|
15
|
+
else
|
16
|
+
@pagenumber = pagenumber
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
#返回当前页中host满足条件的结果
|
21
|
+
def ranks_for(specific_host)
|
22
|
+
host_ranks = Hash.new
|
23
|
+
ranks.each do |id,line|
|
24
|
+
if specific_host.class == Regexp
|
25
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
26
|
+
elsif specific_host.class == String
|
27
|
+
host_ranks[id] = line if line['host'] == specific_host
|
28
|
+
end
|
29
|
+
end
|
30
|
+
host_ranks
|
31
|
+
end
|
32
|
+
#return the top rank number from @ranks with the input host
|
33
|
+
def rank(host)#on base of ranks
|
34
|
+
ranks.each do |id,line|
|
35
|
+
id = id.to_i
|
36
|
+
if host.class == Regexp
|
37
|
+
return id if line['host'] =~ host
|
38
|
+
elsif host.class == String
|
39
|
+
return id if line['host'] == host
|
40
|
+
end
|
41
|
+
end
|
42
|
+
return nil
|
43
|
+
end
|
44
|
+
end
|
45
|
+
class Qihoo
|
46
|
+
Host = 'www.so.com'
|
47
|
+
#基本查询, 相当于在搜索框直接数据关键词查询
|
48
|
+
def query(wd)
|
49
|
+
begin
|
50
|
+
#用原始路径请求
|
51
|
+
uri = URI.encode(URI.join("http://#{Host}/",'s?q='+wd).to_s)
|
52
|
+
body = HTTParty.get(uri)
|
53
|
+
#如果请求地址被跳转,重新获取当前页的URI
|
54
|
+
uri = URI.join("http://#{Host}/",body.request.path).to_s
|
55
|
+
return QihooResult.new(body,uri)
|
56
|
+
rescue Exception => e
|
57
|
+
warn "#{uri} fetch error: #{e.to_s}"
|
58
|
+
return false
|
59
|
+
end
|
60
|
+
end
|
61
|
+
#是否收录
|
62
|
+
def indexed?(url)
|
63
|
+
URI(url)
|
64
|
+
query(url).has_result?
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class QihooResult < SearchResult
|
69
|
+
Host = 'www.so.com'
|
70
|
+
|
71
|
+
#返回所有当前页的排名结果
|
72
|
+
def ranks
|
73
|
+
return @ranks unless @ranks.nil?
|
74
|
+
@ranks = Hash.new
|
75
|
+
id = (@pagenumber - 1) * 10
|
76
|
+
@body.xpath('//li[@class="res-list"]').each do |li|
|
77
|
+
a = li.search("h3/a").first
|
78
|
+
url = li.search("cite")
|
79
|
+
next if a['data-pos'].nil?
|
80
|
+
id += 1
|
81
|
+
text = a.text.strip
|
82
|
+
href = a['href']
|
83
|
+
url = url.first.text
|
84
|
+
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
85
|
+
@ranks[id] = {'href'=>"http://so.com#{href}",'text'=>text,'host'=>host}
|
86
|
+
end
|
87
|
+
@ranks
|
88
|
+
end
|
89
|
+
#下一页
|
90
|
+
def next
|
91
|
+
next_href = @body.xpath('//a[@id="snext"]').first['href']
|
92
|
+
next_href = URI.join(@baseuri,next_href).to_s
|
93
|
+
# next_href = URI.join("http://#{@host}",next_href).to_s
|
94
|
+
next_body = HTTParty.get(next_href).body
|
95
|
+
return QihooResult.new(next_body,next_href,@pagenumber+1)
|
96
|
+
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
97
|
+
end
|
98
|
+
#有结果
|
99
|
+
def has_result?
|
100
|
+
!@body.xpath('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
class Mbaidu
|
105
|
+
BaseUri = 'http://m.baidu.com/s?'
|
106
|
+
headers = {
|
107
|
+
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
108
|
+
}
|
109
|
+
Options = {:headers => headers}
|
110
|
+
|
111
|
+
#基本查询,相当于从搜索框直接输入关键词查询
|
112
|
+
def query(wd)
|
113
|
+
queryStr = "word=#{wd}"
|
114
|
+
uri = URI.encode((BaseUri + queryStr))
|
115
|
+
begin
|
116
|
+
res = HTTParty.get(uri,Options)
|
117
|
+
MbaiduResult.new(res,uri)
|
118
|
+
rescue Exception => e
|
119
|
+
warn "#{uri} fetch error: #{e.to_s}"
|
120
|
+
return false
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
124
|
+
class MbaiduResult < SearchResult
|
125
|
+
def initialize(body,baseuri,pagenumber=nil)
|
126
|
+
@body = Nokogiri::HTML body
|
127
|
+
@baseuri = baseuri
|
128
|
+
if pagenumber.nil?
|
129
|
+
@pagenumber = 1
|
130
|
+
else
|
131
|
+
@pagenumber = pagenumber
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
#返回当前页所有查询结果
|
136
|
+
def ranks
|
137
|
+
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
138
|
+
return @ranks unless @ranks.nil?
|
139
|
+
@ranks = Hash.new
|
140
|
+
@body.xpath('//div[@class="result"]').each do |result|
|
141
|
+
href,text,host,is_mobile = '','','',false
|
142
|
+
a = result.search("a").first
|
143
|
+
is_mobile = true unless a.search("img").empty?
|
144
|
+
host = result.search('span[@class="site"]').first.text
|
145
|
+
href = a['href']
|
146
|
+
text = a.text
|
147
|
+
id = href.scan(/&order=(\d+)&/)
|
148
|
+
if id.empty?
|
149
|
+
id = nil
|
150
|
+
else
|
151
|
+
id = id.first.first.to_i
|
152
|
+
id = (@pagenumber-1)*10+id
|
153
|
+
end
|
154
|
+
=begin
|
155
|
+
result.children.each do |elem|
|
156
|
+
if elem.name == 'a'
|
157
|
+
href = elem['href']
|
158
|
+
id = elem.text.match(/^\d+/).to_s.to_i
|
159
|
+
text = elem.text.sub(/^\d+/,'')
|
160
|
+
text.sub!(/^\u00A0/,'')
|
161
|
+
elsif elem['class'] == 'abs'
|
162
|
+
elem.children.each do |elem2|
|
163
|
+
if elem2['class'] == 'site'
|
164
|
+
host = elem2.text
|
165
|
+
break
|
166
|
+
end
|
167
|
+
end
|
168
|
+
elsif elem['class'] == 'site'
|
169
|
+
host == elem['href']
|
170
|
+
end
|
171
|
+
end
|
172
|
+
=end
|
173
|
+
|
174
|
+
@ranks[id] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
|
175
|
+
end
|
176
|
+
@ranks
|
177
|
+
end
|
178
|
+
=begin
|
179
|
+
#返回当前页中,符合host条件的结果
|
180
|
+
def ranks_for(specific_host)
|
181
|
+
host_ranks = Hash.new
|
182
|
+
ranks.each do |id,line|
|
183
|
+
if specific_host.class == Regexp
|
184
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
185
|
+
elsif specific_host.class == String
|
186
|
+
host_ranks[id] = line if line['host'] == specific_host
|
187
|
+
end
|
188
|
+
end
|
189
|
+
host_ranks
|
190
|
+
end
|
191
|
+
#return the top rank number from @ranks with the input host
|
192
|
+
def rank(host)#on base of ranks
|
193
|
+
ranks.each do |id,line|
|
194
|
+
id = id.to_i
|
195
|
+
if host.class == Regexp
|
196
|
+
return id if line['host'] =~ host
|
197
|
+
elsif host.class == String
|
198
|
+
return id if line['host'] == host
|
199
|
+
end
|
200
|
+
end
|
201
|
+
return nil
|
202
|
+
end
|
203
|
+
=end
|
204
|
+
#下一页
|
205
|
+
def next
|
206
|
+
url = @body.xpath('//a[text()="下一页"]').first['href']
|
207
|
+
url = URI.join(@baseuri,url).to_s
|
208
|
+
body = HTTParty.get(url)
|
209
|
+
return MbaiduResult.new(body,url,@pagenumber+1)
|
210
|
+
end
|
211
|
+
|
212
|
+
end
|
6
213
|
class Baidu
|
7
214
|
BaseUri = 'http://www.baidu.com/s?'
|
8
215
|
PerPage = 100
|
@@ -15,7 +222,7 @@ class Baidu
|
|
15
222
|
end
|
16
223
|
|
17
224
|
def suggestions(wd)
|
18
|
-
json =
|
225
|
+
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
|
19
226
|
m = /\[([^\]]*)\]/.match json
|
20
227
|
return JSON.parse m[0]
|
21
228
|
end
|
@@ -74,12 +281,6 @@ class Baidu
|
|
74
281
|
=end
|
75
282
|
end
|
76
283
|
|
77
|
-
=begin
|
78
|
-
def maxpage
|
79
|
-
@maxpage ||= (how_many / PerPage.to_f).round
|
80
|
-
end
|
81
|
-
=end
|
82
|
-
|
83
284
|
#site:xxx.yyy.com
|
84
285
|
def how_many_pages(host)
|
85
286
|
query("site:#{host}").how_many
|
@@ -94,6 +295,10 @@ class Baidu
|
|
94
295
|
def how_many_pages_with(host,string)
|
95
296
|
query("site:#{host} inurl:#{string}").how_many
|
96
297
|
end
|
298
|
+
#是否收录
|
299
|
+
def indexed?(url)
|
300
|
+
query(url).has_result?
|
301
|
+
end
|
97
302
|
|
98
303
|
=begin
|
99
304
|
private
|
@@ -105,13 +310,13 @@ class Baidu
|
|
105
310
|
=end
|
106
311
|
end
|
107
312
|
|
108
|
-
class BaiduResult
|
313
|
+
class BaiduResult < SearchResult
|
109
314
|
def initialize(page)
|
110
315
|
raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
111
316
|
@page = page
|
112
317
|
end
|
113
318
|
|
114
|
-
def ranks
|
319
|
+
def ranks
|
115
320
|
return @ranks unless @ranks.nil?
|
116
321
|
@ranks = Hash.new
|
117
322
|
@page.search("//table[@class=\"result\"]").each do |table|
|
@@ -129,33 +334,21 @@ class BaiduResult
|
|
129
334
|
end
|
130
335
|
end
|
131
336
|
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
132
|
-
|
133
|
-
@ranks
|
134
|
-
else
|
135
|
-
host_ranks = Hash.new
|
136
|
-
@ranks.each do |id,line|
|
137
|
-
if host.class == Regexp
|
138
|
-
host_ranks[id] = line if line['host'] =~ host
|
139
|
-
elsif host.class == String
|
140
|
-
host_ranks[id] = line if line['host'] == host
|
141
|
-
end
|
142
|
-
end
|
143
|
-
host_ranks
|
144
|
-
#'not finished'#@ranks.each_with_index.map{|h,i| i if !h.nil? and h==host}.compact
|
145
|
-
end
|
337
|
+
@ranks
|
146
338
|
end
|
147
339
|
|
148
340
|
#return the top rank number from @ranks with the input host
|
149
|
-
def rank(host)#on base of ranks
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
341
|
+
# def rank(host)#on base of ranks
|
342
|
+
# ranks.each do |id,line|
|
343
|
+
# id = id.to_i
|
344
|
+
# if host.class == Regexp
|
345
|
+
# return id if line['host'] =~ host
|
346
|
+
# elsif host.class == String
|
347
|
+
# return id if line['host'] == host
|
348
|
+
# end
|
349
|
+
# end
|
350
|
+
# return nil
|
351
|
+
# end
|
159
352
|
|
160
353
|
def how_many
|
161
354
|
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
@@ -168,5 +361,9 @@ class BaiduResult
|
|
168
361
|
def next
|
169
362
|
@page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
170
363
|
end
|
364
|
+
|
365
|
+
def has_result?
|
366
|
+
@page.search('//div[@class="nors"]').empty?
|
367
|
+
end
|
171
368
|
|
172
|
-
end
|
369
|
+
end
|