baidu 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/baidu.rb +68 -70
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8b31dd8b631dbab567991e53f1a6b54a4d7fd9ce
|
4
|
+
data.tar.gz: 2bdc0cd46017d2b02176fae221895b1d0e7db9bc
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cf1c086b498c805a1dd497fea7d2467e036c1b5d66f8a773e992a819735447d90244661947dd5f9b081a2a272057c69df14759566d25e2d70798c2d27cc27c3e
|
7
|
+
data.tar.gz: 596c6098e190ff0e5835bc1f2daff641123805ee2894bc6d9f431d110b587fbb6e17d96822d56147822757c998ef393b702555ff30d1bae48b6c1e7d4de1ad61
|
data/lib/baidu.rb
CHANGED
@@ -5,8 +5,8 @@ require 'addressable/uri'
|
|
5
5
|
require 'httparty'
|
6
6
|
class SearchEngine
|
7
7
|
#是否收录
|
8
|
-
def initialize(
|
9
|
-
@
|
8
|
+
def initialize(pagesize = 100)
|
9
|
+
@pagesize = pagesize#只允许10或100
|
10
10
|
end
|
11
11
|
def indexed?(url)
|
12
12
|
URI(url)
|
@@ -15,15 +15,12 @@ class SearchEngine
|
|
15
15
|
end
|
16
16
|
end
|
17
17
|
class SearchResult
|
18
|
-
def initialize(
|
19
|
-
@
|
18
|
+
def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
19
|
+
@page = Nokogiri::HTML page
|
20
20
|
@baseuri = baseuri
|
21
21
|
# @host = URI(baseuri).host
|
22
|
-
|
23
|
-
|
24
|
-
else
|
25
|
-
@pagenumber = pagenumber
|
26
|
-
end
|
22
|
+
@pagenumber = pagenumber
|
23
|
+
@pagesize = pagesize
|
27
24
|
end
|
28
25
|
def whole
|
29
26
|
{
|
@@ -65,10 +62,10 @@ class Qihoo < SearchEngine
|
|
65
62
|
def query(wd)
|
66
63
|
#用原始路径请求
|
67
64
|
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
68
|
-
|
65
|
+
page = HTTParty.get(uri)
|
69
66
|
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
70
|
-
uri = URI.join("http://#{Host}/",
|
71
|
-
QihooResult.new(
|
67
|
+
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
68
|
+
QihooResult.new(page,uri)
|
72
69
|
end
|
73
70
|
end
|
74
71
|
|
@@ -79,7 +76,7 @@ class QihooResult < SearchResult
|
|
79
76
|
return @ranks unless @ranks.nil?
|
80
77
|
@ranks = Hash.new
|
81
78
|
id = (@pagenumber - 1) * 10
|
82
|
-
@
|
79
|
+
@page.search('//li[@class="res-list"]').each do |li|
|
83
80
|
a = li.search("h3/a").first
|
84
81
|
url = li.search("cite")
|
85
82
|
next if a['data-pos'].nil?
|
@@ -95,7 +92,7 @@ class QihooResult < SearchResult
|
|
95
92
|
def ads_top
|
96
93
|
id = 0
|
97
94
|
result = []
|
98
|
-
@
|
95
|
+
@page.search("//ul[@id='djbox']/li").each do |li|
|
99
96
|
id+=1
|
100
97
|
title = li.search("a").first.text
|
101
98
|
href = li.search("cite").first.text.downcase
|
@@ -110,7 +107,7 @@ class QihooResult < SearchResult
|
|
110
107
|
def ads_right
|
111
108
|
id = 0
|
112
109
|
result = []
|
113
|
-
@
|
110
|
+
@page.search("//ul[@id='rightbox']/li").each do |li|
|
114
111
|
id += 1
|
115
112
|
title = li.search("a").first.text
|
116
113
|
href = li.search("cite").first.text.downcase
|
@@ -124,18 +121,18 @@ class QihooResult < SearchResult
|
|
124
121
|
end
|
125
122
|
#下一页
|
126
123
|
def next
|
127
|
-
next_href = @
|
124
|
+
next_href = @page.xpath('//a[@id="snext"]')
|
128
125
|
return false if next_href.empty?
|
129
126
|
next_href = next_href.first['href']
|
130
127
|
next_href = URI.join(@baseuri,next_href).to_s
|
131
128
|
# next_href = URI.join("http://#{@host}",next_href).to_s
|
132
|
-
|
133
|
-
return QihooResult.new(
|
129
|
+
next_page = HTTParty.get(next_href).page
|
130
|
+
return QihooResult.new(next_page,next_href,@pagenumber+1)
|
134
131
|
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
135
132
|
end
|
136
133
|
#有结果
|
137
134
|
def has_result?
|
138
|
-
!@
|
135
|
+
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
139
136
|
end
|
140
137
|
end
|
141
138
|
|
@@ -160,22 +157,22 @@ class Mbaidu < SearchEngine
|
|
160
157
|
end
|
161
158
|
end
|
162
159
|
class MbaiduResult < SearchResult
|
163
|
-
def initialize(
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
end
|
160
|
+
# def initialize(page,baseuri,pagenumber=nil)
|
161
|
+
# @page= Nokogiri::HTML page
|
162
|
+
# @baseuri = baseuri
|
163
|
+
# if pagenumber.nil?
|
164
|
+
# @pagenumber = 1
|
165
|
+
# else
|
166
|
+
# @pagenumber = pagenumber
|
167
|
+
# end
|
168
|
+
# end
|
172
169
|
|
173
170
|
#返回当前页所有查询结果
|
174
171
|
def ranks
|
175
172
|
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
176
173
|
return @ranks unless @ranks.nil?
|
177
174
|
@ranks = Hash.new
|
178
|
-
@
|
175
|
+
@page.xpath('//div[@class="result"]').each do |result|
|
179
176
|
href,text,host,is_mobile = '','','',false
|
180
177
|
a = result.search("a").first
|
181
178
|
is_mobile = true unless a.search("img").empty?
|
@@ -218,7 +215,7 @@ class MbaiduResult < SearchResult
|
|
218
215
|
def ads_top
|
219
216
|
id = 0
|
220
217
|
result = []
|
221
|
-
@
|
218
|
+
@page.search("div[@class='ec_wise_ad']/div").each do |div|
|
222
219
|
id += 1
|
223
220
|
href = div.search("span[@class='ec_site']").first.text
|
224
221
|
href = "http://#{href}"
|
@@ -265,19 +262,19 @@ class MbaiduResult < SearchResult
|
|
265
262
|
=end
|
266
263
|
#下一页
|
267
264
|
def next
|
268
|
-
nextbutton = @
|
265
|
+
nextbutton = @page.xpath('//a[text()="下一页"]').first
|
269
266
|
return nil if nextbutton.nil?
|
270
267
|
url = nextbutton['href']
|
271
268
|
url = URI.join(@baseuri,url).to_s
|
272
|
-
|
273
|
-
return MbaiduResult.new(
|
269
|
+
page = HTTParty.get(url)
|
270
|
+
return MbaiduResult.new(page,url,@pagenumber+1)
|
274
271
|
end
|
275
272
|
|
276
273
|
end
|
277
274
|
class Baidu < SearchEngine
|
278
275
|
BaseUri = 'http://www.baidu.com/s?'
|
279
276
|
def suggestions(wd)
|
280
|
-
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").
|
277
|
+
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").page.force_encoding('GBK').encode("UTF-8")
|
281
278
|
m = /\[([^\]]*)\]/.match json
|
282
279
|
return JSON.parse m[0]
|
283
280
|
end
|
@@ -307,7 +304,7 @@ class Baidu < SearchEngine
|
|
307
304
|
=end
|
308
305
|
|
309
306
|
def popular?(wd)
|
310
|
-
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").
|
307
|
+
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").page.include?"boxFlash"
|
311
308
|
end
|
312
309
|
|
313
310
|
def query(wd)
|
@@ -320,9 +317,9 @@ class Baidu < SearchEngine
|
|
320
317
|
begin
|
321
318
|
# @page = @a.get uri
|
322
319
|
@page = HTTParty.get uri
|
323
|
-
BaiduResult.new(@page,uri)
|
324
|
-
rescue
|
325
|
-
warn
|
320
|
+
BaiduResult.new(@page,uri,1,@pagesize)
|
321
|
+
rescue Exception => e
|
322
|
+
warn e.to_s
|
326
323
|
return false
|
327
324
|
end
|
328
325
|
=begin
|
@@ -351,31 +348,27 @@ class Baidu < SearchEngine
|
|
351
348
|
def how_many_pages_with(host,string)
|
352
349
|
query("site:#{host} inurl:#{string}").how_many
|
353
350
|
end
|
354
|
-
|
355
|
-
=begin
|
356
|
-
private
|
357
|
-
def clean
|
358
|
-
@page.body.force_encoding('GBK')
|
359
|
-
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
360
|
-
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
361
|
-
end
|
362
|
-
=end
|
363
351
|
end
|
364
352
|
|
365
353
|
class BaiduResult < SearchResult
|
366
|
-
def initialize(page,baseuri,pagenumber=1)
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
354
|
+
# def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
355
|
+
# @page = Nokogiri::HTML page
|
356
|
+
# @baseuri = baseuri
|
357
|
+
# @pagenumber = pagenumber
|
358
|
+
# @pagesize = pagesize
|
359
|
+
# # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
360
|
+
# # @page = page
|
361
|
+
# end
|
374
362
|
def ranks
|
375
363
|
return @ranks unless @ranks.nil?
|
376
364
|
@ranks = Hash.new
|
377
365
|
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
378
366
|
id = table['id']
|
367
|
+
if @pagesize == 10
|
368
|
+
id = table['id'][-1,1]
|
369
|
+
id = '10' if id == '0'
|
370
|
+
end
|
371
|
+
|
379
372
|
@ranks[id] = Hash.new
|
380
373
|
url = table.search("[@class=\"g\"]").first
|
381
374
|
url = url.text unless url.nil?
|
@@ -395,23 +388,28 @@ class BaiduResult < SearchResult
|
|
395
388
|
end
|
396
389
|
|
397
390
|
def ads_bottom
|
398
|
-
|
399
|
-
|
400
|
-
@page.search("//table[@bgcolor='f5f5f5']").
|
401
|
-
next unless table['id'].nil?
|
402
|
-
id += 1
|
403
|
-
ads[id]= parse_ad(table)
|
404
|
-
end
|
405
|
-
ads
|
391
|
+
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
392
|
+
return ads_top
|
393
|
+
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
406
394
|
end
|
407
395
|
def ads_top
|
408
|
-
|
409
|
-
ads =
|
410
|
-
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
396
|
+
#灰色底推广,上下都有
|
397
|
+
ads = Hash.new
|
398
|
+
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
399
|
+
id = table['id']
|
411
400
|
next if id.nil?
|
412
|
-
id
|
401
|
+
id = id[2,3].to_i.to_s
|
413
402
|
ads[id]= parse_ad(table)
|
414
403
|
end
|
404
|
+
#白色底推广,只有上部分
|
405
|
+
if ads.empty?
|
406
|
+
@page.search("//table").each do |table|
|
407
|
+
id = table['id']
|
408
|
+
next if id.nil? or id.to_i<3000
|
409
|
+
id = id[2,3].to_i.to_s
|
410
|
+
ads[id]= parse_ad(table)
|
411
|
+
end
|
412
|
+
end
|
415
413
|
ads
|
416
414
|
end
|
417
415
|
def parse_ad(table)
|
@@ -462,8 +460,8 @@ class BaiduResult < SearchResult
|
|
462
460
|
return if url.nil?
|
463
461
|
url = url['href']
|
464
462
|
url = URI.join(@baseuri,url).to_s
|
465
|
-
|
466
|
-
return BaiduResult.new(
|
463
|
+
page = HTTParty.get(url)
|
464
|
+
return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
|
467
465
|
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
468
466
|
end
|
469
467
|
def has_result?
|
@@ -471,4 +469,4 @@ class BaiduResult < SearchResult
|
|
471
469
|
return false if submit and submit['href'].include?'sitesubmit'
|
472
470
|
return true
|
473
471
|
end
|
474
|
-
end
|
472
|
+
end
|