baidu 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/baidu.rb +68 -70
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 8b31dd8b631dbab567991e53f1a6b54a4d7fd9ce
|
|
4
|
+
data.tar.gz: 2bdc0cd46017d2b02176fae221895b1d0e7db9bc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cf1c086b498c805a1dd497fea7d2467e036c1b5d66f8a773e992a819735447d90244661947dd5f9b081a2a272057c69df14759566d25e2d70798c2d27cc27c3e
|
|
7
|
+
data.tar.gz: 596c6098e190ff0e5835bc1f2daff641123805ee2894bc6d9f431d110b587fbb6e17d96822d56147822757c998ef393b702555ff30d1bae48b6c1e7d4de1ad61
|
data/lib/baidu.rb
CHANGED
|
@@ -5,8 +5,8 @@ require 'addressable/uri'
|
|
|
5
5
|
require 'httparty'
|
|
6
6
|
class SearchEngine
|
|
7
7
|
#是否收录
|
|
8
|
-
def initialize(
|
|
9
|
-
@
|
|
8
|
+
def initialize(pagesize = 100)
|
|
9
|
+
@pagesize = pagesize#只允许10或100
|
|
10
10
|
end
|
|
11
11
|
def indexed?(url)
|
|
12
12
|
URI(url)
|
|
@@ -15,15 +15,12 @@ class SearchEngine
|
|
|
15
15
|
end
|
|
16
16
|
end
|
|
17
17
|
class SearchResult
|
|
18
|
-
def initialize(
|
|
19
|
-
@
|
|
18
|
+
def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
|
19
|
+
@page = Nokogiri::HTML page
|
|
20
20
|
@baseuri = baseuri
|
|
21
21
|
# @host = URI(baseuri).host
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
else
|
|
25
|
-
@pagenumber = pagenumber
|
|
26
|
-
end
|
|
22
|
+
@pagenumber = pagenumber
|
|
23
|
+
@pagesize = pagesize
|
|
27
24
|
end
|
|
28
25
|
def whole
|
|
29
26
|
{
|
|
@@ -65,10 +62,10 @@ class Qihoo < SearchEngine
|
|
|
65
62
|
def query(wd)
|
|
66
63
|
#用原始路径请求
|
|
67
64
|
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
|
68
|
-
|
|
65
|
+
page = HTTParty.get(uri)
|
|
69
66
|
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
|
70
|
-
uri = URI.join("http://#{Host}/",
|
|
71
|
-
QihooResult.new(
|
|
67
|
+
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
|
68
|
+
QihooResult.new(page,uri)
|
|
72
69
|
end
|
|
73
70
|
end
|
|
74
71
|
|
|
@@ -79,7 +76,7 @@ class QihooResult < SearchResult
|
|
|
79
76
|
return @ranks unless @ranks.nil?
|
|
80
77
|
@ranks = Hash.new
|
|
81
78
|
id = (@pagenumber - 1) * 10
|
|
82
|
-
@
|
|
79
|
+
@page.search('//li[@class="res-list"]').each do |li|
|
|
83
80
|
a = li.search("h3/a").first
|
|
84
81
|
url = li.search("cite")
|
|
85
82
|
next if a['data-pos'].nil?
|
|
@@ -95,7 +92,7 @@ class QihooResult < SearchResult
|
|
|
95
92
|
def ads_top
|
|
96
93
|
id = 0
|
|
97
94
|
result = []
|
|
98
|
-
@
|
|
95
|
+
@page.search("//ul[@id='djbox']/li").each do |li|
|
|
99
96
|
id+=1
|
|
100
97
|
title = li.search("a").first.text
|
|
101
98
|
href = li.search("cite").first.text.downcase
|
|
@@ -110,7 +107,7 @@ class QihooResult < SearchResult
|
|
|
110
107
|
def ads_right
|
|
111
108
|
id = 0
|
|
112
109
|
result = []
|
|
113
|
-
@
|
|
110
|
+
@page.search("//ul[@id='rightbox']/li").each do |li|
|
|
114
111
|
id += 1
|
|
115
112
|
title = li.search("a").first.text
|
|
116
113
|
href = li.search("cite").first.text.downcase
|
|
@@ -124,18 +121,18 @@ class QihooResult < SearchResult
|
|
|
124
121
|
end
|
|
125
122
|
#下一页
|
|
126
123
|
def next
|
|
127
|
-
next_href = @
|
|
124
|
+
next_href = @page.xpath('//a[@id="snext"]')
|
|
128
125
|
return false if next_href.empty?
|
|
129
126
|
next_href = next_href.first['href']
|
|
130
127
|
next_href = URI.join(@baseuri,next_href).to_s
|
|
131
128
|
# next_href = URI.join("http://#{@host}",next_href).to_s
|
|
132
|
-
|
|
133
|
-
return QihooResult.new(
|
|
129
|
+
next_page = HTTParty.get(next_href).page
|
|
130
|
+
return QihooResult.new(next_page,next_href,@pagenumber+1)
|
|
134
131
|
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
|
135
132
|
end
|
|
136
133
|
#有结果
|
|
137
134
|
def has_result?
|
|
138
|
-
!@
|
|
135
|
+
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
|
139
136
|
end
|
|
140
137
|
end
|
|
141
138
|
|
|
@@ -160,22 +157,22 @@ class Mbaidu < SearchEngine
|
|
|
160
157
|
end
|
|
161
158
|
end
|
|
162
159
|
class MbaiduResult < SearchResult
|
|
163
|
-
def initialize(
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
end
|
|
160
|
+
# def initialize(page,baseuri,pagenumber=nil)
|
|
161
|
+
# @page= Nokogiri::HTML page
|
|
162
|
+
# @baseuri = baseuri
|
|
163
|
+
# if pagenumber.nil?
|
|
164
|
+
# @pagenumber = 1
|
|
165
|
+
# else
|
|
166
|
+
# @pagenumber = pagenumber
|
|
167
|
+
# end
|
|
168
|
+
# end
|
|
172
169
|
|
|
173
170
|
#返回当前页所有查询结果
|
|
174
171
|
def ranks
|
|
175
172
|
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
|
176
173
|
return @ranks unless @ranks.nil?
|
|
177
174
|
@ranks = Hash.new
|
|
178
|
-
@
|
|
175
|
+
@page.xpath('//div[@class="result"]').each do |result|
|
|
179
176
|
href,text,host,is_mobile = '','','',false
|
|
180
177
|
a = result.search("a").first
|
|
181
178
|
is_mobile = true unless a.search("img").empty?
|
|
@@ -218,7 +215,7 @@ class MbaiduResult < SearchResult
|
|
|
218
215
|
def ads_top
|
|
219
216
|
id = 0
|
|
220
217
|
result = []
|
|
221
|
-
@
|
|
218
|
+
@page.search("div[@class='ec_wise_ad']/div").each do |div|
|
|
222
219
|
id += 1
|
|
223
220
|
href = div.search("span[@class='ec_site']").first.text
|
|
224
221
|
href = "http://#{href}"
|
|
@@ -265,19 +262,19 @@ class MbaiduResult < SearchResult
|
|
|
265
262
|
=end
|
|
266
263
|
#下一页
|
|
267
264
|
def next
|
|
268
|
-
nextbutton = @
|
|
265
|
+
nextbutton = @page.xpath('//a[text()="下一页"]').first
|
|
269
266
|
return nil if nextbutton.nil?
|
|
270
267
|
url = nextbutton['href']
|
|
271
268
|
url = URI.join(@baseuri,url).to_s
|
|
272
|
-
|
|
273
|
-
return MbaiduResult.new(
|
|
269
|
+
page = HTTParty.get(url)
|
|
270
|
+
return MbaiduResult.new(page,url,@pagenumber+1)
|
|
274
271
|
end
|
|
275
272
|
|
|
276
273
|
end
|
|
277
274
|
class Baidu < SearchEngine
|
|
278
275
|
BaseUri = 'http://www.baidu.com/s?'
|
|
279
276
|
def suggestions(wd)
|
|
280
|
-
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").
|
|
277
|
+
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").page.force_encoding('GBK').encode("UTF-8")
|
|
281
278
|
m = /\[([^\]]*)\]/.match json
|
|
282
279
|
return JSON.parse m[0]
|
|
283
280
|
end
|
|
@@ -307,7 +304,7 @@ class Baidu < SearchEngine
|
|
|
307
304
|
=end
|
|
308
305
|
|
|
309
306
|
def popular?(wd)
|
|
310
|
-
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").
|
|
307
|
+
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").page.include?"boxFlash"
|
|
311
308
|
end
|
|
312
309
|
|
|
313
310
|
def query(wd)
|
|
@@ -320,9 +317,9 @@ class Baidu < SearchEngine
|
|
|
320
317
|
begin
|
|
321
318
|
# @page = @a.get uri
|
|
322
319
|
@page = HTTParty.get uri
|
|
323
|
-
BaiduResult.new(@page,uri)
|
|
324
|
-
rescue
|
|
325
|
-
warn
|
|
320
|
+
BaiduResult.new(@page,uri,1,@pagesize)
|
|
321
|
+
rescue Exception => e
|
|
322
|
+
warn e.to_s
|
|
326
323
|
return false
|
|
327
324
|
end
|
|
328
325
|
=begin
|
|
@@ -351,31 +348,27 @@ class Baidu < SearchEngine
|
|
|
351
348
|
def how_many_pages_with(host,string)
|
|
352
349
|
query("site:#{host} inurl:#{string}").how_many
|
|
353
350
|
end
|
|
354
|
-
|
|
355
|
-
=begin
|
|
356
|
-
private
|
|
357
|
-
def clean
|
|
358
|
-
@page.body.force_encoding('GBK')
|
|
359
|
-
@page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
|
|
360
|
-
@page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
|
|
361
|
-
end
|
|
362
|
-
=end
|
|
363
351
|
end
|
|
364
352
|
|
|
365
353
|
class BaiduResult < SearchResult
|
|
366
|
-
def initialize(page,baseuri,pagenumber=1)
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
354
|
+
# def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
|
355
|
+
# @page = Nokogiri::HTML page
|
|
356
|
+
# @baseuri = baseuri
|
|
357
|
+
# @pagenumber = pagenumber
|
|
358
|
+
# @pagesize = pagesize
|
|
359
|
+
# # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
|
360
|
+
# # @page = page
|
|
361
|
+
# end
|
|
374
362
|
def ranks
|
|
375
363
|
return @ranks unless @ranks.nil?
|
|
376
364
|
@ranks = Hash.new
|
|
377
365
|
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
|
378
366
|
id = table['id']
|
|
367
|
+
if @pagesize == 10
|
|
368
|
+
id = table['id'][-1,1]
|
|
369
|
+
id = '10' if id == '0'
|
|
370
|
+
end
|
|
371
|
+
|
|
379
372
|
@ranks[id] = Hash.new
|
|
380
373
|
url = table.search("[@class=\"g\"]").first
|
|
381
374
|
url = url.text unless url.nil?
|
|
@@ -395,23 +388,28 @@ class BaiduResult < SearchResult
|
|
|
395
388
|
end
|
|
396
389
|
|
|
397
390
|
def ads_bottom
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
@page.search("//table[@bgcolor='f5f5f5']").
|
|
401
|
-
next unless table['id'].nil?
|
|
402
|
-
id += 1
|
|
403
|
-
ads[id]= parse_ad(table)
|
|
404
|
-
end
|
|
405
|
-
ads
|
|
391
|
+
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
|
392
|
+
return ads_top
|
|
393
|
+
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
|
406
394
|
end
|
|
407
395
|
def ads_top
|
|
408
|
-
|
|
409
|
-
ads =
|
|
410
|
-
@page.search("//table[@bgcolor='f5f5f5']").each do |table|
|
|
396
|
+
#灰色底推广,上下都有
|
|
397
|
+
ads = Hash.new
|
|
398
|
+
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
|
399
|
+
id = table['id']
|
|
411
400
|
next if id.nil?
|
|
412
|
-
id
|
|
401
|
+
id = id[2,3].to_i.to_s
|
|
413
402
|
ads[id]= parse_ad(table)
|
|
414
403
|
end
|
|
404
|
+
#白色底推广,只有上部分
|
|
405
|
+
if ads.empty?
|
|
406
|
+
@page.search("//table").each do |table|
|
|
407
|
+
id = table['id']
|
|
408
|
+
next if id.nil? or id.to_i<3000
|
|
409
|
+
id = id[2,3].to_i.to_s
|
|
410
|
+
ads[id]= parse_ad(table)
|
|
411
|
+
end
|
|
412
|
+
end
|
|
415
413
|
ads
|
|
416
414
|
end
|
|
417
415
|
def parse_ad(table)
|
|
@@ -462,8 +460,8 @@ class BaiduResult < SearchResult
|
|
|
462
460
|
return if url.nil?
|
|
463
461
|
url = url['href']
|
|
464
462
|
url = URI.join(@baseuri,url).to_s
|
|
465
|
-
|
|
466
|
-
return BaiduResult.new(
|
|
463
|
+
page = HTTParty.get(url)
|
|
464
|
+
return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
|
|
467
465
|
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
|
468
466
|
end
|
|
469
467
|
def has_result?
|
|
@@ -471,4 +469,4 @@ class BaiduResult < SearchResult
|
|
|
471
469
|
return false if submit and submit['href'].include?'sitesubmit'
|
|
472
470
|
return true
|
|
473
471
|
end
|
|
474
|
-
end
|
|
472
|
+
end
|