baidu 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/baidu.rb +68 -70
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ad277bccec5ed902f59dbaab5e18aaa5fa5af8f
4
- data.tar.gz: 08a52511a1952bc3d0ab3398cca376d7f8960bed
3
+ metadata.gz: 8b31dd8b631dbab567991e53f1a6b54a4d7fd9ce
4
+ data.tar.gz: 2bdc0cd46017d2b02176fae221895b1d0e7db9bc
5
5
  SHA512:
6
- metadata.gz: 8b063ef6a9d4d85dea496b1b28fabe9ce184384452991de9872551cd39535e1bb55084156205bbdafadda7ee4f5be80f28517004d2ec5be683d7949647f4dbd6
7
- data.tar.gz: b0950ebbb8c6aa2fd49388d92bc21306cbf323f5a2971380194525ee834f07a2abe862da3f10a87b3d375d1728699a087229eaceb221617f265f10b3c9fbd12e
6
+ metadata.gz: cf1c086b498c805a1dd497fea7d2467e036c1b5d66f8a773e992a819735447d90244661947dd5f9b081a2a272057c69df14759566d25e2d70798c2d27cc27c3e
7
+ data.tar.gz: 596c6098e190ff0e5835bc1f2daff641123805ee2894bc6d9f431d110b587fbb6e17d96822d56147822757c998ef393b702555ff30d1bae48b6c1e7d4de1ad61
@@ -5,8 +5,8 @@ require 'addressable/uri'
5
5
  require 'httparty'
6
6
  class SearchEngine
7
7
  #是否收录
8
- def initialize(perpage = 100)
9
- @perpage = perpage
8
+ def initialize(pagesize = 100)
9
+ @pagesize = pagesize#只允许10或100
10
10
  end
11
11
  def indexed?(url)
12
12
  URI(url)
@@ -15,15 +15,12 @@ class SearchEngine
15
15
  end
16
16
  end
17
17
  class SearchResult
18
- def initialize(body,baseuri,pagenumber=nil)
19
- @body = Nokogiri::HTML body
18
+ def initialize(page,baseuri,pagenumber=1,pagesize=100)
19
+ @page = Nokogiri::HTML page
20
20
  @baseuri = baseuri
21
21
  # @host = URI(baseuri).host
22
- if pagenumber.nil?
23
- @pagenumber = 1
24
- else
25
- @pagenumber = pagenumber
26
- end
22
+ @pagenumber = pagenumber
23
+ @pagesize = pagesize
27
24
  end
28
25
  def whole
29
26
  {
@@ -65,10 +62,10 @@ class Qihoo < SearchEngine
65
62
  def query(wd)
66
63
  #用原始路径请求
67
64
  uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
68
- body = HTTParty.get(uri)
65
+ page = HTTParty.get(uri)
69
66
  #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
70
- uri = URI.join("http://#{Host}/",body.request.path).to_s
71
- QihooResult.new(body,uri)
67
+ uri = URI.join("http://#{Host}/",page.request.path).to_s
68
+ QihooResult.new(page,uri)
72
69
  end
73
70
  end
74
71
 
@@ -79,7 +76,7 @@ class QihooResult < SearchResult
79
76
  return @ranks unless @ranks.nil?
80
77
  @ranks = Hash.new
81
78
  id = (@pagenumber - 1) * 10
82
- @body.xpath('//li[@class="res-list"]').each do |li|
79
+ @page.search('//li[@class="res-list"]').each do |li|
83
80
  a = li.search("h3/a").first
84
81
  url = li.search("cite")
85
82
  next if a['data-pos'].nil?
@@ -95,7 +92,7 @@ class QihooResult < SearchResult
95
92
  def ads_top
96
93
  id = 0
97
94
  result = []
98
- @body.search("//ul[@id='djbox']/li").each do |li|
95
+ @page.search("//ul[@id='djbox']/li").each do |li|
99
96
  id+=1
100
97
  title = li.search("a").first.text
101
98
  href = li.search("cite").first.text.downcase
@@ -110,7 +107,7 @@ class QihooResult < SearchResult
110
107
  def ads_right
111
108
  id = 0
112
109
  result = []
113
- @body.search("//ul[@id='rightbox']/li").each do |li|
110
+ @page.search("//ul[@id='rightbox']/li").each do |li|
114
111
  id += 1
115
112
  title = li.search("a").first.text
116
113
  href = li.search("cite").first.text.downcase
@@ -124,18 +121,18 @@ class QihooResult < SearchResult
124
121
  end
125
122
  #下一页
126
123
  def next
127
- next_href = @body.xpath('//a[@id="snext"]')
124
+ next_href = @page.xpath('//a[@id="snext"]')
128
125
  return false if next_href.empty?
129
126
  next_href = next_href.first['href']
130
127
  next_href = URI.join(@baseuri,next_href).to_s
131
128
  # next_href = URI.join("http://#{@host}",next_href).to_s
132
- next_body = HTTParty.get(next_href).body
133
- return QihooResult.new(next_body,next_href,@pagenumber+1)
129
+ next_page = HTTParty.get(next_href).page
130
+ return QihooResult.new(next_page,next_href,@pagenumber+1)
134
131
  #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
135
132
  end
136
133
  #有结果
137
134
  def has_result?
138
- !@body.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
135
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
139
136
  end
140
137
  end
141
138
 
@@ -160,22 +157,22 @@ class Mbaidu < SearchEngine
160
157
  end
161
158
  end
162
159
  class MbaiduResult < SearchResult
163
- def initialize(body,baseuri,pagenumber=nil)
164
- @body = Nokogiri::HTML body
165
- @baseuri = baseuri
166
- if pagenumber.nil?
167
- @pagenumber = 1
168
- else
169
- @pagenumber = pagenumber
170
- end
171
- end
160
+ # def initialize(page,baseuri,pagenumber=nil)
161
+ # @page= Nokogiri::HTML page
162
+ # @baseuri = baseuri
163
+ # if pagenumber.nil?
164
+ # @pagenumber = 1
165
+ # else
166
+ # @pagenumber = pagenumber
167
+ # end
168
+ # end
172
169
 
173
170
  #返回当前页所有查询结果
174
171
  def ranks
175
172
  #如果已经赋值说明解析过,不需要重新解析,直接返回结果
176
173
  return @ranks unless @ranks.nil?
177
174
  @ranks = Hash.new
178
- @body.xpath('//div[@class="result"]').each do |result|
175
+ @page.xpath('//div[@class="result"]').each do |result|
179
176
  href,text,host,is_mobile = '','','',false
180
177
  a = result.search("a").first
181
178
  is_mobile = true unless a.search("img").empty?
@@ -218,7 +215,7 @@ class MbaiduResult < SearchResult
218
215
  def ads_top
219
216
  id = 0
220
217
  result = []
221
- @body.search("div[@class='ec_wise_ad']/div").each do |div|
218
+ @page.search("div[@class='ec_wise_ad']/div").each do |div|
222
219
  id += 1
223
220
  href = div.search("span[@class='ec_site']").first.text
224
221
  href = "http://#{href}"
@@ -265,19 +262,19 @@ class MbaiduResult < SearchResult
265
262
  =end
266
263
  #下一页
267
264
  def next
268
- nextbutton = @body.xpath('//a[text()="下一页"]').first
265
+ nextbutton = @page.xpath('//a[text()="下一页"]').first
269
266
  return nil if nextbutton.nil?
270
267
  url = nextbutton['href']
271
268
  url = URI.join(@baseuri,url).to_s
272
- body = HTTParty.get(url)
273
- return MbaiduResult.new(body,url,@pagenumber+1)
269
+ page = HTTParty.get(url)
270
+ return MbaiduResult.new(page,url,@pagenumber+1)
274
271
  end
275
272
 
276
273
  end
277
274
  class Baidu < SearchEngine
278
275
  BaseUri = 'http://www.baidu.com/s?'
279
276
  def suggestions(wd)
280
- json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
277
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").page.force_encoding('GBK').encode("UTF-8")
281
278
  m = /\[([^\]]*)\]/.match json
282
279
  return JSON.parse m[0]
283
280
  end
@@ -307,7 +304,7 @@ class Baidu < SearchEngine
307
304
  =end
308
305
 
309
306
  def popular?(wd)
310
- return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
307
+ return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").page.include?"boxFlash"
311
308
  end
312
309
 
313
310
  def query(wd)
@@ -320,9 +317,9 @@ class Baidu < SearchEngine
320
317
  begin
321
318
  # @page = @a.get uri
322
319
  @page = HTTParty.get uri
323
- BaiduResult.new(@page,uri)
324
- rescue Net::HTTP::Persistent::Error
325
- warn "[timeout] #{uri}"
320
+ BaiduResult.new(@page,uri,1,@pagesize)
321
+ rescue Exception => e
322
+ warn e.to_s
326
323
  return false
327
324
  end
328
325
  =begin
@@ -351,31 +348,27 @@ class Baidu < SearchEngine
351
348
  def how_many_pages_with(host,string)
352
349
  query("site:#{host} inurl:#{string}").how_many
353
350
  end
354
-
355
- =begin
356
- private
357
- def clean
358
- @page.body.force_encoding('GBK')
359
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
360
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
361
- end
362
- =end
363
351
  end
364
352
 
365
353
  class BaiduResult < SearchResult
366
- def initialize(page,baseuri,pagenumber=1)
367
- @page = Nokogiri::HTML page
368
- @baseuri = baseuri
369
- @pagenumber = pagenumber
370
- # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
371
- # @page = page
372
- end
373
-
354
+ # def initialize(page,baseuri,pagenumber=1,pagesize=100)
355
+ # @page = Nokogiri::HTML page
356
+ # @baseuri = baseuri
357
+ # @pagenumber = pagenumber
358
+ # @pagesize = pagesize
359
+ # # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
360
+ # # @page = page
361
+ # end
374
362
  def ranks
375
363
  return @ranks unless @ranks.nil?
376
364
  @ranks = Hash.new
377
365
  @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
378
366
  id = table['id']
367
+ if @pagesize == 10
368
+ id = table['id'][-1,1]
369
+ id = '10' if id == '0'
370
+ end
371
+
379
372
  @ranks[id] = Hash.new
380
373
  url = table.search("[@class=\"g\"]").first
381
374
  url = url.text unless url.nil?
@@ -395,23 +388,28 @@ class BaiduResult < SearchResult
395
388
  end
396
389
 
397
390
  def ads_bottom
398
- id = 0
399
- ads = {}
400
- @page.search("//table[@bgcolor='f5f5f5']").each do |table|
401
- next unless table['id'].nil?
402
- id += 1
403
- ads[id]= parse_ad(table)
404
- end
405
- ads
391
+ return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
392
+ return ads_top
393
+ # p @page.search("//table[@bgcolor='f5f5f5']").empty?
406
394
  end
407
395
  def ads_top
408
- id = 0
409
- ads = {}
410
- @page.search("//table[@bgcolor='f5f5f5']").each do |table|
396
+ #灰色底推广,上下都有
397
+ ads = Hash.new
398
+ @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
399
+ id = table['id']
411
400
  next if id.nil?
412
- id += 1
401
+ id = id[2,3].to_i.to_s
413
402
  ads[id]= parse_ad(table)
414
403
  end
404
+ #白色底推广,只有上部分
405
+ if ads.empty?
406
+ @page.search("//table").each do |table|
407
+ id = table['id']
408
+ next if id.nil? or id.to_i<3000
409
+ id = id[2,3].to_i.to_s
410
+ ads[id]= parse_ad(table)
411
+ end
412
+ end
415
413
  ads
416
414
  end
417
415
  def parse_ad(table)
@@ -462,8 +460,8 @@ class BaiduResult < SearchResult
462
460
  return if url.nil?
463
461
  url = url['href']
464
462
  url = URI.join(@baseuri,url).to_s
465
- body = HTTParty.get(url)
466
- return BaiduResult.new(body,url,@pagenumber+1)
463
+ page = HTTParty.get(url)
464
+ return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
467
465
  # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
468
466
  end
469
467
  def has_result?
@@ -471,4 +469,4 @@ class BaiduResult < SearchResult
471
469
  return false if submit and submit['href'].include?'sitesubmit'
472
470
  return true
473
471
  end
474
- end
472
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoaqua