baidu 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/baidu.rb +68 -70
  3. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 4ad277bccec5ed902f59dbaab5e18aaa5fa5af8f
4
- data.tar.gz: 08a52511a1952bc3d0ab3398cca376d7f8960bed
3
+ metadata.gz: 8b31dd8b631dbab567991e53f1a6b54a4d7fd9ce
4
+ data.tar.gz: 2bdc0cd46017d2b02176fae221895b1d0e7db9bc
5
5
  SHA512:
6
- metadata.gz: 8b063ef6a9d4d85dea496b1b28fabe9ce184384452991de9872551cd39535e1bb55084156205bbdafadda7ee4f5be80f28517004d2ec5be683d7949647f4dbd6
7
- data.tar.gz: b0950ebbb8c6aa2fd49388d92bc21306cbf323f5a2971380194525ee834f07a2abe862da3f10a87b3d375d1728699a087229eaceb221617f265f10b3c9fbd12e
6
+ metadata.gz: cf1c086b498c805a1dd497fea7d2467e036c1b5d66f8a773e992a819735447d90244661947dd5f9b081a2a272057c69df14759566d25e2d70798c2d27cc27c3e
7
+ data.tar.gz: 596c6098e190ff0e5835bc1f2daff641123805ee2894bc6d9f431d110b587fbb6e17d96822d56147822757c998ef393b702555ff30d1bae48b6c1e7d4de1ad61
@@ -5,8 +5,8 @@ require 'addressable/uri'
5
5
  require 'httparty'
6
6
  class SearchEngine
7
7
  #是否收录
8
- def initialize(perpage = 100)
9
- @perpage = perpage
8
+ def initialize(pagesize = 100)
9
+ @pagesize = pagesize#只允许10或100
10
10
  end
11
11
  def indexed?(url)
12
12
  URI(url)
@@ -15,15 +15,12 @@ class SearchEngine
15
15
  end
16
16
  end
17
17
  class SearchResult
18
- def initialize(body,baseuri,pagenumber=nil)
19
- @body = Nokogiri::HTML body
18
+ def initialize(page,baseuri,pagenumber=1,pagesize=100)
19
+ @page = Nokogiri::HTML page
20
20
  @baseuri = baseuri
21
21
  # @host = URI(baseuri).host
22
- if pagenumber.nil?
23
- @pagenumber = 1
24
- else
25
- @pagenumber = pagenumber
26
- end
22
+ @pagenumber = pagenumber
23
+ @pagesize = pagesize
27
24
  end
28
25
  def whole
29
26
  {
@@ -65,10 +62,10 @@ class Qihoo < SearchEngine
65
62
  def query(wd)
66
63
  #用原始路径请求
67
64
  uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
68
- body = HTTParty.get(uri)
65
+ page = HTTParty.get(uri)
69
66
  #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
70
- uri = URI.join("http://#{Host}/",body.request.path).to_s
71
- QihooResult.new(body,uri)
67
+ uri = URI.join("http://#{Host}/",page.request.path).to_s
68
+ QihooResult.new(page,uri)
72
69
  end
73
70
  end
74
71
 
@@ -79,7 +76,7 @@ class QihooResult < SearchResult
79
76
  return @ranks unless @ranks.nil?
80
77
  @ranks = Hash.new
81
78
  id = (@pagenumber - 1) * 10
82
- @body.xpath('//li[@class="res-list"]').each do |li|
79
+ @page.search('//li[@class="res-list"]').each do |li|
83
80
  a = li.search("h3/a").first
84
81
  url = li.search("cite")
85
82
  next if a['data-pos'].nil?
@@ -95,7 +92,7 @@ class QihooResult < SearchResult
95
92
  def ads_top
96
93
  id = 0
97
94
  result = []
98
- @body.search("//ul[@id='djbox']/li").each do |li|
95
+ @page.search("//ul[@id='djbox']/li").each do |li|
99
96
  id+=1
100
97
  title = li.search("a").first.text
101
98
  href = li.search("cite").first.text.downcase
@@ -110,7 +107,7 @@ class QihooResult < SearchResult
110
107
  def ads_right
111
108
  id = 0
112
109
  result = []
113
- @body.search("//ul[@id='rightbox']/li").each do |li|
110
+ @page.search("//ul[@id='rightbox']/li").each do |li|
114
111
  id += 1
115
112
  title = li.search("a").first.text
116
113
  href = li.search("cite").first.text.downcase
@@ -124,18 +121,18 @@ class QihooResult < SearchResult
124
121
  end
125
122
  #下一页
126
123
  def next
127
- next_href = @body.xpath('//a[@id="snext"]')
124
+ next_href = @page.xpath('//a[@id="snext"]')
128
125
  return false if next_href.empty?
129
126
  next_href = next_href.first['href']
130
127
  next_href = URI.join(@baseuri,next_href).to_s
131
128
  # next_href = URI.join("http://#{@host}",next_href).to_s
132
- next_body = HTTParty.get(next_href).body
133
- return QihooResult.new(next_body,next_href,@pagenumber+1)
129
+ next_page = HTTParty.get(next_href).page
130
+ return QihooResult.new(next_page,next_href,@pagenumber+1)
134
131
  #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
135
132
  end
136
133
  #有结果
137
134
  def has_result?
138
- !@body.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
135
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
139
136
  end
140
137
  end
141
138
 
@@ -160,22 +157,22 @@ class Mbaidu < SearchEngine
160
157
  end
161
158
  end
162
159
  class MbaiduResult < SearchResult
163
- def initialize(body,baseuri,pagenumber=nil)
164
- @body = Nokogiri::HTML body
165
- @baseuri = baseuri
166
- if pagenumber.nil?
167
- @pagenumber = 1
168
- else
169
- @pagenumber = pagenumber
170
- end
171
- end
160
+ # def initialize(page,baseuri,pagenumber=nil)
161
+ # @page= Nokogiri::HTML page
162
+ # @baseuri = baseuri
163
+ # if pagenumber.nil?
164
+ # @pagenumber = 1
165
+ # else
166
+ # @pagenumber = pagenumber
167
+ # end
168
+ # end
172
169
 
173
170
  #返回当前页所有查询结果
174
171
  def ranks
175
172
  #如果已经赋值说明解析过,不需要重新解析,直接返回结果
176
173
  return @ranks unless @ranks.nil?
177
174
  @ranks = Hash.new
178
- @body.xpath('//div[@class="result"]').each do |result|
175
+ @page.xpath('//div[@class="result"]').each do |result|
179
176
  href,text,host,is_mobile = '','','',false
180
177
  a = result.search("a").first
181
178
  is_mobile = true unless a.search("img").empty?
@@ -218,7 +215,7 @@ class MbaiduResult < SearchResult
218
215
  def ads_top
219
216
  id = 0
220
217
  result = []
221
- @body.search("div[@class='ec_wise_ad']/div").each do |div|
218
+ @page.search("div[@class='ec_wise_ad']/div").each do |div|
222
219
  id += 1
223
220
  href = div.search("span[@class='ec_site']").first.text
224
221
  href = "http://#{href}"
@@ -265,19 +262,19 @@ class MbaiduResult < SearchResult
265
262
  =end
266
263
  #下一页
267
264
  def next
268
- nextbutton = @body.xpath('//a[text()="下一页"]').first
265
+ nextbutton = @page.xpath('//a[text()="下一页"]').first
269
266
  return nil if nextbutton.nil?
270
267
  url = nextbutton['href']
271
268
  url = URI.join(@baseuri,url).to_s
272
- body = HTTParty.get(url)
273
- return MbaiduResult.new(body,url,@pagenumber+1)
269
+ page = HTTParty.get(url)
270
+ return MbaiduResult.new(page,url,@pagenumber+1)
274
271
  end
275
272
 
276
273
  end
277
274
  class Baidu < SearchEngine
278
275
  BaseUri = 'http://www.baidu.com/s?'
279
276
  def suggestions(wd)
280
- json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
277
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").page.force_encoding('GBK').encode("UTF-8")
281
278
  m = /\[([^\]]*)\]/.match json
282
279
  return JSON.parse m[0]
283
280
  end
@@ -307,7 +304,7 @@ class Baidu < SearchEngine
307
304
  =end
308
305
 
309
306
  def popular?(wd)
310
- return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
307
+ return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").page.include?"boxFlash"
311
308
  end
312
309
 
313
310
  def query(wd)
@@ -320,9 +317,9 @@ class Baidu < SearchEngine
320
317
  begin
321
318
  # @page = @a.get uri
322
319
  @page = HTTParty.get uri
323
- BaiduResult.new(@page,uri)
324
- rescue Net::HTTP::Persistent::Error
325
- warn "[timeout] #{uri}"
320
+ BaiduResult.new(@page,uri,1,@pagesize)
321
+ rescue Exception => e
322
+ warn e.to_s
326
323
  return false
327
324
  end
328
325
  =begin
@@ -351,31 +348,27 @@ class Baidu < SearchEngine
351
348
  def how_many_pages_with(host,string)
352
349
  query("site:#{host} inurl:#{string}").how_many
353
350
  end
354
-
355
- =begin
356
- private
357
- def clean
358
- @page.body.force_encoding('GBK')
359
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
360
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
361
- end
362
- =end
363
351
  end
364
352
 
365
353
  class BaiduResult < SearchResult
366
- def initialize(page,baseuri,pagenumber=1)
367
- @page = Nokogiri::HTML page
368
- @baseuri = baseuri
369
- @pagenumber = pagenumber
370
- # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
371
- # @page = page
372
- end
373
-
354
+ # def initialize(page,baseuri,pagenumber=1,pagesize=100)
355
+ # @page = Nokogiri::HTML page
356
+ # @baseuri = baseuri
357
+ # @pagenumber = pagenumber
358
+ # @pagesize = pagesize
359
+ # # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
360
+ # # @page = page
361
+ # end
374
362
  def ranks
375
363
  return @ranks unless @ranks.nil?
376
364
  @ranks = Hash.new
377
365
  @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
378
366
  id = table['id']
367
+ if @pagesize == 10
368
+ id = table['id'][-1,1]
369
+ id = '10' if id == '0'
370
+ end
371
+
379
372
  @ranks[id] = Hash.new
380
373
  url = table.search("[@class=\"g\"]").first
381
374
  url = url.text unless url.nil?
@@ -395,23 +388,28 @@ class BaiduResult < SearchResult
395
388
  end
396
389
 
397
390
  def ads_bottom
398
- id = 0
399
- ads = {}
400
- @page.search("//table[@bgcolor='f5f5f5']").each do |table|
401
- next unless table['id'].nil?
402
- id += 1
403
- ads[id]= parse_ad(table)
404
- end
405
- ads
391
+ return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
392
+ return ads_top
393
+ # p @page.search("//table[@bgcolor='f5f5f5']").empty?
406
394
  end
407
395
  def ads_top
408
- id = 0
409
- ads = {}
410
- @page.search("//table[@bgcolor='f5f5f5']").each do |table|
396
+ #灰色底推广,上下都有
397
+ ads = Hash.new
398
+ @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
399
+ id = table['id']
411
400
  next if id.nil?
412
- id += 1
401
+ id = id[2,3].to_i.to_s
413
402
  ads[id]= parse_ad(table)
414
403
  end
404
+ #白色底推广,只有上部分
405
+ if ads.empty?
406
+ @page.search("//table").each do |table|
407
+ id = table['id']
408
+ next if id.nil? or id.to_i<3000
409
+ id = id[2,3].to_i.to_s
410
+ ads[id]= parse_ad(table)
411
+ end
412
+ end
415
413
  ads
416
414
  end
417
415
  def parse_ad(table)
@@ -462,8 +460,8 @@ class BaiduResult < SearchResult
462
460
  return if url.nil?
463
461
  url = url['href']
464
462
  url = URI.join(@baseuri,url).to_s
465
- body = HTTParty.get(url)
466
- return BaiduResult.new(body,url,@pagenumber+1)
463
+ page = HTTParty.get(url)
464
+ return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
467
465
  # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
468
466
  end
469
467
  def has_result?
@@ -471,4 +469,4 @@ class BaiduResult < SearchResult
471
469
  return false if submit and submit['href'].include?'sitesubmit'
472
470
  return true
473
471
  end
474
- end
472
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.4
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - seoaqua