baidu 1.1.4 → 1.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/baidu.rb +76 -21
  3. metadata +52 -12
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e5e2be5751728aa5aab6cc36fc9552a74aadba1
4
+ data.tar.gz: 28a8bb12fad2e0d3164908a61f13a9fc1d1d03e2
5
+ SHA512:
6
+ metadata.gz: d2b99a90386ebad73fac60db3331d0ea3e8021c657ef7d6d4c0b703023df86f8b3bfc2df434862a07dcc7b1496f026779bcbd592b9159e11ae2790ccdb9d0481
7
+ data.tar.gz: ea3191864319521de14ab735a770e08204a64fc40e23d836be4e196669f24e2b0f1c994f2ab9ed2e5f924ebb650f8dba6f6ae318a472bdc46d64c27b3afb684d
@@ -1,5 +1,4 @@
1
- #coding:UTF-8
2
- require 'mechanize'
1
+ # encoding: utf-8
3
2
  require 'nokogiri'
4
3
  require 'json'
5
4
  require 'addressable/uri'
@@ -10,7 +9,7 @@ class SearchEngine
10
9
  URI(url)
11
10
  result = query(url)
12
11
  return result.has_result?
13
- end
12
+ end
14
13
  end
15
14
  class SearchResult
16
15
  def initialize(body,baseuri,pagenumber=nil)
@@ -141,7 +140,9 @@ class MbaiduResult < SearchResult
141
140
  href,text,host,is_mobile = '','','',false
142
141
  a = result.search("a").first
143
142
  is_mobile = true unless a.search("img").empty?
144
- host = result.search('[@class="site"]').first.text
143
+ host = result.search('[@class="site"]').first
144
+ next if host.nil?
145
+ host = host.text
145
146
  href = a['href']
146
147
  text = a.text
147
148
  id = href.scan(/&order=(\d+)&/)
@@ -237,7 +238,7 @@ class Baidu < SearchEngine
237
238
  def extend(words,level=3,sleeptime=1)
238
239
  level = level.to_i - 1
239
240
  words = [words] unless words.respond_to? 'each'
240
-
241
+
241
242
  extensions = Array.new
242
243
  words.each do |word|
243
244
  self.query(word)
@@ -254,7 +255,7 @@ class Baidu < SearchEngine
254
255
  def popular?(wd)
255
256
  return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
256
257
  end
257
-
258
+
258
259
  def query(wd)
259
260
  q = Array.new
260
261
  q << "wd=#{wd}"
@@ -263,8 +264,9 @@ class Baidu < SearchEngine
263
264
  #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
264
265
  uri = URI.encode((BaseUri + queryStr))
265
266
  begin
266
- @page = @a.get uri
267
- BaiduResult.new(@page)
267
+ # @page = @a.get uri
268
+ @page = HTTParty.get uri
269
+ BaiduResult.new(@page,uri)
268
270
  rescue Net::HTTP::Persistent::Error
269
271
  warn "[timeout] #{uri}"
270
272
  return false
@@ -307,23 +309,28 @@ class Baidu < SearchEngine
307
309
  end
308
310
 
309
311
  class BaiduResult < SearchResult
310
- def initialize(page)
311
- raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
312
- @page = page
312
+ def initialize(page,baseuri,pagenumber=1)
313
+ File.open('/tmp/file','w'){|f|f.puts page}
314
+ @page = Nokogiri::HTML page
315
+ @baseuri = baseuri
316
+ @pagenumber = pagenumber
317
+ # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
318
+ # @page = page
313
319
  end
314
-
320
+
315
321
  def ranks
316
322
  return @ranks unless @ranks.nil?
317
323
  @ranks = Hash.new
318
- @page.search("//table[@class=\"result\"]").each do |table|
324
+ @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
319
325
  id = table['id']
320
326
  @ranks[id] = Hash.new
321
- url = @page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first
322
- a = @page.search("//table[@id=\"#{table['id']}\"]//h3/a")
327
+ url = table.search("[@class=\"g\"]").first
328
+ url = url.text unless url.nil?
329
+ a = table.search("a").first
323
330
  @ranks[id]['text'] = a.text
324
- @ranks[id]['href'] = a.first['href'].sub('http://www.baidu.com/link?url=','').strip
331
+ @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
325
332
  unless url.nil?
326
- url = url.text.strip
333
+ url = url.strip
327
334
  @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
328
335
  else
329
336
  @ranks[id]['host'] = nil
@@ -332,7 +339,49 @@ class BaiduResult < SearchResult
332
339
  #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
333
340
  @ranks
334
341
  end
335
-
342
+
343
+ def ads_top
344
+ ads = {}
345
+ id=0
346
+ @page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
347
+ table_id = table['id']
348
+ next if table_id.nil?
349
+ id += 1
350
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
351
+ title = table.search("a").first.text.strip
352
+ ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
353
+ end
354
+ ads
355
+ end
356
+ def ads_bottom
357
+ ads = {}
358
+ @page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
359
+ id = table['id']
360
+ next unless id.nil?
361
+ id = id[-1,1]
362
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
363
+ title = table.search("a").first.text.strip
364
+ ads[id]= {'title'=>title,'href' => href,'host'=>href}
365
+ end
366
+ ads
367
+ end
368
+ def ads_right
369
+ ads = {}
370
+ @page.search("//div[@id='ec_im_container']").each do |table|
371
+ table.search("div[@id]").each do |div|
372
+ id = div['id'][-1,1].to_i+1
373
+ title = div.search("a").first
374
+ next if title.nil?
375
+ title = title.text
376
+ url = div.search("font[@color='#008000']").first
377
+ next if url.nil?
378
+ url = url.text
379
+ ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
380
+ end
381
+ end
382
+ ads
383
+ end
384
+
336
385
  #return the top rank number from @ranks with the input host
337
386
  # def rank(host)#on base of ranks
338
387
  # ranks.each do |id,line|
@@ -353,13 +402,19 @@ class BaiduResult < SearchResult
353
402
  def related_keywords
354
403
  @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
355
404
  end
356
-
405
+
357
406
  def next
358
- @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
407
+ url = @page.xpath('//a[text()="下一页>"]').first
408
+ return if url.nil?
409
+ url = url['href']
410
+ url = URI.join(@baseuri,url).to_s
411
+ body = HTTParty.get(url)
412
+ return BaiduResult.new(body,url,@pagenumber+1)
413
+ # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
359
414
  end
360
415
 
361
416
  def has_result?
362
417
  @page.search('//div[@class="nors"]').empty?
363
418
  end
364
-
419
+
365
420
  end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.4
5
- prerelease:
4
+ version: 1.1.5
6
5
  platform: ruby
7
6
  authors:
8
7
  - seoaqua
@@ -10,8 +9,50 @@ autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
11
  date: 2012-06-13 00:00:00.000000000 Z
13
- dependencies: []
14
- description: to get keyword ranking,related queries and popularity from baidu.com.
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: addressable
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httparty
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
15
56
  this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
16
57
  email: seoaqua@qq.com
17
58
  executables: []
@@ -21,27 +62,26 @@ files:
21
62
  - lib/baidu.rb
22
63
  homepage: https://github.com/seoaqua/ruby-baidu
23
64
  licenses: []
65
+ metadata: {}
24
66
  post_install_message:
25
67
  rdoc_options: []
26
68
  require_paths:
27
69
  - lib
28
70
  required_ruby_version: !ruby/object:Gem::Requirement
29
- none: false
30
71
  requirements:
31
- - - ! '>='
72
+ - - '>='
32
73
  - !ruby/object:Gem::Version
33
74
  version: '0'
34
75
  required_rubygems_version: !ruby/object:Gem::Requirement
35
- none: false
36
76
  requirements:
37
- - - ! '>='
77
+ - - '>='
38
78
  - !ruby/object:Gem::Version
39
79
  version: '0'
40
80
  requirements: []
41
81
  rubyforge_project:
42
- rubygems_version: 1.8.24
82
+ rubygems_version: 2.0.3
43
83
  signing_key:
44
- specification_version: 3
45
- summary: to get keyword ranking,related queries and popularity from baidu.com. this
46
- is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
84
+ specification_version: 4
85
+ summary: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
86
+ this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
47
87
  test_files: []