baidu 1.1.4 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/baidu.rb +76 -21
  3. metadata +52 -12
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 6e5e2be5751728aa5aab6cc36fc9552a74aadba1
4
+ data.tar.gz: 28a8bb12fad2e0d3164908a61f13a9fc1d1d03e2
5
+ SHA512:
6
+ metadata.gz: d2b99a90386ebad73fac60db3331d0ea3e8021c657ef7d6d4c0b703023df86f8b3bfc2df434862a07dcc7b1496f026779bcbd592b9159e11ae2790ccdb9d0481
7
+ data.tar.gz: ea3191864319521de14ab735a770e08204a64fc40e23d836be4e196669f24e2b0f1c994f2ab9ed2e5f924ebb650f8dba6f6ae318a472bdc46d64c27b3afb684d
@@ -1,5 +1,4 @@
1
- #coding:UTF-8
2
- require 'mechanize'
1
+ # encoding: utf-8
3
2
  require 'nokogiri'
4
3
  require 'json'
5
4
  require 'addressable/uri'
@@ -10,7 +9,7 @@ class SearchEngine
10
9
  URI(url)
11
10
  result = query(url)
12
11
  return result.has_result?
13
- end
12
+ end
14
13
  end
15
14
  class SearchResult
16
15
  def initialize(body,baseuri,pagenumber=nil)
@@ -141,7 +140,9 @@ class MbaiduResult < SearchResult
141
140
  href,text,host,is_mobile = '','','',false
142
141
  a = result.search("a").first
143
142
  is_mobile = true unless a.search("img").empty?
144
- host = result.search('[@class="site"]').first.text
143
+ host = result.search('[@class="site"]').first
144
+ next if host.nil?
145
+ host = host.text
145
146
  href = a['href']
146
147
  text = a.text
147
148
  id = href.scan(/&order=(\d+)&/)
@@ -237,7 +238,7 @@ class Baidu < SearchEngine
237
238
  def extend(words,level=3,sleeptime=1)
238
239
  level = level.to_i - 1
239
240
  words = [words] unless words.respond_to? 'each'
240
-
241
+
241
242
  extensions = Array.new
242
243
  words.each do |word|
243
244
  self.query(word)
@@ -254,7 +255,7 @@ class Baidu < SearchEngine
254
255
  def popular?(wd)
255
256
  return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
256
257
  end
257
-
258
+
258
259
  def query(wd)
259
260
  q = Array.new
260
261
  q << "wd=#{wd}"
@@ -263,8 +264,9 @@ class Baidu < SearchEngine
263
264
  #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
264
265
  uri = URI.encode((BaseUri + queryStr))
265
266
  begin
266
- @page = @a.get uri
267
- BaiduResult.new(@page)
267
+ # @page = @a.get uri
268
+ @page = HTTParty.get uri
269
+ BaiduResult.new(@page,uri)
268
270
  rescue Net::HTTP::Persistent::Error
269
271
  warn "[timeout] #{uri}"
270
272
  return false
@@ -307,23 +309,28 @@ class Baidu < SearchEngine
307
309
  end
308
310
 
309
311
  class BaiduResult < SearchResult
310
- def initialize(page)
311
- raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
312
- @page = page
312
+ def initialize(page,baseuri,pagenumber=1)
313
+ File.open('/tmp/file','w'){|f|f.puts page}
314
+ @page = Nokogiri::HTML page
315
+ @baseuri = baseuri
316
+ @pagenumber = pagenumber
317
+ # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
318
+ # @page = page
313
319
  end
314
-
320
+
315
321
  def ranks
316
322
  return @ranks unless @ranks.nil?
317
323
  @ranks = Hash.new
318
- @page.search("//table[@class=\"result\"]").each do |table|
324
+ @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
319
325
  id = table['id']
320
326
  @ranks[id] = Hash.new
321
- url = @page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first
322
- a = @page.search("//table[@id=\"#{table['id']}\"]//h3/a")
327
+ url = table.search("[@class=\"g\"]").first
328
+ url = url.text unless url.nil?
329
+ a = table.search("a").first
323
330
  @ranks[id]['text'] = a.text
324
- @ranks[id]['href'] = a.first['href'].sub('http://www.baidu.com/link?url=','').strip
331
+ @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
325
332
  unless url.nil?
326
- url = url.text.strip
333
+ url = url.strip
327
334
  @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
328
335
  else
329
336
  @ranks[id]['host'] = nil
@@ -332,7 +339,49 @@ class BaiduResult < SearchResult
332
339
  #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
333
340
  @ranks
334
341
  end
335
-
342
+
343
+ def ads_top
344
+ ads = {}
345
+ id=0
346
+ @page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
347
+ table_id = table['id']
348
+ next if table_id.nil?
349
+ id += 1
350
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
351
+ title = table.search("a").first.text.strip
352
+ ads[id.to_s]= {'title'=>title,'href' => href,'host'=>href}
353
+ end
354
+ ads
355
+ end
356
+ def ads_bottom
357
+ ads = {}
358
+ @page.search("//table[@class='EC_mr15']|//table[@class='ec_pp_f']").each do |table|
359
+ id = table['id']
360
+ next unless id.nil?
361
+ id = id[-1,1]
362
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
363
+ title = table.search("a").first.text.strip
364
+ ads[id]= {'title'=>title,'href' => href,'host'=>href}
365
+ end
366
+ ads
367
+ end
368
+ def ads_right
369
+ ads = {}
370
+ @page.search("//div[@id='ec_im_container']").each do |table|
371
+ table.search("div[@id]").each do |div|
372
+ id = div['id'][-1,1].to_i+1
373
+ title = div.search("a").first
374
+ next if title.nil?
375
+ title = title.text
376
+ url = div.search("font[@color='#008000']").first
377
+ next if url.nil?
378
+ url = url.text
379
+ ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
380
+ end
381
+ end
382
+ ads
383
+ end
384
+
336
385
  #return the top rank number from @ranks with the input host
337
386
  # def rank(host)#on base of ranks
338
387
  # ranks.each do |id,line|
@@ -353,13 +402,19 @@ class BaiduResult < SearchResult
353
402
  def related_keywords
354
403
  @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
355
404
  end
356
-
405
+
357
406
  def next
358
- @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
407
+ url = @page.xpath('//a[text()="下一页>"]').first
408
+ return if url.nil?
409
+ url = url['href']
410
+ url = URI.join(@baseuri,url).to_s
411
+ body = HTTParty.get(url)
412
+ return BaiduResult.new(body,url,@pagenumber+1)
413
+ # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
359
414
  end
360
415
 
361
416
  def has_result?
362
417
  @page.search('//div[@class="nors"]').empty?
363
418
  end
364
-
419
+
365
420
  end
metadata CHANGED
@@ -1,8 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.4
5
- prerelease:
4
+ version: 1.1.5
6
5
  platform: ruby
7
6
  authors:
8
7
  - seoaqua
@@ -10,8 +9,50 @@ autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
11
  date: 2012-06-13 00:00:00.000000000 Z
13
- dependencies: []
14
- description: to get keyword ranking,related queries and popularity from baidu.com.
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: addressable
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: httparty
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ description: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
15
56
  this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
16
57
  email: seoaqua@qq.com
17
58
  executables: []
@@ -21,27 +62,26 @@ files:
21
62
  - lib/baidu.rb
22
63
  homepage: https://github.com/seoaqua/ruby-baidu
23
64
  licenses: []
65
+ metadata: {}
24
66
  post_install_message:
25
67
  rdoc_options: []
26
68
  require_paths:
27
69
  - lib
28
70
  required_ruby_version: !ruby/object:Gem::Requirement
29
- none: false
30
71
  requirements:
31
- - - ! '>='
72
+ - - '>='
32
73
  - !ruby/object:Gem::Version
33
74
  version: '0'
34
75
  required_rubygems_version: !ruby/object:Gem::Requirement
35
- none: false
36
76
  requirements:
37
- - - ! '>='
77
+ - - '>='
38
78
  - !ruby/object:Gem::Version
39
79
  version: '0'
40
80
  requirements: []
41
81
  rubyforge_project:
42
- rubygems_version: 1.8.24
82
+ rubygems_version: 2.0.3
43
83
  signing_key:
44
- specification_version: 3
45
- summary: to get keyword ranking,related queries and popularity from baidu.com. this
46
- is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
84
+ specification_version: 4
85
+ summary: to get keyword ranking,related queries and popularity from www.baidu.com,www.so.com,m.baidu.com.
86
+ this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
47
87
  test_files: []