baidu 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/baidu.rb +11 -6
  2. metadata +2 -2
@@ -3,30 +3,33 @@ require 'mechanize'
3
3
  class Baidu
4
4
  def initialize(offset100=false)
5
5
  @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
6
+ @a.max_history = 1
6
7
  @perpage = 10
7
8
  @perpage = 100 if offset100==true
8
9
  @baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
9
10
  end
10
11
  def query(query)
12
+ query = "#{query}"
11
13
  @uri = @baseuri+URI.encode(query.encode('GBK'))
12
14
  @page = @a.get @uri
15
+ #File.open('/tmp/testpage','w'){|f|f.puts @page.body} if query=='1458 Italia'
13
16
  self.clean
14
- @number = self.number
17
+ @number = self.how_many
15
18
  @maxpage = (@number / @perpage.to_f).round
16
19
  @maxpage =10 if @maxpage>10
17
20
  @currpage =0
18
21
  end
19
22
  def how_many_pages(uri)
20
23
  self.query("site:#{uri}")
21
- return self.number
24
+ return self.how_many
22
25
  end
23
26
  def how_many_links(uri)
24
27
  self.query("domain:\"#{uri}\"")
25
- return self.number
28
+ return self.how_many
26
29
  end
27
30
  def how_many_pages_with(url,string)
28
31
  self.query("site:#{url} inurl:#{string}")
29
- return self.number
32
+ return self.how_many
30
33
  end
31
34
  def rank(host)
32
35
  @page.search("//table[@class=\"result\"]").each do |table|
@@ -39,7 +42,8 @@ class Baidu
39
42
  end
40
43
  return false
41
44
  end
42
- def number
45
+ def how_many
46
+ return false if @page.search("//span[@class='nums']").first.nil?
43
47
  return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
44
48
  end
45
49
  def next
@@ -50,7 +54,8 @@ class Baidu
50
54
  return true
51
55
  end
52
56
  def clean
53
- @page.body.encode!('UTF-8','GBK')
57
+ @page.body.force_encoding('GBK')
58
+ @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
54
59
  @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
55
60
  end
56
61
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
39
  version: '0'
40
40
  requirements: []
41
41
  rubyforge_project:
42
- rubygems_version: 1.8.11
42
+ rubygems_version: 1.8.15
43
43
  signing_key:
44
44
  specification_version: 3
45
45
  summary: to get data from www.baidu.com. this is built by a newbie, so please be careful