baidu 0.2.3 → 0.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/baidu.rb +123 -57
  2. metadata +2 -2
@@ -1,61 +1,127 @@
1
1
  #coding:UTF-8
2
2
  require 'mechanize'
3
3
  class Baidu
4
- def initialize(offset100=false)
5
- @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
6
- @a.max_history = 1
7
- @perpage = 10
8
- @perpage = 100 if offset100==true
9
- @baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
10
- end
11
- def query(query)
12
- query = "#{query}"
13
- @uri = @baseuri+URI.encode(query.encode('GBK'))
14
- @page = @a.get @uri
15
- #File.open('/tmp/testpage','w'){|f|f.puts @page.body} if query=='1458 Italia'
16
- self.clean
17
- @number = self.how_many
18
- @maxpage = (@number / @perpage.to_f).round
19
- @maxpage =10 if @maxpage>10
20
- @currpage =0
21
- end
22
- def how_many_pages(uri)
23
- self.query("site:#{uri}")
24
- return self.how_many
25
- end
26
- def how_many_links(uri)
27
- self.query("domain:\"#{uri}\"")
28
- return self.how_many
29
- end
30
- def how_many_pages_with(url,string)
31
- self.query("site:#{url} inurl:#{string}")
32
- return self.how_many
33
- end
34
- def rank(host)
35
- @page.search("//table[@class=\"result\"]").each do |table|
36
- href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
37
- begin
38
- return table['id'] if host==URI.parse(URI.encode(href)).host
39
- rescue URI::InvalidURIError
40
- puts "invalid uri:#{href}"
41
- end
42
- end
43
- return false
44
- end
45
- def how_many
46
- return false if @page.search("//span[@class='nums']").first.nil?
47
- return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
48
- end
49
- def next
50
- nextbtn = @page.link_with(:text=>/下一页/)
51
- return false if (nextbtn.nil? or @currpage >= @maxpage)
52
- @page = @a.click(nextbtn)
53
- self.clean
54
- return true
55
- end
56
- def clean
57
- @page.body.force_encoding('GBK')
58
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
59
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
60
- end
4
+ attr_accessor :perpage,:pagenumber,:debug
5
+ attr_reader :page,:wd,:data
6
+ BaseUri = 'http://www.baidu.com/s?'
7
+ def initialize
8
+ @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
9
+ @a.idle_timeout = 2
10
+ @a.max_history = 1
11
+ @perpage = 100
12
+ @page = nil
13
+ @debug = false
14
+ @data = Hash.new
15
+ #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
16
+ end
17
+
18
+ public
19
+ def query(wd)
20
+ @data.clear
21
+ @wd = wd
22
+ @data.clear
23
+ q = Array.new
24
+ q << "wd=#{wd}"
25
+ q << "rn=#{@perpage}"
26
+ queryStr = q.join("&")
27
+ uri = URI.encode((BaseUri + queryStr).encode('GBK'))
28
+ begin
29
+ @page = @a.get uri
30
+ rescue SocketError => e
31
+ puts e
32
+ end
33
+ clean
34
+ @number = self.how_many
35
+ @maxpage = (@number / @perpage.to_f).round
36
+ @currpage =0
37
+ =begin
38
+ query = "#{query}"
39
+ @uri = @baseuri+URI.encode(query.encode('GBK'))
40
+ @page = @a.get @uri
41
+ self.clean
42
+ @number = self.how_many
43
+ @maxpage = (@number / @perpage.to_f).round
44
+ @maxpage =10 if @maxpage>10
45
+ @currpage =0
46
+ =end
47
+ end
48
+
49
+ #site:xxx.yyy.com
50
+ def how_many_pages(host)
51
+ return @data['how_many']if @data.has_key?'how_many'
52
+ query("site:#{host}")
53
+ return how_many
54
+ end
55
+
56
+ #domain:xxx.yyy.com/path/file.html
57
+ def how_many_links(uri)
58
+ return @data['how_many']if @data.has_key?'how_many'
59
+ query("domain:\"#{uri}\"")
60
+ return how_many
61
+ end
62
+
63
+ #site:xxx.yyy.com inurl:zzz
64
+ def how_many_pages_with(host,string)
65
+ return @data['how_many']if @data.has_key?'how_many'
66
+ query("site:#{host} inurl:#{string}")
67
+ return how_many
68
+ end
69
+ ########################################################################################################################
70
+ #look up a word and get the rank of a uri with $host
71
+ def rank(host)#on base of ranks
72
+ return @data['rank'][host] if @data.has_key?'rank' and @data['rank'].has_key?host
73
+ ranks.each_with_index do |uri,index|
74
+ if URI.parse(URI.encode(uri).host)
75
+ @data << {'rank'=>{host=>index+1}}
76
+ return index+1
77
+ end
78
+ end
79
+ =begin
80
+ @page.search("//table[@class=\"result\"]").each do |table|
81
+ href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
82
+ begin
83
+ return table['id'] if host==URI.parse(URI.encode(href)).host
84
+ rescue URI::InvalidURIError
85
+ puts "invalid uri:#{href}" if @debug
86
+ end
87
+ end
88
+ return false
89
+ =end
90
+ end
91
+
92
+ def ranks#(keyword=false)
93
+ return @data['ranks'] if @data.has_key?'ranks'
94
+ raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
95
+ #self.query(keyword) if keyword
96
+ ranks = Array.new
97
+ @page.search("//table[@class=\"result\"]").each do |table|
98
+ ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
99
+ end
100
+ @data['ranks'] = ranks
101
+ return ranks
102
+ end
103
+
104
+ def how_many
105
+ return @data['how_many'] if @data.has_key?'how_many'
106
+ raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
107
+ numSpan = @page.search("//span[@class='nums']").first
108
+ return false if numSpan.nil?
109
+ return numSpan.content.gsub(/\D/,'').to_i
110
+ #return false if @page.search("//span[@class='nums']").first.nil?
111
+ #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
112
+ end
113
+
114
+ def next
115
+ nextbtn = @page.link_with(:text=>/下一页/)
116
+ return false if (nextbtn.nil? or @currpage >= @maxpage)
117
+ @page = @a.click(nextbtn)
118
+ self.clean
119
+ return true
120
+ end
121
+ private
122
+ def clean
123
+ @page.body.force_encoding('GBK')
124
+ @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
125
+ @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
126
+ end
61
127
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
39
  version: '0'
40
40
  requirements: []
41
41
  rubyforge_project:
42
- rubygems_version: 1.8.15
42
+ rubygems_version: 1.8.17
43
43
  signing_key:
44
44
  specification_version: 3
45
45
  summary: to get data from www.baidu.com. this is built by a newbie, so please be careful