baidu 0.2.3 → 0.2.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/baidu.rb +123 -57
  2. metadata +2 -2
@@ -1,61 +1,127 @@
1
1
  #coding:UTF-8
2
2
  require 'mechanize'
3
3
  class Baidu
4
- def initialize(offset100=false)
5
- @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
6
- @a.max_history = 1
7
- @perpage = 10
8
- @perpage = 100 if offset100==true
9
- @baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
10
- end
11
- def query(query)
12
- query = "#{query}"
13
- @uri = @baseuri+URI.encode(query.encode('GBK'))
14
- @page = @a.get @uri
15
- #File.open('/tmp/testpage','w'){|f|f.puts @page.body} if query=='1458 Italia'
16
- self.clean
17
- @number = self.how_many
18
- @maxpage = (@number / @perpage.to_f).round
19
- @maxpage =10 if @maxpage>10
20
- @currpage =0
21
- end
22
- def how_many_pages(uri)
23
- self.query("site:#{uri}")
24
- return self.how_many
25
- end
26
- def how_many_links(uri)
27
- self.query("domain:\"#{uri}\"")
28
- return self.how_many
29
- end
30
- def how_many_pages_with(url,string)
31
- self.query("site:#{url} inurl:#{string}")
32
- return self.how_many
33
- end
34
- def rank(host)
35
- @page.search("//table[@class=\"result\"]").each do |table|
36
- href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
37
- begin
38
- return table['id'] if host==URI.parse(URI.encode(href)).host
39
- rescue URI::InvalidURIError
40
- puts "invalid uri:#{href}"
41
- end
42
- end
43
- return false
44
- end
45
- def how_many
46
- return false if @page.search("//span[@class='nums']").first.nil?
47
- return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
48
- end
49
- def next
50
- nextbtn = @page.link_with(:text=>/下一页/)
51
- return false if (nextbtn.nil? or @currpage >= @maxpage)
52
- @page = @a.click(nextbtn)
53
- self.clean
54
- return true
55
- end
56
- def clean
57
- @page.body.force_encoding('GBK')
58
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
59
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
60
- end
4
+ attr_accessor :perpage,:pagenumber,:debug
5
+ attr_reader :page,:wd,:data
6
+ BaseUri = 'http://www.baidu.com/s?'
7
+ def initialize
8
+ @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
9
+ @a.idle_timeout = 2
10
+ @a.max_history = 1
11
+ @perpage = 100
12
+ @page = nil
13
+ @debug = false
14
+ @data = Hash.new
15
+ #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
16
+ end
17
+
18
+ public
19
+ def query(wd)
20
+ @data.clear
21
+ @wd = wd
22
+ @data.clear
23
+ q = Array.new
24
+ q << "wd=#{wd}"
25
+ q << "rn=#{@perpage}"
26
+ queryStr = q.join("&")
27
+ uri = URI.encode((BaseUri + queryStr).encode('GBK'))
28
+ begin
29
+ @page = @a.get uri
30
+ rescue SocketError => e
31
+ puts e
32
+ end
33
+ clean
34
+ @number = self.how_many
35
+ @maxpage = (@number / @perpage.to_f).round
36
+ @currpage =0
37
+ =begin
38
+ query = "#{query}"
39
+ @uri = @baseuri+URI.encode(query.encode('GBK'))
40
+ @page = @a.get @uri
41
+ self.clean
42
+ @number = self.how_many
43
+ @maxpage = (@number / @perpage.to_f).round
44
+ @maxpage =10 if @maxpage>10
45
+ @currpage =0
46
+ =end
47
+ end
48
+
49
+ #site:xxx.yyy.com
50
+ def how_many_pages(host)
51
+ return @data['how_many']if @data.has_key?'how_many'
52
+ query("site:#{host}")
53
+ return how_many
54
+ end
55
+
56
+ #domain:xxx.yyy.com/path/file.html
57
+ def how_many_links(uri)
58
+ return @data['how_many']if @data.has_key?'how_many'
59
+ query("domain:\"#{uri}\"")
60
+ return how_many
61
+ end
62
+
63
+ #site:xxx.yyy.com inurl:zzz
64
+ def how_many_pages_with(host,string)
65
+ return @data['how_many']if @data.has_key?'how_many'
66
+ query("site:#{host} inurl:#{string}")
67
+ return how_many
68
+ end
69
+ ########################################################################################################################
70
+ #look up a word and get the rank of a uri with $host
71
+ def rank(host)#on base of ranks
72
+ return @data['rank'][host] if @data.has_key?'rank' and @data['rank'].has_key?host
73
+ ranks.each_with_index do |uri,index|
74
+ if URI.parse(URI.encode(uri).host)
75
+ @data << {'rank'=>{host=>index+1}}
76
+ return index+1
77
+ end
78
+ end
79
+ =begin
80
+ @page.search("//table[@class=\"result\"]").each do |table|
81
+ href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
82
+ begin
83
+ return table['id'] if host==URI.parse(URI.encode(href)).host
84
+ rescue URI::InvalidURIError
85
+ puts "invalid uri:#{href}" if @debug
86
+ end
87
+ end
88
+ return false
89
+ =end
90
+ end
91
+
92
+ def ranks#(keyword=false)
93
+ return @data['ranks'] if @data.has_key?'ranks'
94
+ raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
95
+ #self.query(keyword) if keyword
96
+ ranks = Array.new
97
+ @page.search("//table[@class=\"result\"]").each do |table|
98
+ ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
99
+ end
100
+ @data['ranks'] = ranks
101
+ return ranks
102
+ end
103
+
104
+ def how_many
105
+ return @data['how_many'] if @data.has_key?'how_many'
106
+ raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
107
+ numSpan = @page.search("//span[@class='nums']").first
108
+ return false if numSpan.nil?
109
+ return numSpan.content.gsub(/\D/,'').to_i
110
+ #return false if @page.search("//span[@class='nums']").first.nil?
111
+ #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
112
+ end
113
+
114
+ def next
115
+ nextbtn = @page.link_with(:text=>/下一页/)
116
+ return false if (nextbtn.nil? or @currpage >= @maxpage)
117
+ @page = @a.click(nextbtn)
118
+ self.clean
119
+ return true
120
+ end
121
+ private
122
+ def clean
123
+ @page.body.force_encoding('GBK')
124
+ @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
125
+ @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
126
+ end
61
127
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.2.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -39,7 +39,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
39
39
  version: '0'
40
40
  requirements: []
41
41
  rubyforge_project:
42
- rubygems_version: 1.8.15
42
+ rubygems_version: 1.8.17
43
43
  signing_key:
44
44
  specification_version: 3
45
45
  summary: to get data from www.baidu.com. this is built by a newbie, so please be careful