baidu 0.2.6 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/baidu.rb +47 -87
  2. metadata +5 -5
data/lib/baidu.rb CHANGED
@@ -4,27 +4,23 @@ require 'mechanize'
4
4
  require 'json'
5
5
  require 'uri'
6
6
  class Baidu
7
- attr_accessor :perpage,:pagenumber,:debug
8
- attr_reader :page,:wd,:data
9
7
  BaseUri = 'http://www.baidu.com/s?'
8
+ PerPage = 100
9
+
10
10
  def initialize
11
11
  @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
12
12
  @a.idle_timeout = 2
13
13
  @a.max_history = 1
14
- @perpage = 100
15
14
  @page = nil
16
- @debug = false
17
- @data = Hash.new
18
- #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
19
15
  end
20
16
 
21
- public
22
17
  def suggestions(wd)
23
18
  json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
24
19
  m = /\[([^\]]*)\]/.match json
25
20
  return JSON.parse m[0]
26
21
  end
27
22
 
23
+ =begin
28
24
  def extend(words,level=3,sleeptime=1)
29
25
  level = level.to_i - 1
30
26
  words = [words] unless words.respond_to? 'each'
@@ -40,32 +36,29 @@ class Baidu
40
36
  return extensions if level < 1
41
37
  return extensions + extend(extensions,level)
42
38
  end
39
+ =end
43
40
 
44
41
  def popular?(wd)
45
42
  return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
46
43
  end
47
44
 
48
45
  def query(wd)
49
- @data.clear
50
- @wd = wd
51
- @data.clear
52
46
  q = Array.new
53
47
  q << "wd=#{wd}"
54
- q << "rn=#{@perpage}"
48
+ q << "rn=#{PerPage}"
55
49
  queryStr = q.join("&")
56
- uri = URI.encode((BaseUri + queryStr).encode('GBK'))
57
- begin
50
+ #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
51
+ uri = URI.encode((BaseUri + queryStr))
52
+ begin
58
53
  @page = @a.get uri
54
+ BaiduResult.new(@page)
59
55
  rescue Net::HTTP::Persistent::Error
60
56
  warn "#{uri}timeout"
57
+ return false
61
58
  end
62
- clean
63
- @number = self.how_many
64
- @maxpage = (@number / @perpage.to_f).round
65
- @currpage =0
66
59
  =begin
67
60
  query = "#{query}"
68
- @uri = @baseuri+URI.encode(query.encode('GBK'))
61
+ @uri = BaseUri+URI.encode(query.encode('GBK'))
69
62
  @page = @a.get @uri
70
63
  self.clean
71
64
  @number = self.how_many
@@ -75,96 +68,63 @@ class Baidu
75
68
  =end
76
69
  end
77
70
 
71
+ =begin
72
+ def maxpage
73
+ @maxpage ||= (how_many / PerPage.to_f).round
74
+ end
75
+ =end
76
+
78
77
  #site:xxx.yyy.com
79
78
  def how_many_pages(host)
80
- return @data['how_many']if @data.has_key?'how_many'
81
- query("site:#{host}")
82
- return how_many
79
+ query("site:#{host}").how_many
83
80
  end
84
81
 
85
82
  #domain:xxx.yyy.com/path/file.html
86
83
  def how_many_links(uri)
87
- return @data['how_many']if @data.has_key?'how_many'
88
- query("domain:\"#{uri}\"")
89
- return how_many
84
+ query("domain:\"#{uri}\"").how_many
90
85
  end
91
86
 
92
87
  #site:xxx.yyy.com inurl:zzz
93
88
  def how_many_pages_with(host,string)
94
- return @data['how_many']if @data.has_key?'how_many'
95
- query("site:#{host} inurl:#{string}")
96
- return how_many
89
+ query("site:#{host} inurl:#{string}").how_many
97
90
  end
98
- ########################################################################################################################
99
- #look up a word and get the rank of a uri with $host
100
- def rank(host)#on base of ranks
101
- return @data[:rank][host] if @data.has_key?:rank and @data[:rank].has_key?host
102
- ranks.each_with_index do |uri,index|
103
- if URI.parse(URI.encode(uri).host)
104
- @data << {:rank=>{host=>index+1}}
105
- return index+1
106
- end
107
- end
91
+
108
92
  =begin
109
- @page.search("//table[@class=\"result\"]").each do |table|
110
- href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
111
- begin
112
- return table['id'] if host==URI.parse(URI.encode(href)).host
113
- rescue URI::InvalidURIError
114
- puts "invalid uri:#{href}" if @debug
115
- end
116
- end
117
- return false
118
- =end
93
+ private
94
+ def clean
95
+ @page.body.force_encoding('GBK')
96
+ @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
97
+ @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
119
98
  end
99
+ =end
100
+ end
120
101
 
121
- def ranks#(keyword=false)
122
- return @data[:ranks] if @data.has_key?:ranks
123
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
124
- #self.query(keyword) if keyword
125
- ranks = Array.new
126
- @page.search("//table[@class=\"result\"]").each do |table|
127
- ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
128
- end
129
- @data[:ranks] = ranks
130
- return ranks
102
+ class BaiduResult
103
+ def initialize(page)
104
+ raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
105
+ @page = page
131
106
  end
132
-
133
- def related_keywords
134
- return @data[:realated_keywords] if @data.has_key?:realated_keywords
135
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
136
- keywords = Array.new
137
- div = @page.search("//div[@id=\"rs\"]//tr//a")
138
- return false if div.nil?
139
- div.each do |keyword|
140
- keywords << keyword.text
107
+
108
+ def ranks
109
+ @ranks ||= @page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//a").first['href'] }
110
+ end
111
+
112
+ #look up a word and get the rank of a uri with $host
113
+ def rank(host)#on base of ranks
114
+ ranks.each_with_index do |uri,index|
115
+ index+1 if URI.parse(URI.encode(uri)).host == host
141
116
  end
142
- @data[:realated_keywords] = keywords
143
- return keywords
144
- #m = /href="[^"]+">([^<]+)<\/a>/.match(related.content)
145
117
  end
146
118
 
147
119
  def how_many
148
- return @data['how_many'] if @data.has_key?'how_many'
149
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
150
- numSpan = @page.search("//span[@class='nums']").first
151
- return false if numSpan.nil?
152
- return numSpan.content.gsub(/\D/,'').to_i
153
- #return false if @page.search("//span[@class='nums']").first.nil?
154
- #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
120
+ @how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
155
121
  end
156
122
 
157
- def next
158
- nextbtn = @page.link_with(:text=>/下一页/)
159
- return false if (nextbtn.nil? or @currpage >= @maxpage)
160
- @page = @a.click(nextbtn)
161
- self.clean
162
- return true
123
+ def related_keywords
124
+ @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
163
125
  end
164
- private
165
- def clean
166
- @page.body.force_encoding('GBK')
167
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
168
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
126
+
127
+ def next
128
+ @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
169
129
  end
170
130
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,17 +9,17 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-11 00:00:00.000000000 Z
12
+ date: 2012-06-13 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: to get keyword ranking,related queries and popularity from baidu.com.
15
- this is built by a newbie, so please be careful
15
+ this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
16
16
  email: seoaqua@qq.com
17
17
  executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
21
  - lib/baidu.rb
22
- homepage: http://seoaqua.com
22
+ homepage: https://github.com/seoaqua/ruby-baidu
23
23
  licenses: []
24
24
  post_install_message:
25
25
  rdoc_options: []
@@ -43,5 +43,5 @@ rubygems_version: 1.8.21
43
43
  signing_key:
44
44
  specification_version: 3
45
45
  summary: to get keyword ranking,related queries and popularity from baidu.com. this
46
- is built by a newbie, so please be careful
46
+ is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
47
47
  test_files: []