baidu 0.2.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/baidu.rb +47 -87
  2. metadata +5 -5
data/lib/baidu.rb CHANGED
@@ -4,27 +4,23 @@ require 'mechanize'
4
4
  require 'json'
5
5
  require 'uri'
6
6
  class Baidu
7
- attr_accessor :perpage,:pagenumber,:debug
8
- attr_reader :page,:wd,:data
9
7
  BaseUri = 'http://www.baidu.com/s?'
8
+ PerPage = 100
9
+
10
10
  def initialize
11
11
  @a = Mechanize.new {|agent| agent.user_agent_alias = 'Linux Mozilla'}
12
12
  @a.idle_timeout = 2
13
13
  @a.max_history = 1
14
- @perpage = 100
15
14
  @page = nil
16
- @debug = false
17
- @data = Hash.new
18
- #@baseuri = "http://www.baidu.com/s?rn=#{@perpage}&wd="
19
15
  end
20
16
 
21
- public
22
17
  def suggestions(wd)
23
18
  json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
24
19
  m = /\[([^\]]*)\]/.match json
25
20
  return JSON.parse m[0]
26
21
  end
27
22
 
23
+ =begin
28
24
  def extend(words,level=3,sleeptime=1)
29
25
  level = level.to_i - 1
30
26
  words = [words] unless words.respond_to? 'each'
@@ -40,32 +36,29 @@ class Baidu
40
36
  return extensions if level < 1
41
37
  return extensions + extend(extensions,level)
42
38
  end
39
+ =end
43
40
 
44
41
  def popular?(wd)
45
42
  return @a.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").body.include?"boxFlash"
46
43
  end
47
44
 
48
45
  def query(wd)
49
- @data.clear
50
- @wd = wd
51
- @data.clear
52
46
  q = Array.new
53
47
  q << "wd=#{wd}"
54
- q << "rn=#{@perpage}"
48
+ q << "rn=#{PerPage}"
55
49
  queryStr = q.join("&")
56
- uri = URI.encode((BaseUri + queryStr).encode('GBK'))
57
- begin
50
+ #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
51
+ uri = URI.encode((BaseUri + queryStr))
52
+ begin
58
53
  @page = @a.get uri
54
+ BaiduResult.new(@page)
59
55
  rescue Net::HTTP::Persistent::Error
60
56
  warn "#{uri}timeout"
57
+ return false
61
58
  end
62
- clean
63
- @number = self.how_many
64
- @maxpage = (@number / @perpage.to_f).round
65
- @currpage =0
66
59
  =begin
67
60
  query = "#{query}"
68
- @uri = @baseuri+URI.encode(query.encode('GBK'))
61
+ @uri = BaseUri+URI.encode(query.encode('GBK'))
69
62
  @page = @a.get @uri
70
63
  self.clean
71
64
  @number = self.how_many
@@ -75,96 +68,63 @@ class Baidu
75
68
  =end
76
69
  end
77
70
 
71
+ =begin
72
+ def maxpage
73
+ @maxpage ||= (how_many / PerPage.to_f).round
74
+ end
75
+ =end
76
+
78
77
  #site:xxx.yyy.com
79
78
  def how_many_pages(host)
80
- return @data['how_many']if @data.has_key?'how_many'
81
- query("site:#{host}")
82
- return how_many
79
+ query("site:#{host}").how_many
83
80
  end
84
81
 
85
82
  #domain:xxx.yyy.com/path/file.html
86
83
  def how_many_links(uri)
87
- return @data['how_many']if @data.has_key?'how_many'
88
- query("domain:\"#{uri}\"")
89
- return how_many
84
+ query("domain:\"#{uri}\"").how_many
90
85
  end
91
86
 
92
87
  #site:xxx.yyy.com inurl:zzz
93
88
  def how_many_pages_with(host,string)
94
- return @data['how_many']if @data.has_key?'how_many'
95
- query("site:#{host} inurl:#{string}")
96
- return how_many
89
+ query("site:#{host} inurl:#{string}").how_many
97
90
  end
98
- ########################################################################################################################
99
- #look up a word and get the rank of a uri with $host
100
- def rank(host)#on base of ranks
101
- return @data[:rank][host] if @data.has_key?:rank and @data[:rank].has_key?host
102
- ranks.each_with_index do |uri,index|
103
- if URI.parse(URI.encode(uri).host)
104
- @data << {:rank=>{host=>index+1}}
105
- return index+1
106
- end
107
- end
91
+
108
92
  =begin
109
- @page.search("//table[@class=\"result\"]").each do |table|
110
- href = @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
111
- begin
112
- return table['id'] if host==URI.parse(URI.encode(href)).host
113
- rescue URI::InvalidURIError
114
- puts "invalid uri:#{href}" if @debug
115
- end
116
- end
117
- return false
118
- =end
93
+ private
94
+ def clean
95
+ @page.body.force_encoding('GBK')
96
+ @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
97
+ @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
119
98
  end
99
+ =end
100
+ end
120
101
 
121
- def ranks#(keyword=false)
122
- return @data[:ranks] if @data.has_key?:ranks
123
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
124
- #self.query(keyword) if keyword
125
- ranks = Array.new
126
- @page.search("//table[@class=\"result\"]").each do |table|
127
- ranks << @page.search("//table[@id=\"#{table['id']}\"]//a").first['href']
128
- end
129
- @data[:ranks] = ranks
130
- return ranks
102
+ class BaiduResult
103
+ def initialize(page)
104
+ raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
105
+ @page = page
131
106
  end
132
-
133
- def related_keywords
134
- return @data[:realated_keywords] if @data.has_key?:realated_keywords
135
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
136
- keywords = Array.new
137
- div = @page.search("//div[@id=\"rs\"]//tr//a")
138
- return false if div.nil?
139
- div.each do |keyword|
140
- keywords << keyword.text
107
+
108
+ def ranks
109
+ @ranks ||= @page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//a").first['href'] }
110
+ end
111
+
112
+ #look up a word and get the rank of a uri with $host
113
+ def rank(host)#on base of ranks
114
+ ranks.each_with_index do |uri,index|
115
+ index+1 if URI.parse(URI.encode(uri)).host == host
141
116
  end
142
- @data[:realated_keywords] = keywords
143
- return keywords
144
- #m = /href="[^"]+">([^<]+)<\/a>/.match(related.content)
145
117
  end
146
118
 
147
119
  def how_many
148
- return @data['how_many'] if @data.has_key?'how_many'
149
- raise StandardError,'wrong with @page' unless @page.instance_of? Mechanize::Page
150
- numSpan = @page.search("//span[@class='nums']").first
151
- return false if numSpan.nil?
152
- return numSpan.content.gsub(/\D/,'').to_i
153
- #return false if @page.search("//span[@class='nums']").first.nil?
154
- #return @page.search("//span[@class='nums']").first.content.gsub(/\D/,'').to_i
120
+ @how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
155
121
  end
156
122
 
157
- def next
158
- nextbtn = @page.link_with(:text=>/下一页/)
159
- return false if (nextbtn.nil? or @currpage >= @maxpage)
160
- @page = @a.click(nextbtn)
161
- self.clean
162
- return true
123
+ def related_keywords
124
+ @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
163
125
  end
164
- private
165
- def clean
166
- @page.body.force_encoding('GBK')
167
- @page.body.encode!('UTF-8',:invalid => :replace, :undef => :replace, :replace => "")
168
- @page.body.gsub! ("[\U0080-\U2C77]+") #mechanize will be confuzed without removing the few characters
126
+
127
+ def next
128
+ @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
169
129
  end
170
130
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 1.0.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,17 +9,17 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2011-11-11 00:00:00.000000000 Z
12
+ date: 2012-06-13 00:00:00.000000000 Z
13
13
  dependencies: []
14
14
  description: to get keyword ranking,related queries and popularity from baidu.com.
15
- this is built by a newbie, so please be careful
15
+ this is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
16
16
  email: seoaqua@qq.com
17
17
  executables: []
18
18
  extensions: []
19
19
  extra_rdoc_files: []
20
20
  files:
21
21
  - lib/baidu.rb
22
- homepage: http://seoaqua.com
22
+ homepage: https://github.com/seoaqua/ruby-baidu
23
23
  licenses: []
24
24
  post_install_message:
25
25
  rdoc_options: []
@@ -43,5 +43,5 @@ rubygems_version: 1.8.21
43
43
  signing_key:
44
44
  specification_version: 3
45
45
  summary: to get keyword ranking,related queries and popularity from baidu.com. this
46
- is built by a newbie, so please be careful
46
+ is built by a newbie, so please be careful. welcome to check my homepage, http://seoaqua.com
47
47
  test_files: []