baidu 1.1.1 → 1.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (2) hide show
  1. data/lib/baidu.rb +232 -35
  2. metadata +1 -1
@@ -1,8 +1,215 @@
1
1
  #coding:UTF-8
2
- require 'rubygems'
3
2
  require 'mechanize'
3
+ require 'nokogiri'
4
4
  require 'json'
5
5
  require 'addressable/uri'
6
+ require 'httparty'
7
+
8
+ class SearchResult
9
+ def initialize(body,baseuri,pagenumber=nil)
10
+ @body = Nokogiri::HTML body
11
+ @baseuri = baseuri
12
+ # @host = URI(baseuri).host
13
+ if pagenumber.nil?
14
+ @pagenumber = 1
15
+ else
16
+ @pagenumber = pagenumber
17
+ end
18
+ end
19
+
20
+ #返回当前页中host满足条件的结果
21
+ def ranks_for(specific_host)
22
+ host_ranks = Hash.new
23
+ ranks.each do |id,line|
24
+ if specific_host.class == Regexp
25
+ host_ranks[id] = line if line['host'] =~ specific_host
26
+ elsif specific_host.class == String
27
+ host_ranks[id] = line if line['host'] == specific_host
28
+ end
29
+ end
30
+ host_ranks
31
+ end
32
+ #return the top rank number from @ranks with the input host
33
+ def rank(host)#on base of ranks
34
+ ranks.each do |id,line|
35
+ id = id.to_i
36
+ if host.class == Regexp
37
+ return id if line['host'] =~ host
38
+ elsif host.class == String
39
+ return id if line['host'] == host
40
+ end
41
+ end
42
+ return nil
43
+ end
44
+ end
45
+ class Qihoo
46
+ Host = 'www.so.com'
47
+ #基本查询, 相当于在搜索框直接数据关键词查询
48
+ def query(wd)
49
+ begin
50
+ #用原始路径请求
51
+ uri = URI.encode(URI.join("http://#{Host}/",'s?q='+wd).to_s)
52
+ body = HTTParty.get(uri)
53
+ #如果请求地址被跳转,重新获取当前页的URI
54
+ uri = URI.join("http://#{Host}/",body.request.path).to_s
55
+ return QihooResult.new(body,uri)
56
+ rescue Exception => e
57
+ warn "#{uri} fetch error: #{e.to_s}"
58
+ return false
59
+ end
60
+ end
61
+ #是否收录
62
+ def indexed?(url)
63
+ URI(url)
64
+ query(url).has_result?
65
+ end
66
+ end
67
+
68
+ class QihooResult < SearchResult
69
+ Host = 'www.so.com'
70
+
71
+ #返回所有当前页的排名结果
72
+ def ranks
73
+ return @ranks unless @ranks.nil?
74
+ @ranks = Hash.new
75
+ id = (@pagenumber - 1) * 10
76
+ @body.xpath('//li[@class="res-list"]').each do |li|
77
+ a = li.search("h3/a").first
78
+ url = li.search("cite")
79
+ next if a['data-pos'].nil?
80
+ id += 1
81
+ text = a.text.strip
82
+ href = a['href']
83
+ url = url.first.text
84
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
85
+ @ranks[id] = {'href'=>"http://so.com#{href}",'text'=>text,'host'=>host}
86
+ end
87
+ @ranks
88
+ end
89
+ #下一页
90
+ def next
91
+ next_href = @body.xpath('//a[@id="snext"]').first['href']
92
+ next_href = URI.join(@baseuri,next_href).to_s
93
+ # next_href = URI.join("http://#{@host}",next_href).to_s
94
+ next_body = HTTParty.get(next_href).body
95
+ return QihooResult.new(next_body,next_href,@pagenumber+1)
96
+ #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
97
+ end
98
+ #有结果
99
+ def has_result?
100
+ !@body.xpath('//div[@id="main"]/h3').text().include?'没有找到该URL'
101
+ end
102
+ end
103
+
104
+ class Mbaidu
105
+ BaseUri = 'http://m.baidu.com/s?'
106
+ headers = {
107
+ "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
108
+ }
109
+ Options = {:headers => headers}
110
+
111
+ #基本查询,相当于从搜索框直接输入关键词查询
112
+ def query(wd)
113
+ queryStr = "word=#{wd}"
114
+ uri = URI.encode((BaseUri + queryStr))
115
+ begin
116
+ res = HTTParty.get(uri,Options)
117
+ MbaiduResult.new(res,uri)
118
+ rescue Exception => e
119
+ warn "#{uri} fetch error: #{e.to_s}"
120
+ return false
121
+ end
122
+ end
123
+ end
124
+ class MbaiduResult < SearchResult
125
+ def initialize(body,baseuri,pagenumber=nil)
126
+ @body = Nokogiri::HTML body
127
+ @baseuri = baseuri
128
+ if pagenumber.nil?
129
+ @pagenumber = 1
130
+ else
131
+ @pagenumber = pagenumber
132
+ end
133
+ end
134
+
135
+ #返回当前页所有查询结果
136
+ def ranks
137
+ #如果已经赋值说明解析过,不需要重新解析,直接返回结果
138
+ return @ranks unless @ranks.nil?
139
+ @ranks = Hash.new
140
+ @body.xpath('//div[@class="result"]').each do |result|
141
+ href,text,host,is_mobile = '','','',false
142
+ a = result.search("a").first
143
+ is_mobile = true unless a.search("img").empty?
144
+ host = result.search('span[@class="site"]').first.text
145
+ href = a['href']
146
+ text = a.text
147
+ id = href.scan(/&order=(\d+)&/)
148
+ if id.empty?
149
+ id = nil
150
+ else
151
+ id = id.first.first.to_i
152
+ id = (@pagenumber-1)*10+id
153
+ end
154
+ =begin
155
+ result.children.each do |elem|
156
+ if elem.name == 'a'
157
+ href = elem['href']
158
+ id = elem.text.match(/^\d+/).to_s.to_i
159
+ text = elem.text.sub(/^\d+/,'')
160
+ text.sub!(/^\u00A0/,'')
161
+ elsif elem['class'] == 'abs'
162
+ elem.children.each do |elem2|
163
+ if elem2['class'] == 'site'
164
+ host = elem2.text
165
+ break
166
+ end
167
+ end
168
+ elsif elem['class'] == 'site'
169
+ host == elem['href']
170
+ end
171
+ end
172
+ =end
173
+
174
+ @ranks[id] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
175
+ end
176
+ @ranks
177
+ end
178
+ =begin
179
+ #返回当前页中,符合host条件的结果
180
+ def ranks_for(specific_host)
181
+ host_ranks = Hash.new
182
+ ranks.each do |id,line|
183
+ if specific_host.class == Regexp
184
+ host_ranks[id] = line if line['host'] =~ specific_host
185
+ elsif specific_host.class == String
186
+ host_ranks[id] = line if line['host'] == specific_host
187
+ end
188
+ end
189
+ host_ranks
190
+ end
191
+ #return the top rank number from @ranks with the input host
192
+ def rank(host)#on base of ranks
193
+ ranks.each do |id,line|
194
+ id = id.to_i
195
+ if host.class == Regexp
196
+ return id if line['host'] =~ host
197
+ elsif host.class == String
198
+ return id if line['host'] == host
199
+ end
200
+ end
201
+ return nil
202
+ end
203
+ =end
204
+ #下一页
205
+ def next
206
+ url = @body.xpath('//a[text()="下一页"]').first['href']
207
+ url = URI.join(@baseuri,url).to_s
208
+ body = HTTParty.get(url)
209
+ return MbaiduResult.new(body,url,@pagenumber+1)
210
+ end
211
+
212
+ end
6
213
  class Baidu
7
214
  BaseUri = 'http://www.baidu.com/s?'
8
215
  PerPage = 100
@@ -15,7 +222,7 @@ class Baidu
15
222
  end
16
223
 
17
224
  def suggestions(wd)
18
- json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
225
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
19
226
  m = /\[([^\]]*)\]/.match json
20
227
  return JSON.parse m[0]
21
228
  end
@@ -74,12 +281,6 @@ class Baidu
74
281
  =end
75
282
  end
76
283
 
77
- =begin
78
- def maxpage
79
- @maxpage ||= (how_many / PerPage.to_f).round
80
- end
81
- =end
82
-
83
284
  #site:xxx.yyy.com
84
285
  def how_many_pages(host)
85
286
  query("site:#{host}").how_many
@@ -94,6 +295,10 @@ class Baidu
94
295
  def how_many_pages_with(host,string)
95
296
  query("site:#{host} inurl:#{string}").how_many
96
297
  end
298
+ #是否收录
299
+ def indexed?(url)
300
+ query(url).has_result?
301
+ end
97
302
 
98
303
  =begin
99
304
  private
@@ -105,13 +310,13 @@ class Baidu
105
310
  =end
106
311
  end
107
312
 
108
- class BaiduResult
313
+ class BaiduResult < SearchResult
109
314
  def initialize(page)
110
315
  raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
111
316
  @page = page
112
317
  end
113
318
 
114
- def ranks(host=nil)
319
+ def ranks
115
320
  return @ranks unless @ranks.nil?
116
321
  @ranks = Hash.new
117
322
  @page.search("//table[@class=\"result\"]").each do |table|
@@ -129,33 +334,21 @@ class BaiduResult
129
334
  end
130
335
  end
131
336
  #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
132
- if host.nil?
133
- @ranks
134
- else
135
- host_ranks = Hash.new
136
- @ranks.each do |id,line|
137
- if host.class == Regexp
138
- host_ranks[id] = line if line['host'] =~ host
139
- elsif host.class == String
140
- host_ranks[id] = line if line['host'] == host
141
- end
142
- end
143
- host_ranks
144
- #'not finished'#@ranks.each_with_index.map{|h,i| i if !h.nil? and h==host}.compact
145
- end
337
+ @ranks
146
338
  end
147
339
 
148
340
  #return the top rank number from @ranks with the input host
149
- def rank(host)#on base of ranks
150
- ranks.each do |id,line|
151
- if host.class == Regexp
152
- return id if line['host'] =~ host
153
- elsif host.class == String
154
- return id if line['host'] == host
155
- end
156
- end
157
- return nil
158
- end
341
+ # def rank(host)#on base of ranks
342
+ # ranks.each do |id,line|
343
+ # id = id.to_i
344
+ # if host.class == Regexp
345
+ # return id if line['host'] =~ host
346
+ # elsif host.class == String
347
+ # return id if line['host'] == host
348
+ # end
349
+ # end
350
+ # return nil
351
+ # end
159
352
 
160
353
  def how_many
161
354
  @how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
@@ -168,5 +361,9 @@ class BaiduResult
168
361
  def next
169
362
  @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
170
363
  end
364
+
365
+ def has_result?
366
+ @page.search('//div[@class="nors"]').empty?
367
+ end
171
368
 
172
- end
369
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: