baidu 1.1.1 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. data/lib/baidu.rb +232 -35
  2. metadata +1 -1
@@ -1,8 +1,215 @@
1
1
  #coding:UTF-8
2
- require 'rubygems'
3
2
  require 'mechanize'
3
+ require 'nokogiri'
4
4
  require 'json'
5
5
  require 'addressable/uri'
6
+ require 'httparty'
7
+
8
+ class SearchResult
9
+ def initialize(body,baseuri,pagenumber=nil)
10
+ @body = Nokogiri::HTML body
11
+ @baseuri = baseuri
12
+ # @host = URI(baseuri).host
13
+ if pagenumber.nil?
14
+ @pagenumber = 1
15
+ else
16
+ @pagenumber = pagenumber
17
+ end
18
+ end
19
+
20
+ #返回当前页中host满足条件的结果
21
+ def ranks_for(specific_host)
22
+ host_ranks = Hash.new
23
+ ranks.each do |id,line|
24
+ if specific_host.class == Regexp
25
+ host_ranks[id] = line if line['host'] =~ specific_host
26
+ elsif specific_host.class == String
27
+ host_ranks[id] = line if line['host'] == specific_host
28
+ end
29
+ end
30
+ host_ranks
31
+ end
32
+ #return the top rank number from @ranks with the input host
33
+ def rank(host)#on base of ranks
34
+ ranks.each do |id,line|
35
+ id = id.to_i
36
+ if host.class == Regexp
37
+ return id if line['host'] =~ host
38
+ elsif host.class == String
39
+ return id if line['host'] == host
40
+ end
41
+ end
42
+ return nil
43
+ end
44
+ end
45
+ class Qihoo
46
+ Host = 'www.so.com'
47
+ #基本查询, 相当于在搜索框直接数据关键词查询
48
+ def query(wd)
49
+ begin
50
+ #用原始路径请求
51
+ uri = URI.encode(URI.join("http://#{Host}/",'s?q='+wd).to_s)
52
+ body = HTTParty.get(uri)
53
+ #如果请求地址被跳转,重新获取当前页的URI
54
+ uri = URI.join("http://#{Host}/",body.request.path).to_s
55
+ return QihooResult.new(body,uri)
56
+ rescue Exception => e
57
+ warn "#{uri} fetch error: #{e.to_s}"
58
+ return false
59
+ end
60
+ end
61
+ #是否收录
62
+ def indexed?(url)
63
+ URI(url)
64
+ query(url).has_result?
65
+ end
66
+ end
67
+
68
+ class QihooResult < SearchResult
69
+ Host = 'www.so.com'
70
+
71
+ #返回所有当前页的排名结果
72
+ def ranks
73
+ return @ranks unless @ranks.nil?
74
+ @ranks = Hash.new
75
+ id = (@pagenumber - 1) * 10
76
+ @body.xpath('//li[@class="res-list"]').each do |li|
77
+ a = li.search("h3/a").first
78
+ url = li.search("cite")
79
+ next if a['data-pos'].nil?
80
+ id += 1
81
+ text = a.text.strip
82
+ href = a['href']
83
+ url = url.first.text
84
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
85
+ @ranks[id] = {'href'=>"http://so.com#{href}",'text'=>text,'host'=>host}
86
+ end
87
+ @ranks
88
+ end
89
+ #下一页
90
+ def next
91
+ next_href = @body.xpath('//a[@id="snext"]').first['href']
92
+ next_href = URI.join(@baseuri,next_href).to_s
93
+ # next_href = URI.join("http://#{@host}",next_href).to_s
94
+ next_body = HTTParty.get(next_href).body
95
+ return QihooResult.new(next_body,next_href,@pagenumber+1)
96
+ #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
97
+ end
98
+ #有结果
99
+ def has_result?
100
+ !@body.xpath('//div[@id="main"]/h3').text().include?'没有找到该URL'
101
+ end
102
+ end
103
+
104
+ class Mbaidu
105
+ BaseUri = 'http://m.baidu.com/s?'
106
+ headers = {
107
+ "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
108
+ }
109
+ Options = {:headers => headers}
110
+
111
+ #基本查询,相当于从搜索框直接输入关键词查询
112
+ def query(wd)
113
+ queryStr = "word=#{wd}"
114
+ uri = URI.encode((BaseUri + queryStr))
115
+ begin
116
+ res = HTTParty.get(uri,Options)
117
+ MbaiduResult.new(res,uri)
118
+ rescue Exception => e
119
+ warn "#{uri} fetch error: #{e.to_s}"
120
+ return false
121
+ end
122
+ end
123
+ end
124
+ class MbaiduResult < SearchResult
125
+ def initialize(body,baseuri,pagenumber=nil)
126
+ @body = Nokogiri::HTML body
127
+ @baseuri = baseuri
128
+ if pagenumber.nil?
129
+ @pagenumber = 1
130
+ else
131
+ @pagenumber = pagenumber
132
+ end
133
+ end
134
+
135
+ #返回当前页所有查询结果
136
+ def ranks
137
+ #如果已经赋值说明解析过,不需要重新解析,直接返回结果
138
+ return @ranks unless @ranks.nil?
139
+ @ranks = Hash.new
140
+ @body.xpath('//div[@class="result"]').each do |result|
141
+ href,text,host,is_mobile = '','','',false
142
+ a = result.search("a").first
143
+ is_mobile = true unless a.search("img").empty?
144
+ host = result.search('span[@class="site"]').first.text
145
+ href = a['href']
146
+ text = a.text
147
+ id = href.scan(/&order=(\d+)&/)
148
+ if id.empty?
149
+ id = nil
150
+ else
151
+ id = id.first.first.to_i
152
+ id = (@pagenumber-1)*10+id
153
+ end
154
+ =begin
155
+ result.children.each do |elem|
156
+ if elem.name == 'a'
157
+ href = elem['href']
158
+ id = elem.text.match(/^\d+/).to_s.to_i
159
+ text = elem.text.sub(/^\d+/,'')
160
+ text.sub!(/^\u00A0/,'')
161
+ elsif elem['class'] == 'abs'
162
+ elem.children.each do |elem2|
163
+ if elem2['class'] == 'site'
164
+ host = elem2.text
165
+ break
166
+ end
167
+ end
168
+ elsif elem['class'] == 'site'
169
+ host == elem['href']
170
+ end
171
+ end
172
+ =end
173
+
174
+ @ranks[id] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
175
+ end
176
+ @ranks
177
+ end
178
+ =begin
179
+ #返回当前页中,符合host条件的结果
180
+ def ranks_for(specific_host)
181
+ host_ranks = Hash.new
182
+ ranks.each do |id,line|
183
+ if specific_host.class == Regexp
184
+ host_ranks[id] = line if line['host'] =~ specific_host
185
+ elsif specific_host.class == String
186
+ host_ranks[id] = line if line['host'] == specific_host
187
+ end
188
+ end
189
+ host_ranks
190
+ end
191
+ #return the top rank number from @ranks with the input host
192
+ def rank(host)#on base of ranks
193
+ ranks.each do |id,line|
194
+ id = id.to_i
195
+ if host.class == Regexp
196
+ return id if line['host'] =~ host
197
+ elsif host.class == String
198
+ return id if line['host'] == host
199
+ end
200
+ end
201
+ return nil
202
+ end
203
+ =end
204
+ #下一页
205
+ def next
206
+ url = @body.xpath('//a[text()="下一页"]').first['href']
207
+ url = URI.join(@baseuri,url).to_s
208
+ body = HTTParty.get(url)
209
+ return MbaiduResult.new(body,url,@pagenumber+1)
210
+ end
211
+
212
+ end
6
213
  class Baidu
7
214
  BaseUri = 'http://www.baidu.com/s?'
8
215
  PerPage = 100
@@ -15,7 +222,7 @@ class Baidu
15
222
  end
16
223
 
17
224
  def suggestions(wd)
18
- json = @a.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
225
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").body.force_encoding('GBK').encode("UTF-8")
19
226
  m = /\[([^\]]*)\]/.match json
20
227
  return JSON.parse m[0]
21
228
  end
@@ -74,12 +281,6 @@ class Baidu
74
281
  =end
75
282
  end
76
283
 
77
- =begin
78
- def maxpage
79
- @maxpage ||= (how_many / PerPage.to_f).round
80
- end
81
- =end
82
-
83
284
  #site:xxx.yyy.com
84
285
  def how_many_pages(host)
85
286
  query("site:#{host}").how_many
@@ -94,6 +295,10 @@ class Baidu
94
295
  def how_many_pages_with(host,string)
95
296
  query("site:#{host} inurl:#{string}").how_many
96
297
  end
298
+ #是否收录
299
+ def indexed?(url)
300
+ query(url).has_result?
301
+ end
97
302
 
98
303
  =begin
99
304
  private
@@ -105,13 +310,13 @@ class Baidu
105
310
  =end
106
311
  end
107
312
 
108
- class BaiduResult
313
+ class BaiduResult < SearchResult
109
314
  def initialize(page)
110
315
  raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
111
316
  @page = page
112
317
  end
113
318
 
114
- def ranks(host=nil)
319
+ def ranks
115
320
  return @ranks unless @ranks.nil?
116
321
  @ranks = Hash.new
117
322
  @page.search("//table[@class=\"result\"]").each do |table|
@@ -129,33 +334,21 @@ class BaiduResult
129
334
  end
130
335
  end
131
336
  #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
132
- if host.nil?
133
- @ranks
134
- else
135
- host_ranks = Hash.new
136
- @ranks.each do |id,line|
137
- if host.class == Regexp
138
- host_ranks[id] = line if line['host'] =~ host
139
- elsif host.class == String
140
- host_ranks[id] = line if line['host'] == host
141
- end
142
- end
143
- host_ranks
144
- #'not finished'#@ranks.each_with_index.map{|h,i| i if !h.nil? and h==host}.compact
145
- end
337
+ @ranks
146
338
  end
147
339
 
148
340
  #return the top rank number from @ranks with the input host
149
- def rank(host)#on base of ranks
150
- ranks.each do |id,line|
151
- if host.class == Regexp
152
- return id if line['host'] =~ host
153
- elsif host.class == String
154
- return id if line['host'] == host
155
- end
156
- end
157
- return nil
158
- end
341
+ # def rank(host)#on base of ranks
342
+ # ranks.each do |id,line|
343
+ # id = id.to_i
344
+ # if host.class == Regexp
345
+ # return id if line['host'] =~ host
346
+ # elsif host.class == String
347
+ # return id if line['host'] == host
348
+ # end
349
+ # end
350
+ # return nil
351
+ # end
159
352
 
160
353
  def how_many
161
354
  @how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
@@ -168,5 +361,9 @@ class BaiduResult
168
361
  def next
169
362
  @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
170
363
  end
364
+
365
+ def has_result?
366
+ @page.search('//div[@class="nors"]').empty?
367
+ end
171
368
 
172
- end
369
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: baidu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.1.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: