baidu 1.2.10 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +82 -0
- data/Rakefile +1 -0
- data/baidu.gemspec +26 -0
- data/lib/baidu.rb +46 -468
- data/lib/baidu/auth.rb +5 -0
- data/lib/baidu/map.rb +66 -0
- data/lib/baidu/rank.rb +5 -0
- data/lib/baidu/sem.rb +14 -0
- data/lib/baidu/sem/account.rb +31 -0
- data/lib/baidu/sem/adgroup.rb +55 -0
- data/lib/baidu/sem/base.rb +96 -0
- data/lib/baidu/sem/bulk.rb +75 -0
- data/lib/baidu/sem/campaign.rb +96 -0
- data/lib/baidu/sem/creative.rb +33 -0
- data/lib/baidu/sem/enum.rb +70 -0
- data/lib/baidu/sem/keyword.rb +35 -0
- data/lib/baidu/sem/kr.rb +9 -0
- data/lib/baidu/sem/new_creative.rb +32 -0
- data/lib/baidu/sem/report.rb +107 -0
- data/lib/baidu/sem/search.rb +79 -0
- data/lib/baidu/version.rb +3 -0
- data/lib/ext.rb +46 -0
- data/spec/map_spec.rb +77 -0
- data/spec/sem_adgroup_spec.rb +119 -0
- data/spec/sem_api_response_spec.rb +97 -0
- data/spec/sem_bulk_spec.rb +89 -0
- data/spec/sem_campaign_spec.rb +94 -0
- data/spec/sem_creative_spec.rb +79 -0
- data/spec/sem_keyword_spec.rb +48 -0
- data/spec/sem_report_spec.rb +52 -0
- data/spec/sem_search_spec.rb +80 -0
- data/spec/spec_helper.rb +356 -0
- metadata +87 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3ab3b16d4f810561f78ee164b2882eaf8f2bae8
|
4
|
+
data.tar.gz: 706fc2deeb2af8c5606eb685f1c85aef45f85127
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be21dae1919c0f313870b2b0a508304becc13117072bc0c93d7992f9cb992aaba344b30c9af36eee6fea2ec75d96c79a8a94af72430c36468998363987408811
|
7
|
+
data.tar.gz: a3f515b1a9ed7be68eb66437c1038e78ee068304fe1c13f0492c66f23a5bd3bef159f10eb1d2e3c867d946fe921e59375cd4589b6d56ce38d29cf3757e0635c7
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 刘明
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Baidu
|
2
|
+
|
3
|
+
Baidu SEM Services
|
4
|
+
Baidu Ranking Services
|
5
|
+
Baidu Map Services
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'baidu'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install baidu
|
20
|
+
|
21
|
+
## Knowledge
|
22
|
+
camel命名法,用于request数据格式
|
23
|
+
snake命名法,用于response数据格式
|
24
|
+
|
25
|
+
## Rspec
|
26
|
+
先修改spec/spec_helper.rb
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
BAIDU_MAP_KEY = ''
|
30
|
+
$username = ''
|
31
|
+
$password = ''
|
32
|
+
$token = ''
|
33
|
+
|
34
|
+
```
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
SEM
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
require 'baidu'
|
42
|
+
|
43
|
+
$auth = Baidu::Auth.new
|
44
|
+
$auth.username = 'username'
|
45
|
+
$auth.password = 'password'
|
46
|
+
$auth.token = 'token'
|
47
|
+
|
48
|
+
ss = Baidu::SEM::SearchService.new
|
49
|
+
res = ss.getKeywordBySearch({:searchWord=>'word',:searchType=>0})
|
50
|
+
res = ss.getKeywordBySearch({:searchWord=>'word',:searchType=>0},true) #debug=true
|
51
|
+
```
|
52
|
+
|
53
|
+
MAP
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
#返回码 定义 英文返回描述
|
57
|
+
#0 正常 ok
|
58
|
+
#2 请求参数非法 Parameter Invalid
|
59
|
+
#3 权限校验失败 Verify Failure
|
60
|
+
#4 配额校验失败 Quota Failure
|
61
|
+
#5 ak不存在或者非法 AK Failure
|
62
|
+
#2xx 无权限
|
63
|
+
#3xx 配额错误
|
64
|
+
puts map.get_xy_by_poiname('滨海公园','上海')
|
65
|
+
map = Baidumap.new('key')
|
66
|
+
require 'awesome_print'
|
67
|
+
lat = 40.3377039331399
|
68
|
+
lng = 116.647588831718
|
69
|
+
ap map.bus.around(lat,lng).info
|
70
|
+
puts map.get(39.911031821584,116.44931548023).for(1000).bus
|
71
|
+
puts map.geo('22.53','113.38')
|
72
|
+
Baidumap.get_baike('北京站')
|
73
|
+
Baidumap.get_cityid('北京') #=>131
|
74
|
+
```
|
75
|
+
|
76
|
+
## Contributing
|
77
|
+
|
78
|
+
1. Fork it
|
79
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
80
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
81
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
82
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/baidu.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'baidu/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "baidu"
|
8
|
+
spec.version = Baidu::VERSION
|
9
|
+
spec.authors = ["seoaqua"]
|
10
|
+
spec.email = ["seoaqua@me.com"]
|
11
|
+
spec.description = %q{Baidu Services Pack,including SEM, Map, Ranking and the others}
|
12
|
+
spec.summary = %q{a gem summary}
|
13
|
+
spec.homepage = "http://github.com/seoaqua/baidu"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_dependency "httparty"
|
24
|
+
spec.add_dependency "awesome_print"
|
25
|
+
spec.add_dependency "savon","~> 3.0"
|
26
|
+
end
|
data/lib/baidu.rb
CHANGED
@@ -1,483 +1,61 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class SearchEngine
|
7
|
-
#是否收录
|
8
|
-
def initialize(pagesize = 100)
|
9
|
-
@pagesize = pagesize#只允许10或100
|
10
|
-
end
|
11
|
-
def indexed?(url)
|
12
|
-
URI(url)
|
13
|
-
result = query(url)
|
14
|
-
return result.has_result?
|
15
|
-
end
|
16
|
-
end
|
17
|
-
class SearchResult
|
18
|
-
def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
19
|
-
@page = Nokogiri::HTML page
|
20
|
-
@baseuri = baseuri
|
21
|
-
# @host = URI(baseuri).host
|
22
|
-
@pagenumber = pagenumber
|
23
|
-
@pagesize = pagesize
|
24
|
-
end
|
25
|
-
def whole
|
26
|
-
{
|
27
|
-
'ads_top'=>ads_top,
|
28
|
-
'ads_right'=>ads_right,
|
29
|
-
'ads_bottom'=>ads_bottom,
|
30
|
-
'ranks'=>ranks
|
31
|
-
}
|
32
|
-
end
|
33
|
-
#返回当前页中host满足条件的结果
|
34
|
-
def ranks_for(specific_host)
|
35
|
-
host_ranks = Hash.new
|
36
|
-
ranks.each do |id,line|
|
37
|
-
if specific_host.class == Regexp
|
38
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
39
|
-
elsif specific_host.class == String
|
40
|
-
host_ranks[id] = line if line['host'] == specific_host
|
41
|
-
end
|
42
|
-
end
|
43
|
-
host_ranks
|
44
|
-
end
|
45
|
-
#return the top rank number from @ranks with the input host
|
46
|
-
def rank(host)#on base of ranks
|
47
|
-
ranks.each do |id,line|
|
48
|
-
id = id.to_i
|
49
|
-
if host.class == Regexp
|
50
|
-
return id if line['host'] =~ host
|
51
|
-
elsif host.class == String
|
52
|
-
return id if line['host'] == host
|
53
|
-
end
|
54
|
-
end
|
55
|
-
return nil
|
56
|
-
end
|
1
|
+
module Baidu
|
2
|
+
module Rank
|
3
|
+
end
|
4
|
+
module SEM
|
5
|
+
end
|
57
6
|
end
|
58
7
|
|
59
|
-
class
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
68
|
-
QihooResult.new(page,uri)
|
69
|
-
end
|
70
|
-
def self.related_keywords(wd)
|
71
|
-
url = "http://rs.so.com/?callback=Search.relate.render&encodein=utf-8&encodeout=utf-8&q="+URI.encode(wd)
|
72
|
-
# uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
73
|
-
page = HTTParty.get(url)
|
74
|
-
json_str = page.body.split("(")[1].gsub(/\s\);/) {""}
|
75
|
-
parsed_json = JSON.parse(json_str)
|
76
|
-
# each
|
77
|
-
# parsed_json.map { |q| p q['q']}
|
78
|
-
@related_keywords = parsed_json.map { |q| q['q'] }
|
79
|
-
# @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
80
|
-
end
|
8
|
+
class String
|
9
|
+
def snake_case
|
10
|
+
self.gsub(/::/, '/').
|
11
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
12
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
13
|
+
tr("-", "_").
|
14
|
+
downcase
|
15
|
+
end
|
81
16
|
end
|
82
|
-
|
83
|
-
class
|
84
|
-
|
85
|
-
|
86
|
-
def ranks
|
87
|
-
return @ranks unless @ranks.nil?
|
88
|
-
@ranks = Hash.new
|
89
|
-
# id = (@pagenumber - 1) * 10
|
90
|
-
id = 0
|
91
|
-
@page.search('//li[@class="res-list"]').each do |li|
|
92
|
-
a = li.search("h3/a").first
|
93
|
-
url = li.search("cite")
|
94
|
-
next if a['data-pos'].nil?
|
95
|
-
id += 1
|
96
|
-
text = a.text.strip
|
97
|
-
href = a['href']
|
98
|
-
url = url.first.text
|
99
|
-
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
100
|
-
@ranks[id.to_s] = {'href'=>a['href'],'text'=>text,'host'=>host}
|
101
|
-
end
|
102
|
-
@ranks
|
17
|
+
class Savon
|
18
|
+
class Response
|
19
|
+
def header
|
20
|
+
hash[:envelope][:header]
|
103
21
|
end
|
104
|
-
def
|
105
|
-
|
106
|
-
result = []
|
107
|
-
@page.search("//ul[@id='djbox']/li").each do |li|
|
108
|
-
id += 1
|
109
|
-
title = li.search("a").first.text
|
110
|
-
href = li.search("cite").first.text.downcase
|
111
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
112
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
113
|
-
end
|
114
|
-
result
|
22
|
+
def res_header
|
23
|
+
header[:res_header]
|
115
24
|
end
|
116
|
-
def
|
117
|
-
|
25
|
+
def desc
|
26
|
+
res_header[:desc]
|
118
27
|
end
|
119
|
-
def
|
120
|
-
|
121
|
-
result = []
|
122
|
-
@page.search("//ul[@id='rightbox']/li").each do |li|
|
123
|
-
id += 1
|
124
|
-
title = li.search("a").first.text
|
125
|
-
href = li.search("cite").first.text.downcase
|
126
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
127
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
128
|
-
end
|
129
|
-
result
|
28
|
+
def quota
|
29
|
+
res_header[:quota]
|
130
30
|
end
|
131
|
-
def
|
132
|
-
|
31
|
+
def rquota
|
32
|
+
res_header[:rquota]
|
133
33
|
end
|
134
|
-
|
135
|
-
|
136
|
-
next_href = @page.xpath('//a[@id="snext"]')
|
137
|
-
return false if next_href.empty?
|
138
|
-
next_href = next_href.first['href']
|
139
|
-
next_href = URI.join(@baseuri,next_href).to_s
|
140
|
-
# next_href = URI.join("http://#{@host}",next_href).to_s
|
141
|
-
next_page = HTTParty.get(next_href).next
|
142
|
-
return QihooResult.new(next_page,next_href,@pagenumber+1)
|
143
|
-
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
34
|
+
def oprs
|
35
|
+
res_header[:oprs]
|
144
36
|
end
|
145
|
-
|
146
|
-
|
147
|
-
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
37
|
+
def oprtime
|
38
|
+
res_header[:oprtime]
|
148
39
|
end
|
149
|
-
|
150
|
-
|
151
|
-
class Mbaidu < SearchEngine
|
152
|
-
BaseUri = 'http://m.baidu.com/s?'
|
153
|
-
headers = {
|
154
|
-
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
155
|
-
}
|
156
|
-
Options = {:headers => headers}
|
157
|
-
|
158
|
-
#基本查询,相当于从搜索框直接输入关键词查询
|
159
|
-
def query(wd)
|
160
|
-
queryStr = "word=#{wd}"
|
161
|
-
uri = URI.encode((BaseUri + queryStr))
|
162
|
-
begin
|
163
|
-
res = HTTParty.get(uri,Options)
|
164
|
-
MbaiduResult.new(res,uri)
|
165
|
-
rescue Exception => e
|
166
|
-
warn "#{uri} fetch error: #{e.to_s}"
|
167
|
-
return false
|
168
|
-
end
|
40
|
+
def failures
|
41
|
+
res_header[:failures]
|
169
42
|
end
|
170
|
-
|
171
|
-
|
172
|
-
# def initialize(page,baseuri,pagenumber=nil)
|
173
|
-
# @page= Nokogiri::HTML page
|
174
|
-
# @baseuri = baseuri
|
175
|
-
# if pagenumber.nil?
|
176
|
-
# @pagenumber = 1
|
177
|
-
# else
|
178
|
-
# @pagenumber = pagenumber
|
179
|
-
# end
|
180
|
-
# end
|
181
|
-
|
182
|
-
#返回当前页所有查询结果
|
183
|
-
def ranks
|
184
|
-
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
185
|
-
return @ranks unless @ranks.nil?
|
186
|
-
@ranks = Hash.new
|
187
|
-
@page.xpath('//div[@class="result"]').each do |result|
|
188
|
-
href,text,host,is_mobile = '','','',false
|
189
|
-
a = result.search("a").first
|
190
|
-
is_mobile = true unless a.search("img").empty?
|
191
|
-
host = result.search('[@class="site"]').first
|
192
|
-
next if host.nil?
|
193
|
-
host = host.text
|
194
|
-
href = a['href']
|
195
|
-
text = a.text
|
196
|
-
id = href.scan(/&order=(\d+)&/)
|
197
|
-
if id.empty?
|
198
|
-
id = nil
|
199
|
-
else
|
200
|
-
id = id.first.first.to_i
|
201
|
-
# id = (@pagenumber-1)*10+id
|
202
|
-
end
|
203
|
-
=begin
|
204
|
-
result.children.each do |elem|
|
205
|
-
if elem.name == 'a'
|
206
|
-
href = elem['href']
|
207
|
-
id = elem.text.match(/^\d+/).to_s.to_i
|
208
|
-
text = elem.text.sub(/^\d+/,'')
|
209
|
-
text.sub!(/^\u00A0/,'')
|
210
|
-
elsif elem['class'] == 'abs'
|
211
|
-
elem.children.each do |elem2|
|
212
|
-
if elem2['class'] == 'site'
|
213
|
-
host = elem2.text
|
214
|
-
break
|
215
|
-
end
|
216
|
-
end
|
217
|
-
elsif elem['class'] == 'site'
|
218
|
-
host == elem['href']
|
219
|
-
end
|
220
|
-
end
|
221
|
-
=end
|
222
|
-
|
223
|
-
@ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
|
224
|
-
end
|
225
|
-
@ranks
|
226
|
-
end
|
227
|
-
def ads_top
|
228
|
-
id = 0
|
229
|
-
result = []
|
230
|
-
@page.search("div[@class='ec_wise_ad']/div").each do |div|
|
231
|
-
id += 1
|
232
|
-
href = div.search("span[@class='ec_site']").first.text
|
233
|
-
href = "http://#{href}"
|
234
|
-
title = div.search("a/text()").text.strip
|
235
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
236
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
237
|
-
end
|
238
|
-
result
|
239
|
-
end
|
240
|
-
def ads_right
|
241
|
-
[]
|
242
|
-
end
|
243
|
-
def ads_bottom
|
244
|
-
[]
|
245
|
-
end
|
246
|
-
def related_keywords
|
247
|
-
@related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
|
248
|
-
end
|
249
|
-
=begin
|
250
|
-
#返回当前页中,符合host条件的结果
|
251
|
-
def ranks_for(specific_host)
|
252
|
-
host_ranks = Hash.new
|
253
|
-
ranks.each do |id,line|
|
254
|
-
if specific_host.class == Regexp
|
255
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
256
|
-
elsif specific_host.class == String
|
257
|
-
host_ranks[id] = line if line['host'] == specific_host
|
258
|
-
end
|
259
|
-
end
|
260
|
-
host_ranks
|
43
|
+
def code
|
44
|
+
failures[:code] if failures
|
261
45
|
end
|
262
|
-
|
263
|
-
|
264
|
-
ranks.each do |id,line|
|
265
|
-
id = id.to_i
|
266
|
-
if host.class == Regexp
|
267
|
-
return id if line['host'] =~ host
|
268
|
-
elsif host.class == String
|
269
|
-
return id if line['host'] == host
|
270
|
-
end
|
271
|
-
end
|
272
|
-
return nil
|
46
|
+
def message
|
47
|
+
failures[:message] if failures
|
273
48
|
end
|
274
|
-
|
275
|
-
|
276
|
-
def next
|
277
|
-
nextbutton = @page.xpath('//a[text()="下一页"]').first
|
278
|
-
return nil if nextbutton.nil?
|
279
|
-
url = nextbutton['href']
|
280
|
-
url = URI.join(@baseuri,url).to_s
|
281
|
-
page = HTTParty.get(url)
|
282
|
-
return MbaiduResult.new(page,url,@pagenumber+1)
|
49
|
+
def status
|
50
|
+
res_header[:status]
|
283
51
|
end
|
52
|
+
end
|
284
53
|
end
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
def url(id)
|
294
|
-
a = Mechanize.new
|
295
|
-
a.redirect_ok=false
|
296
|
-
return a.head("http://www.baidu.com/link?url=#{id}").header['location']
|
297
|
-
end
|
298
|
-
|
299
|
-
=begin
|
300
|
-
def extend(words,level=3,sleeptime=1)
|
301
|
-
level = level.to_i - 1
|
302
|
-
words = [words] unless words.respond_to? 'each'
|
303
|
-
|
304
|
-
extensions = Array.new
|
305
|
-
words.each do |word|
|
306
|
-
self.query(word)
|
307
|
-
extensions += related_keywords
|
308
|
-
extensions += suggestions(word)
|
309
|
-
sleep sleeptime
|
310
|
-
end
|
311
|
-
extensions.uniq!
|
312
|
-
return extensions if level < 1
|
313
|
-
return extensions + extend(extensions,level)
|
314
|
-
end
|
315
|
-
=end
|
316
|
-
|
317
|
-
def popular?(wd)
|
318
|
-
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
|
319
|
-
end
|
320
|
-
|
321
|
-
def query(wd)
|
322
|
-
q = Array.new
|
323
|
-
q << "wd=#{wd}"
|
324
|
-
q << "rn=#{@perpage}"
|
325
|
-
queryStr = q.join("&")
|
326
|
-
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
327
|
-
uri = URI.encode((BaseUri + queryStr))
|
328
|
-
begin
|
329
|
-
# @page = @a.get uri
|
330
|
-
@page = HTTParty.get uri
|
331
|
-
BaiduResult.new(@page,uri,1,@pagesize)
|
332
|
-
rescue Exception => e
|
333
|
-
warn e.to_s
|
334
|
-
return false
|
335
|
-
end
|
336
|
-
=begin
|
337
|
-
query = "#{query}"
|
338
|
-
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
339
|
-
@page = @a.get @uri
|
340
|
-
self.clean
|
341
|
-
@number = self.how_many
|
342
|
-
@maxpage = (@number / @perpage.to_f).round
|
343
|
-
@maxpage =10 if @maxpage>10
|
344
|
-
@currpage =0
|
345
|
-
=end
|
346
|
-
end
|
347
|
-
|
348
|
-
#site:xxx.yyy.com
|
349
|
-
def how_many_pages(host)
|
350
|
-
query("site:#{host}").how_many
|
351
|
-
end
|
352
|
-
|
353
|
-
#domain:xxx.yyy.com/path/file.html
|
354
|
-
def how_many_links(uri)
|
355
|
-
query("domain:\"#{uri}\"").how_many
|
356
|
-
end
|
357
|
-
|
358
|
-
#site:xxx.yyy.com inurl:zzz
|
359
|
-
def how_many_pages_with(host,string)
|
360
|
-
query("site:#{host} inurl:#{string}").how_many
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
class BaiduResult < SearchResult
|
365
|
-
# def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
366
|
-
# @page = Nokogiri::HTML page
|
367
|
-
# @baseuri = baseuri
|
368
|
-
# @pagenumber = pagenumber
|
369
|
-
# @pagesize = pagesize
|
370
|
-
# # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
371
|
-
# # @page = page
|
372
|
-
# end
|
373
|
-
def ranks
|
374
|
-
return @ranks unless @ranks.nil?
|
375
|
-
@ranks = Hash.new
|
376
|
-
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
377
|
-
id = table['id']
|
378
|
-
if @pagesize == 10
|
379
|
-
id = table['id'][-1,1]
|
380
|
-
id = '10' if id == '0'
|
381
|
-
end
|
382
|
-
|
383
|
-
@ranks[id] = Hash.new
|
384
|
-
url = table.search("[@class=\"g\"]").first
|
385
|
-
url = url.text unless url.nil?
|
386
|
-
a = table.search("h3").first
|
387
|
-
next if a.nil?
|
388
|
-
@ranks[id]['text'] = a.text
|
389
|
-
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
390
|
-
unless url.nil?
|
391
|
-
url = url.strip
|
392
|
-
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
393
|
-
else
|
394
|
-
@ranks[id]['host'] = nil
|
395
|
-
end
|
396
|
-
end
|
397
|
-
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
398
|
-
@ranks
|
399
|
-
end
|
400
|
-
|
401
|
-
def ads_bottom
|
402
|
-
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
403
|
-
return ads_top
|
404
|
-
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
405
|
-
end
|
406
|
-
def ads_top
|
407
|
-
#灰色底推广,上下都有
|
408
|
-
ads = Hash.new
|
409
|
-
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
410
|
-
id = table['id']
|
411
|
-
next if id.nil?
|
412
|
-
id = id[2,3].to_i.to_s
|
413
|
-
ads[id]= parse_ad(table)
|
414
|
-
end
|
415
|
-
#白色底推广,只有上部分
|
416
|
-
if ads.empty?
|
417
|
-
@page.search("//table").each do |table|
|
418
|
-
id = table['id']
|
419
|
-
next if id.nil? or id.to_i<3000
|
420
|
-
id = id[2,3].to_i.to_s
|
421
|
-
ads[id]= parse_ad(table)
|
422
|
-
end
|
423
|
-
end
|
424
|
-
ads
|
425
|
-
end
|
426
|
-
def parse_ad(table)
|
427
|
-
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
428
|
-
title = table.search("a").first.text.strip
|
429
|
-
{'title'=>title,'href' => href,'host'=>href}
|
430
|
-
end
|
431
|
-
def ads_right
|
432
|
-
ads = {}
|
433
|
-
@page.search("//div[@id='ec_im_container']").each do |table|
|
434
|
-
table.search("div[@id]").each do |div|
|
435
|
-
id = div['id'][-1,1].to_i+1
|
436
|
-
title = div.search("a").first
|
437
|
-
next if title.nil?
|
438
|
-
title = title.text
|
439
|
-
url = div.search("font[@color='#008000']").first
|
440
|
-
next if url.nil?
|
441
|
-
url = url.text
|
442
|
-
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
443
|
-
end
|
444
|
-
end
|
445
|
-
ads
|
446
|
-
end
|
447
|
-
|
448
|
-
#return the top rank number from @ranks with the input host
|
449
|
-
# def rank(host)#on base of ranks
|
450
|
-
# ranks.each do |id,line|
|
451
|
-
# id = id.to_i
|
452
|
-
# if host.class == Regexp
|
453
|
-
# return id if line['host'] =~ host
|
454
|
-
# elsif host.class == String
|
455
|
-
# return id if line['host'] == host
|
456
|
-
# end
|
457
|
-
# end
|
458
|
-
# return nil
|
459
|
-
# end
|
460
|
-
|
461
|
-
def how_many
|
462
|
-
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
463
|
-
end
|
464
|
-
|
465
|
-
def related_keywords
|
466
|
-
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
467
|
-
end
|
468
|
-
|
469
|
-
def next
|
470
|
-
url = @page.xpath('//a[text()="下一页>"]').first
|
471
|
-
return if url.nil?
|
472
|
-
url = url['href']
|
473
|
-
url = URI.join(@baseuri,url).to_s
|
474
|
-
page = HTTParty.get(url)
|
475
|
-
return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
|
476
|
-
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
477
|
-
end
|
478
|
-
def has_result?
|
479
|
-
submit = @page.search('//a[text()="提交网址"]').first
|
480
|
-
return false if submit and submit['href'].include?'sitesubmit'
|
481
|
-
return true
|
482
|
-
end
|
483
|
-
end
|
54
|
+
require "baidu/version"
|
55
|
+
require "baidu/map"
|
56
|
+
require "baidu/sem"
|
57
|
+
require "baidu/rank"
|
58
|
+
require "baidu/auth"
|
59
|
+
require "baidu/response"
|
60
|
+
require "ext"
|
61
|
+
require "awesome_print"
|