baidu 1.2.10 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +82 -0
- data/Rakefile +1 -0
- data/baidu.gemspec +26 -0
- data/lib/baidu.rb +46 -468
- data/lib/baidu/auth.rb +5 -0
- data/lib/baidu/map.rb +66 -0
- data/lib/baidu/rank.rb +5 -0
- data/lib/baidu/sem.rb +14 -0
- data/lib/baidu/sem/account.rb +31 -0
- data/lib/baidu/sem/adgroup.rb +55 -0
- data/lib/baidu/sem/base.rb +96 -0
- data/lib/baidu/sem/bulk.rb +75 -0
- data/lib/baidu/sem/campaign.rb +96 -0
- data/lib/baidu/sem/creative.rb +33 -0
- data/lib/baidu/sem/enum.rb +70 -0
- data/lib/baidu/sem/keyword.rb +35 -0
- data/lib/baidu/sem/kr.rb +9 -0
- data/lib/baidu/sem/new_creative.rb +32 -0
- data/lib/baidu/sem/report.rb +107 -0
- data/lib/baidu/sem/search.rb +79 -0
- data/lib/baidu/version.rb +3 -0
- data/lib/ext.rb +46 -0
- data/spec/map_spec.rb +77 -0
- data/spec/sem_adgroup_spec.rb +119 -0
- data/spec/sem_api_response_spec.rb +97 -0
- data/spec/sem_bulk_spec.rb +89 -0
- data/spec/sem_campaign_spec.rb +94 -0
- data/spec/sem_creative_spec.rb +79 -0
- data/spec/sem_keyword_spec.rb +48 -0
- data/spec/sem_report_spec.rb +52 -0
- data/spec/sem_search_spec.rb +80 -0
- data/spec/spec_helper.rb +356 -0
- metadata +87 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f3ab3b16d4f810561f78ee164b2882eaf8f2bae8
|
4
|
+
data.tar.gz: 706fc2deeb2af8c5606eb685f1c85aef45f85127
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: be21dae1919c0f313870b2b0a508304becc13117072bc0c93d7992f9cb992aaba344b30c9af36eee6fea2ec75d96c79a8a94af72430c36468998363987408811
|
7
|
+
data.tar.gz: a3f515b1a9ed7be68eb66437c1038e78ee068304fe1c13f0492c66f23a5bd3bef159f10eb1d2e3c867d946fe921e59375cd4589b6d56ce38d29cf3757e0635c7
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 刘明
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
# Baidu
|
2
|
+
|
3
|
+
Baidu SEM Services
|
4
|
+
Baidu Ranking Services
|
5
|
+
Baidu Map Services
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'baidu'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install baidu
|
20
|
+
|
21
|
+
## Knowledge
|
22
|
+
camel命名法,用于request数据格式
|
23
|
+
snake命名法,用于response数据格式
|
24
|
+
|
25
|
+
## Rspec
|
26
|
+
先修改spec/spec_helper.rb
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
BAIDU_MAP_KEY = ''
|
30
|
+
$username = ''
|
31
|
+
$password = ''
|
32
|
+
$token = ''
|
33
|
+
|
34
|
+
```
|
35
|
+
|
36
|
+
## Usage
|
37
|
+
|
38
|
+
SEM
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
require 'baidu'
|
42
|
+
|
43
|
+
$auth = Baidu::Auth.new
|
44
|
+
$auth.username = 'username'
|
45
|
+
$auth.password = 'password'
|
46
|
+
$auth.token = 'token'
|
47
|
+
|
48
|
+
ss = Baidu::SEM::SearchService.new
|
49
|
+
res = ss.getKeywordBySearch({:searchWord=>'word',:searchType=>0})
|
50
|
+
res = ss.getKeywordBySearch({:searchWord=>'word',:searchType=>0},true) #debug=true
|
51
|
+
```
|
52
|
+
|
53
|
+
MAP
|
54
|
+
|
55
|
+
```ruby
|
56
|
+
#返回码 定义 英文返回描述
|
57
|
+
#0 正常 ok
|
58
|
+
#2 请求参数非法 Parameter Invalid
|
59
|
+
#3 权限校验失败 Verify Failure
|
60
|
+
#4 配额校验失败 Quota Failure
|
61
|
+
#5 ak不存在或者非法 AK Failure
|
62
|
+
#2xx 无权限
|
63
|
+
#3xx 配额错误
|
64
|
+
puts map.get_xy_by_poiname('滨海公园','上海')
|
65
|
+
map = Baidumap.new('key')
|
66
|
+
require 'awesome_print'
|
67
|
+
lat = 40.3377039331399
|
68
|
+
lng = 116.647588831718
|
69
|
+
ap map.bus.around(lat,lng).info
|
70
|
+
puts map.get(39.911031821584,116.44931548023).for(1000).bus
|
71
|
+
puts map.geo('22.53','113.38')
|
72
|
+
Baidumap.get_baike('北京站')
|
73
|
+
Baidumap.get_cityid('北京') #=>131
|
74
|
+
```
|
75
|
+
|
76
|
+
## Contributing
|
77
|
+
|
78
|
+
1. Fork it
|
79
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
80
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
81
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
82
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/baidu.gemspec
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'baidu/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "baidu"
|
8
|
+
spec.version = Baidu::VERSION
|
9
|
+
spec.authors = ["seoaqua"]
|
10
|
+
spec.email = ["seoaqua@me.com"]
|
11
|
+
spec.description = %q{Baidu Services Pack,including SEM, Map, Ranking and the others}
|
12
|
+
spec.summary = %q{a gem summary}
|
13
|
+
spec.homepage = "http://github.com/seoaqua/baidu"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_dependency "httparty"
|
24
|
+
spec.add_dependency "awesome_print"
|
25
|
+
spec.add_dependency "savon","~> 3.0"
|
26
|
+
end
|
data/lib/baidu.rb
CHANGED
@@ -1,483 +1,61 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
class SearchEngine
|
7
|
-
#是否收录
|
8
|
-
def initialize(pagesize = 100)
|
9
|
-
@pagesize = pagesize#只允许10或100
|
10
|
-
end
|
11
|
-
def indexed?(url)
|
12
|
-
URI(url)
|
13
|
-
result = query(url)
|
14
|
-
return result.has_result?
|
15
|
-
end
|
16
|
-
end
|
17
|
-
class SearchResult
|
18
|
-
def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
19
|
-
@page = Nokogiri::HTML page
|
20
|
-
@baseuri = baseuri
|
21
|
-
# @host = URI(baseuri).host
|
22
|
-
@pagenumber = pagenumber
|
23
|
-
@pagesize = pagesize
|
24
|
-
end
|
25
|
-
def whole
|
26
|
-
{
|
27
|
-
'ads_top'=>ads_top,
|
28
|
-
'ads_right'=>ads_right,
|
29
|
-
'ads_bottom'=>ads_bottom,
|
30
|
-
'ranks'=>ranks
|
31
|
-
}
|
32
|
-
end
|
33
|
-
#返回当前页中host满足条件的结果
|
34
|
-
def ranks_for(specific_host)
|
35
|
-
host_ranks = Hash.new
|
36
|
-
ranks.each do |id,line|
|
37
|
-
if specific_host.class == Regexp
|
38
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
39
|
-
elsif specific_host.class == String
|
40
|
-
host_ranks[id] = line if line['host'] == specific_host
|
41
|
-
end
|
42
|
-
end
|
43
|
-
host_ranks
|
44
|
-
end
|
45
|
-
#return the top rank number from @ranks with the input host
|
46
|
-
def rank(host)#on base of ranks
|
47
|
-
ranks.each do |id,line|
|
48
|
-
id = id.to_i
|
49
|
-
if host.class == Regexp
|
50
|
-
return id if line['host'] =~ host
|
51
|
-
elsif host.class == String
|
52
|
-
return id if line['host'] == host
|
53
|
-
end
|
54
|
-
end
|
55
|
-
return nil
|
56
|
-
end
|
1
|
+
module Baidu
|
2
|
+
module Rank
|
3
|
+
end
|
4
|
+
module SEM
|
5
|
+
end
|
57
6
|
end
|
58
7
|
|
59
|
-
class
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
68
|
-
QihooResult.new(page,uri)
|
69
|
-
end
|
70
|
-
def self.related_keywords(wd)
|
71
|
-
url = "http://rs.so.com/?callback=Search.relate.render&encodein=utf-8&encodeout=utf-8&q="+URI.encode(wd)
|
72
|
-
# uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
73
|
-
page = HTTParty.get(url)
|
74
|
-
json_str = page.body.split("(")[1].gsub(/\s\);/) {""}
|
75
|
-
parsed_json = JSON.parse(json_str)
|
76
|
-
# each
|
77
|
-
# parsed_json.map { |q| p q['q']}
|
78
|
-
@related_keywords = parsed_json.map { |q| q['q'] }
|
79
|
-
# @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
80
|
-
end
|
8
|
+
class String
|
9
|
+
def snake_case
|
10
|
+
self.gsub(/::/, '/').
|
11
|
+
gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
|
12
|
+
gsub(/([a-z\d])([A-Z])/,'\1_\2').
|
13
|
+
tr("-", "_").
|
14
|
+
downcase
|
15
|
+
end
|
81
16
|
end
|
82
|
-
|
83
|
-
class
|
84
|
-
|
85
|
-
|
86
|
-
def ranks
|
87
|
-
return @ranks unless @ranks.nil?
|
88
|
-
@ranks = Hash.new
|
89
|
-
# id = (@pagenumber - 1) * 10
|
90
|
-
id = 0
|
91
|
-
@page.search('//li[@class="res-list"]').each do |li|
|
92
|
-
a = li.search("h3/a").first
|
93
|
-
url = li.search("cite")
|
94
|
-
next if a['data-pos'].nil?
|
95
|
-
id += 1
|
96
|
-
text = a.text.strip
|
97
|
-
href = a['href']
|
98
|
-
url = url.first.text
|
99
|
-
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
100
|
-
@ranks[id.to_s] = {'href'=>a['href'],'text'=>text,'host'=>host}
|
101
|
-
end
|
102
|
-
@ranks
|
17
|
+
class Savon
|
18
|
+
class Response
|
19
|
+
def header
|
20
|
+
hash[:envelope][:header]
|
103
21
|
end
|
104
|
-
def
|
105
|
-
|
106
|
-
result = []
|
107
|
-
@page.search("//ul[@id='djbox']/li").each do |li|
|
108
|
-
id += 1
|
109
|
-
title = li.search("a").first.text
|
110
|
-
href = li.search("cite").first.text.downcase
|
111
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
112
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
113
|
-
end
|
114
|
-
result
|
22
|
+
def res_header
|
23
|
+
header[:res_header]
|
115
24
|
end
|
116
|
-
def
|
117
|
-
|
25
|
+
def desc
|
26
|
+
res_header[:desc]
|
118
27
|
end
|
119
|
-
def
|
120
|
-
|
121
|
-
result = []
|
122
|
-
@page.search("//ul[@id='rightbox']/li").each do |li|
|
123
|
-
id += 1
|
124
|
-
title = li.search("a").first.text
|
125
|
-
href = li.search("cite").first.text.downcase
|
126
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
127
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
128
|
-
end
|
129
|
-
result
|
28
|
+
def quota
|
29
|
+
res_header[:quota]
|
130
30
|
end
|
131
|
-
def
|
132
|
-
|
31
|
+
def rquota
|
32
|
+
res_header[:rquota]
|
133
33
|
end
|
134
|
-
|
135
|
-
|
136
|
-
next_href = @page.xpath('//a[@id="snext"]')
|
137
|
-
return false if next_href.empty?
|
138
|
-
next_href = next_href.first['href']
|
139
|
-
next_href = URI.join(@baseuri,next_href).to_s
|
140
|
-
# next_href = URI.join("http://#{@host}",next_href).to_s
|
141
|
-
next_page = HTTParty.get(next_href).next
|
142
|
-
return QihooResult.new(next_page,next_href,@pagenumber+1)
|
143
|
-
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
34
|
+
def oprs
|
35
|
+
res_header[:oprs]
|
144
36
|
end
|
145
|
-
|
146
|
-
|
147
|
-
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
37
|
+
def oprtime
|
38
|
+
res_header[:oprtime]
|
148
39
|
end
|
149
|
-
|
150
|
-
|
151
|
-
class Mbaidu < SearchEngine
|
152
|
-
BaseUri = 'http://m.baidu.com/s?'
|
153
|
-
headers = {
|
154
|
-
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
155
|
-
}
|
156
|
-
Options = {:headers => headers}
|
157
|
-
|
158
|
-
#基本查询,相当于从搜索框直接输入关键词查询
|
159
|
-
def query(wd)
|
160
|
-
queryStr = "word=#{wd}"
|
161
|
-
uri = URI.encode((BaseUri + queryStr))
|
162
|
-
begin
|
163
|
-
res = HTTParty.get(uri,Options)
|
164
|
-
MbaiduResult.new(res,uri)
|
165
|
-
rescue Exception => e
|
166
|
-
warn "#{uri} fetch error: #{e.to_s}"
|
167
|
-
return false
|
168
|
-
end
|
40
|
+
def failures
|
41
|
+
res_header[:failures]
|
169
42
|
end
|
170
|
-
|
171
|
-
|
172
|
-
# def initialize(page,baseuri,pagenumber=nil)
|
173
|
-
# @page= Nokogiri::HTML page
|
174
|
-
# @baseuri = baseuri
|
175
|
-
# if pagenumber.nil?
|
176
|
-
# @pagenumber = 1
|
177
|
-
# else
|
178
|
-
# @pagenumber = pagenumber
|
179
|
-
# end
|
180
|
-
# end
|
181
|
-
|
182
|
-
#返回当前页所有查询结果
|
183
|
-
def ranks
|
184
|
-
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
185
|
-
return @ranks unless @ranks.nil?
|
186
|
-
@ranks = Hash.new
|
187
|
-
@page.xpath('//div[@class="result"]').each do |result|
|
188
|
-
href,text,host,is_mobile = '','','',false
|
189
|
-
a = result.search("a").first
|
190
|
-
is_mobile = true unless a.search("img").empty?
|
191
|
-
host = result.search('[@class="site"]').first
|
192
|
-
next if host.nil?
|
193
|
-
host = host.text
|
194
|
-
href = a['href']
|
195
|
-
text = a.text
|
196
|
-
id = href.scan(/&order=(\d+)&/)
|
197
|
-
if id.empty?
|
198
|
-
id = nil
|
199
|
-
else
|
200
|
-
id = id.first.first.to_i
|
201
|
-
# id = (@pagenumber-1)*10+id
|
202
|
-
end
|
203
|
-
=begin
|
204
|
-
result.children.each do |elem|
|
205
|
-
if elem.name == 'a'
|
206
|
-
href = elem['href']
|
207
|
-
id = elem.text.match(/^\d+/).to_s.to_i
|
208
|
-
text = elem.text.sub(/^\d+/,'')
|
209
|
-
text.sub!(/^\u00A0/,'')
|
210
|
-
elsif elem['class'] == 'abs'
|
211
|
-
elem.children.each do |elem2|
|
212
|
-
if elem2['class'] == 'site'
|
213
|
-
host = elem2.text
|
214
|
-
break
|
215
|
-
end
|
216
|
-
end
|
217
|
-
elsif elem['class'] == 'site'
|
218
|
-
host == elem['href']
|
219
|
-
end
|
220
|
-
end
|
221
|
-
=end
|
222
|
-
|
223
|
-
@ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
|
224
|
-
end
|
225
|
-
@ranks
|
226
|
-
end
|
227
|
-
def ads_top
|
228
|
-
id = 0
|
229
|
-
result = []
|
230
|
-
@page.search("div[@class='ec_wise_ad']/div").each do |div|
|
231
|
-
id += 1
|
232
|
-
href = div.search("span[@class='ec_site']").first.text
|
233
|
-
href = "http://#{href}"
|
234
|
-
title = div.search("a/text()").text.strip
|
235
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
236
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
237
|
-
end
|
238
|
-
result
|
239
|
-
end
|
240
|
-
def ads_right
|
241
|
-
[]
|
242
|
-
end
|
243
|
-
def ads_bottom
|
244
|
-
[]
|
245
|
-
end
|
246
|
-
def related_keywords
|
247
|
-
@related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
|
248
|
-
end
|
249
|
-
=begin
|
250
|
-
#返回当前页中,符合host条件的结果
|
251
|
-
def ranks_for(specific_host)
|
252
|
-
host_ranks = Hash.new
|
253
|
-
ranks.each do |id,line|
|
254
|
-
if specific_host.class == Regexp
|
255
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
256
|
-
elsif specific_host.class == String
|
257
|
-
host_ranks[id] = line if line['host'] == specific_host
|
258
|
-
end
|
259
|
-
end
|
260
|
-
host_ranks
|
43
|
+
def code
|
44
|
+
failures[:code] if failures
|
261
45
|
end
|
262
|
-
|
263
|
-
|
264
|
-
ranks.each do |id,line|
|
265
|
-
id = id.to_i
|
266
|
-
if host.class == Regexp
|
267
|
-
return id if line['host'] =~ host
|
268
|
-
elsif host.class == String
|
269
|
-
return id if line['host'] == host
|
270
|
-
end
|
271
|
-
end
|
272
|
-
return nil
|
46
|
+
def message
|
47
|
+
failures[:message] if failures
|
273
48
|
end
|
274
|
-
|
275
|
-
|
276
|
-
def next
|
277
|
-
nextbutton = @page.xpath('//a[text()="下一页"]').first
|
278
|
-
return nil if nextbutton.nil?
|
279
|
-
url = nextbutton['href']
|
280
|
-
url = URI.join(@baseuri,url).to_s
|
281
|
-
page = HTTParty.get(url)
|
282
|
-
return MbaiduResult.new(page,url,@pagenumber+1)
|
49
|
+
def status
|
50
|
+
res_header[:status]
|
283
51
|
end
|
52
|
+
end
|
284
53
|
end
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
def url(id)
|
294
|
-
a = Mechanize.new
|
295
|
-
a.redirect_ok=false
|
296
|
-
return a.head("http://www.baidu.com/link?url=#{id}").header['location']
|
297
|
-
end
|
298
|
-
|
299
|
-
=begin
|
300
|
-
def extend(words,level=3,sleeptime=1)
|
301
|
-
level = level.to_i - 1
|
302
|
-
words = [words] unless words.respond_to? 'each'
|
303
|
-
|
304
|
-
extensions = Array.new
|
305
|
-
words.each do |word|
|
306
|
-
self.query(word)
|
307
|
-
extensions += related_keywords
|
308
|
-
extensions += suggestions(word)
|
309
|
-
sleep sleeptime
|
310
|
-
end
|
311
|
-
extensions.uniq!
|
312
|
-
return extensions if level < 1
|
313
|
-
return extensions + extend(extensions,level)
|
314
|
-
end
|
315
|
-
=end
|
316
|
-
|
317
|
-
def popular?(wd)
|
318
|
-
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
|
319
|
-
end
|
320
|
-
|
321
|
-
def query(wd)
|
322
|
-
q = Array.new
|
323
|
-
q << "wd=#{wd}"
|
324
|
-
q << "rn=#{@perpage}"
|
325
|
-
queryStr = q.join("&")
|
326
|
-
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
327
|
-
uri = URI.encode((BaseUri + queryStr))
|
328
|
-
begin
|
329
|
-
# @page = @a.get uri
|
330
|
-
@page = HTTParty.get uri
|
331
|
-
BaiduResult.new(@page,uri,1,@pagesize)
|
332
|
-
rescue Exception => e
|
333
|
-
warn e.to_s
|
334
|
-
return false
|
335
|
-
end
|
336
|
-
=begin
|
337
|
-
query = "#{query}"
|
338
|
-
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
339
|
-
@page = @a.get @uri
|
340
|
-
self.clean
|
341
|
-
@number = self.how_many
|
342
|
-
@maxpage = (@number / @perpage.to_f).round
|
343
|
-
@maxpage =10 if @maxpage>10
|
344
|
-
@currpage =0
|
345
|
-
=end
|
346
|
-
end
|
347
|
-
|
348
|
-
#site:xxx.yyy.com
|
349
|
-
def how_many_pages(host)
|
350
|
-
query("site:#{host}").how_many
|
351
|
-
end
|
352
|
-
|
353
|
-
#domain:xxx.yyy.com/path/file.html
|
354
|
-
def how_many_links(uri)
|
355
|
-
query("domain:\"#{uri}\"").how_many
|
356
|
-
end
|
357
|
-
|
358
|
-
#site:xxx.yyy.com inurl:zzz
|
359
|
-
def how_many_pages_with(host,string)
|
360
|
-
query("site:#{host} inurl:#{string}").how_many
|
361
|
-
end
|
362
|
-
end
|
363
|
-
|
364
|
-
class BaiduResult < SearchResult
|
365
|
-
# def initialize(page,baseuri,pagenumber=1,pagesize=100)
|
366
|
-
# @page = Nokogiri::HTML page
|
367
|
-
# @baseuri = baseuri
|
368
|
-
# @pagenumber = pagenumber
|
369
|
-
# @pagesize = pagesize
|
370
|
-
# # raise ArgumentError 'should be Mechanize::Page' unless page.class == Mechanize::Page
|
371
|
-
# # @page = page
|
372
|
-
# end
|
373
|
-
def ranks
|
374
|
-
return @ranks unless @ranks.nil?
|
375
|
-
@ranks = Hash.new
|
376
|
-
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
377
|
-
id = table['id']
|
378
|
-
if @pagesize == 10
|
379
|
-
id = table['id'][-1,1]
|
380
|
-
id = '10' if id == '0'
|
381
|
-
end
|
382
|
-
|
383
|
-
@ranks[id] = Hash.new
|
384
|
-
url = table.search("[@class=\"g\"]").first
|
385
|
-
url = url.text unless url.nil?
|
386
|
-
a = table.search("h3").first
|
387
|
-
next if a.nil?
|
388
|
-
@ranks[id]['text'] = a.text
|
389
|
-
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
390
|
-
unless url.nil?
|
391
|
-
url = url.strip
|
392
|
-
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
393
|
-
else
|
394
|
-
@ranks[id]['host'] = nil
|
395
|
-
end
|
396
|
-
end
|
397
|
-
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
398
|
-
@ranks
|
399
|
-
end
|
400
|
-
|
401
|
-
def ads_bottom
|
402
|
-
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
403
|
-
return ads_top
|
404
|
-
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
405
|
-
end
|
406
|
-
def ads_top
|
407
|
-
#灰色底推广,上下都有
|
408
|
-
ads = Hash.new
|
409
|
-
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
410
|
-
id = table['id']
|
411
|
-
next if id.nil?
|
412
|
-
id = id[2,3].to_i.to_s
|
413
|
-
ads[id]= parse_ad(table)
|
414
|
-
end
|
415
|
-
#白色底推广,只有上部分
|
416
|
-
if ads.empty?
|
417
|
-
@page.search("//table").each do |table|
|
418
|
-
id = table['id']
|
419
|
-
next if id.nil? or id.to_i<3000
|
420
|
-
id = id[2,3].to_i.to_s
|
421
|
-
ads[id]= parse_ad(table)
|
422
|
-
end
|
423
|
-
end
|
424
|
-
ads
|
425
|
-
end
|
426
|
-
def parse_ad(table)
|
427
|
-
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
428
|
-
title = table.search("a").first.text.strip
|
429
|
-
{'title'=>title,'href' => href,'host'=>href}
|
430
|
-
end
|
431
|
-
def ads_right
|
432
|
-
ads = {}
|
433
|
-
@page.search("//div[@id='ec_im_container']").each do |table|
|
434
|
-
table.search("div[@id]").each do |div|
|
435
|
-
id = div['id'][-1,1].to_i+1
|
436
|
-
title = div.search("a").first
|
437
|
-
next if title.nil?
|
438
|
-
title = title.text
|
439
|
-
url = div.search("font[@color='#008000']").first
|
440
|
-
next if url.nil?
|
441
|
-
url = url.text
|
442
|
-
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
443
|
-
end
|
444
|
-
end
|
445
|
-
ads
|
446
|
-
end
|
447
|
-
|
448
|
-
#return the top rank number from @ranks with the input host
|
449
|
-
# def rank(host)#on base of ranks
|
450
|
-
# ranks.each do |id,line|
|
451
|
-
# id = id.to_i
|
452
|
-
# if host.class == Regexp
|
453
|
-
# return id if line['host'] =~ host
|
454
|
-
# elsif host.class == String
|
455
|
-
# return id if line['host'] == host
|
456
|
-
# end
|
457
|
-
# end
|
458
|
-
# return nil
|
459
|
-
# end
|
460
|
-
|
461
|
-
def how_many
|
462
|
-
@how_many ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
463
|
-
end
|
464
|
-
|
465
|
-
def related_keywords
|
466
|
-
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
467
|
-
end
|
468
|
-
|
469
|
-
def next
|
470
|
-
url = @page.xpath('//a[text()="下一页>"]').first
|
471
|
-
return if url.nil?
|
472
|
-
url = url['href']
|
473
|
-
url = URI.join(@baseuri,url).to_s
|
474
|
-
page = HTTParty.get(url)
|
475
|
-
return BaiduResult.new(page,url,@pagenumber+1,@pagesize)
|
476
|
-
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
477
|
-
end
|
478
|
-
def has_result?
|
479
|
-
submit = @page.search('//a[text()="提交网址"]').first
|
480
|
-
return false if submit and submit['href'].include?'sitesubmit'
|
481
|
-
return true
|
482
|
-
end
|
483
|
-
end
|
54
|
+
require "baidu/version"
|
55
|
+
require "baidu/map"
|
56
|
+
require "baidu/sem"
|
57
|
+
require "baidu/rank"
|
58
|
+
require "baidu/auth"
|
59
|
+
require "baidu/response"
|
60
|
+
require "ext"
|
61
|
+
require "awesome_print"
|