query 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +23 -0
- data/Gemfile +4 -0
- data/LICENSE +20 -0
- data/LICENSE.txt +22 -0
- data/README.md +24 -0
- data/Rakefile +1 -0
- data/lib/query/engine/baidu.rb +87 -0
- data/lib/query/engine/baidu_mobile.rb +26 -0
- data/lib/query/engine/base.rb +16 -0
- data/lib/query/engine/qihoo.rb +32 -0
- data/lib/query/engine/qihoo_mobile.rb +2 -0
- data/lib/query/engine.rb +10 -0
- data/lib/query/result/baidu.rb +121 -0
- data/lib/query/result/baidu_mobile.rb +114 -0
- data/lib/query/result/base.rb +50 -0
- data/lib/query/result/qihoo.rb +75 -0
- data/lib/query/result/qihoo_mobile.rb +6 -0
- data/lib/query/result.rb +10 -0
- data/lib/query/version.rb +3 -0
- data/lib/query.rb +9 -0
- data/query.gemspec +27 -0
- data/spec/baidu_mobile_spec.rb +19 -0
- data/spec/baidu_spec.rb +73 -0
- data/spec/qihoo_spec.rb +27 -0
- data/spec/spec_helper.rb +1 -0
- metadata +144 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: e414f7561d351c15835fb94956919de3d9d6ac62
|
4
|
+
data.tar.gz: 1e75cc2eb2d552b779d7cc33865248cb73d00d64
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e07b5ab0f23e5775945fa66cd55ae0e822888da4d5aa65968ee5a0cf91d219a3126b2af7836e23cde660b43fccca4570cc33be142d08b83092388de975a84339
|
7
|
+
data.tar.gz: baa4d81223b5911ca159735bdff34ba6b4646ebeefe83112463c733373697ee0823b3de5e57513e97105c55f663bce58d87e8530f48c5d2a8718e50f80785974
|
data/.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
coverage
|
12
|
+
InstalledFiles
|
13
|
+
lib/bundler/man
|
14
|
+
pkg
|
15
|
+
rdoc
|
16
|
+
spec/reports
|
17
|
+
test/tmp
|
18
|
+
test/version_tmp
|
19
|
+
tmp
|
20
|
+
# YARD artifacts
|
21
|
+
.yardoc
|
22
|
+
_yardoc
|
23
|
+
doc/
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2013 seoaqua
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of
|
6
|
+
this software and associated documentation files (the "Software"), to deal in
|
7
|
+
the Software without restriction, including without limitation the rights to
|
8
|
+
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
|
9
|
+
the Software, and to permit persons to whom the Software is furnished to do so,
|
10
|
+
subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
|
17
|
+
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
18
|
+
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
|
19
|
+
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
20
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2013 刘明
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
Query
|
2
|
+
==========
|
3
|
+
|
4
|
+
|
5
|
+
#to get the result list by querying "abc"
|
6
|
+
Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
|
7
|
+
puts id,value
|
8
|
+
end
|
9
|
+
|
10
|
+
#to get the result list with host "www.abc.com.cn" by querying "abc"
|
11
|
+
Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
|
12
|
+
puts id,value
|
13
|
+
end
|
14
|
+
|
15
|
+
#to get the result list with host which fit the regex /com.cn/ by querying "abc"
|
16
|
+
Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
|
17
|
+
puts id,value
|
18
|
+
end
|
19
|
+
|
20
|
+
# to get the top rank of host "www.abc.com.cn" by querying "abc"
|
21
|
+
Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
|
22
|
+
|
23
|
+
TODO:
|
24
|
+
查询结果不多,翻页不存在时的处理,及rspec
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class Baidu < Base
|
4
|
+
BaseUri = 'http://www.baidu.com/s?'
|
5
|
+
def self.suggestions(wd)
|
6
|
+
require 'json'
|
7
|
+
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
|
8
|
+
m = /\[([^\]]*)\]/.match json
|
9
|
+
return JSON.parse m[0]
|
10
|
+
end
|
11
|
+
#to find out the real url for something lik 'www.baidu.com/link?url=7yoYGJqjJ4zBBpC8yDF8xDhctimd_UkfF8AVaJRPKduy2ypxVG18aRB5L6D558y3MjT_Ko0nqFgkMoS'
|
12
|
+
# def url(id)
|
13
|
+
# a = Mechanize.new
|
14
|
+
# a.redirect_ok=false
|
15
|
+
# return a.head("http://www.baidu.com/link?url=#{id}").header['location']
|
16
|
+
# end
|
17
|
+
|
18
|
+
=begin
|
19
|
+
def extend(words,level=3,sleeptime=1)
|
20
|
+
level = level.to_i - 1
|
21
|
+
words = [words] unless words.respond_to? 'each'
|
22
|
+
|
23
|
+
extensions = Array.new
|
24
|
+
words.each do |word|
|
25
|
+
self.query(word)
|
26
|
+
extensions += related_keywords
|
27
|
+
extensions += suggestions(word)
|
28
|
+
sleep sleeptime
|
29
|
+
end
|
30
|
+
extensions.uniq!
|
31
|
+
return extensions if level < 1
|
32
|
+
return extensions + extend(extensions,level)
|
33
|
+
end
|
34
|
+
=end
|
35
|
+
|
36
|
+
def self.popular?(wd)
|
37
|
+
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
|
38
|
+
end
|
39
|
+
|
40
|
+
def query(wd)
|
41
|
+
q = Array.new
|
42
|
+
q << "wd=#{wd}"
|
43
|
+
q << "rn=#{@perpage.to_i}" if @perpage
|
44
|
+
queryStr = q.join("&")
|
45
|
+
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
46
|
+
uri = URI.encode((BaseUri + queryStr))
|
47
|
+
begin
|
48
|
+
# @page = @a.get uri
|
49
|
+
@page = HTTParty.get uri
|
50
|
+
r = Query::Result::Baidu.new(@page)
|
51
|
+
r.baseuri = uri
|
52
|
+
r.pagenumber = 1
|
53
|
+
r.perpage = @perpage
|
54
|
+
r
|
55
|
+
rescue Exception => e
|
56
|
+
warn e.to_s
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
=begin
|
60
|
+
query = "#{query}"
|
61
|
+
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
62
|
+
@page = @a.get @uri
|
63
|
+
self.clean
|
64
|
+
@number = self.how_many
|
65
|
+
@maxpage = (@number / @perpage.to_f).round
|
66
|
+
@maxpage =10 if @maxpage>10
|
67
|
+
@currpage =0
|
68
|
+
=end
|
69
|
+
end
|
70
|
+
|
71
|
+
#site:xxx.yyy.com
|
72
|
+
def pages(host)
|
73
|
+
query("site:#{host}")
|
74
|
+
end
|
75
|
+
|
76
|
+
#domain:xxx.yyy.com/path/file.html
|
77
|
+
def links(uri)
|
78
|
+
query("domain:\"#{uri}\"")
|
79
|
+
end
|
80
|
+
|
81
|
+
#site:xxx.yyy.com inurl:zzz
|
82
|
+
def pages_with(host,string)
|
83
|
+
query("site:#{host} inurl:#{string}")
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class BaiduMobile < Base
|
4
|
+
BaseUri = 'http://m.baidu.com/s?'
|
5
|
+
headers = {
|
6
|
+
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
7
|
+
}
|
8
|
+
Options = {:headers => headers}
|
9
|
+
|
10
|
+
#基本查询,相当于从搜索框直接输入关键词查询
|
11
|
+
def query(wd)
|
12
|
+
queryStr = "word=#{wd}"
|
13
|
+
uri = URI.encode((BaseUri + queryStr))
|
14
|
+
# begin
|
15
|
+
res = HTTParty.get(uri,Options)
|
16
|
+
r = Query::Result::BaiduMobile.new(res)
|
17
|
+
r.baseuri = uri
|
18
|
+
r
|
19
|
+
# rescue Exception => e
|
20
|
+
# warn "#{__FILE__} #{__LINE__} #{uri} fetch error: #{e.to_s}"
|
21
|
+
# return false
|
22
|
+
# end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class Base
|
4
|
+
attr_accessor :perpage
|
5
|
+
#是否收录
|
6
|
+
# def initialize(perpage = 100)
|
7
|
+
# @perpage = perpage#只允许10或100
|
8
|
+
# end
|
9
|
+
def indexed?(url)
|
10
|
+
URI(url)
|
11
|
+
result = query(url)
|
12
|
+
return result.has_result?
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class Qihoo < Base
|
4
|
+
Host = 'www.so.com'
|
5
|
+
#基本查询, 相当于在搜索框直接数据关键词查询
|
6
|
+
def query(wd)
|
7
|
+
#用原始路径请求
|
8
|
+
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
9
|
+
page = HTTParty.get(uri)
|
10
|
+
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
11
|
+
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
12
|
+
r = Query::Result::Qihoo.new(page)
|
13
|
+
r.baseuri = uri
|
14
|
+
r
|
15
|
+
end
|
16
|
+
def self.related_keywords(wd)
|
17
|
+
url = "http://rs.so.com/?callback=Search.relate.render&encodein=utf-8&encodeout=utf-8&q="+URI.encode(wd)
|
18
|
+
# uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
19
|
+
page = HTTParty.get(url)
|
20
|
+
json_str = page.body
|
21
|
+
json_str = json_str.split("(")[1]
|
22
|
+
return nil if json_str.nil?
|
23
|
+
json_str = json_str.delete(');').strip
|
24
|
+
parsed_json = JSON.parse(json_str)
|
25
|
+
# each
|
26
|
+
# parsed_json.map { |q| p q['q']}
|
27
|
+
@related_keywords = parsed_json.map { |q| q['q'] }
|
28
|
+
# @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
data/lib/query/engine.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class Baidu < Base
|
4
|
+
def ranks
|
5
|
+
return @ranks unless @ranks.nil?
|
6
|
+
@ranks = Hash.new
|
7
|
+
@page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
|
8
|
+
id = table['id']
|
9
|
+
# if @perpage == 10
|
10
|
+
# id = table['id'][-1,1]
|
11
|
+
# id = '10' if id == '0'
|
12
|
+
# end
|
13
|
+
|
14
|
+
@ranks[id] = Hash.new
|
15
|
+
url = table.search("[@class=\"g\"]").first
|
16
|
+
url = url.text unless url.nil?
|
17
|
+
a = table.search("h3").first
|
18
|
+
next if a.nil?
|
19
|
+
@ranks[id]['text'] = a.text
|
20
|
+
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
21
|
+
unless url.nil?
|
22
|
+
url = url.strip
|
23
|
+
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
24
|
+
else
|
25
|
+
@ranks[id]['host'] = nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
29
|
+
@ranks
|
30
|
+
end
|
31
|
+
|
32
|
+
def ads_bottom
|
33
|
+
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
34
|
+
return ads_top
|
35
|
+
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
36
|
+
end
|
37
|
+
def ads_top
|
38
|
+
#灰色底推广,上下都有
|
39
|
+
ads = Hash.new
|
40
|
+
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
41
|
+
id = table['id']
|
42
|
+
next if id.nil?
|
43
|
+
id = id[2,3].to_i.to_s
|
44
|
+
ads[id]= parse_ad(table)
|
45
|
+
end
|
46
|
+
#白色底推广,只有上部分
|
47
|
+
if ads.empty?
|
48
|
+
@page.search("//table").each do |table|
|
49
|
+
id = table['id']
|
50
|
+
next if id.nil? or id.to_i<3000
|
51
|
+
id = id[2,3].to_i.to_s
|
52
|
+
ads[id]= parse_ad(table)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
ads
|
56
|
+
end
|
57
|
+
def parse_ad(table)
|
58
|
+
href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
|
59
|
+
title = table.search("a").first.text.strip
|
60
|
+
{'title'=>title,'href' => href,'host'=>href}
|
61
|
+
end
|
62
|
+
def ads_right
|
63
|
+
ads = {}
|
64
|
+
@page.search("//div[@id='ec_im_container']").each do |table|
|
65
|
+
table.search("div[@id]").each do |div|
|
66
|
+
id = div['id'][-1,1].to_i+1
|
67
|
+
title = div.search("a").first
|
68
|
+
next if title.nil?
|
69
|
+
title = title.text
|
70
|
+
url = div.search("font[@color='#008000']").first
|
71
|
+
next if url.nil?
|
72
|
+
url = url.text
|
73
|
+
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
74
|
+
end
|
75
|
+
end
|
76
|
+
ads
|
77
|
+
end
|
78
|
+
|
79
|
+
#return the top rank number from @ranks with the input host
|
80
|
+
# def rank(host)#on base of ranks
|
81
|
+
# ranks.each do |id,line|
|
82
|
+
# id = id.to_i
|
83
|
+
# if host.class == Regexp
|
84
|
+
# return id if line['host'] =~ host
|
85
|
+
# elsif host.class == String
|
86
|
+
# return id if line['host'] == host
|
87
|
+
# end
|
88
|
+
# end
|
89
|
+
# return nil
|
90
|
+
# end
|
91
|
+
|
92
|
+
def count
|
93
|
+
@count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
94
|
+
end
|
95
|
+
|
96
|
+
def related_keywords
|
97
|
+
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
98
|
+
end
|
99
|
+
|
100
|
+
def next
|
101
|
+
url = @page.xpath('//a[text()="下一页>"]').first
|
102
|
+
return if url.nil?
|
103
|
+
url = url['href']
|
104
|
+
url = URI.join(@baseuri,url).to_s
|
105
|
+
page = HTTParty.get(url)
|
106
|
+
r = Query::Result::Baidu.new(page)
|
107
|
+
r.baseuri = url
|
108
|
+
r.pagenumber=@pagenumber+1
|
109
|
+
r.perpage=@perpage
|
110
|
+
r
|
111
|
+
|
112
|
+
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
113
|
+
end
|
114
|
+
def has_result?
|
115
|
+
submit = @page.search('//a[text()="提交网址"]').first
|
116
|
+
return false if submit and submit['href'].include?'sitesubmit'
|
117
|
+
return true
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class BaiduMobile < Base
|
4
|
+
#返回当前页所有查询结果
|
5
|
+
def ranks
|
6
|
+
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
7
|
+
return @ranks unless @ranks.nil?
|
8
|
+
@ranks = Hash.new
|
9
|
+
@page.xpath('//div[@class="result"]').each do |result|
|
10
|
+
href,text,host,is_mobile = '','','',false
|
11
|
+
a = result.search("a").first
|
12
|
+
is_mobile = true unless a.search("img").empty?
|
13
|
+
host = result.search('[@class="site"]').first
|
14
|
+
next if host.nil?
|
15
|
+
host = host.text
|
16
|
+
href = a['href']
|
17
|
+
text = a.text
|
18
|
+
id = href.scan(/&order=(\d+)&/)
|
19
|
+
if id.empty?
|
20
|
+
id = nil
|
21
|
+
else
|
22
|
+
id = id.first.first.to_i
|
23
|
+
# id = (@pagenumber-1)*10+id
|
24
|
+
end
|
25
|
+
=begin
|
26
|
+
result.children.each do |elem|
|
27
|
+
if elem.name == 'a'
|
28
|
+
href = elem['href']
|
29
|
+
id = elem.text.match(/^\d+/).to_s.to_i
|
30
|
+
text = elem.text.sub(/^\d+/,'')
|
31
|
+
text.sub!(/^\u00A0/,'')
|
32
|
+
elsif elem['class'] == 'abs'
|
33
|
+
elem.children.each do |elem2|
|
34
|
+
if elem2['class'] == 'site'
|
35
|
+
host = elem2.text
|
36
|
+
break
|
37
|
+
end
|
38
|
+
end
|
39
|
+
elsif elem['class'] == 'site'
|
40
|
+
host == elem['href']
|
41
|
+
end
|
42
|
+
end
|
43
|
+
=end
|
44
|
+
|
45
|
+
@ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
|
46
|
+
end
|
47
|
+
@ranks
|
48
|
+
end
|
49
|
+
def ads_top
|
50
|
+
id = 0
|
51
|
+
result = []
|
52
|
+
@page.search("div[@class='ec_wise_ad']/div").each do |div|
|
53
|
+
id += 1
|
54
|
+
href = div.search("span[@class='ec_site']").first.text
|
55
|
+
href = "http://#{href}"
|
56
|
+
title = div.search("a/text()").text.strip
|
57
|
+
host = Addressable::URI.parse(URI.encode(href)).host
|
58
|
+
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
59
|
+
end
|
60
|
+
result
|
61
|
+
end
|
62
|
+
def ads_right
|
63
|
+
[]
|
64
|
+
end
|
65
|
+
def ads_bottom
|
66
|
+
[]
|
67
|
+
end
|
68
|
+
def related_keywords
|
69
|
+
@related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
|
70
|
+
end
|
71
|
+
=begin
|
72
|
+
#返回当前页中,符合host条件的结果
|
73
|
+
def ranks_for(specific_host)
|
74
|
+
host_ranks = Hash.new
|
75
|
+
ranks.each do |id,line|
|
76
|
+
if specific_host.class == Regexp
|
77
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
78
|
+
elsif specific_host.class == String
|
79
|
+
host_ranks[id] = line if line['host'] == specific_host
|
80
|
+
end
|
81
|
+
end
|
82
|
+
host_ranks
|
83
|
+
end
|
84
|
+
#return the top rank number from @ranks with the input host
|
85
|
+
def rank(host)#on base of ranks
|
86
|
+
ranks.each do |id,line|
|
87
|
+
id = id.to_i
|
88
|
+
if host.class == Regexp
|
89
|
+
return id if line['host'] =~ host
|
90
|
+
elsif host.class == String
|
91
|
+
return id if line['host'] == host
|
92
|
+
end
|
93
|
+
end
|
94
|
+
return nil
|
95
|
+
end
|
96
|
+
=end
|
97
|
+
#下一页
|
98
|
+
def next
|
99
|
+
nextbutton = @page.xpath('//a[text()="下一页"]').first
|
100
|
+
return nil if nextbutton.nil?
|
101
|
+
url = URI.encode nextbutton['href']
|
102
|
+
# puts url
|
103
|
+
# p @baseuri
|
104
|
+
# exit
|
105
|
+
url = URI.join(@baseuri,url).to_s
|
106
|
+
page = HTTParty.get(url)
|
107
|
+
r = Query::Result::BaiduMobile.new(page)
|
108
|
+
r.baseuri=url
|
109
|
+
r.pagenumber=@pagenumber+1
|
110
|
+
r
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class Base
|
4
|
+
attr_accessor :baseuri,:pagenumber,:perpage
|
5
|
+
def initialize(page)
|
6
|
+
@page = Nokogiri::HTML page
|
7
|
+
@pagenumber = 1
|
8
|
+
end
|
9
|
+
# def initialize(page,baseuri,pagenumber=1,perpage=100)
|
10
|
+
# @page = Nokogiri::HTML page
|
11
|
+
# @baseuri = baseuri
|
12
|
+
# # @host = URI(baseuri).host
|
13
|
+
# @pagenumber = pagenumber
|
14
|
+
# @perpage = perpage
|
15
|
+
# end
|
16
|
+
def whole
|
17
|
+
{
|
18
|
+
'ads_top'=>ads_top,
|
19
|
+
'ads_right'=>ads_right,
|
20
|
+
'ads_bottom'=>ads_bottom,
|
21
|
+
'ranks'=>ranks
|
22
|
+
}
|
23
|
+
end
|
24
|
+
#返回当前页中host满足条件的结果
|
25
|
+
def ranks_for(specific_host)
|
26
|
+
host_ranks = Hash.new
|
27
|
+
ranks.each do |id,line|
|
28
|
+
if specific_host.class == Regexp
|
29
|
+
host_ranks[id] = line if line['host'] =~ specific_host
|
30
|
+
elsif specific_host.class == String
|
31
|
+
host_ranks[id] = line if line['host'] == specific_host
|
32
|
+
end
|
33
|
+
end
|
34
|
+
host_ranks
|
35
|
+
end
|
36
|
+
#return the top rank number from @ranks with the input host
|
37
|
+
def rank(host)#on base of ranks
|
38
|
+
ranks.each do |id,line|
|
39
|
+
id = id.to_i
|
40
|
+
if host.class == Regexp
|
41
|
+
return id if line['host'] =~ host
|
42
|
+
elsif host.class == String
|
43
|
+
return id if line['host'] == host
|
44
|
+
end
|
45
|
+
end
|
46
|
+
return nil
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class Qihoo < Base
|
4
|
+
# include Query::Result
|
5
|
+
Host = 'www.so.com'
|
6
|
+
#返回所有当前页的排名结果
|
7
|
+
def ranks
|
8
|
+
return @ranks unless @ranks.nil?
|
9
|
+
@ranks = Hash.new
|
10
|
+
# id = (@pagenumber - 1) * 10
|
11
|
+
id = 0
|
12
|
+
@page.search('//li[@class="res-list"]').each do |li|
|
13
|
+
a = li.search("h3/a").first
|
14
|
+
url = li.search("cite")
|
15
|
+
next if a['data-pos'].nil?
|
16
|
+
id += 1
|
17
|
+
text = a.text.strip
|
18
|
+
href = a['href']
|
19
|
+
url = url.first.text
|
20
|
+
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
21
|
+
@ranks[id.to_s] = {'href'=>a['href'],'text'=>text,'host'=>host}
|
22
|
+
end
|
23
|
+
@ranks
|
24
|
+
end
|
25
|
+
def ads_top
|
26
|
+
id = 0
|
27
|
+
result = []
|
28
|
+
@page.search("//ul[@id='djbox']/li").each do |li|
|
29
|
+
id += 1
|
30
|
+
title = li.search("a").first.text
|
31
|
+
href = li.search("cite").first.text.downcase
|
32
|
+
host = Addressable::URI.parse(URI.encode(href)).host
|
33
|
+
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
34
|
+
end
|
35
|
+
result
|
36
|
+
end
|
37
|
+
def ads_bottom
|
38
|
+
[]
|
39
|
+
end
|
40
|
+
def ads_right
|
41
|
+
id = 0
|
42
|
+
result = []
|
43
|
+
@page.search("//ul[@id='rightbox']/li").each do |li|
|
44
|
+
id += 1
|
45
|
+
title = li.search("a").first.text
|
46
|
+
href = li.search("cite").first.text.downcase
|
47
|
+
host = Addressable::URI.parse(URI.encode(href)).host
|
48
|
+
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
49
|
+
end
|
50
|
+
result
|
51
|
+
end
|
52
|
+
def related_keywords
|
53
|
+
[]
|
54
|
+
end
|
55
|
+
#下一页
|
56
|
+
def next
|
57
|
+
next_href = @page.xpath('//a[@id="snext"]')
|
58
|
+
return false if next_href.empty?
|
59
|
+
next_href = next_href.first['href']
|
60
|
+
next_href = URI.join(@baseuri,next_href).to_s
|
61
|
+
# next_href = URI.join("http://#{@host}",next_href).to_s
|
62
|
+
next_page = HTTParty.get(next_href).next
|
63
|
+
r =Query::Result::Qihoo.new(next_page)
|
64
|
+
r.baseuri=next_href
|
65
|
+
r.pagenumber=@pagenumber+1
|
66
|
+
r
|
67
|
+
#@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
68
|
+
end
|
69
|
+
#有结果
|
70
|
+
def has_result?
|
71
|
+
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
data/lib/query/result.rb
ADDED
data/lib/query.rb
ADDED
data/query.gemspec
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'query/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "query"
|
8
|
+
spec.version = Query::VERSION
|
9
|
+
spec.authors = ["seoaqua"]
|
10
|
+
spec.email = ["seoaqua@me.com"]
|
11
|
+
spec.description = %q{This GEM is designed to work for SEOers who need to fetch query and parse results from all kinds of search engines}
|
12
|
+
spec.summary = %q{Now its only support Chinese main search engines}
|
13
|
+
spec.homepage = "https://github.com/seoaqua/query"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files`.split($/)
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
spec.add_development_dependency "rake"
|
23
|
+
spec.add_dependency "nokogiri"
|
24
|
+
spec.add_dependency "addressable"
|
25
|
+
spec.add_dependency "httparty"
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
describe Query::Engine::BaiduMobile do
|
4
|
+
mbaidu = Query::Engine::BaiduMobile.new
|
5
|
+
page = mbaidu.query '百度'
|
6
|
+
it "应返回#{Query::Engine::BaiduMobile}" do
|
7
|
+
page.class.should == Query::Result::BaiduMobile
|
8
|
+
end
|
9
|
+
it "下一页也应是Query::Engine::BaiduMobile" do
|
10
|
+
page.next.class.should == Query::Result::BaiduMobile
|
11
|
+
page.next.next.class.should == Query::Result::BaiduMobile
|
12
|
+
end
|
13
|
+
it "百度百科域名应该大于1" do
|
14
|
+
page.rank('wapbaike.baidu.com').should > 1
|
15
|
+
end
|
16
|
+
it "百度无线域名应该在10以内" do
|
17
|
+
page.rank('m.baidu.com').should < 11
|
18
|
+
end
|
19
|
+
end
|
data/spec/baidu_spec.rb
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
describe Query::Engine::Baidu do
|
4
|
+
baidu = Query::Engine::Baidu.new
|
5
|
+
page = baidu.query '百度'
|
6
|
+
|
7
|
+
it "should return Query::Result::Baidu" do
|
8
|
+
page.class.should == Query::Result::Baidu
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should return 100,000,000" do
|
12
|
+
page.count.should > 100000
|
13
|
+
end
|
14
|
+
it "should return 1" do
|
15
|
+
page.rank('www.baidu.com').should == 1
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should return Query::Result::Baidu" do
|
19
|
+
page.next.class.should == Query::Result::Baidu
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should return true" do
|
23
|
+
bool = Query::Engine::Baidu.popular?'百度'
|
24
|
+
bool.should == true
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should return false" do
|
28
|
+
bool = Query::Engine::Baidu.popular?'lavataliuming'
|
29
|
+
bool.should == false
|
30
|
+
end
|
31
|
+
|
32
|
+
it "should return over 5 words beginning with the query_word" do
|
33
|
+
query_word = '为'
|
34
|
+
suggestions = Query::Engine::Baidu.suggestions(query_word)
|
35
|
+
suggestions.size.should > 5
|
36
|
+
suggestions.each do |suggestion|
|
37
|
+
suggestion[0].should == query_word
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "should return 100,000,000" do
|
42
|
+
result = baidu.pages('baidu.com')
|
43
|
+
result.class.should == Query::Result::Baidu
|
44
|
+
result.count.should == 100000000
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should return 100,000,000" do
|
48
|
+
result = baidu.links('baidu.com')
|
49
|
+
result.class.should == Query::Result::Baidu
|
50
|
+
result.count.should == 100000000
|
51
|
+
end
|
52
|
+
it "should return 100,000,000" do
|
53
|
+
result = baidu.pages_with('baidu.com','baidu.com')
|
54
|
+
result.class.should == Query::Result::Baidu
|
55
|
+
result.count.should == 100000000
|
56
|
+
end
|
57
|
+
it "查询已经被收录的页面收录情况时,应返回true" do
|
58
|
+
baidu.indexed?('http://www.baidu.com').should == true
|
59
|
+
end
|
60
|
+
it "查询一个不存在的页面收录情况时,应返回true" do
|
61
|
+
baidu.indexed?('http://zxv.not-exists.com').should == false
|
62
|
+
end
|
63
|
+
page1 = baidu.query('seoaqua.com')
|
64
|
+
it "查询结果应该都能拿到title,href,host" do
|
65
|
+
page1.ranks.each do |id,rank|
|
66
|
+
rank['href'].should_not == nil
|
67
|
+
rank['text'].should_not == nil
|
68
|
+
rank['host'].should_not == nil
|
69
|
+
end
|
70
|
+
end
|
71
|
+
# ads_page = baidu.query '减肥药'
|
72
|
+
|
73
|
+
end
|
data/spec/qihoo_spec.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
#coding:UTF-8
|
2
|
+
require 'spec_helper'
|
3
|
+
describe Query::Engine::Qihoo do
|
4
|
+
qihoo = Query::Engine::Qihoo.new
|
5
|
+
page = qihoo.query '奇虎'
|
6
|
+
page2 = page.next
|
7
|
+
page3 = page2.next
|
8
|
+
it "查询关键词'奇虎'后,应返回正确的实例" do
|
9
|
+
page.class.should == Query::Result::Qihoo
|
10
|
+
end
|
11
|
+
it "查询关键词'奇虎'后,下一页也应是MbaiduResult的实例" do
|
12
|
+
page2.class.should == Query::Result::Qihoo
|
13
|
+
end
|
14
|
+
it "查询关键词'奇虎'后,下一页,再下一页也应是MbaiduResult的实例" do
|
15
|
+
page3.class.should == Query::Result::Qihoo
|
16
|
+
end
|
17
|
+
|
18
|
+
it "查询关键词'奇虎'后,奇虎首页域名应该等于1" do
|
19
|
+
page.rank('www.qihoo.com').should == 1
|
20
|
+
end
|
21
|
+
it "查询已经被收录的页面收录情况时,应返回true" do
|
22
|
+
qihoo.indexed?('http://www.360.cn').should == true
|
23
|
+
end
|
24
|
+
it "查询一个不存在的页面收录情况时,应返回true" do
|
25
|
+
qihoo.indexed?('http://zxv.not-exists.com').should == false
|
26
|
+
end
|
27
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'query'
|
metadata
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: query
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- seoaqua
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-10-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: addressable
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: httparty
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: This GEM is designed to work for SEOers who need to fetch query and parse
|
84
|
+
results from all kinds of search engines
|
85
|
+
email:
|
86
|
+
- seoaqua@me.com
|
87
|
+
executables: []
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- .gitignore
|
92
|
+
- Gemfile
|
93
|
+
- LICENSE
|
94
|
+
- LICENSE.txt
|
95
|
+
- README.md
|
96
|
+
- Rakefile
|
97
|
+
- lib/query.rb
|
98
|
+
- lib/query/engine.rb
|
99
|
+
- lib/query/engine/baidu.rb
|
100
|
+
- lib/query/engine/baidu_mobile.rb
|
101
|
+
- lib/query/engine/base.rb
|
102
|
+
- lib/query/engine/qihoo.rb
|
103
|
+
- lib/query/engine/qihoo_mobile.rb
|
104
|
+
- lib/query/result.rb
|
105
|
+
- lib/query/result/baidu.rb
|
106
|
+
- lib/query/result/baidu_mobile.rb
|
107
|
+
- lib/query/result/base.rb
|
108
|
+
- lib/query/result/qihoo.rb
|
109
|
+
- lib/query/result/qihoo_mobile.rb
|
110
|
+
- lib/query/version.rb
|
111
|
+
- query.gemspec
|
112
|
+
- spec/baidu_mobile_spec.rb
|
113
|
+
- spec/baidu_spec.rb
|
114
|
+
- spec/qihoo_spec.rb
|
115
|
+
- spec/spec_helper.rb
|
116
|
+
homepage: https://github.com/seoaqua/query
|
117
|
+
licenses:
|
118
|
+
- MIT
|
119
|
+
metadata: {}
|
120
|
+
post_install_message:
|
121
|
+
rdoc_options: []
|
122
|
+
require_paths:
|
123
|
+
- lib
|
124
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
125
|
+
requirements:
|
126
|
+
- - '>='
|
127
|
+
- !ruby/object:Gem::Version
|
128
|
+
version: '0'
|
129
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - '>='
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
requirements: []
|
135
|
+
rubyforge_project:
|
136
|
+
rubygems_version: 2.1.5
|
137
|
+
signing_key:
|
138
|
+
specification_version: 4
|
139
|
+
summary: Now its only support Chinese main search engines
|
140
|
+
test_files:
|
141
|
+
- spec/baidu_mobile_spec.rb
|
142
|
+
- spec/baidu_spec.rb
|
143
|
+
- spec/qihoo_spec.rb
|
144
|
+
- spec/spec_helper.rb
|