query 0.0.1 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +4 -1
- data/Gemfile +3 -1
- data/README.md +6 -1
- data/lib/query/engine/baidu.rb +12 -8
- data/lib/query/engine/baidu_mobile.rb +4 -4
- data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
- data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
- data/lib/query/engine/sogou.rb +45 -0
- data/lib/query/engine/sogou_mobile.rb +21 -0
- data/lib/query/engine.rb +11 -4
- data/lib/query/result/baidu.rb +57 -91
- data/lib/query/result/baidu_mobile.rb +49 -93
- data/lib/query/result/qihu.rb +66 -0
- data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
- data/lib/query/result/sogou.rb +103 -0
- data/lib/query/result/sogou_mobile.rb +51 -0
- data/lib/query/result.rb +47 -4
- data/lib/query/version.rb +1 -1
- data/lib/query.rb +6 -8
- data/query.gemspec +2 -3
- data/spec/baidu1_spec.rb +157 -0
- data/spec/baidu2_spec.rb +156 -0
- data/spec/mbaidu1_spec.rb +167 -0
- data/spec/msogou_spec.rb +91 -0
- data/spec/qihu_spec.rb +87 -0
- data/spec/samples/baidu1.html +521 -0
- data/spec/samples/baidu2.html +662 -0
- data/spec/samples/mbaidu1.html +2 -0
- data/spec/samples/mbaidu2.html +2 -0
- data/spec/samples/msogou.html +474 -0
- data/spec/samples/qihu.html +506 -0
- data/spec/samples/sogou.html +629 -0
- data/spec/sogou_mobile_spec.rb +86 -0
- data/spec/sogou_spec.rb +107 -0
- data/spec/spec_helper.rb +12 -1
- metadata +56 -31
- data/lib/query/engine/base.rb +0 -16
- data/lib/query/result/base.rb +0 -50
- data/lib/query/result/qihoo.rb +0 -75
- data/spec/baidu_mobile_spec.rb +0 -19
- data/spec/baidu_spec.rb +0 -73
- data/spec/qihoo_spec.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5043b0180a473ab3d213136dfe9ab55ccb4a6d9
|
4
|
+
data.tar.gz: dc0b8b1ee15dc3f4437439712904de92b838520e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: baa8ca09bc7bfd19f1eb3e9dffa24cf0cc28aba704d8671769bc26a79c68f10c01d57df4f76d4e6558ea638fd6c211111ca21d1883675450971256fc8369bc2a
|
7
|
+
data.tar.gz: 804e6685b6d7d49e563318a9150eae1a655c52cb6f6ca9084c7fbb0c908fde92a44025761668abe2b7aa4b21ba31fbb0c9ebcbc24a6d3c61fd2415e5344b9fd4
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,22 +3,27 @@ Query
|
|
3
3
|
|
4
4
|
|
5
5
|
#to get the result list by querying "abc"
|
6
|
+
|
6
7
|
Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
|
7
8
|
puts id,value
|
8
9
|
end
|
9
10
|
|
10
11
|
#to get the result list with host "www.abc.com.cn" by querying "abc"
|
12
|
+
|
11
13
|
Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
|
12
14
|
puts id,value
|
13
15
|
end
|
14
16
|
|
15
17
|
#to get the result list with host which fit the regex /com.cn/ by querying "abc"
|
18
|
+
|
16
19
|
Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
|
17
20
|
puts id,value
|
18
21
|
end
|
19
22
|
|
20
23
|
# to get the top rank of host "www.abc.com.cn" by querying "abc"
|
24
|
+
|
21
25
|
Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
|
22
26
|
|
23
27
|
TODO:
|
24
|
-
查询结果不多,翻页不存在时的处理,及rspec
|
28
|
+
查询结果不多,翻页不存在时的处理,及rspec
|
29
|
+
增加其他搜索引擎
|
data/lib/query/engine/baidu.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class Baidu
|
3
|
+
class Baidu
|
4
|
+
include Query::Engine
|
4
5
|
BaseUri = 'http://www.baidu.com/s?'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
|
8
|
+
}
|
5
9
|
def self.suggestions(wd)
|
6
10
|
require 'json'
|
7
11
|
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
|
@@ -37,25 +41,25 @@ module Query
|
|
37
41
|
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
|
38
42
|
end
|
39
43
|
|
40
|
-
def query(wd)
|
44
|
+
def self.query(wd)
|
41
45
|
q = Array.new
|
42
46
|
q << "wd=#{wd}"
|
43
47
|
q << "rn=#{@perpage.to_i}" if @perpage
|
44
48
|
queryStr = q.join("&")
|
45
49
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
46
50
|
uri = URI.encode((BaseUri + queryStr))
|
47
|
-
begin
|
51
|
+
# begin
|
48
52
|
# @page = @a.get uri
|
49
|
-
@page = HTTParty.get
|
53
|
+
@page = HTTParty.get(uri,Options)
|
50
54
|
r = Query::Result::Baidu.new(@page)
|
51
55
|
r.baseuri = uri
|
52
56
|
r.pagenumber = 1
|
53
57
|
r.perpage = @perpage
|
54
58
|
r
|
55
|
-
rescue Exception => e
|
56
|
-
|
57
|
-
|
58
|
-
end
|
59
|
+
# rescue Exception => e
|
60
|
+
# warn e.to_s
|
61
|
+
# return false
|
62
|
+
# end
|
59
63
|
=begin
|
60
64
|
query = "#{query}"
|
61
65
|
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class BaiduMobile
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Engine
|
4
5
|
BaseUri = 'http://m.baidu.com/s?'
|
5
|
-
|
6
|
-
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
|
7
8
|
}
|
8
|
-
Options = {:headers => headers}
|
9
9
|
|
10
10
|
#基本查询,相当于从搜索框直接输入关键词查询
|
11
11
|
def query(wd)
|
@@ -1,15 +1,20 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class
|
3
|
+
class Qihu
|
4
|
+
include Query::Engine
|
4
5
|
Host = 'www.so.com'
|
6
|
+
headers = {
|
7
|
+
"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'
|
8
|
+
}
|
9
|
+
Options = {:headers => headers}
|
5
10
|
#基本查询, 相当于在搜索框直接数据关键词查询
|
6
11
|
def query(wd)
|
7
12
|
#用原始路径请求
|
8
13
|
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
9
|
-
page = HTTParty.get(uri)
|
14
|
+
page = HTTParty.get(uri,Options)
|
10
15
|
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
11
16
|
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
12
|
-
r = Query::Result::
|
17
|
+
r = Query::Result::Qihu.new(page)
|
13
18
|
r.baseuri = uri
|
14
19
|
r
|
15
20
|
end
|
File without changes
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class Sogou
|
4
|
+
include Query::Engine
|
5
|
+
BaseUri = 'http://www.sogou.com/web?'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
|
8
|
+
}
|
9
|
+
class << self
|
10
|
+
def query(wd)
|
11
|
+
q = []
|
12
|
+
q << "query=#{wd}"
|
13
|
+
uri = URI.encode BaseUri+q.join('&')
|
14
|
+
page = HTTParty.get(uri,Options)
|
15
|
+
r = Query::Result::Sogou.new(page)
|
16
|
+
r.baseuri = uri
|
17
|
+
r.perpage = @perpage
|
18
|
+
r.pagenumber = 1
|
19
|
+
r
|
20
|
+
end
|
21
|
+
|
22
|
+
def suggestions(word)
|
23
|
+
suggestions = HTTParty.get "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=#{URI.encode(word)}"
|
24
|
+
suggestions = suggestions.encode('utf-8').scan /#{word}[^"]+/
|
25
|
+
suggestions
|
26
|
+
end
|
27
|
+
|
28
|
+
#site:xxx.yyy.com
|
29
|
+
def pages(host)
|
30
|
+
query("site:#{host}")
|
31
|
+
end
|
32
|
+
|
33
|
+
#domain:xxx.yyy.com/path/file.html
|
34
|
+
def links(uri)
|
35
|
+
query("domain:\"#{uri}\"")
|
36
|
+
end
|
37
|
+
|
38
|
+
#site:xxx.yyy.com inurl:zzz
|
39
|
+
# def pages_with(host,string)
|
40
|
+
# query("site:#{host} inurl:#{string}")
|
41
|
+
# end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class SogouMobile
|
4
|
+
include Query::Engine
|
5
|
+
BaseUri = 'http://wap.sogou.com/web/searchList.jsp'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
|
8
|
+
}
|
9
|
+
class << self
|
10
|
+
def query(wd)
|
11
|
+
queryStr = "keyword=#{wd}"
|
12
|
+
uri = URI.encode(BaseUri + "?" + queryStr)
|
13
|
+
res = HTTParty.get(uri,Options)
|
14
|
+
r = Query::Result::SogouMobile.new(res)
|
15
|
+
r.baseuri = uri
|
16
|
+
r
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/query/engine.rb
CHANGED
@@ -1,10 +1,17 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
+
attr_accessor :perpage
|
4
|
+
def self.indexed?(url)
|
5
|
+
URI(url)
|
6
|
+
result = query(url)
|
7
|
+
return result.has_result?
|
8
|
+
end
|
3
9
|
end
|
4
10
|
end
|
5
|
-
|
6
|
-
require 'query/engine/base'
|
11
|
+
require 'httparty'
|
7
12
|
require 'query/engine/baidu'
|
8
13
|
require 'query/engine/baidu_mobile'
|
9
|
-
require 'query/engine/
|
10
|
-
require 'query/engine/
|
14
|
+
require 'query/engine/qihu'
|
15
|
+
require 'query/engine/qihu_mobile'
|
16
|
+
require 'query/engine/sogou'
|
17
|
+
require 'query/engine/sogou_mobile'
|
data/lib/query/result/baidu.rb
CHANGED
@@ -1,94 +1,40 @@
|
|
1
1
|
module Query
|
2
2
|
module Result
|
3
|
-
class Baidu
|
4
|
-
|
3
|
+
class Baidu
|
4
|
+
include Query::Result
|
5
|
+
def seo_ranks
|
5
6
|
return @ranks unless @ranks.nil?
|
6
|
-
@
|
7
|
-
|
8
|
-
id = table['id']
|
9
|
-
# if @perpage == 10
|
10
|
-
# id = table['id'][-1,1]
|
11
|
-
# id = '10' if id == '0'
|
12
|
-
# end
|
13
|
-
|
14
|
-
@ranks[id] = Hash.new
|
15
|
-
url = table.search("[@class=\"g\"]").first
|
16
|
-
url = url.text unless url.nil?
|
17
|
-
a = table.search("h3").first
|
18
|
-
next if a.nil?
|
19
|
-
@ranks[id]['text'] = a.text
|
20
|
-
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
21
|
-
unless url.nil?
|
22
|
-
url = url.strip
|
23
|
-
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
24
|
-
else
|
25
|
-
@ranks[id]['host'] = nil
|
26
|
-
end
|
7
|
+
@page.search("//*[@class='result']|//*[@class='result-op']|//*[@class='result-op c-container']").map.with_index do |table,index|
|
8
|
+
parse_seo(table).merge({:rank => index + 1})
|
27
9
|
end
|
28
|
-
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
29
|
-
@ranks
|
30
10
|
end
|
31
11
|
|
32
|
-
def ads_bottom
|
33
|
-
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
34
|
-
return ads_top
|
35
|
-
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
36
|
-
end
|
37
12
|
def ads_top
|
38
|
-
|
39
|
-
|
40
|
-
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
41
|
-
id = table['id']
|
42
|
-
next if id.nil?
|
43
|
-
id = id[2,3].to_i.to_s
|
44
|
-
ads[id]= parse_ad(table)
|
13
|
+
@page.search("//*[@class='result']/preceding-sibling::*[contains(@class,'EC_result')]").map.with_index do |div, index|
|
14
|
+
parse_ad(div).merge(:rank => index + 1)
|
45
15
|
end
|
46
|
-
#白色底推广,只有上部分
|
47
|
-
if ads.empty?
|
48
|
-
@page.search("//table").each do |table|
|
49
|
-
id = table['id']
|
50
|
-
next if id.nil? or id.to_i<3000
|
51
|
-
id = id[2,3].to_i.to_s
|
52
|
-
ads[id]= parse_ad(table)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
ads
|
56
16
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
17
|
+
|
18
|
+
def ads_bottom
|
19
|
+
@page.search("//*[@class='result']/following-sibling::*[contains(@class,'EC_result')]").map.with_index do |div,index|
|
20
|
+
parse_ad(div).merge(:rank => index + 1)
|
21
|
+
end
|
61
22
|
end
|
23
|
+
|
62
24
|
def ads_right
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
74
|
-
end
|
25
|
+
@page.search("//div[@id='ec_im_container']/div[@id]").map.with_index do |div,index|
|
26
|
+
a = div.search('a').first
|
27
|
+
url = div.search("*[@class='EC_url']").first.text
|
28
|
+
url = "http://#{url}"
|
29
|
+
{
|
30
|
+
:rank => index + 1,
|
31
|
+
:text => a.text.strip,
|
32
|
+
:href => a['href'].strip,
|
33
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
34
|
+
}
|
75
35
|
end
|
76
|
-
ads
|
77
36
|
end
|
78
37
|
|
79
|
-
#return the top rank number from @ranks with the input host
|
80
|
-
# def rank(host)#on base of ranks
|
81
|
-
# ranks.each do |id,line|
|
82
|
-
# id = id.to_i
|
83
|
-
# if host.class == Regexp
|
84
|
-
# return id if line['host'] =~ host
|
85
|
-
# elsif host.class == String
|
86
|
-
# return id if line['host'] == host
|
87
|
-
# end
|
88
|
-
# end
|
89
|
-
# return nil
|
90
|
-
# end
|
91
|
-
|
92
38
|
def count
|
93
39
|
@count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
94
40
|
end
|
@@ -97,25 +43,45 @@ module Query
|
|
97
43
|
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
98
44
|
end
|
99
45
|
|
100
|
-
def next
|
101
|
-
url = @page.xpath('//a[text()="下一页>"]').first
|
102
|
-
return if url.nil?
|
103
|
-
url = url['href']
|
104
|
-
url = URI.join(@baseuri,url).to_s
|
105
|
-
page = HTTParty.get(url)
|
106
|
-
r = Query::Result::Baidu.new(page)
|
107
|
-
r.baseuri = url
|
108
|
-
r.pagenumber=@pagenumber+1
|
109
|
-
r.perpage=@perpage
|
110
|
-
r
|
111
|
-
|
112
|
-
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
113
|
-
end
|
114
46
|
def has_result?
|
115
47
|
submit = @page.search('//a[text()="提交网址"]').first
|
116
48
|
return false if submit and submit['href'].include?'sitesubmit'
|
117
49
|
return true
|
118
50
|
end
|
51
|
+
|
52
|
+
def next_url
|
53
|
+
@page.search("//a[text()='下一页>']").first['href']
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def parse_ad(div)
|
58
|
+
#@todo should be :
|
59
|
+
#title = div.xpath("*[contains(@class,'ec_title')]",MyFilter.new).first
|
60
|
+
title = div.xpath("//*[contains(@class,'ec_title')]",MyFilter.new).first
|
61
|
+
url = %w( span[@class='ec_url'] a[@class='EC_url'] ).map do |xpath|
|
62
|
+
node = div.search(xpath).first
|
63
|
+
node.text if node
|
64
|
+
end.compact.first
|
65
|
+
url = "http://" + url
|
66
|
+
{
|
67
|
+
:text => title.text,
|
68
|
+
:href => title['href'],
|
69
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
def parse_seo(table)
|
74
|
+
url = %w( span[@class="g"] span[@class="c-showurl"] div[@class="op_zhidao_showurl"]).map do |xpath|
|
75
|
+
span = table.search(xpath).first
|
76
|
+
span.text.sub(/\d{4}-\d{1,2}-\d{1,2}/,'').strip if span
|
77
|
+
end.compact.first
|
78
|
+
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
79
|
+
{
|
80
|
+
:text => table.search("h3").first.text.strip,
|
81
|
+
:href => table.search('a').first['href'].strip,
|
82
|
+
:host => host
|
83
|
+
}
|
84
|
+
end
|
119
85
|
end
|
120
86
|
end
|
121
87
|
end
|
@@ -1,113 +1,69 @@
|
|
1
1
|
module Query
|
2
2
|
module Result
|
3
|
-
class BaiduMobile
|
4
|
-
|
5
|
-
def ranks
|
6
|
-
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
7
|
-
return @ranks unless @ranks.nil?
|
8
|
-
@ranks = Hash.new
|
9
|
-
@page.xpath('//div[@class="result"]').each do |result|
|
10
|
-
href,text,host,is_mobile = '','','',false
|
11
|
-
a = result.search("a").first
|
12
|
-
is_mobile = true unless a.search("img").empty?
|
13
|
-
host = result.search('[@class="site"]').first
|
14
|
-
next if host.nil?
|
15
|
-
host = host.text
|
16
|
-
href = a['href']
|
17
|
-
text = a.text
|
18
|
-
id = href.scan(/&order=(\d+)&/)
|
19
|
-
if id.empty?
|
20
|
-
id = nil
|
21
|
-
else
|
22
|
-
id = id.first.first.to_i
|
23
|
-
# id = (@pagenumber-1)*10+id
|
24
|
-
end
|
25
|
-
=begin
|
26
|
-
result.children.each do |elem|
|
27
|
-
if elem.name == 'a'
|
28
|
-
href = elem['href']
|
29
|
-
id = elem.text.match(/^\d+/).to_s.to_i
|
30
|
-
text = elem.text.sub(/^\d+/,'')
|
31
|
-
text.sub!(/^\u00A0/,'')
|
32
|
-
elsif elem['class'] == 'abs'
|
33
|
-
elem.children.each do |elem2|
|
34
|
-
if elem2['class'] == 'site'
|
35
|
-
host = elem2.text
|
36
|
-
break
|
37
|
-
end
|
38
|
-
end
|
39
|
-
elsif elem['class'] == 'site'
|
40
|
-
host == elem['href']
|
41
|
-
end
|
42
|
-
end
|
43
|
-
=end
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Result
|
44
5
|
|
45
|
-
|
6
|
+
def seo_ranks
|
7
|
+
@seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
|
8
|
+
parse_seo(div).merge({:rank => index + 1})
|
46
9
|
end
|
47
|
-
@ranks
|
48
10
|
end
|
11
|
+
|
49
12
|
def ads_top
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
id += 1
|
54
|
-
href = div.search("span[@class='ec_site']").first.text
|
55
|
-
href = "http://#{href}"
|
56
|
-
title = div.search("a/text()").text.strip
|
57
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
58
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
13
|
+
@ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
|
14
|
+
puts index
|
15
|
+
parse_ad(div).merge({:rank => index + 1})
|
59
16
|
end
|
60
|
-
result
|
61
17
|
end
|
18
|
+
|
62
19
|
def ads_right
|
63
20
|
[]
|
64
21
|
end
|
22
|
+
|
65
23
|
def ads_bottom
|
66
|
-
[]
|
24
|
+
@ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
|
25
|
+
parse_ad(div).merge({:rank => index + 1})
|
26
|
+
end
|
67
27
|
end
|
28
|
+
|
29
|
+
#酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
|
68
30
|
def related_keywords
|
69
|
-
@related_keywords ||= @page.search("div[@class='
|
31
|
+
@related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
|
70
32
|
end
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
host_ranks = Hash.new
|
75
|
-
ranks.each do |id,line|
|
76
|
-
if specific_host.class == Regexp
|
77
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
78
|
-
elsif specific_host.class == String
|
79
|
-
host_ranks[id] = line if line['host'] == specific_host
|
80
|
-
end
|
81
|
-
end
|
82
|
-
host_ranks
|
33
|
+
|
34
|
+
def next_url
|
35
|
+
@next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
|
83
36
|
end
|
84
|
-
|
85
|
-
def
|
86
|
-
|
87
|
-
id = id.to_i
|
88
|
-
if host.class == Regexp
|
89
|
-
return id if line['host'] =~ host
|
90
|
-
elsif host.class == String
|
91
|
-
return id if line['host'] == host
|
92
|
-
end
|
93
|
-
end
|
94
|
-
return nil
|
37
|
+
|
38
|
+
def count
|
39
|
+
|
95
40
|
end
|
96
|
-
|
97
|
-
|
98
|
-
def
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
41
|
+
|
42
|
+
private
|
43
|
+
def parse_ad(div)
|
44
|
+
url = div.search("span[@class='ec_site']").first.text
|
45
|
+
url = "http://#{url}"
|
46
|
+
{
|
47
|
+
:text => div.search('a/text()').text.strip,
|
48
|
+
:href => div.search('a').first['href'],
|
49
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_seo(div)
|
54
|
+
a = div.search('a').first
|
55
|
+
if div['class'] == 'card-result wa-ue-card-result'
|
56
|
+
host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
|
57
|
+
elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
|
58
|
+
host = 'map.baidu.com'
|
59
|
+
else
|
60
|
+
host = div.search("*[@class='site']").first.text
|
61
|
+
end
|
62
|
+
{
|
63
|
+
:text => a.text,
|
64
|
+
:href => a['href'],
|
65
|
+
:host => host
|
66
|
+
}
|
111
67
|
end
|
112
68
|
end
|
113
69
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class Qihu
|
4
|
+
include Query::Result
|
5
|
+
def seo_ranks
|
6
|
+
@page.search('//ul[@id="m-result"]/li//h3').map.with_index do |h3,index|
|
7
|
+
a = h3.search('a').first
|
8
|
+
{
|
9
|
+
:rank => index + 1,
|
10
|
+
:href => a['href'],
|
11
|
+
:text => a.text.strip,
|
12
|
+
:host => Addressable::URI.parse(a['href']).host
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def ads_top
|
18
|
+
@page.search("//ul[@id='djbox']/li").map.with_index do |li,index|
|
19
|
+
a = li.search("a").first
|
20
|
+
href = CGI.parse(URI(a['_cs']).query)['aurl'].first
|
21
|
+
{
|
22
|
+
:rank => index + 1,
|
23
|
+
:text => a.text,
|
24
|
+
:href => href,
|
25
|
+
:host => Addressable::URI.parse(URI.encode(href)).host
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ads_bottom
|
31
|
+
[]
|
32
|
+
end
|
33
|
+
|
34
|
+
def ads_right
|
35
|
+
@page.search("//ul[@id='rightbox']/li").map.with_index do |li,index|
|
36
|
+
a = li.search('a').first
|
37
|
+
href = CGI.parse(URI(a['_cs']).query)['aurl'].first
|
38
|
+
host = Addressable::URI.parse(URI.encode(href)).host
|
39
|
+
{
|
40
|
+
:rank => index + 1,
|
41
|
+
:text => a.text,
|
42
|
+
:href => href,
|
43
|
+
:host => host
|
44
|
+
}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def related_keywords
|
49
|
+
[]
|
50
|
+
end
|
51
|
+
|
52
|
+
def count
|
53
|
+
@page.search('//span[@class="nums"]').first.text.gsub(/\D/,'').to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
#下一页
|
57
|
+
def next_url
|
58
|
+
next_href = @page.xpath('//a[@id="snext"]').first['href']
|
59
|
+
end
|
60
|
+
#有结果
|
61
|
+
def has_result?
|
62
|
+
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|