query 0.0.1 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +4 -1
- data/Gemfile +3 -1
- data/README.md +6 -1
- data/lib/query/engine/baidu.rb +12 -8
- data/lib/query/engine/baidu_mobile.rb +4 -4
- data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
- data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
- data/lib/query/engine/sogou.rb +45 -0
- data/lib/query/engine/sogou_mobile.rb +21 -0
- data/lib/query/engine.rb +11 -4
- data/lib/query/result/baidu.rb +57 -91
- data/lib/query/result/baidu_mobile.rb +49 -93
- data/lib/query/result/qihu.rb +66 -0
- data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
- data/lib/query/result/sogou.rb +103 -0
- data/lib/query/result/sogou_mobile.rb +51 -0
- data/lib/query/result.rb +47 -4
- data/lib/query/version.rb +1 -1
- data/lib/query.rb +6 -8
- data/query.gemspec +2 -3
- data/spec/baidu1_spec.rb +157 -0
- data/spec/baidu2_spec.rb +156 -0
- data/spec/mbaidu1_spec.rb +167 -0
- data/spec/msogou_spec.rb +91 -0
- data/spec/qihu_spec.rb +87 -0
- data/spec/samples/baidu1.html +521 -0
- data/spec/samples/baidu2.html +662 -0
- data/spec/samples/mbaidu1.html +2 -0
- data/spec/samples/mbaidu2.html +2 -0
- data/spec/samples/msogou.html +474 -0
- data/spec/samples/qihu.html +506 -0
- data/spec/samples/sogou.html +629 -0
- data/spec/sogou_mobile_spec.rb +86 -0
- data/spec/sogou_spec.rb +107 -0
- data/spec/spec_helper.rb +12 -1
- metadata +56 -31
- data/lib/query/engine/base.rb +0 -16
- data/lib/query/result/base.rb +0 -50
- data/lib/query/result/qihoo.rb +0 -75
- data/spec/baidu_mobile_spec.rb +0 -19
- data/spec/baidu_spec.rb +0 -73
- data/spec/qihoo_spec.rb +0 -27
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e5043b0180a473ab3d213136dfe9ab55ccb4a6d9
|
4
|
+
data.tar.gz: dc0b8b1ee15dc3f4437439712904de92b838520e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: baa8ca09bc7bfd19f1eb3e9dffa24cf0cc28aba704d8671769bc26a79c68f10c01d57df4f76d4e6558ea638fd6c211111ca21d1883675450971256fc8369bc2a
|
7
|
+
data.tar.gz: 804e6685b6d7d49e563318a9150eae1a655c52cb6f6ca9084c7fbb0c908fde92a44025761668abe2b7aa4b21ba31fbb0c9ebcbc24a6d3c61fd2415e5344b9fd4
|
data/.gitignore
CHANGED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -3,22 +3,27 @@ Query
|
|
3
3
|
|
4
4
|
|
5
5
|
#to get the result list by querying "abc"
|
6
|
+
|
6
7
|
Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
|
7
8
|
puts id,value
|
8
9
|
end
|
9
10
|
|
10
11
|
#to get the result list with host "www.abc.com.cn" by querying "abc"
|
12
|
+
|
11
13
|
Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
|
12
14
|
puts id,value
|
13
15
|
end
|
14
16
|
|
15
17
|
#to get the result list with host which fit the regex /com.cn/ by querying "abc"
|
18
|
+
|
16
19
|
Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
|
17
20
|
puts id,value
|
18
21
|
end
|
19
22
|
|
20
23
|
# to get the top rank of host "www.abc.com.cn" by querying "abc"
|
24
|
+
|
21
25
|
Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
|
22
26
|
|
23
27
|
TODO:
|
24
|
-
查询结果不多,翻页不存在时的处理,及rspec
|
28
|
+
查询结果不多,翻页不存在时的处理,及rspec
|
29
|
+
增加其他搜索引擎
|
data/lib/query/engine/baidu.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class Baidu
|
3
|
+
class Baidu
|
4
|
+
include Query::Engine
|
4
5
|
BaseUri = 'http://www.baidu.com/s?'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
|
8
|
+
}
|
5
9
|
def self.suggestions(wd)
|
6
10
|
require 'json'
|
7
11
|
json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
|
@@ -37,25 +41,25 @@ module Query
|
|
37
41
|
return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
|
38
42
|
end
|
39
43
|
|
40
|
-
def query(wd)
|
44
|
+
def self.query(wd)
|
41
45
|
q = Array.new
|
42
46
|
q << "wd=#{wd}"
|
43
47
|
q << "rn=#{@perpage.to_i}" if @perpage
|
44
48
|
queryStr = q.join("&")
|
45
49
|
#uri = URI.encode((BaseUri + queryStr).encode('GBK'))
|
46
50
|
uri = URI.encode((BaseUri + queryStr))
|
47
|
-
begin
|
51
|
+
# begin
|
48
52
|
# @page = @a.get uri
|
49
|
-
@page = HTTParty.get
|
53
|
+
@page = HTTParty.get(uri,Options)
|
50
54
|
r = Query::Result::Baidu.new(@page)
|
51
55
|
r.baseuri = uri
|
52
56
|
r.pagenumber = 1
|
53
57
|
r.perpage = @perpage
|
54
58
|
r
|
55
|
-
rescue Exception => e
|
56
|
-
|
57
|
-
|
58
|
-
end
|
59
|
+
# rescue Exception => e
|
60
|
+
# warn e.to_s
|
61
|
+
# return false
|
62
|
+
# end
|
59
63
|
=begin
|
60
64
|
query = "#{query}"
|
61
65
|
@uri = BaseUri+URI.encode(query.encode('GBK'))
|
@@ -1,11 +1,11 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class BaiduMobile
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Engine
|
4
5
|
BaseUri = 'http://m.baidu.com/s?'
|
5
|
-
|
6
|
-
"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
|
7
8
|
}
|
8
|
-
Options = {:headers => headers}
|
9
9
|
|
10
10
|
#基本查询,相当于从搜索框直接输入关键词查询
|
11
11
|
def query(wd)
|
@@ -1,15 +1,20 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
-
class
|
3
|
+
class Qihu
|
4
|
+
include Query::Engine
|
4
5
|
Host = 'www.so.com'
|
6
|
+
headers = {
|
7
|
+
"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'
|
8
|
+
}
|
9
|
+
Options = {:headers => headers}
|
5
10
|
#基本查询, 相当于在搜索框直接数据关键词查询
|
6
11
|
def query(wd)
|
7
12
|
#用原始路径请求
|
8
13
|
uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
|
9
|
-
page = HTTParty.get(uri)
|
14
|
+
page = HTTParty.get(uri,Options)
|
10
15
|
#如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
|
11
16
|
uri = URI.join("http://#{Host}/",page.request.path).to_s
|
12
|
-
r = Query::Result::
|
17
|
+
r = Query::Result::Qihu.new(page)
|
13
18
|
r.baseuri = uri
|
14
19
|
r
|
15
20
|
end
|
File without changes
|
@@ -0,0 +1,45 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class Sogou
|
4
|
+
include Query::Engine
|
5
|
+
BaseUri = 'http://www.sogou.com/web?'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
|
8
|
+
}
|
9
|
+
class << self
|
10
|
+
def query(wd)
|
11
|
+
q = []
|
12
|
+
q << "query=#{wd}"
|
13
|
+
uri = URI.encode BaseUri+q.join('&')
|
14
|
+
page = HTTParty.get(uri,Options)
|
15
|
+
r = Query::Result::Sogou.new(page)
|
16
|
+
r.baseuri = uri
|
17
|
+
r.perpage = @perpage
|
18
|
+
r.pagenumber = 1
|
19
|
+
r
|
20
|
+
end
|
21
|
+
|
22
|
+
def suggestions(word)
|
23
|
+
suggestions = HTTParty.get "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=#{URI.encode(word)}"
|
24
|
+
suggestions = suggestions.encode('utf-8').scan /#{word}[^"]+/
|
25
|
+
suggestions
|
26
|
+
end
|
27
|
+
|
28
|
+
#site:xxx.yyy.com
|
29
|
+
def pages(host)
|
30
|
+
query("site:#{host}")
|
31
|
+
end
|
32
|
+
|
33
|
+
#domain:xxx.yyy.com/path/file.html
|
34
|
+
def links(uri)
|
35
|
+
query("domain:\"#{uri}\"")
|
36
|
+
end
|
37
|
+
|
38
|
+
#site:xxx.yyy.com inurl:zzz
|
39
|
+
# def pages_with(host,string)
|
40
|
+
# query("site:#{host} inurl:#{string}")
|
41
|
+
# end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Query
|
2
|
+
module Engine
|
3
|
+
class SogouMobile
|
4
|
+
include Query::Engine
|
5
|
+
BaseUri = 'http://wap.sogou.com/web/searchList.jsp'
|
6
|
+
Options = {
|
7
|
+
:headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
|
8
|
+
}
|
9
|
+
class << self
|
10
|
+
def query(wd)
|
11
|
+
queryStr = "keyword=#{wd}"
|
12
|
+
uri = URI.encode(BaseUri + "?" + queryStr)
|
13
|
+
res = HTTParty.get(uri,Options)
|
14
|
+
r = Query::Result::SogouMobile.new(res)
|
15
|
+
r.baseuri = uri
|
16
|
+
r
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
data/lib/query/engine.rb
CHANGED
@@ -1,10 +1,17 @@
|
|
1
1
|
module Query
|
2
2
|
module Engine
|
3
|
+
attr_accessor :perpage
|
4
|
+
def self.indexed?(url)
|
5
|
+
URI(url)
|
6
|
+
result = query(url)
|
7
|
+
return result.has_result?
|
8
|
+
end
|
3
9
|
end
|
4
10
|
end
|
5
|
-
|
6
|
-
require 'query/engine/base'
|
11
|
+
require 'httparty'
|
7
12
|
require 'query/engine/baidu'
|
8
13
|
require 'query/engine/baidu_mobile'
|
9
|
-
require 'query/engine/
|
10
|
-
require 'query/engine/
|
14
|
+
require 'query/engine/qihu'
|
15
|
+
require 'query/engine/qihu_mobile'
|
16
|
+
require 'query/engine/sogou'
|
17
|
+
require 'query/engine/sogou_mobile'
|
data/lib/query/result/baidu.rb
CHANGED
@@ -1,94 +1,40 @@
|
|
1
1
|
module Query
|
2
2
|
module Result
|
3
|
-
class Baidu
|
4
|
-
|
3
|
+
class Baidu
|
4
|
+
include Query::Result
|
5
|
+
def seo_ranks
|
5
6
|
return @ranks unless @ranks.nil?
|
6
|
-
@
|
7
|
-
|
8
|
-
id = table['id']
|
9
|
-
# if @perpage == 10
|
10
|
-
# id = table['id'][-1,1]
|
11
|
-
# id = '10' if id == '0'
|
12
|
-
# end
|
13
|
-
|
14
|
-
@ranks[id] = Hash.new
|
15
|
-
url = table.search("[@class=\"g\"]").first
|
16
|
-
url = url.text unless url.nil?
|
17
|
-
a = table.search("h3").first
|
18
|
-
next if a.nil?
|
19
|
-
@ranks[id]['text'] = a.text
|
20
|
-
@ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
|
21
|
-
unless url.nil?
|
22
|
-
url = url.strip
|
23
|
-
@ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
24
|
-
else
|
25
|
-
@ranks[id]['host'] = nil
|
26
|
-
end
|
7
|
+
@page.search("//*[@class='result']|//*[@class='result-op']|//*[@class='result-op c-container']").map.with_index do |table,index|
|
8
|
+
parse_seo(table).merge({:rank => index + 1})
|
27
9
|
end
|
28
|
-
#@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
|
29
|
-
@ranks
|
30
10
|
end
|
31
11
|
|
32
|
-
def ads_bottom
|
33
|
-
return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
|
34
|
-
return ads_top
|
35
|
-
# p @page.search("//table[@bgcolor='f5f5f5']").empty?
|
36
|
-
end
|
37
12
|
def ads_top
|
38
|
-
|
39
|
-
|
40
|
-
@page.search("//table[@bgcolor='#f5f5f5']").each do |table|
|
41
|
-
id = table['id']
|
42
|
-
next if id.nil?
|
43
|
-
id = id[2,3].to_i.to_s
|
44
|
-
ads[id]= parse_ad(table)
|
13
|
+
@page.search("//*[@class='result']/preceding-sibling::*[contains(@class,'EC_result')]").map.with_index do |div, index|
|
14
|
+
parse_ad(div).merge(:rank => index + 1)
|
45
15
|
end
|
46
|
-
#白色底推广,只有上部分
|
47
|
-
if ads.empty?
|
48
|
-
@page.search("//table").each do |table|
|
49
|
-
id = table['id']
|
50
|
-
next if id.nil? or id.to_i<3000
|
51
|
-
id = id[2,3].to_i.to_s
|
52
|
-
ads[id]= parse_ad(table)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
ads
|
56
16
|
end
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
17
|
+
|
18
|
+
def ads_bottom
|
19
|
+
@page.search("//*[@class='result']/following-sibling::*[contains(@class,'EC_result')]").map.with_index do |div,index|
|
20
|
+
parse_ad(div).merge(:rank => index + 1)
|
21
|
+
end
|
61
22
|
end
|
23
|
+
|
62
24
|
def ads_right
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
|
74
|
-
end
|
25
|
+
@page.search("//div[@id='ec_im_container']/div[@id]").map.with_index do |div,index|
|
26
|
+
a = div.search('a').first
|
27
|
+
url = div.search("*[@class='EC_url']").first.text
|
28
|
+
url = "http://#{url}"
|
29
|
+
{
|
30
|
+
:rank => index + 1,
|
31
|
+
:text => a.text.strip,
|
32
|
+
:href => a['href'].strip,
|
33
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
34
|
+
}
|
75
35
|
end
|
76
|
-
ads
|
77
36
|
end
|
78
37
|
|
79
|
-
#return the top rank number from @ranks with the input host
|
80
|
-
# def rank(host)#on base of ranks
|
81
|
-
# ranks.each do |id,line|
|
82
|
-
# id = id.to_i
|
83
|
-
# if host.class == Regexp
|
84
|
-
# return id if line['host'] =~ host
|
85
|
-
# elsif host.class == String
|
86
|
-
# return id if line['host'] == host
|
87
|
-
# end
|
88
|
-
# end
|
89
|
-
# return nil
|
90
|
-
# end
|
91
|
-
|
92
38
|
def count
|
93
39
|
@count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
|
94
40
|
end
|
@@ -97,25 +43,45 @@ module Query
|
|
97
43
|
@related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
|
98
44
|
end
|
99
45
|
|
100
|
-
def next
|
101
|
-
url = @page.xpath('//a[text()="下一页>"]').first
|
102
|
-
return if url.nil?
|
103
|
-
url = url['href']
|
104
|
-
url = URI.join(@baseuri,url).to_s
|
105
|
-
page = HTTParty.get(url)
|
106
|
-
r = Query::Result::Baidu.new(page)
|
107
|
-
r.baseuri = url
|
108
|
-
r.pagenumber=@pagenumber+1
|
109
|
-
r.perpage=@perpage
|
110
|
-
r
|
111
|
-
|
112
|
-
# @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
|
113
|
-
end
|
114
46
|
def has_result?
|
115
47
|
submit = @page.search('//a[text()="提交网址"]').first
|
116
48
|
return false if submit and submit['href'].include?'sitesubmit'
|
117
49
|
return true
|
118
50
|
end
|
51
|
+
|
52
|
+
def next_url
|
53
|
+
@page.search("//a[text()='下一页>']").first['href']
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
def parse_ad(div)
|
58
|
+
#@todo should be :
|
59
|
+
#title = div.xpath("*[contains(@class,'ec_title')]",MyFilter.new).first
|
60
|
+
title = div.xpath("//*[contains(@class,'ec_title')]",MyFilter.new).first
|
61
|
+
url = %w( span[@class='ec_url'] a[@class='EC_url'] ).map do |xpath|
|
62
|
+
node = div.search(xpath).first
|
63
|
+
node.text if node
|
64
|
+
end.compact.first
|
65
|
+
url = "http://" + url
|
66
|
+
{
|
67
|
+
:text => title.text,
|
68
|
+
:href => title['href'],
|
69
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
70
|
+
}
|
71
|
+
end
|
72
|
+
|
73
|
+
def parse_seo(table)
|
74
|
+
url = %w( span[@class="g"] span[@class="c-showurl"] div[@class="op_zhidao_showurl"]).map do |xpath|
|
75
|
+
span = table.search(xpath).first
|
76
|
+
span.text.sub(/\d{4}-\d{1,2}-\d{1,2}/,'').strip if span
|
77
|
+
end.compact.first
|
78
|
+
host = Addressable::URI.parse(URI.encode("http://#{url}")).host
|
79
|
+
{
|
80
|
+
:text => table.search("h3").first.text.strip,
|
81
|
+
:href => table.search('a').first['href'].strip,
|
82
|
+
:host => host
|
83
|
+
}
|
84
|
+
end
|
119
85
|
end
|
120
86
|
end
|
121
87
|
end
|
@@ -1,113 +1,69 @@
|
|
1
1
|
module Query
|
2
2
|
module Result
|
3
|
-
class BaiduMobile
|
4
|
-
|
5
|
-
def ranks
|
6
|
-
#如果已经赋值说明解析过,不需要重新解析,直接返回结果
|
7
|
-
return @ranks unless @ranks.nil?
|
8
|
-
@ranks = Hash.new
|
9
|
-
@page.xpath('//div[@class="result"]').each do |result|
|
10
|
-
href,text,host,is_mobile = '','','',false
|
11
|
-
a = result.search("a").first
|
12
|
-
is_mobile = true unless a.search("img").empty?
|
13
|
-
host = result.search('[@class="site"]').first
|
14
|
-
next if host.nil?
|
15
|
-
host = host.text
|
16
|
-
href = a['href']
|
17
|
-
text = a.text
|
18
|
-
id = href.scan(/&order=(\d+)&/)
|
19
|
-
if id.empty?
|
20
|
-
id = nil
|
21
|
-
else
|
22
|
-
id = id.first.first.to_i
|
23
|
-
# id = (@pagenumber-1)*10+id
|
24
|
-
end
|
25
|
-
=begin
|
26
|
-
result.children.each do |elem|
|
27
|
-
if elem.name == 'a'
|
28
|
-
href = elem['href']
|
29
|
-
id = elem.text.match(/^\d+/).to_s.to_i
|
30
|
-
text = elem.text.sub(/^\d+/,'')
|
31
|
-
text.sub!(/^\u00A0/,'')
|
32
|
-
elsif elem['class'] == 'abs'
|
33
|
-
elem.children.each do |elem2|
|
34
|
-
if elem2['class'] == 'site'
|
35
|
-
host = elem2.text
|
36
|
-
break
|
37
|
-
end
|
38
|
-
end
|
39
|
-
elsif elem['class'] == 'site'
|
40
|
-
host == elem['href']
|
41
|
-
end
|
42
|
-
end
|
43
|
-
=end
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Result
|
44
5
|
|
45
|
-
|
6
|
+
def seo_ranks
|
7
|
+
@seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
|
8
|
+
parse_seo(div).merge({:rank => index + 1})
|
46
9
|
end
|
47
|
-
@ranks
|
48
10
|
end
|
11
|
+
|
49
12
|
def ads_top
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
id += 1
|
54
|
-
href = div.search("span[@class='ec_site']").first.text
|
55
|
-
href = "http://#{href}"
|
56
|
-
title = div.search("a/text()").text.strip
|
57
|
-
host = Addressable::URI.parse(URI.encode(href)).host
|
58
|
-
result[id] = {'title'=>title,'href'=>href,'host'=>host}
|
13
|
+
@ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
|
14
|
+
puts index
|
15
|
+
parse_ad(div).merge({:rank => index + 1})
|
59
16
|
end
|
60
|
-
result
|
61
17
|
end
|
18
|
+
|
62
19
|
def ads_right
|
63
20
|
[]
|
64
21
|
end
|
22
|
+
|
65
23
|
def ads_bottom
|
66
|
-
[]
|
24
|
+
@ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
|
25
|
+
parse_ad(div).merge({:rank => index + 1})
|
26
|
+
end
|
67
27
|
end
|
28
|
+
|
29
|
+
#酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
|
68
30
|
def related_keywords
|
69
|
-
@related_keywords ||= @page.search("div[@class='
|
31
|
+
@related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
|
70
32
|
end
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
host_ranks = Hash.new
|
75
|
-
ranks.each do |id,line|
|
76
|
-
if specific_host.class == Regexp
|
77
|
-
host_ranks[id] = line if line['host'] =~ specific_host
|
78
|
-
elsif specific_host.class == String
|
79
|
-
host_ranks[id] = line if line['host'] == specific_host
|
80
|
-
end
|
81
|
-
end
|
82
|
-
host_ranks
|
33
|
+
|
34
|
+
def next_url
|
35
|
+
@next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
|
83
36
|
end
|
84
|
-
|
85
|
-
def
|
86
|
-
|
87
|
-
id = id.to_i
|
88
|
-
if host.class == Regexp
|
89
|
-
return id if line['host'] =~ host
|
90
|
-
elsif host.class == String
|
91
|
-
return id if line['host'] == host
|
92
|
-
end
|
93
|
-
end
|
94
|
-
return nil
|
37
|
+
|
38
|
+
def count
|
39
|
+
|
95
40
|
end
|
96
|
-
|
97
|
-
|
98
|
-
def
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
41
|
+
|
42
|
+
private
|
43
|
+
def parse_ad(div)
|
44
|
+
url = div.search("span[@class='ec_site']").first.text
|
45
|
+
url = "http://#{url}"
|
46
|
+
{
|
47
|
+
:text => div.search('a/text()').text.strip,
|
48
|
+
:href => div.search('a').first['href'],
|
49
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
def parse_seo(div)
|
54
|
+
a = div.search('a').first
|
55
|
+
if div['class'] == 'card-result wa-ue-card-result'
|
56
|
+
host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
|
57
|
+
elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
|
58
|
+
host = 'map.baidu.com'
|
59
|
+
else
|
60
|
+
host = div.search("*[@class='site']").first.text
|
61
|
+
end
|
62
|
+
{
|
63
|
+
:text => a.text,
|
64
|
+
:href => a['href'],
|
65
|
+
:host => host
|
66
|
+
}
|
111
67
|
end
|
112
68
|
end
|
113
69
|
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class Qihu
|
4
|
+
include Query::Result
|
5
|
+
def seo_ranks
|
6
|
+
@page.search('//ul[@id="m-result"]/li//h3').map.with_index do |h3,index|
|
7
|
+
a = h3.search('a').first
|
8
|
+
{
|
9
|
+
:rank => index + 1,
|
10
|
+
:href => a['href'],
|
11
|
+
:text => a.text.strip,
|
12
|
+
:host => Addressable::URI.parse(a['href']).host
|
13
|
+
}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def ads_top
|
18
|
+
@page.search("//ul[@id='djbox']/li").map.with_index do |li,index|
|
19
|
+
a = li.search("a").first
|
20
|
+
href = CGI.parse(URI(a['_cs']).query)['aurl'].first
|
21
|
+
{
|
22
|
+
:rank => index + 1,
|
23
|
+
:text => a.text,
|
24
|
+
:href => href,
|
25
|
+
:host => Addressable::URI.parse(URI.encode(href)).host
|
26
|
+
}
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def ads_bottom
|
31
|
+
[]
|
32
|
+
end
|
33
|
+
|
34
|
+
def ads_right
|
35
|
+
@page.search("//ul[@id='rightbox']/li").map.with_index do |li,index|
|
36
|
+
a = li.search('a').first
|
37
|
+
href = CGI.parse(URI(a['_cs']).query)['aurl'].first
|
38
|
+
host = Addressable::URI.parse(URI.encode(href)).host
|
39
|
+
{
|
40
|
+
:rank => index + 1,
|
41
|
+
:text => a.text,
|
42
|
+
:href => href,
|
43
|
+
:host => host
|
44
|
+
}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def related_keywords
|
49
|
+
[]
|
50
|
+
end
|
51
|
+
|
52
|
+
def count
|
53
|
+
@page.search('//span[@class="nums"]').first.text.gsub(/\D/,'').to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
#下一页
|
57
|
+
def next_url
|
58
|
+
next_href = @page.xpath('//a[@id="snext"]').first['href']
|
59
|
+
end
|
60
|
+
#有结果
|
61
|
+
def has_result?
|
62
|
+
!@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|