query 0.1.25 → 0.1.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +1 -2
- data/LICENSE +13 -12
- data/README.md +14 -11
- data/lib/query.rb +6 -3
- data/lib/query/engine.rb +0 -7
- data/lib/query/engine/baidu.rb +1 -1
- data/lib/query/engine/baidu_mobile.rb +36 -21
- data/lib/query/engine/qihu_mobile.rb +23 -1
- data/lib/query/engine/sm_mobile.rb +32 -0
- data/lib/query/engine/sogou_mobile.rb +23 -19
- data/lib/query/result.rb +7 -18
- data/lib/query/result/baidu.rb +51 -55
- data/lib/query/result/baidu_mobile.rb +114 -56
- data/lib/query/result/qihu_mobile.rb +48 -3
- data/lib/query/result/sm_mobile.rb +95 -0
- data/lib/query/result/sogou.rb +2 -5
- data/lib/query/result/sogou_mobile.rb +79 -39
- data/query.gemspec +7 -8
- data/spec/mbaidu_spec.rb +62 -0
- data/spec/qihu_mobile_spec.rb +33 -0
- data/spec/samples/mbaidu.html +3 -0
- data/spec/sm_mobile_spec.rb +94 -0
- data/spec/sogou_mobile_spec.rb +0 -2
- data/spec/spec_helper.rb +1 -1
- metadata +40 -31
- data/lib/query/version.rb +0 -3
@@ -1,70 +1,128 @@
|
|
1
1
|
module Query
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
module Result
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Result
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
def seo_ranks
|
7
|
+
s_res = @page.at("//div[@id='results']")
|
8
|
+
@seo_ranks ||= s_res.css("div.result").map.with_index do |seo_div,index|
|
9
|
+
parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
10
|
+
end
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
def ads_top
|
14
|
+
selector = "//*[@class='result']/preceding-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
|
15
|
+
@ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
|
16
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
17
|
+
end
|
18
|
+
end
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
def ads_bottom
|
21
|
+
selector = "//*[@class='result']/following-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
|
22
|
+
@ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
|
23
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
24
|
+
end
|
25
|
+
end
|
22
26
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
27
|
+
#酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
|
28
|
+
def related_keywords
|
29
|
+
@related_keywords ||= @page.search("//div[@id='relativewords']/div[@class='rw-list']/a").map { |a| a.text }
|
30
|
+
end
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
32
|
+
def html
|
33
|
+
@page.to_html
|
34
|
+
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
def ads_right
|
37
|
+
[]
|
38
|
+
end
|
37
39
|
|
38
|
-
|
40
|
+
def next_url
|
41
|
+
next_bn = @page.search("//div[@id='pagenav']/a").first
|
42
|
+
url = next_bn.nil? ? "/s?#{@baseuri.query}&pn=#{@pagenumber*10}" : next_bn['href']
|
43
|
+
url
|
44
|
+
end
|
39
45
|
|
40
|
-
|
46
|
+
def count
|
41
47
|
|
42
|
-
|
43
|
-
def parse_ad(div)
|
44
|
-
url = div.search("span[@class='ec_site']").first.text
|
45
|
-
url = "http://#{url}"
|
46
|
-
{
|
47
|
-
:text => div.search('a/text()').text.strip,
|
48
|
-
:href => div.search('a').first['href'],
|
49
|
-
:host => Addressable::URI.parse(URI.encode(url)).host
|
50
|
-
}
|
51
|
-
end
|
48
|
+
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
50
|
+
private
|
51
|
+
def parse_ad(ad_div)
|
52
|
+
begin
|
53
|
+
title_link = ad_div.search('a')[0]
|
54
|
+
url = ad_div.search('link')
|
55
|
+
if url.empty?
|
56
|
+
url = ad_div.search(".//span[contains(text(),'.com')]")[0]
|
57
|
+
url = url.nil? ? "http://m.baidu.com" : "http://#{url.text.strip}"
|
58
|
+
title = title_link.text
|
59
|
+
else
|
60
|
+
url = url[0]['href']
|
61
|
+
title = title_link.xpath("./text() | ./em").text
|
62
|
+
end
|
63
|
+
{
|
64
|
+
:text => title.gsub(/\n|\s/,''),
|
65
|
+
:href => title_link['href'],
|
66
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
67
|
+
}
|
68
|
+
rescue Exception => e
|
69
|
+
warn "Error in parse_seo method : " + e.message
|
70
|
+
{}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse_seo(seo_div)
|
75
|
+
begin
|
76
|
+
title_link = seo_div.search('a')[0]
|
77
|
+
href = title_link['href']
|
78
|
+
href = href[/m.baidu.com/] ? href : "http://m.baidu.com#{href}"
|
79
|
+
if seo_div['class']=='result'
|
80
|
+
host, is_vr = seo_div.search(".//*[@class='site']")[0], false
|
81
|
+
host = host.nil? ? find_host(seo_div) : host.text.split[0]
|
82
|
+
elsif seo_div['srcid']=='map'
|
83
|
+
is_vr, host = true, 'map.baidu.com'
|
84
|
+
elsif seo_div['tpl'] and seo_div['data-log']
|
85
|
+
url = JSON.parse(seo_div['data-log'].gsub("'",'"'))['mu']
|
86
|
+
if url==''
|
87
|
+
host = find_host(seo_div)
|
88
|
+
else
|
89
|
+
host = Addressable::URI.parse(URI.encode(url)).host
|
67
90
|
end
|
91
|
+
is_vr = true
|
92
|
+
else
|
93
|
+
is_vr, host = true, find_host(seo_div)
|
94
|
+
end
|
95
|
+
#is_vr = (is_vr.nil? and !host[/baidu|nuomi/]) ? false : true
|
96
|
+
{
|
97
|
+
:is_vr => false || is_vr,
|
98
|
+
:text => title_link.text.gsub(/\n|\s/,'')[0..30],
|
99
|
+
:href => href,
|
100
|
+
:host => host
|
101
|
+
}
|
102
|
+
rescue Exception => e
|
103
|
+
warn "Error in parse_seo method : " + e.message
|
104
|
+
{}
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def find_host(node)
|
109
|
+
host = node.search(".//*[name()!='style' and (contains(text(),'.cn') or contains(text(),'com'))]")[0]
|
110
|
+
host.nil? ? 'm.baidu.com' : host.text.split[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
def redirect(url,limit = 10)
|
114
|
+
raise ArgumentError, 'Too many HTTP redirects' if limit == 0
|
115
|
+
response = Net::HTTP.get_response(URI(url))
|
116
|
+
case response
|
117
|
+
when Net::HTTPSuccess then
|
118
|
+
return URI(url).host
|
119
|
+
when Net::HTTPRedirection then
|
120
|
+
location = response['location']
|
121
|
+
redirect(location, limit-1)
|
122
|
+
else
|
123
|
+
return "m.baidu.com"
|
68
124
|
end
|
125
|
+
end
|
69
126
|
end
|
70
|
-
end
|
127
|
+
end
|
128
|
+
end
|
@@ -1,6 +1,51 @@
|
|
1
1
|
module Query
|
2
|
-
|
3
|
-
|
2
|
+
module Result
|
3
|
+
class QihuMobile
|
4
|
+
include Query::Result
|
5
|
+
|
6
|
+
def html
|
7
|
+
@page.to_html
|
8
|
+
end
|
9
|
+
|
10
|
+
def next_url
|
11
|
+
"#{@baseuri.to_s}&pn=#{@pagenumber+1}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def seo_ranks
|
15
|
+
@page.css('div.g-card').map.with_index do |seo_div,index|
|
16
|
+
begin
|
17
|
+
cite = seo_div.at('.//*[@class="res-show-url"]/text()')
|
18
|
+
a = seo_div.at_css('a')
|
19
|
+
if cite
|
20
|
+
cite = cite.to_s.gsub(/ |-/,'')
|
21
|
+
else
|
22
|
+
url = seo_div.at('.//a[contains(@href,"u=")]')
|
23
|
+
if url
|
24
|
+
cite = URI.decode(CGI.parse(URI(URI.encode(url['href'])).query)['u'][0])
|
25
|
+
cite = URI(URI.encode(cite)).host
|
26
|
+
else
|
27
|
+
cite = "m.haosou.com"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
title = seo_div.at_css('h3') || a
|
31
|
+
{
|
32
|
+
:is_vr => seo_div['class']=="g-card r-og-card" ? false : true,
|
33
|
+
:rank => index + 1 + (@pagenumber-1)*10,
|
34
|
+
:href => a['href'],
|
35
|
+
:text => title.text.gsub(/ |\n|\s/,""),
|
36
|
+
:host => cite
|
37
|
+
}
|
38
|
+
rescue Exception => e
|
39
|
+
warn "Error in parse_seo method : " + e.message
|
40
|
+
{}
|
41
|
+
end
|
4
42
|
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def related_keywords
|
46
|
+
@related_keywords ||= @page.search("//div[@class='related-search-b']//a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
|
47
|
+
end
|
48
|
+
|
5
49
|
end
|
6
|
-
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class SMobile
|
4
|
+
include Query::Result
|
5
|
+
|
6
|
+
def seo_ranks
|
7
|
+
@seo_ranks ||= @page.search("//div[@id='results']/div[@class!='ali_row result card']").map.with_index do |seo_div,index|
|
8
|
+
parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def ads_top
|
13
|
+
selector = "//div[@id='results']/div[@class='result card'][1]/preceding-sibling::div[@class='ali_row result card']"
|
14
|
+
@ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
|
15
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def ads_bottom
|
20
|
+
selector = "//div[@id='results']/div[@class='result card'][1]/following-sibling::div[@class='ali_row result card']"
|
21
|
+
@ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
|
22
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
#relative words
|
27
|
+
def related_keywords
|
28
|
+
@related_keywords ||= @page.search("//div[@class='rel-keywords card']/ul/li/a").map { |a| a.text }
|
29
|
+
end
|
30
|
+
|
31
|
+
def html
|
32
|
+
@page.to_html
|
33
|
+
end
|
34
|
+
|
35
|
+
def ads_right
|
36
|
+
[]
|
37
|
+
end
|
38
|
+
|
39
|
+
def next_url
|
40
|
+
"#{@baseuri.to_s}&page=#{@pagenumber+1}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def count
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def parse_ad(ad_div)
|
49
|
+
begin
|
50
|
+
title_link = ad_div.at_css('a')
|
51
|
+
title = title_link.search('./text()|./em|./span')
|
52
|
+
url = ad_div.search('.//div[@class="host"]/text()').text
|
53
|
+
url = "http://#{url}" if !url[/http:/]
|
54
|
+
{
|
55
|
+
:text => title.text.gsub(/\n|\s/,''),
|
56
|
+
:href => title_link['href'],
|
57
|
+
:host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host
|
58
|
+
}
|
59
|
+
rescue Exception => e
|
60
|
+
warn "Error in parse_ads method : " + e.message
|
61
|
+
{}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_seo(seo_div)
|
66
|
+
begin
|
67
|
+
title_link = seo_div.at('.//a[contains(@href,"http://")]')
|
68
|
+
href = title_link['href']
|
69
|
+
if seo_div['class']=="result card"
|
70
|
+
is_vr = false
|
71
|
+
url = seo_div.search('.//div[@class="host"]/span/text()[matches(.,"\w+.\w+")]', XpathFunctions.new)[0] || href
|
72
|
+
else
|
73
|
+
is_vr, url = true, href
|
74
|
+
end
|
75
|
+
url = "http://#{url}" if !url[/http:/]
|
76
|
+
{
|
77
|
+
:is_vr => is_vr,
|
78
|
+
:text => title_link.text.gsub(/\n|\s/,'')[0..30],
|
79
|
+
:href => href,
|
80
|
+
:host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host # remove   and whitespace
|
81
|
+
}
|
82
|
+
rescue Exception => e
|
83
|
+
warn "Error in parse_seo method : " + e.message
|
84
|
+
{}
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class XpathFunctions
|
89
|
+
def matches node_set, regex
|
90
|
+
node_set.find_all {|node| node.to_s[/#{regex}/] }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/query/result/sogou.rb
CHANGED
@@ -42,11 +42,8 @@ module Query
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def count
|
45
|
-
|
46
|
-
|
47
|
-
return counter_block.text.gsub(/\D/,'').to_i
|
48
|
-
end
|
49
|
-
end
|
45
|
+
node = @page.search("//resnum[@id='scd_num']").first
|
46
|
+
node ? node.text.gsub(/\D/,'').to_i : nil
|
50
47
|
end
|
51
48
|
|
52
49
|
def related_keywords
|
@@ -1,51 +1,91 @@
|
|
1
1
|
require 'cgi'
|
2
2
|
module Query
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def ads_right
|
13
|
-
[]
|
14
|
-
end
|
3
|
+
module Result
|
4
|
+
class SogouMobile
|
5
|
+
include Query::Result
|
6
|
+
def ads_top
|
7
|
+
@page.search("//div[@class='results']/div[@class='ec_ad_results'][1]/div[@class='ad_result']").map.with_index do |ad_div,index|
|
8
|
+
parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
9
|
+
end
|
10
|
+
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
end
|
12
|
+
def ads_right
|
13
|
+
[]
|
14
|
+
end
|
21
15
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
:text => a.search('h3').text,
|
28
|
-
:href => href,
|
29
|
-
:host => URI(href).host
|
30
|
-
}
|
31
|
-
end
|
32
|
-
end
|
16
|
+
def ads_bottom
|
17
|
+
@page.search("//div[@class='results']/div[@class='ec_ad_results'][2]/div[@class='ad_result']").map.with_index do |ad_div,index|
|
18
|
+
parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
19
|
+
end
|
20
|
+
end
|
33
21
|
|
34
|
-
|
35
|
-
|
36
|
-
|
22
|
+
def seo_ranks
|
23
|
+
@seo_rank ||= @page.search("//div[@class='results']/div[@class='result' or @class='vrResult']").map.with_index do |seo_div,index|
|
24
|
+
parse_seo_ranks(seo_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
25
|
+
end
|
26
|
+
end
|
37
27
|
|
28
|
+
def next_url
|
29
|
+
"#{@baseuri.to_s}&p=#{@pagenumber+1}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def related_keywords
|
33
|
+
@related_keywords ||= @page.search("div[@class='hint']/ul/li/a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
|
34
|
+
end
|
35
|
+
|
38
36
|
def count
|
39
37
|
end
|
38
|
+
|
39
|
+
def html
|
40
|
+
@page.to_html
|
41
|
+
end
|
40
42
|
|
41
43
|
private
|
42
44
|
def parse_ad(ad_div)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
begin
|
46
|
+
site = ad_div.search(".//span[@class='exp_tip']/preceding-sibling::span")[0] || ad_div.search(".//div[@class='bd_citeurl']/text()")[0]
|
47
|
+
{
|
48
|
+
:text => ad_div.search('h3')[0].text.gsub(/ |\n|\t/,""),
|
49
|
+
:href => ad_div.search('a')[0]['href'],
|
50
|
+
:host => site.text.strip.downcase
|
51
|
+
}
|
52
|
+
rescue Exception => e
|
53
|
+
warn "Error in parse_ads method : " + e.message
|
54
|
+
{}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_seo_ranks(seo_div)
|
59
|
+
begin
|
60
|
+
a = seo_div.search(".//a[contains(@href,'url=')]")[0]
|
61
|
+
cite_url = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'][0])
|
62
|
+
|
63
|
+
if cite_url==""
|
64
|
+
cite_url = seo_div.search(".//div[@class='citeurl']/text()")[0] || "wap.sogou.com"
|
65
|
+
cite_url = "http://#{cite_url.to_s.gsub(/ |-/,'')}"
|
66
|
+
end
|
67
|
+
|
68
|
+
if seo_div['class']=='result'
|
69
|
+
is_vr, title = false, a.search("./text()|./em|./span")
|
70
|
+
else
|
71
|
+
title = seo_div.search(".//h3")[0] || a
|
72
|
+
is_vr = true
|
73
|
+
title.css('script').remove
|
74
|
+
end
|
75
|
+
url = a['href'][/wap.sogou.com\/web/].nil? ? "http://wap.sogou.com/web/#{a['href']}" : a['href']
|
76
|
+
|
77
|
+
{
|
78
|
+
:text => title.text.gsub(/ |\n|\t/,""),
|
79
|
+
:href => url,
|
80
|
+
:host => URI(URI.encode(cite_url)).host,
|
81
|
+
:is_vr => is_vr
|
82
|
+
}
|
83
|
+
rescue Exception => e
|
84
|
+
warn "Error in parse_seo method : " + e.message
|
85
|
+
{}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
51
91
|
end
|