query 0.1.25 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +1 -2
- data/LICENSE +13 -12
- data/README.md +14 -11
- data/lib/query.rb +6 -3
- data/lib/query/engine.rb +0 -7
- data/lib/query/engine/baidu.rb +1 -1
- data/lib/query/engine/baidu_mobile.rb +36 -21
- data/lib/query/engine/qihu_mobile.rb +23 -1
- data/lib/query/engine/sm_mobile.rb +32 -0
- data/lib/query/engine/sogou_mobile.rb +23 -19
- data/lib/query/result.rb +7 -18
- data/lib/query/result/baidu.rb +51 -55
- data/lib/query/result/baidu_mobile.rb +114 -56
- data/lib/query/result/qihu_mobile.rb +48 -3
- data/lib/query/result/sm_mobile.rb +95 -0
- data/lib/query/result/sogou.rb +2 -5
- data/lib/query/result/sogou_mobile.rb +79 -39
- data/query.gemspec +7 -8
- data/spec/mbaidu_spec.rb +62 -0
- data/spec/qihu_mobile_spec.rb +33 -0
- data/spec/samples/mbaidu.html +3 -0
- data/spec/sm_mobile_spec.rb +94 -0
- data/spec/sogou_mobile_spec.rb +0 -2
- data/spec/spec_helper.rb +1 -1
- metadata +40 -31
- data/lib/query/version.rb +0 -3
@@ -1,70 +1,128 @@
|
|
1
1
|
module Query
|
2
|
-
|
3
|
-
|
4
|
-
|
2
|
+
module Result
|
3
|
+
class BaiduMobile
|
4
|
+
include Query::Result
|
5
5
|
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
6
|
+
def seo_ranks
|
7
|
+
s_res = @page.at("//div[@id='results']")
|
8
|
+
@seo_ranks ||= s_res.css("div.result").map.with_index do |seo_div,index|
|
9
|
+
parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
10
|
+
end
|
11
|
+
end
|
11
12
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
13
|
+
def ads_top
|
14
|
+
selector = "//*[@class='result']/preceding-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
|
15
|
+
@ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
|
16
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
17
|
+
end
|
18
|
+
end
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
|
20
|
+
def ads_bottom
|
21
|
+
selector = "//*[@class='result']/following-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
|
22
|
+
@ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
|
23
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
24
|
+
end
|
25
|
+
end
|
22
26
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
end
|
27
|
+
#酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
|
28
|
+
def related_keywords
|
29
|
+
@related_keywords ||= @page.search("//div[@id='relativewords']/div[@class='rw-list']/a").map { |a| a.text }
|
30
|
+
end
|
28
31
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
32
|
+
def html
|
33
|
+
@page.to_html
|
34
|
+
end
|
33
35
|
|
34
|
-
|
35
|
-
|
36
|
-
|
36
|
+
def ads_right
|
37
|
+
[]
|
38
|
+
end
|
37
39
|
|
38
|
-
|
40
|
+
def next_url
|
41
|
+
next_bn = @page.search("//div[@id='pagenav']/a").first
|
42
|
+
url = next_bn.nil? ? "/s?#{@baseuri.query}&pn=#{@pagenumber*10}" : next_bn['href']
|
43
|
+
url
|
44
|
+
end
|
39
45
|
|
40
|
-
|
46
|
+
def count
|
41
47
|
|
42
|
-
|
43
|
-
def parse_ad(div)
|
44
|
-
url = div.search("span[@class='ec_site']").first.text
|
45
|
-
url = "http://#{url}"
|
46
|
-
{
|
47
|
-
:text => div.search('a/text()').text.strip,
|
48
|
-
:href => div.search('a').first['href'],
|
49
|
-
:host => Addressable::URI.parse(URI.encode(url)).host
|
50
|
-
}
|
51
|
-
end
|
48
|
+
end
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
50
|
+
private
|
51
|
+
def parse_ad(ad_div)
|
52
|
+
begin
|
53
|
+
title_link = ad_div.search('a')[0]
|
54
|
+
url = ad_div.search('link')
|
55
|
+
if url.empty?
|
56
|
+
url = ad_div.search(".//span[contains(text(),'.com')]")[0]
|
57
|
+
url = url.nil? ? "http://m.baidu.com" : "http://#{url.text.strip}"
|
58
|
+
title = title_link.text
|
59
|
+
else
|
60
|
+
url = url[0]['href']
|
61
|
+
title = title_link.xpath("./text() | ./em").text
|
62
|
+
end
|
63
|
+
{
|
64
|
+
:text => title.gsub(/\n|\s/,''),
|
65
|
+
:href => title_link['href'],
|
66
|
+
:host => Addressable::URI.parse(URI.encode(url)).host
|
67
|
+
}
|
68
|
+
rescue Exception => e
|
69
|
+
warn "Error in parse_seo method : " + e.message
|
70
|
+
{}
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def parse_seo(seo_div)
|
75
|
+
begin
|
76
|
+
title_link = seo_div.search('a')[0]
|
77
|
+
href = title_link['href']
|
78
|
+
href = href[/m.baidu.com/] ? href : "http://m.baidu.com#{href}"
|
79
|
+
if seo_div['class']=='result'
|
80
|
+
host, is_vr = seo_div.search(".//*[@class='site']")[0], false
|
81
|
+
host = host.nil? ? find_host(seo_div) : host.text.split[0]
|
82
|
+
elsif seo_div['srcid']=='map'
|
83
|
+
is_vr, host = true, 'map.baidu.com'
|
84
|
+
elsif seo_div['tpl'] and seo_div['data-log']
|
85
|
+
url = JSON.parse(seo_div['data-log'].gsub("'",'"'))['mu']
|
86
|
+
if url==''
|
87
|
+
host = find_host(seo_div)
|
88
|
+
else
|
89
|
+
host = Addressable::URI.parse(URI.encode(url)).host
|
67
90
|
end
|
91
|
+
is_vr = true
|
92
|
+
else
|
93
|
+
is_vr, host = true, find_host(seo_div)
|
94
|
+
end
|
95
|
+
#is_vr = (is_vr.nil? and !host[/baidu|nuomi/]) ? false : true
|
96
|
+
{
|
97
|
+
:is_vr => false || is_vr,
|
98
|
+
:text => title_link.text.gsub(/\n|\s/,'')[0..30],
|
99
|
+
:href => href,
|
100
|
+
:host => host
|
101
|
+
}
|
102
|
+
rescue Exception => e
|
103
|
+
warn "Error in parse_seo method : " + e.message
|
104
|
+
{}
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def find_host(node)
|
109
|
+
host = node.search(".//*[name()!='style' and (contains(text(),'.cn') or contains(text(),'com'))]")[0]
|
110
|
+
host.nil? ? 'm.baidu.com' : host.text.split[0]
|
111
|
+
end
|
112
|
+
|
113
|
+
def redirect(url,limit = 10)
|
114
|
+
raise ArgumentError, 'Too many HTTP redirects' if limit == 0
|
115
|
+
response = Net::HTTP.get_response(URI(url))
|
116
|
+
case response
|
117
|
+
when Net::HTTPSuccess then
|
118
|
+
return URI(url).host
|
119
|
+
when Net::HTTPRedirection then
|
120
|
+
location = response['location']
|
121
|
+
redirect(location, limit-1)
|
122
|
+
else
|
123
|
+
return "m.baidu.com"
|
68
124
|
end
|
125
|
+
end
|
69
126
|
end
|
70
|
-
end
|
127
|
+
end
|
128
|
+
end
|
@@ -1,6 +1,51 @@
|
|
1
1
|
module Query
|
2
|
-
|
3
|
-
|
2
|
+
module Result
|
3
|
+
class QihuMobile
|
4
|
+
include Query::Result
|
5
|
+
|
6
|
+
def html
|
7
|
+
@page.to_html
|
8
|
+
end
|
9
|
+
|
10
|
+
def next_url
|
11
|
+
"#{@baseuri.to_s}&pn=#{@pagenumber+1}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def seo_ranks
|
15
|
+
@page.css('div.g-card').map.with_index do |seo_div,index|
|
16
|
+
begin
|
17
|
+
cite = seo_div.at('.//*[@class="res-show-url"]/text()')
|
18
|
+
a = seo_div.at_css('a')
|
19
|
+
if cite
|
20
|
+
cite = cite.to_s.gsub(/ |-/,'')
|
21
|
+
else
|
22
|
+
url = seo_div.at('.//a[contains(@href,"u=")]')
|
23
|
+
if url
|
24
|
+
cite = URI.decode(CGI.parse(URI(URI.encode(url['href'])).query)['u'][0])
|
25
|
+
cite = URI(URI.encode(cite)).host
|
26
|
+
else
|
27
|
+
cite = "m.haosou.com"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
title = seo_div.at_css('h3') || a
|
31
|
+
{
|
32
|
+
:is_vr => seo_div['class']=="g-card r-og-card" ? false : true,
|
33
|
+
:rank => index + 1 + (@pagenumber-1)*10,
|
34
|
+
:href => a['href'],
|
35
|
+
:text => title.text.gsub(/ |\n|\s/,""),
|
36
|
+
:host => cite
|
37
|
+
}
|
38
|
+
rescue Exception => e
|
39
|
+
warn "Error in parse_seo method : " + e.message
|
40
|
+
{}
|
41
|
+
end
|
4
42
|
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def related_keywords
|
46
|
+
@related_keywords ||= @page.search("//div[@class='related-search-b']//a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
|
47
|
+
end
|
48
|
+
|
5
49
|
end
|
6
|
-
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
module Query
|
2
|
+
module Result
|
3
|
+
class SMobile
|
4
|
+
include Query::Result
|
5
|
+
|
6
|
+
def seo_ranks
|
7
|
+
@seo_ranks ||= @page.search("//div[@id='results']/div[@class!='ali_row result card']").map.with_index do |seo_div,index|
|
8
|
+
parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
def ads_top
|
13
|
+
selector = "//div[@id='results']/div[@class='result card'][1]/preceding-sibling::div[@class='ali_row result card']"
|
14
|
+
@ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
|
15
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def ads_bottom
|
20
|
+
selector = "//div[@id='results']/div[@class='result card'][1]/following-sibling::div[@class='ali_row result card']"
|
21
|
+
@ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
|
22
|
+
parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
#relative words
|
27
|
+
def related_keywords
|
28
|
+
@related_keywords ||= @page.search("//div[@class='rel-keywords card']/ul/li/a").map { |a| a.text }
|
29
|
+
end
|
30
|
+
|
31
|
+
def html
|
32
|
+
@page.to_html
|
33
|
+
end
|
34
|
+
|
35
|
+
def ads_right
|
36
|
+
[]
|
37
|
+
end
|
38
|
+
|
39
|
+
def next_url
|
40
|
+
"#{@baseuri.to_s}&page=#{@pagenumber+1}"
|
41
|
+
end
|
42
|
+
|
43
|
+
def count
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
def parse_ad(ad_div)
|
49
|
+
begin
|
50
|
+
title_link = ad_div.at_css('a')
|
51
|
+
title = title_link.search('./text()|./em|./span')
|
52
|
+
url = ad_div.search('.//div[@class="host"]/text()').text
|
53
|
+
url = "http://#{url}" if !url[/http:/]
|
54
|
+
{
|
55
|
+
:text => title.text.gsub(/\n|\s/,''),
|
56
|
+
:href => title_link['href'],
|
57
|
+
:host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host
|
58
|
+
}
|
59
|
+
rescue Exception => e
|
60
|
+
warn "Error in parse_ads method : " + e.message
|
61
|
+
{}
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def parse_seo(seo_div)
|
66
|
+
begin
|
67
|
+
title_link = seo_div.at('.//a[contains(@href,"http://")]')
|
68
|
+
href = title_link['href']
|
69
|
+
if seo_div['class']=="result card"
|
70
|
+
is_vr = false
|
71
|
+
url = seo_div.search('.//div[@class="host"]/span/text()[matches(.,"\w+.\w+")]', XpathFunctions.new)[0] || href
|
72
|
+
else
|
73
|
+
is_vr, url = true, href
|
74
|
+
end
|
75
|
+
url = "http://#{url}" if !url[/http:/]
|
76
|
+
{
|
77
|
+
:is_vr => is_vr,
|
78
|
+
:text => title_link.text.gsub(/\n|\s/,'')[0..30],
|
79
|
+
:href => href,
|
80
|
+
:host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host # remove   and whitespace
|
81
|
+
}
|
82
|
+
rescue Exception => e
|
83
|
+
warn "Error in parse_seo method : " + e.message
|
84
|
+
{}
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class XpathFunctions
|
89
|
+
def matches node_set, regex
|
90
|
+
node_set.find_all {|node| node.to_s[/#{regex}/] }
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/query/result/sogou.rb
CHANGED
@@ -42,11 +42,8 @@ module Query
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def count
|
45
|
-
|
46
|
-
|
47
|
-
return counter_block.text.gsub(/\D/,'').to_i
|
48
|
-
end
|
49
|
-
end
|
45
|
+
node = @page.search("//resnum[@id='scd_num']").first
|
46
|
+
node ? node.text.gsub(/\D/,'').to_i : nil
|
50
47
|
end
|
51
48
|
|
52
49
|
def related_keywords
|
@@ -1,51 +1,91 @@
|
|
1
1
|
require 'cgi'
|
2
2
|
module Query
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def ads_right
|
13
|
-
[]
|
14
|
-
end
|
3
|
+
module Result
|
4
|
+
class SogouMobile
|
5
|
+
include Query::Result
|
6
|
+
def ads_top
|
7
|
+
@page.search("//div[@class='results']/div[@class='ec_ad_results'][1]/div[@class='ad_result']").map.with_index do |ad_div,index|
|
8
|
+
parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
9
|
+
end
|
10
|
+
end
|
15
11
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
end
|
20
|
-
end
|
12
|
+
def ads_right
|
13
|
+
[]
|
14
|
+
end
|
21
15
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
:text => a.search('h3').text,
|
28
|
-
:href => href,
|
29
|
-
:host => URI(href).host
|
30
|
-
}
|
31
|
-
end
|
32
|
-
end
|
16
|
+
def ads_bottom
|
17
|
+
@page.search("//div[@class='results']/div[@class='ec_ad_results'][2]/div[@class='ad_result']").map.with_index do |ad_div,index|
|
18
|
+
parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
19
|
+
end
|
20
|
+
end
|
33
21
|
|
34
|
-
|
35
|
-
|
36
|
-
|
22
|
+
def seo_ranks
|
23
|
+
@seo_rank ||= @page.search("//div[@class='results']/div[@class='result' or @class='vrResult']").map.with_index do |seo_div,index|
|
24
|
+
parse_seo_ranks(seo_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
|
25
|
+
end
|
26
|
+
end
|
37
27
|
|
28
|
+
def next_url
|
29
|
+
"#{@baseuri.to_s}&p=#{@pagenumber+1}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def related_keywords
|
33
|
+
@related_keywords ||= @page.search("div[@class='hint']/ul/li/a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
|
34
|
+
end
|
35
|
+
|
38
36
|
def count
|
39
37
|
end
|
38
|
+
|
39
|
+
def html
|
40
|
+
@page.to_html
|
41
|
+
end
|
40
42
|
|
41
43
|
private
|
42
44
|
def parse_ad(ad_div)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
45
|
+
begin
|
46
|
+
site = ad_div.search(".//span[@class='exp_tip']/preceding-sibling::span")[0] || ad_div.search(".//div[@class='bd_citeurl']/text()")[0]
|
47
|
+
{
|
48
|
+
:text => ad_div.search('h3')[0].text.gsub(/ |\n|\t/,""),
|
49
|
+
:href => ad_div.search('a')[0]['href'],
|
50
|
+
:host => site.text.strip.downcase
|
51
|
+
}
|
52
|
+
rescue Exception => e
|
53
|
+
warn "Error in parse_ads method : " + e.message
|
54
|
+
{}
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def parse_seo_ranks(seo_div)
|
59
|
+
begin
|
60
|
+
a = seo_div.search(".//a[contains(@href,'url=')]")[0]
|
61
|
+
cite_url = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'][0])
|
62
|
+
|
63
|
+
if cite_url==""
|
64
|
+
cite_url = seo_div.search(".//div[@class='citeurl']/text()")[0] || "wap.sogou.com"
|
65
|
+
cite_url = "http://#{cite_url.to_s.gsub(/ |-/,'')}"
|
66
|
+
end
|
67
|
+
|
68
|
+
if seo_div['class']=='result'
|
69
|
+
is_vr, title = false, a.search("./text()|./em|./span")
|
70
|
+
else
|
71
|
+
title = seo_div.search(".//h3")[0] || a
|
72
|
+
is_vr = true
|
73
|
+
title.css('script').remove
|
74
|
+
end
|
75
|
+
url = a['href'][/wap.sogou.com\/web/].nil? ? "http://wap.sogou.com/web/#{a['href']}" : a['href']
|
76
|
+
|
77
|
+
{
|
78
|
+
:text => title.text.gsub(/ |\n|\t/,""),
|
79
|
+
:href => url,
|
80
|
+
:host => URI(URI.encode(cite_url)).host,
|
81
|
+
:is_vr => is_vr
|
82
|
+
}
|
83
|
+
rescue Exception => e
|
84
|
+
warn "Error in parse_seo method : " + e.message
|
85
|
+
{}
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
end
|
51
91
|
end
|