query 0.1.25 → 0.1.28

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,70 +1,128 @@
1
1
  module Query
2
- module Result
3
- class BaiduMobile
4
- include Query::Result
2
+ module Result
3
+ class BaiduMobile
4
+ include Query::Result
5
5
 
6
- def seo_ranks
7
- @seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
8
- parse_seo(div).merge({:rank => index + 1})
9
- end
10
- end
6
+ def seo_ranks
7
+ s_res = @page.at("//div[@id='results']")
8
+ @seo_ranks ||= s_res.css("div.result").map.with_index do |seo_div,index|
9
+ parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
10
+ end
11
+ end
11
12
 
12
- def ads_top
13
- @ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
14
- puts index
15
- parse_ad(div).merge({:rank => index + 1})
16
- end
17
- end
13
+ def ads_top
14
+ selector = "//*[@class='result']/preceding-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
15
+ @ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
16
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
17
+ end
18
+ end
18
19
 
19
- def ads_right
20
- []
21
- end
20
+ def ads_bottom
21
+ selector = "//*[@class='result']/following-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
22
+ @ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
23
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
24
+ end
25
+ end
22
26
 
23
- def ads_bottom
24
- @ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
25
- parse_ad(div).merge({:rank => index + 1})
26
- end
27
- end
27
+ #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
28
+ def related_keywords
29
+ @related_keywords ||= @page.search("//div[@id='relativewords']/div[@class='rw-list']/a").map { |a| a.text }
30
+ end
28
31
 
29
- #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
30
- def related_keywords
31
- @related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
32
- end
32
+ def html
33
+ @page.to_html
34
+ end
33
35
 
34
- def next_url
35
- @next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
36
- end
36
+ def ads_right
37
+ []
38
+ end
37
39
 
38
- def count
40
+ def next_url
41
+ next_bn = @page.search("//div[@id='pagenav']/a").first
42
+ url = next_bn.nil? ? "/s?#{@baseuri.query}&pn=#{@pagenumber*10}" : next_bn['href']
43
+ url
44
+ end
39
45
 
40
- end
46
+ def count
41
47
 
42
- private
43
- def parse_ad(div)
44
- url = div.search("span[@class='ec_site']").first.text
45
- url = "http://#{url}"
46
- {
47
- :text => div.search('a/text()').text.strip,
48
- :href => div.search('a').first['href'],
49
- :host => Addressable::URI.parse(URI.encode(url)).host
50
- }
51
- end
48
+ end
52
49
 
53
- def parse_seo(div)
54
- a = div.search('a').first
55
- if div['class'] == 'card-result wa-ue-card-result'
56
- host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
57
- elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
58
- host = 'map.baidu.com'
59
- else
60
- host = div.search("*[@class='site']").first.text
61
- end
62
- {
63
- :text => a.text,
64
- :href => a['href'],
65
- :host => host
66
- }
50
+ private
51
+ def parse_ad(ad_div)
52
+ begin
53
+ title_link = ad_div.search('a')[0]
54
+ url = ad_div.search('link')
55
+ if url.empty?
56
+ url = ad_div.search(".//span[contains(text(),'.com')]")[0]
57
+ url = url.nil? ? "http://m.baidu.com" : "http://#{url.text.strip}"
58
+ title = title_link.text
59
+ else
60
+ url = url[0]['href']
61
+ title = title_link.xpath("./text() | ./em").text
62
+ end
63
+ {
64
+ :text => title.gsub(/\n|\s/,''),
65
+ :href => title_link['href'],
66
+ :host => Addressable::URI.parse(URI.encode(url)).host
67
+ }
68
+ rescue Exception => e
69
+ warn "Error in parse_seo method : " + e.message
70
+ {}
71
+ end
72
+ end
73
+
74
+ def parse_seo(seo_div)
75
+ begin
76
+ title_link = seo_div.search('a')[0]
77
+ href = title_link['href']
78
+ href = href[/m.baidu.com/] ? href : "http://m.baidu.com#{href}"
79
+ if seo_div['class']=='result'
80
+ host, is_vr = seo_div.search(".//*[@class='site']")[0], false
81
+ host = host.nil? ? find_host(seo_div) : host.text.split[0]
82
+ elsif seo_div['srcid']=='map'
83
+ is_vr, host = true, 'map.baidu.com'
84
+ elsif seo_div['tpl'] and seo_div['data-log']
85
+ url = JSON.parse(seo_div['data-log'].gsub("'",'"'))['mu']
86
+ if url==''
87
+ host = find_host(seo_div)
88
+ else
89
+ host = Addressable::URI.parse(URI.encode(url)).host
67
90
  end
91
+ is_vr = true
92
+ else
93
+ is_vr, host = true, find_host(seo_div)
94
+ end
95
+ #is_vr = (is_vr.nil? and !host[/baidu|nuomi/]) ? false : true
96
+ {
97
+ :is_vr => false || is_vr,
98
+ :text => title_link.text.gsub(/\n|\s/,'')[0..30],
99
+ :href => href,
100
+ :host => host
101
+ }
102
+ rescue Exception => e
103
+ warn "Error in parse_seo method : " + e.message
104
+ {}
105
+ end
106
+ end
107
+
108
+ def find_host(node)
109
+ host = node.search(".//*[name()!='style' and (contains(text(),'.cn') or contains(text(),'com'))]")[0]
110
+ host.nil? ? 'm.baidu.com' : host.text.split[0]
111
+ end
112
+
113
+ def redirect(url,limit = 10)
114
+ raise ArgumentError, 'Too many HTTP redirects' if limit == 0
115
+ response = Net::HTTP.get_response(URI(url))
116
+ case response
117
+ when Net::HTTPSuccess then
118
+ return URI(url).host
119
+ when Net::HTTPRedirection then
120
+ location = response['location']
121
+ redirect(location, limit-1)
122
+ else
123
+ return "m.baidu.com"
68
124
  end
125
+ end
69
126
  end
70
- end
127
+ end
128
+ end
@@ -1,6 +1,51 @@
1
1
  module Query
2
- module Result
3
- class QihuMobile
2
+ module Result
3
+ class QihuMobile
4
+ include Query::Result
5
+
6
+ def html
7
+ @page.to_html
8
+ end
9
+
10
+ def next_url
11
+ "#{@baseuri.to_s}&pn=#{@pagenumber+1}"
12
+ end
13
+
14
+ def seo_ranks
15
+ @page.css('div.g-card').map.with_index do |seo_div,index|
16
+ begin
17
+ cite = seo_div.at('.//*[@class="res-show-url"]/text()')
18
+ a = seo_div.at_css('a')
19
+ if cite
20
+ cite = cite.to_s.gsub(/ |-/,'')
21
+ else
22
+ url = seo_div.at('.//a[contains(@href,"u=")]')
23
+ if url
24
+ cite = URI.decode(CGI.parse(URI(URI.encode(url['href'])).query)['u'][0])
25
+ cite = URI(URI.encode(cite)).host
26
+ else
27
+ cite = "m.haosou.com"
28
+ end
29
+ end
30
+ title = seo_div.at_css('h3') || a
31
+ {
32
+ :is_vr => seo_div['class']=="g-card r-og-card" ? false : true,
33
+ :rank => index + 1 + (@pagenumber-1)*10,
34
+ :href => a['href'],
35
+ :text => title.text.gsub(/ |\n|\s/,""),
36
+ :host => cite
37
+ }
38
+ rescue Exception => e
39
+ warn "Error in parse_seo method : " + e.message
40
+ {}
41
+ end
4
42
  end
43
+ end
44
+
45
+ def related_keywords
46
+ @related_keywords ||= @page.search("//div[@class='related-search-b']//a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
47
+ end
48
+
5
49
  end
6
- end
50
+ end
51
+ end
@@ -0,0 +1,95 @@
1
+ module Query
2
+ module Result
3
+ class SMobile
4
+ include Query::Result
5
+
6
+ def seo_ranks
7
+ @seo_ranks ||= @page.search("//div[@id='results']/div[@class!='ali_row result card']").map.with_index do |seo_div,index|
8
+ parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
9
+ end
10
+ end
11
+
12
+ def ads_top
13
+ selector = "//div[@id='results']/div[@class='result card'][1]/preceding-sibling::div[@class='ali_row result card']"
14
+ @ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
15
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
16
+ end
17
+ end
18
+
19
+ def ads_bottom
20
+ selector = "//div[@id='results']/div[@class='result card'][1]/following-sibling::div[@class='ali_row result card']"
21
+ @ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
22
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
23
+ end
24
+ end
25
+
26
+ #relative words
27
+ def related_keywords
28
+ @related_keywords ||= @page.search("//div[@class='rel-keywords card']/ul/li/a").map { |a| a.text }
29
+ end
30
+
31
+ def html
32
+ @page.to_html
33
+ end
34
+
35
+ def ads_right
36
+ []
37
+ end
38
+
39
+ def next_url
40
+ "#{@baseuri.to_s}&page=#{@pagenumber+1}"
41
+ end
42
+
43
+ def count
44
+
45
+ end
46
+
47
+ private
48
+ def parse_ad(ad_div)
49
+ begin
50
+ title_link = ad_div.at_css('a')
51
+ title = title_link.search('./text()|./em|./span')
52
+ url = ad_div.search('.//div[@class="host"]/text()').text
53
+ url = "http://#{url}" if !url[/http:/]
54
+ {
55
+ :text => title.text.gsub(/\n|\s/,''),
56
+ :href => title_link['href'],
57
+ :host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host
58
+ }
59
+ rescue Exception => e
60
+ warn "Error in parse_ads method : " + e.message
61
+ {}
62
+ end
63
+ end
64
+
65
+ def parse_seo(seo_div)
66
+ begin
67
+ title_link = seo_div.at('.//a[contains(@href,"http://")]')
68
+ href = title_link['href']
69
+ if seo_div['class']=="result card"
70
+ is_vr = false
71
+ url = seo_div.search('.//div[@class="host"]/span/text()[matches(.,"\w+.\w+")]', XpathFunctions.new)[0] || href
72
+ else
73
+ is_vr, url = true, href
74
+ end
75
+ url = "http://#{url}" if !url[/http:/]
76
+ {
77
+ :is_vr => is_vr,
78
+ :text => title_link.text.gsub(/\n|\s/,'')[0..30],
79
+ :href => href,
80
+ :host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host # remove &nbsp and whitespace
81
+ }
82
+ rescue Exception => e
83
+ warn "Error in parse_seo method : " + e.message
84
+ {}
85
+ end
86
+ end
87
+
88
+ class XpathFunctions
89
+ def matches node_set, regex
90
+ node_set.find_all {|node| node.to_s[/#{regex}/] }
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -42,11 +42,8 @@ module Query
42
42
  end
43
43
 
44
44
  def count
45
- ["//div[@class='zhanzhang']//em", "//span[@id='scd_num']"].each do |xpath|
46
- if counter_block = @page.search(xpath).first
47
- return counter_block.text.gsub(/\D/,'').to_i
48
- end
49
- end
45
+ node = @page.search("//resnum[@id='scd_num']").first
46
+ node ? node.text.gsub(/\D/,'').to_i : nil
50
47
  end
51
48
 
52
49
  def related_keywords
@@ -1,51 +1,91 @@
1
1
  require 'cgi'
2
2
  module Query
3
- module Result
4
- class SogouMobile
5
- include Query::Result
6
- def ads_top
7
- @page.search("//ul[@class='searchresult']/li[1]/preceding-sibling::div").map.with_index do |ad_div,index|
8
- parse_ad(ad_div).merge({:rank => index + 1})
9
- end
10
- end
11
-
12
- def ads_right
13
- []
14
- end
3
+ module Result
4
+ class SogouMobile
5
+ include Query::Result
6
+ def ads_top
7
+ @page.search("//div[@class='results']/div[@class='ec_ad_results'][1]/div[@class='ad_result']").map.with_index do |ad_div,index|
8
+ parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
9
+ end
10
+ end
15
11
 
16
- def ads_bottom
17
- @page.search("//ul[@class='searchresult']/li[last()]/following-sibling::div").map.with_index do |div,index|
18
- parse_ad(div).merge({:rank => index + 1})
19
- end
20
- end
12
+ def ads_right
13
+ []
14
+ end
21
15
 
22
- def seo_ranks
23
- @seo_rank ||= @page.search("//ul[@class='searchresult']/li/a").map.with_index do |a,index|
24
- href = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'].first)
25
- {
26
- :rank => index + 1,
27
- :text => a.search('h3').text,
28
- :href => href,
29
- :host => URI(href).host
30
- }
31
- end
32
- end
16
+ def ads_bottom
17
+ @page.search("//div[@class='results']/div[@class='ec_ad_results'][2]/div[@class='ad_result']").map.with_index do |ad_div,index|
18
+ parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
19
+ end
20
+ end
33
21
 
34
- def next_url
35
- @page.search("//a[text()='下一页']").first['href']
36
- end
22
+ def seo_ranks
23
+ @seo_rank ||= @page.search("//div[@class='results']/div[@class='result' or @class='vrResult']").map.with_index do |seo_div,index|
24
+ parse_seo_ranks(seo_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
25
+ end
26
+ end
37
27
 
28
+ def next_url
29
+ "#{@baseuri.to_s}&p=#{@pagenumber+1}"
30
+ end
31
+
32
+ def related_keywords
33
+ @related_keywords ||= @page.search("div[@class='hint']/ul/li/a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
34
+ end
35
+
38
36
  def count
39
37
  end
38
+
39
+ def html
40
+ @page.to_html
41
+ end
40
42
 
41
43
  private
42
44
  def parse_ad(ad_div)
43
- {
44
- :text => ad_div.search('h3').first.text,
45
- :href => ad_div.search('a').first['href'],
46
- :host => Addressable::URI.parse("http://#{ad_div.search('span[@class="site"]').text}").host
47
- }
48
- end
49
- end
50
- end
45
+ begin
46
+ site = ad_div.search(".//span[@class='exp_tip']/preceding-sibling::span")[0] || ad_div.search(".//div[@class='bd_citeurl']/text()")[0]
47
+ {
48
+ :text => ad_div.search('h3')[0].text.gsub(/ |\n|\t/,""),
49
+ :href => ad_div.search('a')[0]['href'],
50
+ :host => site.text.strip.downcase
51
+ }
52
+ rescue Exception => e
53
+ warn "Error in parse_ads method : " + e.message
54
+ {}
55
+ end
56
+ end
57
+
58
+ def parse_seo_ranks(seo_div)
59
+ begin
60
+ a = seo_div.search(".//a[contains(@href,'url=')]")[0]
61
+ cite_url = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'][0])
62
+
63
+ if cite_url==""
64
+ cite_url = seo_div.search(".//div[@class='citeurl']/text()")[0] || "wap.sogou.com"
65
+ cite_url = "http://#{cite_url.to_s.gsub(/ |-/,'')}"
66
+ end
67
+
68
+ if seo_div['class']=='result'
69
+ is_vr, title = false, a.search("./text()|./em|./span")
70
+ else
71
+ title = seo_div.search(".//h3")[0] || a
72
+ is_vr = true
73
+ title.css('script').remove
74
+ end
75
+ url = a['href'][/wap.sogou.com\/web/].nil? ? "http://wap.sogou.com/web/#{a['href']}" : a['href']
76
+
77
+ {
78
+ :text => title.text.gsub(/ |\n|\t/,""),
79
+ :href => url,
80
+ :host => URI(URI.encode(cite_url)).host,
81
+ :is_vr => is_vr
82
+ }
83
+ rescue Exception => e
84
+ warn "Error in parse_seo method : " + e.message
85
+ {}
86
+ end
87
+ end
88
+
89
+ end
90
+ end
51
91
  end