query 0.1.25 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,70 +1,128 @@
1
1
  module Query
2
- module Result
3
- class BaiduMobile
4
- include Query::Result
2
+ module Result
3
+ class BaiduMobile
4
+ include Query::Result
5
5
 
6
- def seo_ranks
7
- @seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
8
- parse_seo(div).merge({:rank => index + 1})
9
- end
10
- end
6
+ def seo_ranks
7
+ s_res = @page.at("//div[@id='results']")
8
+ @seo_ranks ||= s_res.css("div.result").map.with_index do |seo_div,index|
9
+ parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
10
+ end
11
+ end
11
12
 
12
- def ads_top
13
- @ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
14
- puts index
15
- parse_ad(div).merge({:rank => index + 1})
16
- end
17
- end
13
+ def ads_top
14
+ selector = "//*[@class='result']/preceding-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
15
+ @ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
16
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
17
+ end
18
+ end
18
19
 
19
- def ads_right
20
- []
21
- end
20
+ def ads_bottom
21
+ selector = "//*[@class='result']/following-sibling::div[not (contains(@class,'result'))]/div/div/a[not (contains(@href,'http://baozhang.baidu.com/guarantee'))]/.."
22
+ @ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
23
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
24
+ end
25
+ end
22
26
 
23
- def ads_bottom
24
- @ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
25
- parse_ad(div).merge({:rank => index + 1})
26
- end
27
- end
27
+ #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
28
+ def related_keywords
29
+ @related_keywords ||= @page.search("//div[@id='relativewords']/div[@class='rw-list']/a").map { |a| a.text }
30
+ end
28
31
 
29
- #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
30
- def related_keywords
31
- @related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
32
- end
32
+ def html
33
+ @page.to_html
34
+ end
33
35
 
34
- def next_url
35
- @next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
36
- end
36
+ def ads_right
37
+ []
38
+ end
37
39
 
38
- def count
40
+ def next_url
41
+ next_bn = @page.search("//div[@id='pagenav']/a").first
42
+ url = next_bn.nil? ? "/s?#{@baseuri.query}&pn=#{@pagenumber*10}" : next_bn['href']
43
+ url
44
+ end
39
45
 
40
- end
46
+ def count
41
47
 
42
- private
43
- def parse_ad(div)
44
- url = div.search("span[@class='ec_site']").first.text
45
- url = "http://#{url}"
46
- {
47
- :text => div.search('a/text()').text.strip,
48
- :href => div.search('a').first['href'],
49
- :host => Addressable::URI.parse(URI.encode(url)).host
50
- }
51
- end
48
+ end
52
49
 
53
- def parse_seo(div)
54
- a = div.search('a').first
55
- if div['class'] == 'card-result wa-ue-card-result'
56
- host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
57
- elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
58
- host = 'map.baidu.com'
59
- else
60
- host = div.search("*[@class='site']").first.text
61
- end
62
- {
63
- :text => a.text,
64
- :href => a['href'],
65
- :host => host
66
- }
50
+ private
51
+ def parse_ad(ad_div)
52
+ begin
53
+ title_link = ad_div.search('a')[0]
54
+ url = ad_div.search('link')
55
+ if url.empty?
56
+ url = ad_div.search(".//span[contains(text(),'.com')]")[0]
57
+ url = url.nil? ? "http://m.baidu.com" : "http://#{url.text.strip}"
58
+ title = title_link.text
59
+ else
60
+ url = url[0]['href']
61
+ title = title_link.xpath("./text() | ./em").text
62
+ end
63
+ {
64
+ :text => title.gsub(/\n|\s/,''),
65
+ :href => title_link['href'],
66
+ :host => Addressable::URI.parse(URI.encode(url)).host
67
+ }
68
+ rescue Exception => e
69
+ warn "Error in parse_seo method : " + e.message
70
+ {}
71
+ end
72
+ end
73
+
74
+ def parse_seo(seo_div)
75
+ begin
76
+ title_link = seo_div.search('a')[0]
77
+ href = title_link['href']
78
+ href = href[/m.baidu.com/] ? href : "http://m.baidu.com#{href}"
79
+ if seo_div['class']=='result'
80
+ host, is_vr = seo_div.search(".//*[@class='site']")[0], false
81
+ host = host.nil? ? find_host(seo_div) : host.text.split[0]
82
+ elsif seo_div['srcid']=='map'
83
+ is_vr, host = true, 'map.baidu.com'
84
+ elsif seo_div['tpl'] and seo_div['data-log']
85
+ url = JSON.parse(seo_div['data-log'].gsub("'",'"'))['mu']
86
+ if url==''
87
+ host = find_host(seo_div)
88
+ else
89
+ host = Addressable::URI.parse(URI.encode(url)).host
67
90
  end
91
+ is_vr = true
92
+ else
93
+ is_vr, host = true, find_host(seo_div)
94
+ end
95
+ #is_vr = (is_vr.nil? and !host[/baidu|nuomi/]) ? false : true
96
+ {
97
+ :is_vr => false || is_vr,
98
+ :text => title_link.text.gsub(/\n|\s/,'')[0..30],
99
+ :href => href,
100
+ :host => host
101
+ }
102
+ rescue Exception => e
103
+ warn "Error in parse_seo method : " + e.message
104
+ {}
105
+ end
106
+ end
107
+
108
+ def find_host(node)
109
+ host = node.search(".//*[name()!='style' and (contains(text(),'.cn') or contains(text(),'com'))]")[0]
110
+ host.nil? ? 'm.baidu.com' : host.text.split[0]
111
+ end
112
+
113
+ def redirect(url,limit = 10)
114
+ raise ArgumentError, 'Too many HTTP redirects' if limit == 0
115
+ response = Net::HTTP.get_response(URI(url))
116
+ case response
117
+ when Net::HTTPSuccess then
118
+ return URI(url).host
119
+ when Net::HTTPRedirection then
120
+ location = response['location']
121
+ redirect(location, limit-1)
122
+ else
123
+ return "m.baidu.com"
68
124
  end
125
+ end
69
126
  end
70
- end
127
+ end
128
+ end
@@ -1,6 +1,51 @@
1
1
  module Query
2
- module Result
3
- class QihuMobile
2
+ module Result
3
+ class QihuMobile
4
+ include Query::Result
5
+
6
+ def html
7
+ @page.to_html
8
+ end
9
+
10
+ def next_url
11
+ "#{@baseuri.to_s}&pn=#{@pagenumber+1}"
12
+ end
13
+
14
+ def seo_ranks
15
+ @page.css('div.g-card').map.with_index do |seo_div,index|
16
+ begin
17
+ cite = seo_div.at('.//*[@class="res-show-url"]/text()')
18
+ a = seo_div.at_css('a')
19
+ if cite
20
+ cite = cite.to_s.gsub(/ |-/,'')
21
+ else
22
+ url = seo_div.at('.//a[contains(@href,"u=")]')
23
+ if url
24
+ cite = URI.decode(CGI.parse(URI(URI.encode(url['href'])).query)['u'][0])
25
+ cite = URI(URI.encode(cite)).host
26
+ else
27
+ cite = "m.haosou.com"
28
+ end
29
+ end
30
+ title = seo_div.at_css('h3') || a
31
+ {
32
+ :is_vr => seo_div['class']=="g-card r-og-card" ? false : true,
33
+ :rank => index + 1 + (@pagenumber-1)*10,
34
+ :href => a['href'],
35
+ :text => title.text.gsub(/ |\n|\s/,""),
36
+ :host => cite
37
+ }
38
+ rescue Exception => e
39
+ warn "Error in parse_seo method : " + e.message
40
+ {}
41
+ end
4
42
  end
43
+ end
44
+
45
+ def related_keywords
46
+ @related_keywords ||= @page.search("//div[@class='related-search-b']//a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
47
+ end
48
+
5
49
  end
6
- end
50
+ end
51
+ end
@@ -0,0 +1,95 @@
1
+ module Query
2
+ module Result
3
+ class SMobile
4
+ include Query::Result
5
+
6
+ def seo_ranks
7
+ @seo_ranks ||= @page.search("//div[@id='results']/div[@class!='ali_row result card']").map.with_index do |seo_div,index|
8
+ parse_seo(seo_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
9
+ end
10
+ end
11
+
12
+ def ads_top
13
+ selector = "//div[@id='results']/div[@class='result card'][1]/preceding-sibling::div[@class='ali_row result card']"
14
+ @ads_top ||= @page.search(selector).map.with_index do |ad_div,index|
15
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
16
+ end
17
+ end
18
+
19
+ def ads_bottom
20
+ selector = "//div[@id='results']/div[@class='result card'][1]/following-sibling::div[@class='ali_row result card']"
21
+ @ads_bottom ||= @page.search(selector).map.with_index do |ad_div,index|
22
+ parse_ad(ad_div).merge({:rank => (index + 1) + (@pagenumber -1) * 10})
23
+ end
24
+ end
25
+
26
+ #relative words
27
+ def related_keywords
28
+ @related_keywords ||= @page.search("//div[@class='rel-keywords card']/ul/li/a").map { |a| a.text }
29
+ end
30
+
31
+ def html
32
+ @page.to_html
33
+ end
34
+
35
+ def ads_right
36
+ []
37
+ end
38
+
39
+ def next_url
40
+ "#{@baseuri.to_s}&page=#{@pagenumber+1}"
41
+ end
42
+
43
+ def count
44
+
45
+ end
46
+
47
+ private
48
+ def parse_ad(ad_div)
49
+ begin
50
+ title_link = ad_div.at_css('a')
51
+ title = title_link.search('./text()|./em|./span')
52
+ url = ad_div.search('.//div[@class="host"]/text()').text
53
+ url = "http://#{url}" if !url[/http:/]
54
+ {
55
+ :text => title.text.gsub(/\n|\s/,''),
56
+ :href => title_link['href'],
57
+ :host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host
58
+ }
59
+ rescue Exception => e
60
+ warn "Error in parse_ads method : " + e.message
61
+ {}
62
+ end
63
+ end
64
+
65
+ def parse_seo(seo_div)
66
+ begin
67
+ title_link = seo_div.at('.//a[contains(@href,"http://")]')
68
+ href = title_link['href']
69
+ if seo_div['class']=="result card"
70
+ is_vr = false
71
+ url = seo_div.search('.//div[@class="host"]/span/text()[matches(.,"\w+.\w+")]', XpathFunctions.new)[0] || href
72
+ else
73
+ is_vr, url = true, href
74
+ end
75
+ url = "http://#{url}" if !url[/http:/]
76
+ {
77
+ :is_vr => is_vr,
78
+ :text => title_link.text.gsub(/\n|\s/,'')[0..30],
79
+ :href => href,
80
+ :host => URI(URI.encode(url.gsub(/ |\n|\t|\s/,""))).host # remove &nbsp and whitespace
81
+ }
82
+ rescue Exception => e
83
+ warn "Error in parse_seo method : " + e.message
84
+ {}
85
+ end
86
+ end
87
+
88
+ class XpathFunctions
89
+ def matches node_set, regex
90
+ node_set.find_all {|node| node.to_s[/#{regex}/] }
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -42,11 +42,8 @@ module Query
42
42
  end
43
43
 
44
44
  def count
45
- ["//div[@class='zhanzhang']//em", "//span[@id='scd_num']"].each do |xpath|
46
- if counter_block = @page.search(xpath).first
47
- return counter_block.text.gsub(/\D/,'').to_i
48
- end
49
- end
45
+ node = @page.search("//resnum[@id='scd_num']").first
46
+ node ? node.text.gsub(/\D/,'').to_i : nil
50
47
  end
51
48
 
52
49
  def related_keywords
@@ -1,51 +1,91 @@
1
1
  require 'cgi'
2
2
  module Query
3
- module Result
4
- class SogouMobile
5
- include Query::Result
6
- def ads_top
7
- @page.search("//ul[@class='searchresult']/li[1]/preceding-sibling::div").map.with_index do |ad_div,index|
8
- parse_ad(ad_div).merge({:rank => index + 1})
9
- end
10
- end
11
-
12
- def ads_right
13
- []
14
- end
3
+ module Result
4
+ class SogouMobile
5
+ include Query::Result
6
+ def ads_top
7
+ @page.search("//div[@class='results']/div[@class='ec_ad_results'][1]/div[@class='ad_result']").map.with_index do |ad_div,index|
8
+ parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
9
+ end
10
+ end
15
11
 
16
- def ads_bottom
17
- @page.search("//ul[@class='searchresult']/li[last()]/following-sibling::div").map.with_index do |div,index|
18
- parse_ad(div).merge({:rank => index + 1})
19
- end
20
- end
12
+ def ads_right
13
+ []
14
+ end
21
15
 
22
- def seo_ranks
23
- @seo_rank ||= @page.search("//ul[@class='searchresult']/li/a").map.with_index do |a,index|
24
- href = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'].first)
25
- {
26
- :rank => index + 1,
27
- :text => a.search('h3').text,
28
- :href => href,
29
- :host => URI(href).host
30
- }
31
- end
32
- end
16
+ def ads_bottom
17
+ @page.search("//div[@class='results']/div[@class='ec_ad_results'][2]/div[@class='ad_result']").map.with_index do |ad_div,index|
18
+ parse_ad(ad_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
19
+ end
20
+ end
33
21
 
34
- def next_url
35
- @page.search("//a[text()='下一页']").first['href']
36
- end
22
+ def seo_ranks
23
+ @seo_rank ||= @page.search("//div[@class='results']/div[@class='result' or @class='vrResult']").map.with_index do |seo_div,index|
24
+ parse_seo_ranks(seo_div).merge({:rank => (@pagenumber-1)*10 + index + 1})
25
+ end
26
+ end
37
27
 
28
+ def next_url
29
+ "#{@baseuri.to_s}&p=#{@pagenumber+1}"
30
+ end
31
+
32
+ def related_keywords
33
+ @related_keywords ||= @page.search("div[@class='hint']/ul/li/a").map{|relwd| relwd.text.gsub(/ |\n|\t/,"")}
34
+ end
35
+
38
36
  def count
39
37
  end
38
+
39
+ def html
40
+ @page.to_html
41
+ end
40
42
 
41
43
  private
42
44
  def parse_ad(ad_div)
43
- {
44
- :text => ad_div.search('h3').first.text,
45
- :href => ad_div.search('a').first['href'],
46
- :host => Addressable::URI.parse("http://#{ad_div.search('span[@class="site"]').text}").host
47
- }
48
- end
49
- end
50
- end
45
+ begin
46
+ site = ad_div.search(".//span[@class='exp_tip']/preceding-sibling::span")[0] || ad_div.search(".//div[@class='bd_citeurl']/text()")[0]
47
+ {
48
+ :text => ad_div.search('h3')[0].text.gsub(/ |\n|\t/,""),
49
+ :href => ad_div.search('a')[0]['href'],
50
+ :host => site.text.strip.downcase
51
+ }
52
+ rescue Exception => e
53
+ warn "Error in parse_ads method : " + e.message
54
+ {}
55
+ end
56
+ end
57
+
58
+ def parse_seo_ranks(seo_div)
59
+ begin
60
+ a = seo_div.search(".//a[contains(@href,'url=')]")[0]
61
+ cite_url = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'][0])
62
+
63
+ if cite_url==""
64
+ cite_url = seo_div.search(".//div[@class='citeurl']/text()")[0] || "wap.sogou.com"
65
+ cite_url = "http://#{cite_url.to_s.gsub(/ |-/,'')}"
66
+ end
67
+
68
+ if seo_div['class']=='result'
69
+ is_vr, title = false, a.search("./text()|./em|./span")
70
+ else
71
+ title = seo_div.search(".//h3")[0] || a
72
+ is_vr = true
73
+ title.css('script').remove
74
+ end
75
+ url = a['href'][/wap.sogou.com\/web/].nil? ? "http://wap.sogou.com/web/#{a['href']}" : a['href']
76
+
77
+ {
78
+ :text => title.text.gsub(/ |\n|\t/,""),
79
+ :href => url,
80
+ :host => URI(URI.encode(cite_url)).host,
81
+ :is_vr => is_vr
82
+ }
83
+ rescue Exception => e
84
+ warn "Error in parse_seo method : " + e.message
85
+ {}
86
+ end
87
+ end
88
+
89
+ end
90
+ end
51
91
  end