query 0.0.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -1
  3. data/Gemfile +3 -1
  4. data/README.md +6 -1
  5. data/lib/query/engine/baidu.rb +12 -8
  6. data/lib/query/engine/baidu_mobile.rb +4 -4
  7. data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
  8. data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
  9. data/lib/query/engine/sogou.rb +45 -0
  10. data/lib/query/engine/sogou_mobile.rb +21 -0
  11. data/lib/query/engine.rb +11 -4
  12. data/lib/query/result/baidu.rb +57 -91
  13. data/lib/query/result/baidu_mobile.rb +49 -93
  14. data/lib/query/result/qihu.rb +66 -0
  15. data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
  16. data/lib/query/result/sogou.rb +103 -0
  17. data/lib/query/result/sogou_mobile.rb +51 -0
  18. data/lib/query/result.rb +47 -4
  19. data/lib/query/version.rb +1 -1
  20. data/lib/query.rb +6 -8
  21. data/query.gemspec +2 -3
  22. data/spec/baidu1_spec.rb +157 -0
  23. data/spec/baidu2_spec.rb +156 -0
  24. data/spec/mbaidu1_spec.rb +167 -0
  25. data/spec/msogou_spec.rb +91 -0
  26. data/spec/qihu_spec.rb +87 -0
  27. data/spec/samples/baidu1.html +521 -0
  28. data/spec/samples/baidu2.html +662 -0
  29. data/spec/samples/mbaidu1.html +2 -0
  30. data/spec/samples/mbaidu2.html +2 -0
  31. data/spec/samples/msogou.html +474 -0
  32. data/spec/samples/qihu.html +506 -0
  33. data/spec/samples/sogou.html +629 -0
  34. data/spec/sogou_mobile_spec.rb +86 -0
  35. data/spec/sogou_spec.rb +107 -0
  36. data/spec/spec_helper.rb +12 -1
  37. metadata +56 -31
  38. data/lib/query/engine/base.rb +0 -16
  39. data/lib/query/result/base.rb +0 -50
  40. data/lib/query/result/qihoo.rb +0 -75
  41. data/spec/baidu_mobile_spec.rb +0 -19
  42. data/spec/baidu_spec.rb +0 -73
  43. data/spec/qihoo_spec.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e414f7561d351c15835fb94956919de3d9d6ac62
4
- data.tar.gz: 1e75cc2eb2d552b779d7cc33865248cb73d00d64
3
+ metadata.gz: e5043b0180a473ab3d213136dfe9ab55ccb4a6d9
4
+ data.tar.gz: dc0b8b1ee15dc3f4437439712904de92b838520e
5
5
  SHA512:
6
- metadata.gz: e07b5ab0f23e5775945fa66cd55ae0e822888da4d5aa65968ee5a0cf91d219a3126b2af7836e23cde660b43fccca4570cc33be142d08b83092388de975a84339
7
- data.tar.gz: baa4d81223b5911ca159735bdff34ba6b4646ebeefe83112463c733373697ee0823b3de5e57513e97105c55f663bce58d87e8530f48c5d2a8718e50f80785974
6
+ metadata.gz: baa8ca09bc7bfd19f1eb3e9dffa24cf0cc28aba704d8671769bc26a79c68f10c01d57df4f76d4e6558ea638fd6c211111ca21d1883675450971256fc8369bc2a
7
+ data.tar.gz: 804e6685b6d7d49e563318a9150eae1a655c52cb6f6ca9084c7fbb0c908fde92a44025761668abe2b7aa4b21ba31fbb0c9ebcbc24a6d3c61fd2415e5344b9fd4
data/.gitignore CHANGED
@@ -20,4 +20,7 @@ tmp
20
20
  # YARD artifacts
21
21
  .yardoc
22
22
  _yardoc
23
- doc/
23
+ doc/
24
+ lib/query/.DS_Store
25
+
26
+ lib/query/.DS_Store
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
- source 'https://rubygems.org'
2
1
 
2
+ source 'http://ruby.taobao.org'
3
+ gem 'addressable'
4
+ gem 'httparty'
3
5
  # Specify your gem's dependencies in query.gemspec
4
6
  gemspec
data/README.md CHANGED
@@ -3,22 +3,27 @@ Query
3
3
 
4
4
 
5
5
  #to get the result list by querying "abc"
6
+
6
7
  Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
7
8
  puts id,value
8
9
  end
9
10
 
10
11
  #to get the result list with host "www.abc.com.cn" by querying "abc"
12
+
11
13
  Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
12
14
  puts id,value
13
15
  end
14
16
 
15
17
  #to get the result list with host which fit the regex /com.cn/ by querying "abc"
18
+
16
19
  Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
17
20
  puts id,value
18
21
  end
19
22
 
20
23
  # to get the top rank of host "www.abc.com.cn" by querying "abc"
24
+
21
25
  Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
22
26
 
23
27
  TODO:
24
- 查询结果不多,翻页不存在时的处理,及rspec
28
+ 查询结果不多,翻页不存在时的处理,及rspec
29
+ 增加其他搜索引擎
@@ -1,7 +1,11 @@
1
1
  module Query
2
2
  module Engine
3
- class Baidu < Base
3
+ class Baidu
4
+ include Query::Engine
4
5
  BaseUri = 'http://www.baidu.com/s?'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
8
+ }
5
9
  def self.suggestions(wd)
6
10
  require 'json'
7
11
  json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
@@ -37,25 +41,25 @@ module Query
37
41
  return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
38
42
  end
39
43
 
40
- def query(wd)
44
+ def self.query(wd)
41
45
  q = Array.new
42
46
  q << "wd=#{wd}"
43
47
  q << "rn=#{@perpage.to_i}" if @perpage
44
48
  queryStr = q.join("&")
45
49
  #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
46
50
  uri = URI.encode((BaseUri + queryStr))
47
- begin
51
+ # begin
48
52
  # @page = @a.get uri
49
- @page = HTTParty.get uri
53
+ @page = HTTParty.get(uri,Options)
50
54
  r = Query::Result::Baidu.new(@page)
51
55
  r.baseuri = uri
52
56
  r.pagenumber = 1
53
57
  r.perpage = @perpage
54
58
  r
55
- rescue Exception => e
56
- warn e.to_s
57
- return false
58
- end
59
+ # rescue Exception => e
60
+ # warn e.to_s
61
+ # return false
62
+ # end
59
63
  =begin
60
64
  query = "#{query}"
61
65
  @uri = BaseUri+URI.encode(query.encode('GBK'))
@@ -1,11 +1,11 @@
1
1
  module Query
2
2
  module Engine
3
- class BaiduMobile < Base
3
+ class BaiduMobile
4
+ include Query::Engine
4
5
  BaseUri = 'http://m.baidu.com/s?'
5
- headers = {
6
- "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
7
8
  }
8
- Options = {:headers => headers}
9
9
 
10
10
  #基本查询,相当于从搜索框直接输入关键词查询
11
11
  def query(wd)
@@ -1,15 +1,20 @@
1
1
  module Query
2
2
  module Engine
3
- class Qihoo < Base
3
+ class Qihu
4
+ include Query::Engine
4
5
  Host = 'www.so.com'
6
+ headers = {
7
+ "User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'
8
+ }
9
+ Options = {:headers => headers}
5
10
  #基本查询, 相当于在搜索框直接数据关键词查询
6
11
  def query(wd)
7
12
  #用原始路径请求
8
13
  uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
9
- page = HTTParty.get(uri)
14
+ page = HTTParty.get(uri,Options)
10
15
  #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
11
16
  uri = URI.join("http://#{Host}/",page.request.path).to_s
12
- r = Query::Result::Qihoo.new(page)
17
+ r = Query::Result::Qihu.new(page)
13
18
  r.baseuri = uri
14
19
  r
15
20
  end
@@ -0,0 +1,45 @@
1
+ module Query
2
+ module Engine
3
+ class Sogou
4
+ include Query::Engine
5
+ BaseUri = 'http://www.sogou.com/web?'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
8
+ }
9
+ class << self
10
+ def query(wd)
11
+ q = []
12
+ q << "query=#{wd}"
13
+ uri = URI.encode BaseUri+q.join('&')
14
+ page = HTTParty.get(uri,Options)
15
+ r = Query::Result::Sogou.new(page)
16
+ r.baseuri = uri
17
+ r.perpage = @perpage
18
+ r.pagenumber = 1
19
+ r
20
+ end
21
+
22
+ def suggestions(word)
23
+ suggestions = HTTParty.get "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=#{URI.encode(word)}"
24
+ suggestions = suggestions.encode('utf-8').scan /#{word}[^"]+/
25
+ suggestions
26
+ end
27
+
28
+ #site:xxx.yyy.com
29
+ def pages(host)
30
+ query("site:#{host}")
31
+ end
32
+
33
+ #domain:xxx.yyy.com/path/file.html
34
+ def links(uri)
35
+ query("domain:\"#{uri}\"")
36
+ end
37
+
38
+ #site:xxx.yyy.com inurl:zzz
39
+ # def pages_with(host,string)
40
+ # query("site:#{host} inurl:#{string}")
41
+ # end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ module Query
2
+ module Engine
3
+ class SogouMobile
4
+ include Query::Engine
5
+ BaseUri = 'http://wap.sogou.com/web/searchList.jsp'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
8
+ }
9
+ class << self
10
+ def query(wd)
11
+ queryStr = "keyword=#{wd}"
12
+ uri = URI.encode(BaseUri + "?" + queryStr)
13
+ res = HTTParty.get(uri,Options)
14
+ r = Query::Result::SogouMobile.new(res)
15
+ r.baseuri = uri
16
+ r
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
data/lib/query/engine.rb CHANGED
@@ -1,10 +1,17 @@
1
1
  module Query
2
2
  module Engine
3
+ attr_accessor :perpage
4
+ def self.indexed?(url)
5
+ URI(url)
6
+ result = query(url)
7
+ return result.has_result?
8
+ end
3
9
  end
4
10
  end
5
-
6
- require 'query/engine/base'
11
+ require 'httparty'
7
12
  require 'query/engine/baidu'
8
13
  require 'query/engine/baidu_mobile'
9
- require 'query/engine/qihoo'
10
- require 'query/engine/qihoo_mobile'
14
+ require 'query/engine/qihu'
15
+ require 'query/engine/qihu_mobile'
16
+ require 'query/engine/sogou'
17
+ require 'query/engine/sogou_mobile'
@@ -1,94 +1,40 @@
1
1
  module Query
2
2
  module Result
3
- class Baidu < Base
4
- def ranks
3
+ class Baidu
4
+ include Query::Result
5
+ def seo_ranks
5
6
  return @ranks unless @ranks.nil?
6
- @ranks = Hash.new
7
- @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
8
- id = table['id']
9
- # if @perpage == 10
10
- # id = table['id'][-1,1]
11
- # id = '10' if id == '0'
12
- # end
13
-
14
- @ranks[id] = Hash.new
15
- url = table.search("[@class=\"g\"]").first
16
- url = url.text unless url.nil?
17
- a = table.search("h3").first
18
- next if a.nil?
19
- @ranks[id]['text'] = a.text
20
- @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
21
- unless url.nil?
22
- url = url.strip
23
- @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
24
- else
25
- @ranks[id]['host'] = nil
26
- end
7
+ @page.search("//*[@class='result']|//*[@class='result-op']|//*[@class='result-op c-container']").map.with_index do |table,index|
8
+ parse_seo(table).merge({:rank => index + 1})
27
9
  end
28
- #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
29
- @ranks
30
10
  end
31
11
 
32
- def ads_bottom
33
- return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
34
- return ads_top
35
- # p @page.search("//table[@bgcolor='f5f5f5']").empty?
36
- end
37
12
  def ads_top
38
- #灰色底推广,上下都有
39
- ads = Hash.new
40
- @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
41
- id = table['id']
42
- next if id.nil?
43
- id = id[2,3].to_i.to_s
44
- ads[id]= parse_ad(table)
13
+ @page.search("//*[@class='result']/preceding-sibling::*[contains(@class,'EC_result')]").map.with_index do |div, index|
14
+ parse_ad(div).merge(:rank => index + 1)
45
15
  end
46
- #白色底推广,只有上部分
47
- if ads.empty?
48
- @page.search("//table").each do |table|
49
- id = table['id']
50
- next if id.nil? or id.to_i<3000
51
- id = id[2,3].to_i.to_s
52
- ads[id]= parse_ad(table)
53
- end
54
- end
55
- ads
56
16
  end
57
- def parse_ad(table)
58
- href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
59
- title = table.search("a").first.text.strip
60
- {'title'=>title,'href' => href,'host'=>href}
17
+
18
+ def ads_bottom
19
+ @page.search("//*[@class='result']/following-sibling::*[contains(@class,'EC_result')]").map.with_index do |div,index|
20
+ parse_ad(div).merge(:rank => index + 1)
21
+ end
61
22
  end
23
+
62
24
  def ads_right
63
- ads = {}
64
- @page.search("//div[@id='ec_im_container']").each do |table|
65
- table.search("div[@id]").each do |div|
66
- id = div['id'][-1,1].to_i+1
67
- title = div.search("a").first
68
- next if title.nil?
69
- title = title.text
70
- url = div.search("font[@color='#008000']").first
71
- next if url.nil?
72
- url = url.text
73
- ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
74
- end
25
+ @page.search("//div[@id='ec_im_container']/div[@id]").map.with_index do |div,index|
26
+ a = div.search('a').first
27
+ url = div.search("*[@class='EC_url']").first.text
28
+ url = "http://#{url}"
29
+ {
30
+ :rank => index + 1,
31
+ :text => a.text.strip,
32
+ :href => a['href'].strip,
33
+ :host => Addressable::URI.parse(URI.encode(url)).host
34
+ }
75
35
  end
76
- ads
77
36
  end
78
37
 
79
- #return the top rank number from @ranks with the input host
80
- # def rank(host)#on base of ranks
81
- # ranks.each do |id,line|
82
- # id = id.to_i
83
- # if host.class == Regexp
84
- # return id if line['host'] =~ host
85
- # elsif host.class == String
86
- # return id if line['host'] == host
87
- # end
88
- # end
89
- # return nil
90
- # end
91
-
92
38
  def count
93
39
  @count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
94
40
  end
@@ -97,25 +43,45 @@ module Query
97
43
  @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
98
44
  end
99
45
 
100
- def next
101
- url = @page.xpath('//a[text()="下一页>"]').first
102
- return if url.nil?
103
- url = url['href']
104
- url = URI.join(@baseuri,url).to_s
105
- page = HTTParty.get(url)
106
- r = Query::Result::Baidu.new(page)
107
- r.baseuri = url
108
- r.pagenumber=@pagenumber+1
109
- r.perpage=@perpage
110
- r
111
-
112
- # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
113
- end
114
46
  def has_result?
115
47
  submit = @page.search('//a[text()="提交网址"]').first
116
48
  return false if submit and submit['href'].include?'sitesubmit'
117
49
  return true
118
50
  end
51
+
52
+ def next_url
53
+ @page.search("//a[text()='下一页>']").first['href']
54
+ end
55
+
56
+ private
57
+ def parse_ad(div)
58
+ #@todo should be :
59
+ #title = div.xpath("*[contains(@class,'ec_title')]",MyFilter.new).first
60
+ title = div.xpath("//*[contains(@class,'ec_title')]",MyFilter.new).first
61
+ url = %w( span[@class='ec_url'] a[@class='EC_url'] ).map do |xpath|
62
+ node = div.search(xpath).first
63
+ node.text if node
64
+ end.compact.first
65
+ url = "http://" + url
66
+ {
67
+ :text => title.text,
68
+ :href => title['href'],
69
+ :host => Addressable::URI.parse(URI.encode(url)).host
70
+ }
71
+ end
72
+
73
+ def parse_seo(table)
74
+ url = %w( span[@class="g"] span[@class="c-showurl"] div[@class="op_zhidao_showurl"]).map do |xpath|
75
+ span = table.search(xpath).first
76
+ span.text.sub(/\d{4}-\d{1,2}-\d{1,2}/,'').strip if span
77
+ end.compact.first
78
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
79
+ {
80
+ :text => table.search("h3").first.text.strip,
81
+ :href => table.search('a').first['href'].strip,
82
+ :host => host
83
+ }
84
+ end
119
85
  end
120
86
  end
121
87
  end
@@ -1,113 +1,69 @@
1
1
  module Query
2
2
  module Result
3
- class BaiduMobile < Base
4
- #返回当前页所有查询结果
5
- def ranks
6
- #如果已经赋值说明解析过,不需要重新解析,直接返回结果
7
- return @ranks unless @ranks.nil?
8
- @ranks = Hash.new
9
- @page.xpath('//div[@class="result"]').each do |result|
10
- href,text,host,is_mobile = '','','',false
11
- a = result.search("a").first
12
- is_mobile = true unless a.search("img").empty?
13
- host = result.search('[@class="site"]').first
14
- next if host.nil?
15
- host = host.text
16
- href = a['href']
17
- text = a.text
18
- id = href.scan(/&order=(\d+)&/)
19
- if id.empty?
20
- id = nil
21
- else
22
- id = id.first.first.to_i
23
- # id = (@pagenumber-1)*10+id
24
- end
25
- =begin
26
- result.children.each do |elem|
27
- if elem.name == 'a'
28
- href = elem['href']
29
- id = elem.text.match(/^\d+/).to_s.to_i
30
- text = elem.text.sub(/^\d+/,'')
31
- text.sub!(/^\u00A0/,'')
32
- elsif elem['class'] == 'abs'
33
- elem.children.each do |elem2|
34
- if elem2['class'] == 'site'
35
- host = elem2.text
36
- break
37
- end
38
- end
39
- elsif elem['class'] == 'site'
40
- host == elem['href']
41
- end
42
- end
43
- =end
3
+ class BaiduMobile
4
+ include Query::Result
44
5
 
45
- @ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
6
+ def seo_ranks
7
+ @seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
8
+ parse_seo(div).merge({:rank => index + 1})
46
9
  end
47
- @ranks
48
10
  end
11
+
49
12
  def ads_top
50
- id = 0
51
- result = []
52
- @page.search("div[@class='ec_wise_ad']/div").each do |div|
53
- id += 1
54
- href = div.search("span[@class='ec_site']").first.text
55
- href = "http://#{href}"
56
- title = div.search("a/text()").text.strip
57
- host = Addressable::URI.parse(URI.encode(href)).host
58
- result[id] = {'title'=>title,'href'=>href,'host'=>host}
13
+ @ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
14
+ puts index
15
+ parse_ad(div).merge({:rank => index + 1})
59
16
  end
60
- result
61
17
  end
18
+
62
19
  def ads_right
63
20
  []
64
21
  end
22
+
65
23
  def ads_bottom
66
- []
24
+ @ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
25
+ parse_ad(div).merge({:rank => index + 1})
26
+ end
67
27
  end
28
+
29
+ #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
68
30
  def related_keywords
69
- @related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
31
+ @related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
70
32
  end
71
- =begin
72
- #返回当前页中,符合host条件的结果
73
- def ranks_for(specific_host)
74
- host_ranks = Hash.new
75
- ranks.each do |id,line|
76
- if specific_host.class == Regexp
77
- host_ranks[id] = line if line['host'] =~ specific_host
78
- elsif specific_host.class == String
79
- host_ranks[id] = line if line['host'] == specific_host
80
- end
81
- end
82
- host_ranks
33
+
34
+ def next_url
35
+ @next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
83
36
  end
84
- #return the top rank number from @ranks with the input host
85
- def rank(host)#on base of ranks
86
- ranks.each do |id,line|
87
- id = id.to_i
88
- if host.class == Regexp
89
- return id if line['host'] =~ host
90
- elsif host.class == String
91
- return id if line['host'] == host
92
- end
93
- end
94
- return nil
37
+
38
+ def count
39
+
95
40
  end
96
- =end
97
- #下一页
98
- def next
99
- nextbutton = @page.xpath('//a[text()="下一页"]').first
100
- return nil if nextbutton.nil?
101
- url = URI.encode nextbutton['href']
102
- # puts url
103
- # p @baseuri
104
- # exit
105
- url = URI.join(@baseuri,url).to_s
106
- page = HTTParty.get(url)
107
- r = Query::Result::BaiduMobile.new(page)
108
- r.baseuri=url
109
- r.pagenumber=@pagenumber+1
110
- r
41
+
42
+ private
43
+ def parse_ad(div)
44
+ url = div.search("span[@class='ec_site']").first.text
45
+ url = "http://#{url}"
46
+ {
47
+ :text => div.search('a/text()').text.strip,
48
+ :href => div.search('a').first['href'],
49
+ :host => Addressable::URI.parse(URI.encode(url)).host
50
+ }
51
+ end
52
+
53
+ def parse_seo(div)
54
+ a = div.search('a').first
55
+ if div['class'] == 'card-result wa-ue-card-result'
56
+ host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
57
+ elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
58
+ host = 'map.baidu.com'
59
+ else
60
+ host = div.search("*[@class='site']").first.text
61
+ end
62
+ {
63
+ :text => a.text,
64
+ :href => a['href'],
65
+ :host => host
66
+ }
111
67
  end
112
68
  end
113
69
  end
@@ -0,0 +1,66 @@
1
+ module Query
2
+ module Result
3
+ class Qihu
4
+ include Query::Result
5
+ def seo_ranks
6
+ @page.search('//ul[@id="m-result"]/li//h3').map.with_index do |h3,index|
7
+ a = h3.search('a').first
8
+ {
9
+ :rank => index + 1,
10
+ :href => a['href'],
11
+ :text => a.text.strip,
12
+ :host => Addressable::URI.parse(a['href']).host
13
+ }
14
+ end
15
+ end
16
+
17
+ def ads_top
18
+ @page.search("//ul[@id='djbox']/li").map.with_index do |li,index|
19
+ a = li.search("a").first
20
+ href = CGI.parse(URI(a['_cs']).query)['aurl'].first
21
+ {
22
+ :rank => index + 1,
23
+ :text => a.text,
24
+ :href => href,
25
+ :host => Addressable::URI.parse(URI.encode(href)).host
26
+ }
27
+ end
28
+ end
29
+
30
+ def ads_bottom
31
+ []
32
+ end
33
+
34
+ def ads_right
35
+ @page.search("//ul[@id='rightbox']/li").map.with_index do |li,index|
36
+ a = li.search('a').first
37
+ href = CGI.parse(URI(a['_cs']).query)['aurl'].first
38
+ host = Addressable::URI.parse(URI.encode(href)).host
39
+ {
40
+ :rank => index + 1,
41
+ :text => a.text,
42
+ :href => href,
43
+ :host => host
44
+ }
45
+ end
46
+ end
47
+
48
+ def related_keywords
49
+ []
50
+ end
51
+
52
+ def count
53
+ @page.search('//span[@class="nums"]').first.text.gsub(/\D/,'').to_i
54
+ end
55
+
56
+ #下一页
57
+ def next_url
58
+ next_href = @page.xpath('//a[@id="snext"]').first['href']
59
+ end
60
+ #有结果
61
+ def has_result?
62
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,6 +1,6 @@
1
1
  module Query
2
2
  module Result
3
- class QihooMobile
3
+ class QihuMobile
4
4
  end
5
5
  end
6
6
  end