query 0.0.1 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -1
  3. data/Gemfile +3 -1
  4. data/README.md +6 -1
  5. data/lib/query/engine/baidu.rb +12 -8
  6. data/lib/query/engine/baidu_mobile.rb +4 -4
  7. data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
  8. data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
  9. data/lib/query/engine/sogou.rb +45 -0
  10. data/lib/query/engine/sogou_mobile.rb +21 -0
  11. data/lib/query/engine.rb +11 -4
  12. data/lib/query/result/baidu.rb +57 -91
  13. data/lib/query/result/baidu_mobile.rb +49 -93
  14. data/lib/query/result/qihu.rb +66 -0
  15. data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
  16. data/lib/query/result/sogou.rb +103 -0
  17. data/lib/query/result/sogou_mobile.rb +51 -0
  18. data/lib/query/result.rb +47 -4
  19. data/lib/query/version.rb +1 -1
  20. data/lib/query.rb +6 -8
  21. data/query.gemspec +2 -3
  22. data/spec/baidu1_spec.rb +157 -0
  23. data/spec/baidu2_spec.rb +156 -0
  24. data/spec/mbaidu1_spec.rb +167 -0
  25. data/spec/msogou_spec.rb +91 -0
  26. data/spec/qihu_spec.rb +87 -0
  27. data/spec/samples/baidu1.html +521 -0
  28. data/spec/samples/baidu2.html +662 -0
  29. data/spec/samples/mbaidu1.html +2 -0
  30. data/spec/samples/mbaidu2.html +2 -0
  31. data/spec/samples/msogou.html +474 -0
  32. data/spec/samples/qihu.html +506 -0
  33. data/spec/samples/sogou.html +629 -0
  34. data/spec/sogou_mobile_spec.rb +86 -0
  35. data/spec/sogou_spec.rb +107 -0
  36. data/spec/spec_helper.rb +12 -1
  37. metadata +56 -31
  38. data/lib/query/engine/base.rb +0 -16
  39. data/lib/query/result/base.rb +0 -50
  40. data/lib/query/result/qihoo.rb +0 -75
  41. data/spec/baidu_mobile_spec.rb +0 -19
  42. data/spec/baidu_spec.rb +0 -73
  43. data/spec/qihoo_spec.rb +0 -27
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e414f7561d351c15835fb94956919de3d9d6ac62
4
- data.tar.gz: 1e75cc2eb2d552b779d7cc33865248cb73d00d64
3
+ metadata.gz: e5043b0180a473ab3d213136dfe9ab55ccb4a6d9
4
+ data.tar.gz: dc0b8b1ee15dc3f4437439712904de92b838520e
5
5
  SHA512:
6
- metadata.gz: e07b5ab0f23e5775945fa66cd55ae0e822888da4d5aa65968ee5a0cf91d219a3126b2af7836e23cde660b43fccca4570cc33be142d08b83092388de975a84339
7
- data.tar.gz: baa4d81223b5911ca159735bdff34ba6b4646ebeefe83112463c733373697ee0823b3de5e57513e97105c55f663bce58d87e8530f48c5d2a8718e50f80785974
6
+ metadata.gz: baa8ca09bc7bfd19f1eb3e9dffa24cf0cc28aba704d8671769bc26a79c68f10c01d57df4f76d4e6558ea638fd6c211111ca21d1883675450971256fc8369bc2a
7
+ data.tar.gz: 804e6685b6d7d49e563318a9150eae1a655c52cb6f6ca9084c7fbb0c908fde92a44025761668abe2b7aa4b21ba31fbb0c9ebcbc24a6d3c61fd2415e5344b9fd4
data/.gitignore CHANGED
@@ -20,4 +20,7 @@ tmp
20
20
  # YARD artifacts
21
21
  .yardoc
22
22
  _yardoc
23
- doc/
23
+ doc/
24
+ lib/query/.DS_Store
25
+
26
+ lib/query/.DS_Store
data/Gemfile CHANGED
@@ -1,4 +1,6 @@
1
- source 'https://rubygems.org'
2
1
 
2
+ source 'http://ruby.taobao.org'
3
+ gem 'addressable'
4
+ gem 'httparty'
3
5
  # Specify your gem's dependencies in query.gemspec
4
6
  gemspec
data/README.md CHANGED
@@ -3,22 +3,27 @@ Query
3
3
 
4
4
 
5
5
  #to get the result list by querying "abc"
6
+
6
7
  Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
7
8
  puts id,value
8
9
  end
9
10
 
10
11
  #to get the result list with host "www.abc.com.cn" by querying "abc"
12
+
11
13
  Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
12
14
  puts id,value
13
15
  end
14
16
 
15
17
  #to get the result list with host which fit the regex /com.cn/ by querying "abc"
18
+
16
19
  Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
17
20
  puts id,value
18
21
  end
19
22
 
20
23
  # to get the top rank of host "www.abc.com.cn" by querying "abc"
24
+
21
25
  Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
22
26
 
23
27
  TODO:
24
- 查询结果不多,翻页不存在时的处理,及rspec
28
+ 查询结果不多,翻页不存在时的处理,及rspec
29
+ 增加其他搜索引擎
@@ -1,7 +1,11 @@
1
1
  module Query
2
2
  module Engine
3
- class Baidu < Base
3
+ class Baidu
4
+ include Query::Engine
4
5
  BaseUri = 'http://www.baidu.com/s?'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
8
+ }
5
9
  def self.suggestions(wd)
6
10
  require 'json'
7
11
  json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
@@ -37,25 +41,25 @@ module Query
37
41
  return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
38
42
  end
39
43
 
40
- def query(wd)
44
+ def self.query(wd)
41
45
  q = Array.new
42
46
  q << "wd=#{wd}"
43
47
  q << "rn=#{@perpage.to_i}" if @perpage
44
48
  queryStr = q.join("&")
45
49
  #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
46
50
  uri = URI.encode((BaseUri + queryStr))
47
- begin
51
+ # begin
48
52
  # @page = @a.get uri
49
- @page = HTTParty.get uri
53
+ @page = HTTParty.get(uri,Options)
50
54
  r = Query::Result::Baidu.new(@page)
51
55
  r.baseuri = uri
52
56
  r.pagenumber = 1
53
57
  r.perpage = @perpage
54
58
  r
55
- rescue Exception => e
56
- warn e.to_s
57
- return false
58
- end
59
+ # rescue Exception => e
60
+ # warn e.to_s
61
+ # return false
62
+ # end
59
63
  =begin
60
64
  query = "#{query}"
61
65
  @uri = BaseUri+URI.encode(query.encode('GBK'))
@@ -1,11 +1,11 @@
1
1
  module Query
2
2
  module Engine
3
- class BaiduMobile < Base
3
+ class BaiduMobile
4
+ include Query::Engine
4
5
  BaseUri = 'http://m.baidu.com/s?'
5
- headers = {
6
- "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
7
8
  }
8
- Options = {:headers => headers}
9
9
 
10
10
  #基本查询,相当于从搜索框直接输入关键词查询
11
11
  def query(wd)
@@ -1,15 +1,20 @@
1
1
  module Query
2
2
  module Engine
3
- class Qihoo < Base
3
+ class Qihu
4
+ include Query::Engine
4
5
  Host = 'www.so.com'
6
+ headers = {
7
+ "User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'
8
+ }
9
+ Options = {:headers => headers}
5
10
  #基本查询, 相当于在搜索框直接数据关键词查询
6
11
  def query(wd)
7
12
  #用原始路径请求
8
13
  uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
9
- page = HTTParty.get(uri)
14
+ page = HTTParty.get(uri,Options)
10
15
  #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
11
16
  uri = URI.join("http://#{Host}/",page.request.path).to_s
12
- r = Query::Result::Qihoo.new(page)
17
+ r = Query::Result::Qihu.new(page)
13
18
  r.baseuri = uri
14
19
  r
15
20
  end
@@ -0,0 +1,45 @@
1
+ module Query
2
+ module Engine
3
+ class Sogou
4
+ include Query::Engine
5
+ BaseUri = 'http://www.sogou.com/web?'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.73.11 (KHTML, like Gecko) Version/7.0.1 Safari/537.73.11'}
8
+ }
9
+ class << self
10
+ def query(wd)
11
+ q = []
12
+ q << "query=#{wd}"
13
+ uri = URI.encode BaseUri+q.join('&')
14
+ page = HTTParty.get(uri,Options)
15
+ r = Query::Result::Sogou.new(page)
16
+ r.baseuri = uri
17
+ r.perpage = @perpage
18
+ r.pagenumber = 1
19
+ r
20
+ end
21
+
22
+ def suggestions(word)
23
+ suggestions = HTTParty.get "http://w.sugg.sogou.com/sugg/ajaj_json.jsp?key=#{URI.encode(word)}"
24
+ suggestions = suggestions.encode('utf-8').scan /#{word}[^"]+/
25
+ suggestions
26
+ end
27
+
28
+ #site:xxx.yyy.com
29
+ def pages(host)
30
+ query("site:#{host}")
31
+ end
32
+
33
+ #domain:xxx.yyy.com/path/file.html
34
+ def links(uri)
35
+ query("domain:\"#{uri}\"")
36
+ end
37
+
38
+ #site:xxx.yyy.com inurl:zzz
39
+ # def pages_with(host,string)
40
+ # query("site:#{host} inurl:#{string}")
41
+ # end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ module Query
2
+ module Engine
3
+ class SogouMobile
4
+ include Query::Engine
5
+ BaseUri = 'http://wap.sogou.com/web/searchList.jsp'
6
+ Options = {
7
+ :headers => {"User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'}
8
+ }
9
+ class << self
10
+ def query(wd)
11
+ queryStr = "keyword=#{wd}"
12
+ uri = URI.encode(BaseUri + "?" + queryStr)
13
+ res = HTTParty.get(uri,Options)
14
+ r = Query::Result::SogouMobile.new(res)
15
+ r.baseuri = uri
16
+ r
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
data/lib/query/engine.rb CHANGED
@@ -1,10 +1,17 @@
1
1
  module Query
2
2
  module Engine
3
+ attr_accessor :perpage
4
+ def self.indexed?(url)
5
+ URI(url)
6
+ result = query(url)
7
+ return result.has_result?
8
+ end
3
9
  end
4
10
  end
5
-
6
- require 'query/engine/base'
11
+ require 'httparty'
7
12
  require 'query/engine/baidu'
8
13
  require 'query/engine/baidu_mobile'
9
- require 'query/engine/qihoo'
10
- require 'query/engine/qihoo_mobile'
14
+ require 'query/engine/qihu'
15
+ require 'query/engine/qihu_mobile'
16
+ require 'query/engine/sogou'
17
+ require 'query/engine/sogou_mobile'
@@ -1,94 +1,40 @@
1
1
  module Query
2
2
  module Result
3
- class Baidu < Base
4
- def ranks
3
+ class Baidu
4
+ include Query::Result
5
+ def seo_ranks
5
6
  return @ranks unless @ranks.nil?
6
- @ranks = Hash.new
7
- @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
8
- id = table['id']
9
- # if @perpage == 10
10
- # id = table['id'][-1,1]
11
- # id = '10' if id == '0'
12
- # end
13
-
14
- @ranks[id] = Hash.new
15
- url = table.search("[@class=\"g\"]").first
16
- url = url.text unless url.nil?
17
- a = table.search("h3").first
18
- next if a.nil?
19
- @ranks[id]['text'] = a.text
20
- @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
21
- unless url.nil?
22
- url = url.strip
23
- @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
24
- else
25
- @ranks[id]['host'] = nil
26
- end
7
+ @page.search("//*[@class='result']|//*[@class='result-op']|//*[@class='result-op c-container']").map.with_index do |table,index|
8
+ parse_seo(table).merge({:rank => index + 1})
27
9
  end
28
- #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
29
- @ranks
30
10
  end
31
11
 
32
- def ads_bottom
33
- return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
34
- return ads_top
35
- # p @page.search("//table[@bgcolor='f5f5f5']").empty?
36
- end
37
12
  def ads_top
38
- #灰色底推广,上下都有
39
- ads = Hash.new
40
- @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
41
- id = table['id']
42
- next if id.nil?
43
- id = id[2,3].to_i.to_s
44
- ads[id]= parse_ad(table)
13
+ @page.search("//*[@class='result']/preceding-sibling::*[contains(@class,'EC_result')]").map.with_index do |div, index|
14
+ parse_ad(div).merge(:rank => index + 1)
45
15
  end
46
- #白色底推广,只有上部分
47
- if ads.empty?
48
- @page.search("//table").each do |table|
49
- id = table['id']
50
- next if id.nil? or id.to_i<3000
51
- id = id[2,3].to_i.to_s
52
- ads[id]= parse_ad(table)
53
- end
54
- end
55
- ads
56
16
  end
57
- def parse_ad(table)
58
- href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
59
- title = table.search("a").first.text.strip
60
- {'title'=>title,'href' => href,'host'=>href}
17
+
18
+ def ads_bottom
19
+ @page.search("//*[@class='result']/following-sibling::*[contains(@class,'EC_result')]").map.with_index do |div,index|
20
+ parse_ad(div).merge(:rank => index + 1)
21
+ end
61
22
  end
23
+
62
24
  def ads_right
63
- ads = {}
64
- @page.search("//div[@id='ec_im_container']").each do |table|
65
- table.search("div[@id]").each do |div|
66
- id = div['id'][-1,1].to_i+1
67
- title = div.search("a").first
68
- next if title.nil?
69
- title = title.text
70
- url = div.search("font[@color='#008000']").first
71
- next if url.nil?
72
- url = url.text
73
- ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
74
- end
25
+ @page.search("//div[@id='ec_im_container']/div[@id]").map.with_index do |div,index|
26
+ a = div.search('a').first
27
+ url = div.search("*[@class='EC_url']").first.text
28
+ url = "http://#{url}"
29
+ {
30
+ :rank => index + 1,
31
+ :text => a.text.strip,
32
+ :href => a['href'].strip,
33
+ :host => Addressable::URI.parse(URI.encode(url)).host
34
+ }
75
35
  end
76
- ads
77
36
  end
78
37
 
79
- #return the top rank number from @ranks with the input host
80
- # def rank(host)#on base of ranks
81
- # ranks.each do |id,line|
82
- # id = id.to_i
83
- # if host.class == Regexp
84
- # return id if line['host'] =~ host
85
- # elsif host.class == String
86
- # return id if line['host'] == host
87
- # end
88
- # end
89
- # return nil
90
- # end
91
-
92
38
  def count
93
39
  @count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
94
40
  end
@@ -97,25 +43,45 @@ module Query
97
43
  @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
98
44
  end
99
45
 
100
- def next
101
- url = @page.xpath('//a[text()="下一页>"]').first
102
- return if url.nil?
103
- url = url['href']
104
- url = URI.join(@baseuri,url).to_s
105
- page = HTTParty.get(url)
106
- r = Query::Result::Baidu.new(page)
107
- r.baseuri = url
108
- r.pagenumber=@pagenumber+1
109
- r.perpage=@perpage
110
- r
111
-
112
- # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
113
- end
114
46
  def has_result?
115
47
  submit = @page.search('//a[text()="提交网址"]').first
116
48
  return false if submit and submit['href'].include?'sitesubmit'
117
49
  return true
118
50
  end
51
+
52
+ def next_url
53
+ @page.search("//a[text()='下一页>']").first['href']
54
+ end
55
+
56
+ private
57
+ def parse_ad(div)
58
+ #@todo should be :
59
+ #title = div.xpath("*[contains(@class,'ec_title')]",MyFilter.new).first
60
+ title = div.xpath("//*[contains(@class,'ec_title')]",MyFilter.new).first
61
+ url = %w( span[@class='ec_url'] a[@class='EC_url'] ).map do |xpath|
62
+ node = div.search(xpath).first
63
+ node.text if node
64
+ end.compact.first
65
+ url = "http://" + url
66
+ {
67
+ :text => title.text,
68
+ :href => title['href'],
69
+ :host => Addressable::URI.parse(URI.encode(url)).host
70
+ }
71
+ end
72
+
73
+ def parse_seo(table)
74
+ url = %w( span[@class="g"] span[@class="c-showurl"] div[@class="op_zhidao_showurl"]).map do |xpath|
75
+ span = table.search(xpath).first
76
+ span.text.sub(/\d{4}-\d{1,2}-\d{1,2}/,'').strip if span
77
+ end.compact.first
78
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
79
+ {
80
+ :text => table.search("h3").first.text.strip,
81
+ :href => table.search('a').first['href'].strip,
82
+ :host => host
83
+ }
84
+ end
119
85
  end
120
86
  end
121
87
  end
@@ -1,113 +1,69 @@
1
1
  module Query
2
2
  module Result
3
- class BaiduMobile < Base
4
- #返回当前页所有查询结果
5
- def ranks
6
- #如果已经赋值说明解析过,不需要重新解析,直接返回结果
7
- return @ranks unless @ranks.nil?
8
- @ranks = Hash.new
9
- @page.xpath('//div[@class="result"]').each do |result|
10
- href,text,host,is_mobile = '','','',false
11
- a = result.search("a").first
12
- is_mobile = true unless a.search("img").empty?
13
- host = result.search('[@class="site"]').first
14
- next if host.nil?
15
- host = host.text
16
- href = a['href']
17
- text = a.text
18
- id = href.scan(/&order=(\d+)&/)
19
- if id.empty?
20
- id = nil
21
- else
22
- id = id.first.first.to_i
23
- # id = (@pagenumber-1)*10+id
24
- end
25
- =begin
26
- result.children.each do |elem|
27
- if elem.name == 'a'
28
- href = elem['href']
29
- id = elem.text.match(/^\d+/).to_s.to_i
30
- text = elem.text.sub(/^\d+/,'')
31
- text.sub!(/^\u00A0/,'')
32
- elsif elem['class'] == 'abs'
33
- elem.children.each do |elem2|
34
- if elem2['class'] == 'site'
35
- host = elem2.text
36
- break
37
- end
38
- end
39
- elsif elem['class'] == 'site'
40
- host == elem['href']
41
- end
42
- end
43
- =end
3
+ class BaiduMobile
4
+ include Query::Result
44
5
 
45
- @ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
6
+ def seo_ranks
7
+ @seo_ranks ||= @page.search("//*[@class='result']|//*[@class='card-result wa-ue-card-result']|//*[@class='result card-result wma-card-box']").map.with_index do |div,index|
8
+ parse_seo(div).merge({:rank => index + 1})
46
9
  end
47
- @ranks
48
10
  end
11
+
49
12
  def ads_top
50
- id = 0
51
- result = []
52
- @page.search("div[@class='ec_wise_ad']/div").each do |div|
53
- id += 1
54
- href = div.search("span[@class='ec_site']").first.text
55
- href = "http://#{href}"
56
- title = div.search("a/text()").text.strip
57
- host = Addressable::URI.parse(URI.encode(href)).host
58
- result[id] = {'title'=>title,'href'=>href,'host'=>host}
13
+ @ads_top ||= @page.search("//*[@class='result']/preceding-sibling::div[@class='ec_wise_ad']/div").map.with_index do |div,index|
14
+ puts index
15
+ parse_ad(div).merge({:rank => index + 1})
59
16
  end
60
- result
61
17
  end
18
+
62
19
  def ads_right
63
20
  []
64
21
  end
22
+
65
23
  def ads_bottom
66
- []
24
+ @ads_bottom ||= @page.search("//*[@class='result']/following-sibling::div[@class='ec_wise_ad']/div/div").map.with_index do |div,index|
25
+ parse_ad(div).merge({:rank => index + 1})
26
+ end
67
27
  end
28
+
29
+ #酒店预订 酒店英文 酒店团购 酒店管理 酒店招聘 快捷酒店 如家快捷酒店 五星级酒店
68
30
  def related_keywords
69
- @related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
31
+ @related_keywords ||= @page.search("div[@class='rw-list']/a").map{|a|a.text}
70
32
  end
71
- =begin
72
- #返回当前页中,符合host条件的结果
73
- def ranks_for(specific_host)
74
- host_ranks = Hash.new
75
- ranks.each do |id,line|
76
- if specific_host.class == Regexp
77
- host_ranks[id] = line if line['host'] =~ specific_host
78
- elsif specific_host.class == String
79
- host_ranks[id] = line if line['host'] == specific_host
80
- end
81
- end
82
- host_ranks
33
+
34
+ def next_url
35
+ @next_url ||= @page.xpath('//a[contains(text(),"下一页")]').first['href']
83
36
  end
84
- #return the top rank number from @ranks with the input host
85
- def rank(host)#on base of ranks
86
- ranks.each do |id,line|
87
- id = id.to_i
88
- if host.class == Regexp
89
- return id if line['host'] =~ host
90
- elsif host.class == String
91
- return id if line['host'] == host
92
- end
93
- end
94
- return nil
37
+
38
+ def count
39
+
95
40
  end
96
- =end
97
- #下一页
98
- def next
99
- nextbutton = @page.xpath('//a[text()="下一页"]').first
100
- return nil if nextbutton.nil?
101
- url = URI.encode nextbutton['href']
102
- # puts url
103
- # p @baseuri
104
- # exit
105
- url = URI.join(@baseuri,url).to_s
106
- page = HTTParty.get(url)
107
- r = Query::Result::BaiduMobile.new(page)
108
- r.baseuri=url
109
- r.pagenumber=@pagenumber+1
110
- r
41
+
42
+ private
43
+ def parse_ad(div)
44
+ url = div.search("span[@class='ec_site']").first.text
45
+ url = "http://#{url}"
46
+ {
47
+ :text => div.search('a/text()').text.strip,
48
+ :href => div.search('a').first['href'],
49
+ :host => Addressable::URI.parse(URI.encode(url)).host
50
+ }
51
+ end
52
+
53
+ def parse_seo(div)
54
+ a = div.search('a').first
55
+ if div['class'] == 'card-result wa-ue-card-result'
56
+ host = div.search("*[@class='wa-hotelgeneral-gray wa-hotelgeneral-info-sub-title']").text
57
+ elsif div['class'] == 'result card-result wma-card-box' and div['srcid'] == 'map'
58
+ host = 'map.baidu.com'
59
+ else
60
+ host = div.search("*[@class='site']").first.text
61
+ end
62
+ {
63
+ :text => a.text,
64
+ :href => a['href'],
65
+ :host => host
66
+ }
111
67
  end
112
68
  end
113
69
  end
@@ -0,0 +1,66 @@
1
+ module Query
2
+ module Result
3
+ class Qihu
4
+ include Query::Result
5
+ def seo_ranks
6
+ @page.search('//ul[@id="m-result"]/li//h3').map.with_index do |h3,index|
7
+ a = h3.search('a').first
8
+ {
9
+ :rank => index + 1,
10
+ :href => a['href'],
11
+ :text => a.text.strip,
12
+ :host => Addressable::URI.parse(a['href']).host
13
+ }
14
+ end
15
+ end
16
+
17
+ def ads_top
18
+ @page.search("//ul[@id='djbox']/li").map.with_index do |li,index|
19
+ a = li.search("a").first
20
+ href = CGI.parse(URI(a['_cs']).query)['aurl'].first
21
+ {
22
+ :rank => index + 1,
23
+ :text => a.text,
24
+ :href => href,
25
+ :host => Addressable::URI.parse(URI.encode(href)).host
26
+ }
27
+ end
28
+ end
29
+
30
+ def ads_bottom
31
+ []
32
+ end
33
+
34
+ def ads_right
35
+ @page.search("//ul[@id='rightbox']/li").map.with_index do |li,index|
36
+ a = li.search('a').first
37
+ href = CGI.parse(URI(a['_cs']).query)['aurl'].first
38
+ host = Addressable::URI.parse(URI.encode(href)).host
39
+ {
40
+ :rank => index + 1,
41
+ :text => a.text,
42
+ :href => href,
43
+ :host => host
44
+ }
45
+ end
46
+ end
47
+
48
+ def related_keywords
49
+ []
50
+ end
51
+
52
+ def count
53
+ @page.search('//span[@class="nums"]').first.text.gsub(/\D/,'').to_i
54
+ end
55
+
56
+ #下一页
57
+ def next_url
58
+ next_href = @page.xpath('//a[@id="snext"]').first['href']
59
+ end
60
+ #有结果
61
+ def has_result?
62
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
63
+ end
64
+ end
65
+ end
66
+ end
@@ -1,6 +1,6 @@
1
1
  module Query
2
2
  module Result
3
- class QihooMobile
3
+ class QihuMobile
4
4
  end
5
5
  end
6
6
  end