query 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e414f7561d351c15835fb94956919de3d9d6ac62
4
+ data.tar.gz: 1e75cc2eb2d552b779d7cc33865248cb73d00d64
5
+ SHA512:
6
+ metadata.gz: e07b5ab0f23e5775945fa66cd55ae0e822888da4d5aa65968ee5a0cf91d219a3126b2af7836e23cde660b43fccca4570cc33be142d08b83092388de975a84339
7
+ data.tar.gz: baa4d81223b5911ca159735bdff34ba6b4646ebeefe83112463c733373697ee0823b3de5e57513e97105c55f663bce58d87e8530f48c5d2a8718e50f80785974
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ coverage
12
+ InstalledFiles
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+ # YARD artifacts
21
+ .yardoc
22
+ _yardoc
23
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in query.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 seoaqua
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 刘明
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,24 @@
1
+ Query
2
+ ==========
3
+
4
+
5
+ #to get the result list by querying "abc"
6
+ Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
7
+ puts id,value
8
+ end
9
+
10
+ #to get the result list with host "www.abc.com.cn" by querying "abc"
11
+ Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
12
+ puts id,value
13
+ end
14
+
15
+ #to get the result list with host which fit the regex /com.cn/ by querying "abc"
16
+ Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
17
+ puts id,value
18
+ end
19
+
20
+ # to get the top rank of host "www.abc.com.cn" by querying "abc"
21
+ Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
22
+
23
+ TODO:
24
+ 查询结果不多,翻页不存在时的处理,及rspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,87 @@
1
+ module Query
2
+ module Engine
3
+ class Baidu < Base
4
+ BaseUri = 'http://www.baidu.com/s?'
5
+ def self.suggestions(wd)
6
+ require 'json'
7
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
8
+ m = /\[([^\]]*)\]/.match json
9
+ return JSON.parse m[0]
10
+ end
11
+ #to find out the real url for something lik 'www.baidu.com/link?url=7yoYGJqjJ4zBBpC8yDF8xDhctimd_UkfF8AVaJRPKduy2ypxVG18aRB5L6D558y3MjT_Ko0nqFgkMoS'
12
+ # def url(id)
13
+ # a = Mechanize.new
14
+ # a.redirect_ok=false
15
+ # return a.head("http://www.baidu.com/link?url=#{id}").header['location']
16
+ # end
17
+
18
+ =begin
19
+ def extend(words,level=3,sleeptime=1)
20
+ level = level.to_i - 1
21
+ words = [words] unless words.respond_to? 'each'
22
+
23
+ extensions = Array.new
24
+ words.each do |word|
25
+ self.query(word)
26
+ extensions += related_keywords
27
+ extensions += suggestions(word)
28
+ sleep sleeptime
29
+ end
30
+ extensions.uniq!
31
+ return extensions if level < 1
32
+ return extensions + extend(extensions,level)
33
+ end
34
+ =end
35
+
36
+ def self.popular?(wd)
37
+ return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
38
+ end
39
+
40
+ def query(wd)
41
+ q = Array.new
42
+ q << "wd=#{wd}"
43
+ q << "rn=#{@perpage.to_i}" if @perpage
44
+ queryStr = q.join("&")
45
+ #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
46
+ uri = URI.encode((BaseUri + queryStr))
47
+ begin
48
+ # @page = @a.get uri
49
+ @page = HTTParty.get uri
50
+ r = Query::Result::Baidu.new(@page)
51
+ r.baseuri = uri
52
+ r.pagenumber = 1
53
+ r.perpage = @perpage
54
+ r
55
+ rescue Exception => e
56
+ warn e.to_s
57
+ return false
58
+ end
59
+ =begin
60
+ query = "#{query}"
61
+ @uri = BaseUri+URI.encode(query.encode('GBK'))
62
+ @page = @a.get @uri
63
+ self.clean
64
+ @number = self.how_many
65
+ @maxpage = (@number / @perpage.to_f).round
66
+ @maxpage =10 if @maxpage>10
67
+ @currpage =0
68
+ =end
69
+ end
70
+
71
+ #site:xxx.yyy.com
72
+ def pages(host)
73
+ query("site:#{host}")
74
+ end
75
+
76
+ #domain:xxx.yyy.com/path/file.html
77
+ def links(uri)
78
+ query("domain:\"#{uri}\"")
79
+ end
80
+
81
+ #site:xxx.yyy.com inurl:zzz
82
+ def pages_with(host,string)
83
+ query("site:#{host} inurl:#{string}")
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,26 @@
1
+ module Query
2
+ module Engine
3
+ class BaiduMobile < Base
4
+ BaseUri = 'http://m.baidu.com/s?'
5
+ headers = {
6
+ "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
7
+ }
8
+ Options = {:headers => headers}
9
+
10
+ #基本查询,相当于从搜索框直接输入关键词查询
11
+ def query(wd)
12
+ queryStr = "word=#{wd}"
13
+ uri = URI.encode((BaseUri + queryStr))
14
+ # begin
15
+ res = HTTParty.get(uri,Options)
16
+ r = Query::Result::BaiduMobile.new(res)
17
+ r.baseuri = uri
18
+ r
19
+ # rescue Exception => e
20
+ # warn "#{__FILE__} #{__LINE__} #{uri} fetch error: #{e.to_s}"
21
+ # return false
22
+ # end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,16 @@
1
+ module Query
2
+ module Engine
3
+ class Base
4
+ attr_accessor :perpage
5
+ #是否收录
6
+ # def initialize(perpage = 100)
7
+ # @perpage = perpage#只允许10或100
8
+ # end
9
+ def indexed?(url)
10
+ URI(url)
11
+ result = query(url)
12
+ return result.has_result?
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,32 @@
1
+ module Query
2
+ module Engine
3
+ class Qihoo < Base
4
+ Host = 'www.so.com'
5
+ #基本查询, 相当于在搜索框直接数据关键词查询
6
+ def query(wd)
7
+ #用原始路径请求
8
+ uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
9
+ page = HTTParty.get(uri)
10
+ #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
11
+ uri = URI.join("http://#{Host}/",page.request.path).to_s
12
+ r = Query::Result::Qihoo.new(page)
13
+ r.baseuri = uri
14
+ r
15
+ end
16
+ def self.related_keywords(wd)
17
+ url = "http://rs.so.com/?callback=Search.relate.render&encodein=utf-8&encodeout=utf-8&q="+URI.encode(wd)
18
+ # uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
19
+ page = HTTParty.get(url)
20
+ json_str = page.body
21
+ json_str = json_str.split("(")[1]
22
+ return nil if json_str.nil?
23
+ json_str = json_str.delete(');').strip
24
+ parsed_json = JSON.parse(json_str)
25
+ # each
26
+ # parsed_json.map { |q| p q['q']}
27
+ @related_keywords = parsed_json.map { |q| q['q'] }
28
+ # @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,2 @@
1
+ module Query
2
+ end
@@ -0,0 +1,10 @@
1
+ module Query
2
+ module Engine
3
+ end
4
+ end
5
+
6
+ require 'query/engine/base'
7
+ require 'query/engine/baidu'
8
+ require 'query/engine/baidu_mobile'
9
+ require 'query/engine/qihoo'
10
+ require 'query/engine/qihoo_mobile'
@@ -0,0 +1,121 @@
1
+ module Query
2
+ module Result
3
+ class Baidu < Base
4
+ def ranks
5
+ return @ranks unless @ranks.nil?
6
+ @ranks = Hash.new
7
+ @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
8
+ id = table['id']
9
+ # if @perpage == 10
10
+ # id = table['id'][-1,1]
11
+ # id = '10' if id == '0'
12
+ # end
13
+
14
+ @ranks[id] = Hash.new
15
+ url = table.search("[@class=\"g\"]").first
16
+ url = url.text unless url.nil?
17
+ a = table.search("h3").first
18
+ next if a.nil?
19
+ @ranks[id]['text'] = a.text
20
+ @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
21
+ unless url.nil?
22
+ url = url.strip
23
+ @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
24
+ else
25
+ @ranks[id]['host'] = nil
26
+ end
27
+ end
28
+ #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
29
+ @ranks
30
+ end
31
+
32
+ def ads_bottom
33
+ return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
34
+ return ads_top
35
+ # p @page.search("//table[@bgcolor='f5f5f5']").empty?
36
+ end
37
+ def ads_top
38
+ #灰色底推广,上下都有
39
+ ads = Hash.new
40
+ @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
41
+ id = table['id']
42
+ next if id.nil?
43
+ id = id[2,3].to_i.to_s
44
+ ads[id]= parse_ad(table)
45
+ end
46
+ #白色底推广,只有上部分
47
+ if ads.empty?
48
+ @page.search("//table").each do |table|
49
+ id = table['id']
50
+ next if id.nil? or id.to_i<3000
51
+ id = id[2,3].to_i.to_s
52
+ ads[id]= parse_ad(table)
53
+ end
54
+ end
55
+ ads
56
+ end
57
+ def parse_ad(table)
58
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
59
+ title = table.search("a").first.text.strip
60
+ {'title'=>title,'href' => href,'host'=>href}
61
+ end
62
+ def ads_right
63
+ ads = {}
64
+ @page.search("//div[@id='ec_im_container']").each do |table|
65
+ table.search("div[@id]").each do |div|
66
+ id = div['id'][-1,1].to_i+1
67
+ title = div.search("a").first
68
+ next if title.nil?
69
+ title = title.text
70
+ url = div.search("font[@color='#008000']").first
71
+ next if url.nil?
72
+ url = url.text
73
+ ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
74
+ end
75
+ end
76
+ ads
77
+ end
78
+
79
+ #return the top rank number from @ranks with the input host
80
+ # def rank(host)#on base of ranks
81
+ # ranks.each do |id,line|
82
+ # id = id.to_i
83
+ # if host.class == Regexp
84
+ # return id if line['host'] =~ host
85
+ # elsif host.class == String
86
+ # return id if line['host'] == host
87
+ # end
88
+ # end
89
+ # return nil
90
+ # end
91
+
92
+ def count
93
+ @count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
94
+ end
95
+
96
+ def related_keywords
97
+ @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
98
+ end
99
+
100
+ def next
101
+ url = @page.xpath('//a[text()="下一页>"]').first
102
+ return if url.nil?
103
+ url = url['href']
104
+ url = URI.join(@baseuri,url).to_s
105
+ page = HTTParty.get(url)
106
+ r = Query::Result::Baidu.new(page)
107
+ r.baseuri = url
108
+ r.pagenumber=@pagenumber+1
109
+ r.perpage=@perpage
110
+ r
111
+
112
+ # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
113
+ end
114
+ def has_result?
115
+ submit = @page.search('//a[text()="提交网址"]').first
116
+ return false if submit and submit['href'].include?'sitesubmit'
117
+ return true
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,114 @@
1
+ module Query
2
+ module Result
3
+ class BaiduMobile < Base
4
+ #返回当前页所有查询结果
5
+ def ranks
6
+ #如果已经赋值说明解析过,不需要重新解析,直接返回结果
7
+ return @ranks unless @ranks.nil?
8
+ @ranks = Hash.new
9
+ @page.xpath('//div[@class="result"]').each do |result|
10
+ href,text,host,is_mobile = '','','',false
11
+ a = result.search("a").first
12
+ is_mobile = true unless a.search("img").empty?
13
+ host = result.search('[@class="site"]').first
14
+ next if host.nil?
15
+ host = host.text
16
+ href = a['href']
17
+ text = a.text
18
+ id = href.scan(/&order=(\d+)&/)
19
+ if id.empty?
20
+ id = nil
21
+ else
22
+ id = id.first.first.to_i
23
+ # id = (@pagenumber-1)*10+id
24
+ end
25
+ =begin
26
+ result.children.each do |elem|
27
+ if elem.name == 'a'
28
+ href = elem['href']
29
+ id = elem.text.match(/^\d+/).to_s.to_i
30
+ text = elem.text.sub(/^\d+/,'')
31
+ text.sub!(/^\u00A0/,'')
32
+ elsif elem['class'] == 'abs'
33
+ elem.children.each do |elem2|
34
+ if elem2['class'] == 'site'
35
+ host = elem2.text
36
+ break
37
+ end
38
+ end
39
+ elsif elem['class'] == 'site'
40
+ host == elem['href']
41
+ end
42
+ end
43
+ =end
44
+
45
+ @ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
46
+ end
47
+ @ranks
48
+ end
49
+ def ads_top
50
+ id = 0
51
+ result = []
52
+ @page.search("div[@class='ec_wise_ad']/div").each do |div|
53
+ id += 1
54
+ href = div.search("span[@class='ec_site']").first.text
55
+ href = "http://#{href}"
56
+ title = div.search("a/text()").text.strip
57
+ host = Addressable::URI.parse(URI.encode(href)).host
58
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
59
+ end
60
+ result
61
+ end
62
+ def ads_right
63
+ []
64
+ end
65
+ def ads_bottom
66
+ []
67
+ end
68
+ def related_keywords
69
+ @related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
70
+ end
71
+ =begin
72
+ #返回当前页中,符合host条件的结果
73
+ def ranks_for(specific_host)
74
+ host_ranks = Hash.new
75
+ ranks.each do |id,line|
76
+ if specific_host.class == Regexp
77
+ host_ranks[id] = line if line['host'] =~ specific_host
78
+ elsif specific_host.class == String
79
+ host_ranks[id] = line if line['host'] == specific_host
80
+ end
81
+ end
82
+ host_ranks
83
+ end
84
+ #return the top rank number from @ranks with the input host
85
+ def rank(host)#on base of ranks
86
+ ranks.each do |id,line|
87
+ id = id.to_i
88
+ if host.class == Regexp
89
+ return id if line['host'] =~ host
90
+ elsif host.class == String
91
+ return id if line['host'] == host
92
+ end
93
+ end
94
+ return nil
95
+ end
96
+ =end
97
+ #下一页
98
+ def next
99
+ nextbutton = @page.xpath('//a[text()="下一页"]').first
100
+ return nil if nextbutton.nil?
101
+ url = URI.encode nextbutton['href']
102
+ # puts url
103
+ # p @baseuri
104
+ # exit
105
+ url = URI.join(@baseuri,url).to_s
106
+ page = HTTParty.get(url)
107
+ r = Query::Result::BaiduMobile.new(page)
108
+ r.baseuri=url
109
+ r.pagenumber=@pagenumber+1
110
+ r
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,50 @@
1
+ module Query
2
+ module Result
3
+ class Base
4
+ attr_accessor :baseuri,:pagenumber,:perpage
5
+ def initialize(page)
6
+ @page = Nokogiri::HTML page
7
+ @pagenumber = 1
8
+ end
9
+ # def initialize(page,baseuri,pagenumber=1,perpage=100)
10
+ # @page = Nokogiri::HTML page
11
+ # @baseuri = baseuri
12
+ # # @host = URI(baseuri).host
13
+ # @pagenumber = pagenumber
14
+ # @perpage = perpage
15
+ # end
16
+ def whole
17
+ {
18
+ 'ads_top'=>ads_top,
19
+ 'ads_right'=>ads_right,
20
+ 'ads_bottom'=>ads_bottom,
21
+ 'ranks'=>ranks
22
+ }
23
+ end
24
+ #返回当前页中host满足条件的结果
25
+ def ranks_for(specific_host)
26
+ host_ranks = Hash.new
27
+ ranks.each do |id,line|
28
+ if specific_host.class == Regexp
29
+ host_ranks[id] = line if line['host'] =~ specific_host
30
+ elsif specific_host.class == String
31
+ host_ranks[id] = line if line['host'] == specific_host
32
+ end
33
+ end
34
+ host_ranks
35
+ end
36
+ #return the top rank number from @ranks with the input host
37
+ def rank(host)#on base of ranks
38
+ ranks.each do |id,line|
39
+ id = id.to_i
40
+ if host.class == Regexp
41
+ return id if line['host'] =~ host
42
+ elsif host.class == String
43
+ return id if line['host'] == host
44
+ end
45
+ end
46
+ return nil
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,75 @@
1
+ module Query
2
+ module Result
3
+ class Qihoo < Base
4
+ # include Query::Result
5
+ Host = 'www.so.com'
6
+ #返回所有当前页的排名结果
7
+ def ranks
8
+ return @ranks unless @ranks.nil?
9
+ @ranks = Hash.new
10
+ # id = (@pagenumber - 1) * 10
11
+ id = 0
12
+ @page.search('//li[@class="res-list"]').each do |li|
13
+ a = li.search("h3/a").first
14
+ url = li.search("cite")
15
+ next if a['data-pos'].nil?
16
+ id += 1
17
+ text = a.text.strip
18
+ href = a['href']
19
+ url = url.first.text
20
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
21
+ @ranks[id.to_s] = {'href'=>a['href'],'text'=>text,'host'=>host}
22
+ end
23
+ @ranks
24
+ end
25
+ def ads_top
26
+ id = 0
27
+ result = []
28
+ @page.search("//ul[@id='djbox']/li").each do |li|
29
+ id += 1
30
+ title = li.search("a").first.text
31
+ href = li.search("cite").first.text.downcase
32
+ host = Addressable::URI.parse(URI.encode(href)).host
33
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
34
+ end
35
+ result
36
+ end
37
+ def ads_bottom
38
+ []
39
+ end
40
+ def ads_right
41
+ id = 0
42
+ result = []
43
+ @page.search("//ul[@id='rightbox']/li").each do |li|
44
+ id += 1
45
+ title = li.search("a").first.text
46
+ href = li.search("cite").first.text.downcase
47
+ host = Addressable::URI.parse(URI.encode(href)).host
48
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
49
+ end
50
+ result
51
+ end
52
+ def related_keywords
53
+ []
54
+ end
55
+ #下一页
56
+ def next
57
+ next_href = @page.xpath('//a[@id="snext"]')
58
+ return false if next_href.empty?
59
+ next_href = next_href.first['href']
60
+ next_href = URI.join(@baseuri,next_href).to_s
61
+ # next_href = URI.join("http://#{@host}",next_href).to_s
62
+ next_page = HTTParty.get(next_href).next
63
+ r =Query::Result::Qihoo.new(next_page)
64
+ r.baseuri=next_href
65
+ r.pagenumber=@pagenumber+1
66
+ r
67
+ #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
68
+ end
69
+ #有结果
70
+ def has_result?
71
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,6 @@
1
+ module Query
2
+ module Result
3
+ class QihooMobile
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,10 @@
1
+ module Query
2
+ module Result
3
+ end
4
+ end
5
+ require 'nokogiri'
6
+ require 'query/result/base'
7
+ require 'query/result/baidu'
8
+ require 'query/result/baidu_mobile'
9
+ require 'query/result/qihoo'
10
+ require 'query/result/qihoo_mobile'
@@ -0,0 +1,3 @@
1
+ module Query
2
+ VERSION = "0.0.1"
3
+ end
data/lib/query.rb ADDED
@@ -0,0 +1,9 @@
1
+ require "query/version"
2
+ require "query/engine"
3
+ require "query/result"
4
+ require "httparty"
5
+ require 'addressable/uri'
6
+ require 'awesome_print'
7
+ module Query
8
+ # Your code goes here...
9
+ end
data/query.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'query/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "query"
8
+ spec.version = Query::VERSION
9
+ spec.authors = ["seoaqua"]
10
+ spec.email = ["seoaqua@me.com"]
11
+ spec.description = %q{This GEM is designed to work for SEOers who need to fetch query and parse results from all kinds of search engines}
12
+ spec.summary = %q{Now its only support Chinese main search engines}
13
+ spec.homepage = "https://github.com/seoaqua/query"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_dependency "nokogiri"
24
+ spec.add_dependency "addressable"
25
+ spec.add_dependency "httparty"
26
+
27
+ end
@@ -0,0 +1,19 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::BaiduMobile do
4
+ mbaidu = Query::Engine::BaiduMobile.new
5
+ page = mbaidu.query '百度'
6
+ it "应返回#{Query::Engine::BaiduMobile}" do
7
+ page.class.should == Query::Result::BaiduMobile
8
+ end
9
+ it "下一页也应是Query::Engine::BaiduMobile" do
10
+ page.next.class.should == Query::Result::BaiduMobile
11
+ page.next.next.class.should == Query::Result::BaiduMobile
12
+ end
13
+ it "百度百科域名应该大于1" do
14
+ page.rank('wapbaike.baidu.com').should > 1
15
+ end
16
+ it "百度无线域名应该在10以内" do
17
+ page.rank('m.baidu.com').should < 11
18
+ end
19
+ end
@@ -0,0 +1,73 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::Baidu do
4
+ baidu = Query::Engine::Baidu.new
5
+ page = baidu.query '百度'
6
+
7
+ it "should return Query::Result::Baidu" do
8
+ page.class.should == Query::Result::Baidu
9
+ end
10
+
11
+ it "should return 100,000,000" do
12
+ page.count.should > 100000
13
+ end
14
+ it "should return 1" do
15
+ page.rank('www.baidu.com').should == 1
16
+ end
17
+
18
+ it "should return Query::Result::Baidu" do
19
+ page.next.class.should == Query::Result::Baidu
20
+ end
21
+
22
+ it "should return true" do
23
+ bool = Query::Engine::Baidu.popular?'百度'
24
+ bool.should == true
25
+ end
26
+
27
+ it "should return false" do
28
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
29
+ bool.should == false
30
+ end
31
+
32
+ it "should return over 5 words beginning with the query_word" do
33
+ query_word = '为'
34
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
35
+ suggestions.size.should > 5
36
+ suggestions.each do |suggestion|
37
+ suggestion[0].should == query_word
38
+ end
39
+ end
40
+
41
+ it "should return 100,000,000" do
42
+ result = baidu.pages('baidu.com')
43
+ result.class.should == Query::Result::Baidu
44
+ result.count.should == 100000000
45
+ end
46
+
47
+ it "should return 100,000,000" do
48
+ result = baidu.links('baidu.com')
49
+ result.class.should == Query::Result::Baidu
50
+ result.count.should == 100000000
51
+ end
52
+ it "should return 100,000,000" do
53
+ result = baidu.pages_with('baidu.com','baidu.com')
54
+ result.class.should == Query::Result::Baidu
55
+ result.count.should == 100000000
56
+ end
57
+ it "查询已经被收录的页面收录情况时,应返回true" do
58
+ baidu.indexed?('http://www.baidu.com').should == true
59
+ end
60
+ it "查询一个不存在的页面收录情况时,应返回true" do
61
+ baidu.indexed?('http://zxv.not-exists.com').should == false
62
+ end
63
+ page1 = baidu.query('seoaqua.com')
64
+ it "查询结果应该都能拿到title,href,host" do
65
+ page1.ranks.each do |id,rank|
66
+ rank['href'].should_not == nil
67
+ rank['text'].should_not == nil
68
+ rank['host'].should_not == nil
69
+ end
70
+ end
71
+ # ads_page = baidu.query '减肥药'
72
+
73
+ end
@@ -0,0 +1,27 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::Qihoo do
4
+ qihoo = Query::Engine::Qihoo.new
5
+ page = qihoo.query '奇虎'
6
+ page2 = page.next
7
+ page3 = page2.next
8
+ it "查询关键词'奇虎'后,应返回正确的实例" do
9
+ page.class.should == Query::Result::Qihoo
10
+ end
11
+ it "查询关键词'奇虎'后,下一页也应是MbaiduResult的实例" do
12
+ page2.class.should == Query::Result::Qihoo
13
+ end
14
+ it "查询关键词'奇虎'后,下一页,再下一页也应是MbaiduResult的实例" do
15
+ page3.class.should == Query::Result::Qihoo
16
+ end
17
+
18
+ it "查询关键词'奇虎'后,奇虎首页域名应该等于1" do
19
+ page.rank('www.qihoo.com').should == 1
20
+ end
21
+ it "查询已经被收录的页面收录情况时,应返回true" do
22
+ qihoo.indexed?('http://www.360.cn').should == true
23
+ end
24
+ it "查询一个不存在的页面收录情况时,应返回true" do
25
+ qihoo.indexed?('http://zxv.not-exists.com').should == false
26
+ end
27
+ end
@@ -0,0 +1 @@
1
+ require 'query'
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: query
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - seoaqua
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: addressable
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: httparty
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: This GEM is designed to work for SEOers who need to fetch query and parse
84
+ results from all kinds of search engines
85
+ email:
86
+ - seoaqua@me.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - Gemfile
93
+ - LICENSE
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - lib/query.rb
98
+ - lib/query/engine.rb
99
+ - lib/query/engine/baidu.rb
100
+ - lib/query/engine/baidu_mobile.rb
101
+ - lib/query/engine/base.rb
102
+ - lib/query/engine/qihoo.rb
103
+ - lib/query/engine/qihoo_mobile.rb
104
+ - lib/query/result.rb
105
+ - lib/query/result/baidu.rb
106
+ - lib/query/result/baidu_mobile.rb
107
+ - lib/query/result/base.rb
108
+ - lib/query/result/qihoo.rb
109
+ - lib/query/result/qihoo_mobile.rb
110
+ - lib/query/version.rb
111
+ - query.gemspec
112
+ - spec/baidu_mobile_spec.rb
113
+ - spec/baidu_spec.rb
114
+ - spec/qihoo_spec.rb
115
+ - spec/spec_helper.rb
116
+ homepage: https://github.com/seoaqua/query
117
+ licenses:
118
+ - MIT
119
+ metadata: {}
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 2.1.5
137
+ signing_key:
138
+ specification_version: 4
139
+ summary: Now its only support Chinese main search engines
140
+ test_files:
141
+ - spec/baidu_mobile_spec.rb
142
+ - spec/baidu_spec.rb
143
+ - spec/qihoo_spec.rb
144
+ - spec/spec_helper.rb