query 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: e414f7561d351c15835fb94956919de3d9d6ac62
4
+ data.tar.gz: 1e75cc2eb2d552b779d7cc33865248cb73d00d64
5
+ SHA512:
6
+ metadata.gz: e07b5ab0f23e5775945fa66cd55ae0e822888da4d5aa65968ee5a0cf91d219a3126b2af7836e23cde660b43fccca4570cc33be142d08b83092388de975a84339
7
+ data.tar.gz: baa4d81223b5911ca159735bdff34ba6b4646ebeefe83112463c733373697ee0823b3de5e57513e97105c55f663bce58d87e8530f48c5d2a8718e50f80785974
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ coverage
12
+ InstalledFiles
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+ # YARD artifacts
21
+ .yardoc
22
+ _yardoc
23
+ doc/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in query.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2013 seoaqua
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
6
+ this software and associated documentation files (the "Software"), to deal in
7
+ the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
9
+ the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 刘明
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,24 @@
1
+ Query
2
+ ==========
3
+
4
+
5
+ #to get the result list by querying "abc"
6
+ Query::Engine::Baidu.new.query("abc").ranks().each do |id,value|
7
+ puts id,value
8
+ end
9
+
10
+ #to get the result list with host "www.abc.com.cn" by querying "abc"
11
+ Query::Engine::Baidu.new.query("abc").ranks("www.abc.com.cn").each do |id,value|
12
+ puts id,value
13
+ end
14
+
15
+ #to get the result list with host which fit the regex /com.cn/ by querying "abc"
16
+ Query::Engine::Baidu.new.query("abc").ranks(/com.cn/).each do |id,value|
17
+ puts id,value
18
+ end
19
+
20
+ # to get the top rank of host "www.abc.com.cn" by querying "abc"
21
+ Query::Engine::Baidu.new.query("abc").rank("www.abc.com.cn")
22
+
23
+ TODO:
24
+ 查询结果不多,翻页不存在时的处理,及rspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,87 @@
1
+ module Query
2
+ module Engine
3
+ class Baidu < Base
4
+ BaseUri = 'http://www.baidu.com/s?'
5
+ def self.suggestions(wd)
6
+ require 'json'
7
+ json = HTTParty.get("http://suggestion.baidu.com/su?wd=#{URI.encode(wd)}&cb=callback").force_encoding('GBK').encode("UTF-8")
8
+ m = /\[([^\]]*)\]/.match json
9
+ return JSON.parse m[0]
10
+ end
11
+ #to find out the real url for something lik 'www.baidu.com/link?url=7yoYGJqjJ4zBBpC8yDF8xDhctimd_UkfF8AVaJRPKduy2ypxVG18aRB5L6D558y3MjT_Ko0nqFgkMoS'
12
+ # def url(id)
13
+ # a = Mechanize.new
14
+ # a.redirect_ok=false
15
+ # return a.head("http://www.baidu.com/link?url=#{id}").header['location']
16
+ # end
17
+
18
+ =begin
19
+ def extend(words,level=3,sleeptime=1)
20
+ level = level.to_i - 1
21
+ words = [words] unless words.respond_to? 'each'
22
+
23
+ extensions = Array.new
24
+ words.each do |word|
25
+ self.query(word)
26
+ extensions += related_keywords
27
+ extensions += suggestions(word)
28
+ sleep sleeptime
29
+ end
30
+ extensions.uniq!
31
+ return extensions if level < 1
32
+ return extensions + extend(extensions,level)
33
+ end
34
+ =end
35
+
36
+ def self.popular?(wd)
37
+ return HTTParty.get("http://index.baidu.com/main/word.php?word=#{URI.encode(wd.encode("GBK"))}").include?"boxFlash"
38
+ end
39
+
40
+ def query(wd)
41
+ q = Array.new
42
+ q << "wd=#{wd}"
43
+ q << "rn=#{@perpage.to_i}" if @perpage
44
+ queryStr = q.join("&")
45
+ #uri = URI.encode((BaseUri + queryStr).encode('GBK'))
46
+ uri = URI.encode((BaseUri + queryStr))
47
+ begin
48
+ # @page = @a.get uri
49
+ @page = HTTParty.get uri
50
+ r = Query::Result::Baidu.new(@page)
51
+ r.baseuri = uri
52
+ r.pagenumber = 1
53
+ r.perpage = @perpage
54
+ r
55
+ rescue Exception => e
56
+ warn e.to_s
57
+ return false
58
+ end
59
+ =begin
60
+ query = "#{query}"
61
+ @uri = BaseUri+URI.encode(query.encode('GBK'))
62
+ @page = @a.get @uri
63
+ self.clean
64
+ @number = self.how_many
65
+ @maxpage = (@number / @perpage.to_f).round
66
+ @maxpage =10 if @maxpage>10
67
+ @currpage =0
68
+ =end
69
+ end
70
+
71
+ #site:xxx.yyy.com
72
+ def pages(host)
73
+ query("site:#{host}")
74
+ end
75
+
76
+ #domain:xxx.yyy.com/path/file.html
77
+ def links(uri)
78
+ query("domain:\"#{uri}\"")
79
+ end
80
+
81
+ #site:xxx.yyy.com inurl:zzz
82
+ def pages_with(host,string)
83
+ query("site:#{host} inurl:#{string}")
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,26 @@
1
+ module Query
2
+ module Engine
3
+ class BaiduMobile < Base
4
+ BaseUri = 'http://m.baidu.com/s?'
5
+ headers = {
6
+ "User-Agent" => 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5'
7
+ }
8
+ Options = {:headers => headers}
9
+
10
+ #基本查询,相当于从搜索框直接输入关键词查询
11
+ def query(wd)
12
+ queryStr = "word=#{wd}"
13
+ uri = URI.encode((BaseUri + queryStr))
14
+ # begin
15
+ res = HTTParty.get(uri,Options)
16
+ r = Query::Result::BaiduMobile.new(res)
17
+ r.baseuri = uri
18
+ r
19
+ # rescue Exception => e
20
+ # warn "#{__FILE__} #{__LINE__} #{uri} fetch error: #{e.to_s}"
21
+ # return false
22
+ # end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,16 @@
1
+ module Query
2
+ module Engine
3
+ class Base
4
+ attr_accessor :perpage
5
+ #是否收录
6
+ # def initialize(perpage = 100)
7
+ # @perpage = perpage#只允许10或100
8
+ # end
9
+ def indexed?(url)
10
+ URI(url)
11
+ result = query(url)
12
+ return result.has_result?
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,32 @@
1
+ module Query
2
+ module Engine
3
+ class Qihoo < Base
4
+ Host = 'www.so.com'
5
+ #基本查询, 相当于在搜索框直接数据关键词查询
6
+ def query(wd)
7
+ #用原始路径请求
8
+ uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
9
+ page = HTTParty.get(uri)
10
+ #如果请求地址被跳转,重新获取当前页的URI,可避免翻页错误
11
+ uri = URI.join("http://#{Host}/",page.request.path).to_s
12
+ r = Query::Result::Qihoo.new(page)
13
+ r.baseuri = uri
14
+ r
15
+ end
16
+ def self.related_keywords(wd)
17
+ url = "http://rs.so.com/?callback=Search.relate.render&encodein=utf-8&encodeout=utf-8&q="+URI.encode(wd)
18
+ # uri = URI.join("http://#{Host}/",URI.encode('s?q='+wd)).to_s
19
+ page = HTTParty.get(url)
20
+ json_str = page.body
21
+ json_str = json_str.split("(")[1]
22
+ return nil if json_str.nil?
23
+ json_str = json_str.delete(');').strip
24
+ parsed_json = JSON.parse(json_str)
25
+ # each
26
+ # parsed_json.map { |q| p q['q']}
27
+ @related_keywords = parsed_json.map { |q| q['q'] }
28
+ # @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,2 @@
1
+ module Query
2
+ end
@@ -0,0 +1,10 @@
1
+ module Query
2
+ module Engine
3
+ end
4
+ end
5
+
6
+ require 'query/engine/base'
7
+ require 'query/engine/baidu'
8
+ require 'query/engine/baidu_mobile'
9
+ require 'query/engine/qihoo'
10
+ require 'query/engine/qihoo_mobile'
@@ -0,0 +1,121 @@
1
+ module Query
2
+ module Result
3
+ class Baidu < Base
4
+ def ranks
5
+ return @ranks unless @ranks.nil?
6
+ @ranks = Hash.new
7
+ @page.search("//table[@class=\"result\"]|//table[@class=\"result-op\"]").each do |table|
8
+ id = table['id']
9
+ # if @perpage == 10
10
+ # id = table['id'][-1,1]
11
+ # id = '10' if id == '0'
12
+ # end
13
+
14
+ @ranks[id] = Hash.new
15
+ url = table.search("[@class=\"g\"]").first
16
+ url = url.text unless url.nil?
17
+ a = table.search("h3").first
18
+ next if a.nil?
19
+ @ranks[id]['text'] = a.text
20
+ @ranks[id]['href'] = url #a.first['href'].sub('http://www.baidu.com/link?url=','').strip
21
+ unless url.nil?
22
+ url = url.strip
23
+ @ranks[id]['host'] = Addressable::URI.parse(URI.encode("http://#{url}")).host
24
+ else
25
+ @ranks[id]['host'] = nil
26
+ end
27
+ end
28
+ #@page.search("//table[@class=\"result\"]").map{|table|@page.search("//table[@id=\"#{table['id']}\"]//span[@class=\"g\"]").first}.map{|rank|URI(URI.encode('http://'+rank.text.strip)).host unless rank.nil?}
29
+ @ranks
30
+ end
31
+
32
+ def ads_bottom
33
+ return {} if @page.search("//table[@bgcolor='f5f5f5']").empty?
34
+ return ads_top
35
+ # p @page.search("//table[@bgcolor='f5f5f5']").empty?
36
+ end
37
+ def ads_top
38
+ #灰色底推广,上下都有
39
+ ads = Hash.new
40
+ @page.search("//table[@bgcolor='#f5f5f5']").each do |table|
41
+ id = table['id']
42
+ next if id.nil?
43
+ id = id[2,3].to_i.to_s
44
+ ads[id]= parse_ad(table)
45
+ end
46
+ #白色底推广,只有上部分
47
+ if ads.empty?
48
+ @page.search("//table").each do |table|
49
+ id = table['id']
50
+ next if id.nil? or id.to_i<3000
51
+ id = id[2,3].to_i.to_s
52
+ ads[id]= parse_ad(table)
53
+ end
54
+ end
55
+ ads
56
+ end
57
+ def parse_ad(table)
58
+ href = table.search("font[@color='#008000']").text.split(/\s/).first.strip
59
+ title = table.search("a").first.text.strip
60
+ {'title'=>title,'href' => href,'host'=>href}
61
+ end
62
+ def ads_right
63
+ ads = {}
64
+ @page.search("//div[@id='ec_im_container']").each do |table|
65
+ table.search("div[@id]").each do |div|
66
+ id = div['id'][-1,1].to_i+1
67
+ title = div.search("a").first
68
+ next if title.nil?
69
+ title = title.text
70
+ url = div.search("font[@color='#008000']").first
71
+ next if url.nil?
72
+ url = url.text
73
+ ads[id.to_s] = {'title'=>title,'href'=>url,'host'=>url}
74
+ end
75
+ end
76
+ ads
77
+ end
78
+
79
+ #return the top rank number from @ranks with the input host
80
+ # def rank(host)#on base of ranks
81
+ # ranks.each do |id,line|
82
+ # id = id.to_i
83
+ # if host.class == Regexp
84
+ # return id if line['host'] =~ host
85
+ # elsif host.class == String
86
+ # return id if line['host'] == host
87
+ # end
88
+ # end
89
+ # return nil
90
+ # end
91
+
92
+ def count
93
+ @count ||= @page.search("//span[@class='nums']").map{|num|num.content.gsub(/\D/,'').to_i unless num.nil?}.first
94
+ end
95
+
96
+ def related_keywords
97
+ @related_keywords ||= @page.search("//div[@id=\"rs\"]//tr//a").map{|keyword| keyword.text}
98
+ end
99
+
100
+ def next
101
+ url = @page.xpath('//a[text()="下一页>"]').first
102
+ return if url.nil?
103
+ url = url['href']
104
+ url = URI.join(@baseuri,url).to_s
105
+ page = HTTParty.get(url)
106
+ r = Query::Result::Baidu.new(page)
107
+ r.baseuri = url
108
+ r.pagenumber=@pagenumber+1
109
+ r.perpage=@perpage
110
+ r
111
+
112
+ # @page = BaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
113
+ end
114
+ def has_result?
115
+ submit = @page.search('//a[text()="提交网址"]').first
116
+ return false if submit and submit['href'].include?'sitesubmit'
117
+ return true
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,114 @@
1
+ module Query
2
+ module Result
3
+ class BaiduMobile < Base
4
+ #返回当前页所有查询结果
5
+ def ranks
6
+ #如果已经赋值说明解析过,不需要重新解析,直接返回结果
7
+ return @ranks unless @ranks.nil?
8
+ @ranks = Hash.new
9
+ @page.xpath('//div[@class="result"]').each do |result|
10
+ href,text,host,is_mobile = '','','',false
11
+ a = result.search("a").first
12
+ is_mobile = true unless a.search("img").empty?
13
+ host = result.search('[@class="site"]').first
14
+ next if host.nil?
15
+ host = host.text
16
+ href = a['href']
17
+ text = a.text
18
+ id = href.scan(/&order=(\d+)&/)
19
+ if id.empty?
20
+ id = nil
21
+ else
22
+ id = id.first.first.to_i
23
+ # id = (@pagenumber-1)*10+id
24
+ end
25
+ =begin
26
+ result.children.each do |elem|
27
+ if elem.name == 'a'
28
+ href = elem['href']
29
+ id = elem.text.match(/^\d+/).to_s.to_i
30
+ text = elem.text.sub(/^\d+/,'')
31
+ text.sub!(/^\u00A0/,'')
32
+ elsif elem['class'] == 'abs'
33
+ elem.children.each do |elem2|
34
+ if elem2['class'] == 'site'
35
+ host = elem2.text
36
+ break
37
+ end
38
+ end
39
+ elsif elem['class'] == 'site'
40
+ host == elem['href']
41
+ end
42
+ end
43
+ =end
44
+
45
+ @ranks[id.to_s] = {'href'=>href,'text'=>text,'is_mobile'=>is_mobile,'host'=>host.sub(/\u00A0/,'')}
46
+ end
47
+ @ranks
48
+ end
49
+ def ads_top
50
+ id = 0
51
+ result = []
52
+ @page.search("div[@class='ec_wise_ad']/div").each do |div|
53
+ id += 1
54
+ href = div.search("span[@class='ec_site']").first.text
55
+ href = "http://#{href}"
56
+ title = div.search("a/text()").text.strip
57
+ host = Addressable::URI.parse(URI.encode(href)).host
58
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
59
+ end
60
+ result
61
+ end
62
+ def ads_right
63
+ []
64
+ end
65
+ def ads_bottom
66
+ []
67
+ end
68
+ def related_keywords
69
+ @related_keywords ||= @page.search("div[@class='relativewords_info']/a").map{|a|a.text}
70
+ end
71
+ =begin
72
+ #返回当前页中,符合host条件的结果
73
+ def ranks_for(specific_host)
74
+ host_ranks = Hash.new
75
+ ranks.each do |id,line|
76
+ if specific_host.class == Regexp
77
+ host_ranks[id] = line if line['host'] =~ specific_host
78
+ elsif specific_host.class == String
79
+ host_ranks[id] = line if line['host'] == specific_host
80
+ end
81
+ end
82
+ host_ranks
83
+ end
84
+ #return the top rank number from @ranks with the input host
85
+ def rank(host)#on base of ranks
86
+ ranks.each do |id,line|
87
+ id = id.to_i
88
+ if host.class == Regexp
89
+ return id if line['host'] =~ host
90
+ elsif host.class == String
91
+ return id if line['host'] == host
92
+ end
93
+ end
94
+ return nil
95
+ end
96
+ =end
97
+ #下一页
98
+ def next
99
+ nextbutton = @page.xpath('//a[text()="下一页"]').first
100
+ return nil if nextbutton.nil?
101
+ url = URI.encode nextbutton['href']
102
+ # puts url
103
+ # p @baseuri
104
+ # exit
105
+ url = URI.join(@baseuri,url).to_s
106
+ page = HTTParty.get(url)
107
+ r = Query::Result::BaiduMobile.new(page)
108
+ r.baseuri=url
109
+ r.pagenumber=@pagenumber+1
110
+ r
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,50 @@
1
+ module Query
2
+ module Result
3
+ class Base
4
+ attr_accessor :baseuri,:pagenumber,:perpage
5
+ def initialize(page)
6
+ @page = Nokogiri::HTML page
7
+ @pagenumber = 1
8
+ end
9
+ # def initialize(page,baseuri,pagenumber=1,perpage=100)
10
+ # @page = Nokogiri::HTML page
11
+ # @baseuri = baseuri
12
+ # # @host = URI(baseuri).host
13
+ # @pagenumber = pagenumber
14
+ # @perpage = perpage
15
+ # end
16
+ def whole
17
+ {
18
+ 'ads_top'=>ads_top,
19
+ 'ads_right'=>ads_right,
20
+ 'ads_bottom'=>ads_bottom,
21
+ 'ranks'=>ranks
22
+ }
23
+ end
24
+ #返回当前页中host满足条件的结果
25
+ def ranks_for(specific_host)
26
+ host_ranks = Hash.new
27
+ ranks.each do |id,line|
28
+ if specific_host.class == Regexp
29
+ host_ranks[id] = line if line['host'] =~ specific_host
30
+ elsif specific_host.class == String
31
+ host_ranks[id] = line if line['host'] == specific_host
32
+ end
33
+ end
34
+ host_ranks
35
+ end
36
+ #return the top rank number from @ranks with the input host
37
+ def rank(host)#on base of ranks
38
+ ranks.each do |id,line|
39
+ id = id.to_i
40
+ if host.class == Regexp
41
+ return id if line['host'] =~ host
42
+ elsif host.class == String
43
+ return id if line['host'] == host
44
+ end
45
+ end
46
+ return nil
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,75 @@
1
+ module Query
2
+ module Result
3
+ class Qihoo < Base
4
+ # include Query::Result
5
+ Host = 'www.so.com'
6
+ #返回所有当前页的排名结果
7
+ def ranks
8
+ return @ranks unless @ranks.nil?
9
+ @ranks = Hash.new
10
+ # id = (@pagenumber - 1) * 10
11
+ id = 0
12
+ @page.search('//li[@class="res-list"]').each do |li|
13
+ a = li.search("h3/a").first
14
+ url = li.search("cite")
15
+ next if a['data-pos'].nil?
16
+ id += 1
17
+ text = a.text.strip
18
+ href = a['href']
19
+ url = url.first.text
20
+ host = Addressable::URI.parse(URI.encode("http://#{url}")).host
21
+ @ranks[id.to_s] = {'href'=>a['href'],'text'=>text,'host'=>host}
22
+ end
23
+ @ranks
24
+ end
25
+ def ads_top
26
+ id = 0
27
+ result = []
28
+ @page.search("//ul[@id='djbox']/li").each do |li|
29
+ id += 1
30
+ title = li.search("a").first.text
31
+ href = li.search("cite").first.text.downcase
32
+ host = Addressable::URI.parse(URI.encode(href)).host
33
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
34
+ end
35
+ result
36
+ end
37
+ def ads_bottom
38
+ []
39
+ end
40
+ def ads_right
41
+ id = 0
42
+ result = []
43
+ @page.search("//ul[@id='rightbox']/li").each do |li|
44
+ id += 1
45
+ title = li.search("a").first.text
46
+ href = li.search("cite").first.text.downcase
47
+ host = Addressable::URI.parse(URI.encode(href)).host
48
+ result[id] = {'title'=>title,'href'=>href,'host'=>host}
49
+ end
50
+ result
51
+ end
52
+ def related_keywords
53
+ []
54
+ end
55
+ #下一页
56
+ def next
57
+ next_href = @page.xpath('//a[@id="snext"]')
58
+ return false if next_href.empty?
59
+ next_href = next_href.first['href']
60
+ next_href = URI.join(@baseuri,next_href).to_s
61
+ # next_href = URI.join("http://#{@host}",next_href).to_s
62
+ next_page = HTTParty.get(next_href).next
63
+ r =Query::Result::Qihoo.new(next_page)
64
+ r.baseuri=next_href
65
+ r.pagenumber=@pagenumber+1
66
+ r
67
+ #@page = MbaiduResult.new(Mechanize.new.click(@page.link_with(:text=>/下一页/))) unless @page.link_with(:text=>/下一页/).nil?
68
+ end
69
+ #有结果
70
+ def has_result?
71
+ !@page.search('//div[@id="main"]/h3').text().include?'没有找到该URL'
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,6 @@
1
+ module Query
2
+ module Result
3
+ class QihooMobile
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,10 @@
1
+ module Query
2
+ module Result
3
+ end
4
+ end
5
+ require 'nokogiri'
6
+ require 'query/result/base'
7
+ require 'query/result/baidu'
8
+ require 'query/result/baidu_mobile'
9
+ require 'query/result/qihoo'
10
+ require 'query/result/qihoo_mobile'
@@ -0,0 +1,3 @@
1
+ module Query
2
+ VERSION = "0.0.1"
3
+ end
data/lib/query.rb ADDED
@@ -0,0 +1,9 @@
1
+ require "query/version"
2
+ require "query/engine"
3
+ require "query/result"
4
+ require "httparty"
5
+ require 'addressable/uri'
6
+ require 'awesome_print'
7
+ module Query
8
+ # Your code goes here...
9
+ end
data/query.gemspec ADDED
@@ -0,0 +1,27 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'query/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "query"
8
+ spec.version = Query::VERSION
9
+ spec.authors = ["seoaqua"]
10
+ spec.email = ["seoaqua@me.com"]
11
+ spec.description = %q{This GEM is designed to work for SEOers who need to fetch query and parse results from all kinds of search engines}
12
+ spec.summary = %q{Now its only support Chinese main search engines}
13
+ spec.homepage = "https://github.com/seoaqua/query"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "bundler", "~> 1.3"
22
+ spec.add_development_dependency "rake"
23
+ spec.add_dependency "nokogiri"
24
+ spec.add_dependency "addressable"
25
+ spec.add_dependency "httparty"
26
+
27
+ end
@@ -0,0 +1,19 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::BaiduMobile do
4
+ mbaidu = Query::Engine::BaiduMobile.new
5
+ page = mbaidu.query '百度'
6
+ it "应返回#{Query::Engine::BaiduMobile}" do
7
+ page.class.should == Query::Result::BaiduMobile
8
+ end
9
+ it "下一页也应是Query::Engine::BaiduMobile" do
10
+ page.next.class.should == Query::Result::BaiduMobile
11
+ page.next.next.class.should == Query::Result::BaiduMobile
12
+ end
13
+ it "百度百科域名应该大于1" do
14
+ page.rank('wapbaike.baidu.com').should > 1
15
+ end
16
+ it "百度无线域名应该在10以内" do
17
+ page.rank('m.baidu.com').should < 11
18
+ end
19
+ end
@@ -0,0 +1,73 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::Baidu do
4
+ baidu = Query::Engine::Baidu.new
5
+ page = baidu.query '百度'
6
+
7
+ it "should return Query::Result::Baidu" do
8
+ page.class.should == Query::Result::Baidu
9
+ end
10
+
11
+ it "should return 100,000,000" do
12
+ page.count.should > 100000
13
+ end
14
+ it "should return 1" do
15
+ page.rank('www.baidu.com').should == 1
16
+ end
17
+
18
+ it "should return Query::Result::Baidu" do
19
+ page.next.class.should == Query::Result::Baidu
20
+ end
21
+
22
+ it "should return true" do
23
+ bool = Query::Engine::Baidu.popular?'百度'
24
+ bool.should == true
25
+ end
26
+
27
+ it "should return false" do
28
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
29
+ bool.should == false
30
+ end
31
+
32
+ it "should return over 5 words beginning with the query_word" do
33
+ query_word = '为'
34
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
35
+ suggestions.size.should > 5
36
+ suggestions.each do |suggestion|
37
+ suggestion[0].should == query_word
38
+ end
39
+ end
40
+
41
+ it "should return 100,000,000" do
42
+ result = baidu.pages('baidu.com')
43
+ result.class.should == Query::Result::Baidu
44
+ result.count.should == 100000000
45
+ end
46
+
47
+ it "should return 100,000,000" do
48
+ result = baidu.links('baidu.com')
49
+ result.class.should == Query::Result::Baidu
50
+ result.count.should == 100000000
51
+ end
52
+ it "should return 100,000,000" do
53
+ result = baidu.pages_with('baidu.com','baidu.com')
54
+ result.class.should == Query::Result::Baidu
55
+ result.count.should == 100000000
56
+ end
57
+ it "查询已经被收录的页面收录情况时,应返回true" do
58
+ baidu.indexed?('http://www.baidu.com').should == true
59
+ end
60
+ it "查询一个不存在的页面收录情况时,应返回true" do
61
+ baidu.indexed?('http://zxv.not-exists.com').should == false
62
+ end
63
+ page1 = baidu.query('seoaqua.com')
64
+ it "查询结果应该都能拿到title,href,host" do
65
+ page1.ranks.each do |id,rank|
66
+ rank['href'].should_not == nil
67
+ rank['text'].should_not == nil
68
+ rank['host'].should_not == nil
69
+ end
70
+ end
71
+ # ads_page = baidu.query '减肥药'
72
+
73
+ end
@@ -0,0 +1,27 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Engine::Qihoo do
4
+ qihoo = Query::Engine::Qihoo.new
5
+ page = qihoo.query '奇虎'
6
+ page2 = page.next
7
+ page3 = page2.next
8
+ it "查询关键词'奇虎'后,应返回正确的实例" do
9
+ page.class.should == Query::Result::Qihoo
10
+ end
11
+ it "查询关键词'奇虎'后,下一页也应是MbaiduResult的实例" do
12
+ page2.class.should == Query::Result::Qihoo
13
+ end
14
+ it "查询关键词'奇虎'后,下一页,再下一页也应是MbaiduResult的实例" do
15
+ page3.class.should == Query::Result::Qihoo
16
+ end
17
+
18
+ it "查询关键词'奇虎'后,奇虎首页域名应该等于1" do
19
+ page.rank('www.qihoo.com').should == 1
20
+ end
21
+ it "查询已经被收录的页面收录情况时,应返回true" do
22
+ qihoo.indexed?('http://www.360.cn').should == true
23
+ end
24
+ it "查询一个不存在的页面收录情况时,应返回true" do
25
+ qihoo.indexed?('http://zxv.not-exists.com').should == false
26
+ end
27
+ end
@@ -0,0 +1 @@
1
+ require 'query'
metadata ADDED
@@ -0,0 +1,144 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: query
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - seoaqua
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-10-03 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: addressable
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: httparty
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: This GEM is designed to work for SEOers who need to fetch query and parse
84
+ results from all kinds of search engines
85
+ email:
86
+ - seoaqua@me.com
87
+ executables: []
88
+ extensions: []
89
+ extra_rdoc_files: []
90
+ files:
91
+ - .gitignore
92
+ - Gemfile
93
+ - LICENSE
94
+ - LICENSE.txt
95
+ - README.md
96
+ - Rakefile
97
+ - lib/query.rb
98
+ - lib/query/engine.rb
99
+ - lib/query/engine/baidu.rb
100
+ - lib/query/engine/baidu_mobile.rb
101
+ - lib/query/engine/base.rb
102
+ - lib/query/engine/qihoo.rb
103
+ - lib/query/engine/qihoo_mobile.rb
104
+ - lib/query/result.rb
105
+ - lib/query/result/baidu.rb
106
+ - lib/query/result/baidu_mobile.rb
107
+ - lib/query/result/base.rb
108
+ - lib/query/result/qihoo.rb
109
+ - lib/query/result/qihoo_mobile.rb
110
+ - lib/query/version.rb
111
+ - query.gemspec
112
+ - spec/baidu_mobile_spec.rb
113
+ - spec/baidu_spec.rb
114
+ - spec/qihoo_spec.rb
115
+ - spec/spec_helper.rb
116
+ homepage: https://github.com/seoaqua/query
117
+ licenses:
118
+ - MIT
119
+ metadata: {}
120
+ post_install_message:
121
+ rdoc_options: []
122
+ require_paths:
123
+ - lib
124
+ required_ruby_version: !ruby/object:Gem::Requirement
125
+ requirements:
126
+ - - '>='
127
+ - !ruby/object:Gem::Version
128
+ version: '0'
129
+ required_rubygems_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - '>='
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ requirements: []
135
+ rubyforge_project:
136
+ rubygems_version: 2.1.5
137
+ signing_key:
138
+ specification_version: 4
139
+ summary: Now its only support Chinese main search engines
140
+ test_files:
141
+ - spec/baidu_mobile_spec.rb
142
+ - spec/baidu_spec.rb
143
+ - spec/qihoo_spec.rb
144
+ - spec/spec_helper.rb