query 0.0.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -1
  3. data/Gemfile +3 -1
  4. data/README.md +6 -1
  5. data/lib/query/engine/baidu.rb +12 -8
  6. data/lib/query/engine/baidu_mobile.rb +4 -4
  7. data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
  8. data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
  9. data/lib/query/engine/sogou.rb +45 -0
  10. data/lib/query/engine/sogou_mobile.rb +21 -0
  11. data/lib/query/engine.rb +11 -4
  12. data/lib/query/result/baidu.rb +57 -91
  13. data/lib/query/result/baidu_mobile.rb +49 -93
  14. data/lib/query/result/qihu.rb +66 -0
  15. data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
  16. data/lib/query/result/sogou.rb +103 -0
  17. data/lib/query/result/sogou_mobile.rb +51 -0
  18. data/lib/query/result.rb +47 -4
  19. data/lib/query/version.rb +1 -1
  20. data/lib/query.rb +6 -8
  21. data/query.gemspec +2 -3
  22. data/spec/baidu1_spec.rb +157 -0
  23. data/spec/baidu2_spec.rb +156 -0
  24. data/spec/mbaidu1_spec.rb +167 -0
  25. data/spec/msogou_spec.rb +91 -0
  26. data/spec/qihu_spec.rb +87 -0
  27. data/spec/samples/baidu1.html +521 -0
  28. data/spec/samples/baidu2.html +662 -0
  29. data/spec/samples/mbaidu1.html +2 -0
  30. data/spec/samples/mbaidu2.html +2 -0
  31. data/spec/samples/msogou.html +474 -0
  32. data/spec/samples/qihu.html +506 -0
  33. data/spec/samples/sogou.html +629 -0
  34. data/spec/sogou_mobile_spec.rb +86 -0
  35. data/spec/sogou_spec.rb +107 -0
  36. data/spec/spec_helper.rb +12 -1
  37. metadata +56 -31
  38. data/lib/query/engine/base.rb +0 -16
  39. data/lib/query/result/base.rb +0 -50
  40. data/lib/query/result/qihoo.rb +0 -75
  41. data/spec/baidu_mobile_spec.rb +0 -19
  42. data/spec/baidu_spec.rb +0 -73
  43. data/spec/qihoo_spec.rb +0 -27
@@ -0,0 +1,103 @@
1
+ module Query
2
+ module Result
3
+ class Sogou
4
+ include Query::Result
5
+ def ads_top
6
+ return [] if sponsored_divs.empty?
7
+ sponsored_divs.first.search("li").map.with_index do|li,index|
8
+ {
9
+ :rank => index + 1,
10
+ :text => li.css('h3 a').text,
11
+ :href => li.css('h3 a')[0]['href'],
12
+ :host => Addressable::URI.parse(li.css('cite')[0].text).host
13
+ }
14
+ end
15
+ end
16
+
17
+ def ads_right
18
+ @page.css('div#right div#bdfs0').map.with_index do |div,index|
19
+ {
20
+ :rank => index + 1,
21
+ :text => div.css('h3 a').text,
22
+ :href => div.css('h3 a')[0]['href'],
23
+ :host => Addressable::URI.parse(div.css('div.fb a cite').text).host
24
+ }
25
+ end
26
+ end
27
+
28
+ def ads_bottom
29
+ return [] if sponsored_divs.size < 2
30
+ end
31
+
32
+ def seo_ranks
33
+ # @seo_ranks ||= @page.search("div[@class='result']/div/h3").map do |h3|
34
+ @page.search("//div[@class='results']/div/h3").map.with_index do |h3,index|
35
+ {
36
+ :text => h3.search('a').first.text,
37
+ :href => h3.search('a').first['href'],
38
+ :host => Addressable::URI.parse(h3.search('a').first['href']).host,
39
+ :rank => index + 1
40
+ }
41
+ end
42
+ end
43
+
44
+ def count
45
+ ["//div[@class='zhanzhang']//em", "//span[@id='scd_num']"].each do |xpath|
46
+ if counter_block = @page.search(xpath).first
47
+ return counter_block.text.gsub(/\D/,'').to_i
48
+ end
49
+ end
50
+ end
51
+
52
+ def related_keywords
53
+ @related_keywords ||= @page.search("table[@id='hint_container']/td").map{|td|td.first.text}
54
+ end
55
+
56
+ def next_url
57
+ @page.search("//a[text()='下一页>']").first['href']
58
+ end
59
+
60
+ def has_result?
61
+ @page.search("div[@class='no-result']").empty?
62
+ end
63
+
64
+ # def rank(host)
65
+ # raise "unknown host object type:#{host}" unless host.class == Regexp or host.class == String
66
+
67
+ # result = {}
68
+
69
+ # #顶部广告排名
70
+ # ranking_ads_top = 0
71
+ # ads_top.each do |line|
72
+ # ranking_ads_top += 1
73
+ # if host.class == Regexp and line[:host] =~ host
74
+ # result[:rank_top] = ranking_ads_top
75
+ # break
76
+ # elsif host.class == String and line[:host] == host
77
+ # result[:rank_top] = ranking_ads_top
78
+ # break
79
+ # end
80
+ # end
81
+
82
+ # #右侧广告排名
83
+ # ranking_ads_right = 0
84
+ # ads_right.each do |line|
85
+ # ranking_ads_right += 1
86
+ # if host.class == Regexp and line[:host] =~ host
87
+ # result[:rank_right] = ranking_ads_right
88
+ # break
89
+ # elsif host.class == String and line[:host] == host
90
+ # result[:rank_right] = ranking_ads_right
91
+ # break
92
+ # end
93
+ # end
94
+
95
+ # result
96
+ # end
97
+ private
98
+ def sponsored_divs
99
+ @page.search("div[@class='sponsored']")
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,51 @@
1
+ require 'cgi'
2
+ module Query
3
+ module Result
4
+ class SogouMobile
5
+ include Query::Result
6
+ def ads_top
7
+ @page.search("//ul[@class='searchresult']/li[1]/preceding-sibling::div").map.with_index do |ad_div,index|
8
+ parse_ad(ad_div).merge({:rank => index + 1})
9
+ end
10
+ end
11
+
12
+ def ads_right
13
+ []
14
+ end
15
+
16
+ def ads_bottom
17
+ @page.search("//ul[@class='searchresult']/li[last()]/following-sibling::div").map.with_index do |div,index|
18
+ parse_ad(div).merge({:rank => index + 1})
19
+ end
20
+ end
21
+
22
+ def seo_ranks
23
+ @seo_rank ||= @page.search("//ul[@class='searchresult']/li/a").map.with_index do |a,index|
24
+ href = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'].first)
25
+ {
26
+ :rank => index + 1,
27
+ :text => a.search('h3').text,
28
+ :href => href,
29
+ :host => URI(href).host
30
+ }
31
+ end
32
+ end
33
+
34
+ def next_url
35
+ @page.search("//a[text()='下一页']").first['href']
36
+ end
37
+
38
+ def count
39
+ end
40
+
41
+ private
42
+ def parse_ad(ad_div)
43
+ {
44
+ :text => ad_div.search('h3').first.text,
45
+ :href => ad_div.search('a').first['href'],
46
+ :host => Addressable::URI.parse("http://#{ad_div.search('span[@class="site"]').text}").host
47
+ }
48
+ end
49
+ end
50
+ end
51
+ end
data/lib/query/result.rb CHANGED
@@ -1,10 +1,53 @@
1
1
  module Query
2
- module Result
2
+ module Result
3
+ attr_accessor :baseuri,:pagenumber,:perpage
4
+ def initialize(page)
5
+ @page = Nokogiri::HTML page
6
+ @pagenumber = 1
3
7
  end
8
+ def raw_ranks
9
+ {
10
+ 'ads_top'=>ads_top,
11
+ 'ads_right'=>ads_right,
12
+ 'ads_bottom'=>ads_bottom,
13
+ 'seo_ranks'=>seo_ranks
14
+ }
15
+ end
16
+
17
+ def rank(host)#on base of ranks
18
+ @rank ||= %w(seo_ranks ads_top ads_right ads_bottom).map do |type_str|
19
+ result = nil
20
+ send(type_str).each_with_index do |line,index|
21
+ if host.class == Regexp
22
+ result = index + 1 and break if line[:host] =~ host
23
+ elsif host.class == String
24
+ result = index + 1 and break if line[:host] == host
25
+ else
26
+ result = false
27
+ end
28
+ end
29
+ result
30
+ end
31
+ end
32
+
33
+ def next
34
+ @next_url = URI.join(@baseuri,next_url).to_s
35
+ next_page = HTTParty.get @next_url
36
+ next_page = self.class.new(next_page)
37
+ next_page.baseuri = @next_url
38
+ next_page.pagenumber = @pagenumber + 1
39
+ next_page.perpage = @perpage
40
+ r = next_page
41
+ r.baseuri = next_url
42
+ r
43
+ end
44
+ end
4
45
  end
5
46
  require 'nokogiri'
6
- require 'query/result/base'
47
+ require "addressable/uri"
7
48
  require 'query/result/baidu'
8
49
  require 'query/result/baidu_mobile'
9
- require 'query/result/qihoo'
10
- require 'query/result/qihoo_mobile'
50
+ require 'query/result/qihu'
51
+ require 'query/result/qihu_mobile'
52
+ require 'query/result/sogou'
53
+ require 'query/result/sogou_mobile'
data/lib/query/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Query
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.4"
3
3
  end
data/lib/query.rb CHANGED
@@ -1,9 +1,7 @@
1
- require "query/version"
2
- require "query/engine"
3
- require "query/result"
4
- require "httparty"
5
- require 'addressable/uri'
6
- require 'awesome_print'
7
- module Query
8
- # Your code goes here...
1
+ class MyFilter
2
+ def contains set, str
3
+ set.any? { |x| x.to_s.downcase == str.downcase}
4
+ end
9
5
  end
6
+ require 'query/result'
7
+ require 'query/engine'
data/query.gemspec CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Query::VERSION
9
9
  spec.authors = ["seoaqua"]
10
10
  spec.email = ["seoaqua@me.com"]
11
- spec.description = %q{This GEM is designed to work for SEOers who need to fetch query and parse results from all kinds of search engines}
12
- spec.summary = %q{Now its only support Chinese main search engines}
11
+ spec.description = %q{This GEM is designed to work for Chinese SEOers who need to fetch query and parse results from all kinds of search engines}
12
+ spec.summary = %q{I dont have time to write the document yet. Usage is almost within rspec tests. Any questions,pls contact me with QQ628552}
13
13
  spec.homepage = "https://github.com/seoaqua/query"
14
14
  spec.license = "MIT"
15
15
 
@@ -23,5 +23,4 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "nokogiri"
24
24
  spec.add_dependency "addressable"
25
25
  spec.add_dependency "httparty"
26
-
27
26
  end
@@ -0,0 +1,157 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Result::Baidu do
4
+ subject{Query::Result::Baidu.new(File.read($sample_baidu1))}
5
+
6
+ it "can click the next page button" do
7
+ subject.next_url.should == '/s?wd=%E5%90%8C%E7%A8%8B%E7%BD%91%E9%85%92%E5%BA%97%E9%A2%84%E8%AE%A2&pn=10&tn=baiduhome_pg&ie=utf-8&f=3&usm=2&rsv_page=1'
8
+ end
9
+
10
+ it "have 69200000 results" do
11
+ subject.count.should == 69200000
12
+ end
13
+
14
+
15
+ describe '#seo_ranks' do
16
+ it "puts www.17u.cn to be on first" do
17
+ subject.seo_ranks.first[:host].should == 'www.17u.cn'
18
+ end
19
+
20
+ it "should put 同程旅游网客服电话 to be the first title" do
21
+ subject.seo_ranks.first[:text].should == '同程旅游网客服电话'
22
+ end
23
+
24
+ it "should put 'http://www.17u.cn/' to be the second url" do
25
+ subject.seo_ranks[1][:href].should == 'http://www.17u.cn/'
26
+ end
27
+
28
+ it "should have href,text,host elements for each seo result" do
29
+ subject.seo_ranks.each do |seo_rank|
30
+ seo_rank[:href].should_not == nil
31
+ seo_rank[:text].should_not == nil
32
+ seo_rank[:host].should_not == nil
33
+ end
34
+ end
35
+ end
36
+
37
+ describe '#ads_top' do
38
+ it "should have 6 top ads" do
39
+ subject.ads_top.size.should == 3
40
+ end
41
+
42
+ it "should find hotel.elong.com at the first position in the top ads" do
43
+ subject.ads_top[0][:host].should == 'www.17u.cn'
44
+ end
45
+
46
+ it "has an array of hashes with the required keys as the result of ads_top" do
47
+ subject.ads_top.class.should == Array
48
+ subject.ads_top.each do |ad_top|
49
+ ad_top.should have_key(:rank)
50
+ ad_top.should have_key(:host)
51
+ ad_top.should have_key(:href)
52
+ ad_top.should have_key(:text)
53
+ end
54
+ end
55
+ end
56
+
57
+ describe '#ads_right' do
58
+ it "should have 5 right ads" do
59
+ subject.ads_right.size.should == 5
60
+ end
61
+
62
+ it "has an array of hashes with the required keys as the result of ads_right" do
63
+ subject.ads_right.class.should == Array
64
+ subject.ads_right.each do |ad_right|
65
+ ad_right.should have_key(:rank)
66
+ ad_right.should have_key(:host)
67
+ ad_right.should have_key(:href)
68
+ ad_right.should have_key(:text)
69
+ end
70
+ end
71
+ end
72
+
73
+ describe '#ads_bottom' do
74
+ it "should have zero bottom ads" do
75
+ subject.ads_bottom.size.should == 3
76
+ end
77
+
78
+ it "has an array of hashes with the required keys as the result of ads_bottom" do
79
+ subject.ads_bottom.class.should == Array
80
+ subject.ads_bottom.each do |ad_bottom|
81
+ ad_bottom.should have_key(:rank)
82
+ ad_bottom.should have_key(:host)
83
+ ad_bottom.should have_key(:href)
84
+ ad_bottom.should have_key(:text)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ __END__
90
+ describe Query::Engine::Baidu do
91
+ page = Query::Engine::Baidu.query '百度'
92
+
93
+ it "should return Query::Result::Baidu" do
94
+ page.class.should == Query::Result::Baidu
95
+ end
96
+
97
+ it "should return 100,000,000" do
98
+ page.count.should > 100000
99
+ end
100
+ it "should return 1" do
101
+ page.rank('www.baidu.com').should == 1
102
+ end
103
+
104
+ it "should return Query::Result::Baidu" do
105
+ page.next.class.should == Query::Result::Baidu
106
+ end
107
+
108
+ it "should return true" do
109
+ bool = Query::Engine::Baidu.popular?'百度'
110
+ bool.should == true
111
+ end
112
+
113
+ it "should return false" do
114
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
115
+ bool.should == false
116
+ end
117
+
118
+ it "should return over 5 words beginning with the query_word" do
119
+ query_word = '为'
120
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
121
+ suggestions.size.should > 5
122
+ suggestions.each do |suggestion|
123
+ suggestion[0].should == query_word
124
+ end
125
+ end
126
+
127
+ it "should return 100,000,000" do
128
+ result = baidu.pages('baidu.com')
129
+ result.class.should == Query::Result::Baidu
130
+ result.count.should == 100000000
131
+ end
132
+
133
+ it "should return 100,000,000" do
134
+ result = baidu.links('baidu.com')
135
+ result.class.should == Query::Result::Baidu
136
+ result.count.should == 100000000
137
+ end
138
+ it "should return 100,000,000" do
139
+ result = baidu.pages_with('baidu.com','baidu.com')
140
+ result.class.should == Query::Result::Baidu
141
+ result.count.should == 100000000
142
+ end
143
+ it "查询已经被收录的页面收录情况时,应返回true" do
144
+ baidu.indexed?('http://www.baidu.com').should == true
145
+ end
146
+ it "查询一个不存在的页面收录情况时,应返回true" do
147
+ baidu.indexed?('http://zxv.not-exists.com').should == false
148
+ end
149
+ page1 = Query::Engine::Baidu.query('seoaqua.com')
150
+ it "查询结果应该都能拿到title,href,host" do
151
+ page1.seo_ranks.each do |id,rank|
152
+ rank['href'].should_not == nil
153
+ rank['text'].should_not == nil
154
+ rank['host'].should_not == nil
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,156 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Result::Baidu do
4
+ subject{Query::Result::Baidu.new(File.read($sample_baidu2))}
5
+
6
+ it "can click the next page button" do
7
+ subject.next_url.should == '/s?wd=%E9%85%92%E5%BA%97%E9%A2%84%E8%AE%A2%E7%BD%91&pn=10&tn=baiduhome_pg&ie=utf-8&f=3&usm=1&rsv_page=1'
8
+ end
9
+
10
+ it "have 100000000 results" do
11
+ subject.count.should == 100000000
12
+ end
13
+
14
+ describe '#seo_ranks' do
15
+ it "should put hotel.qunar.com to be on first" do
16
+ subject.seo_ranks.first[:host].should == 'hotel.qunar.com'
17
+ end
18
+
19
+ it "should put 北京酒店预订_8371家特惠酒店_百度品质保证_去哪儿网提供 to be the first title" do
20
+ subject.seo_ranks.first[:text].should == '北京酒店预订_8371家特惠酒店_百度品质保证_去哪儿网提供'
21
+ end
22
+
23
+ it "should put 'http://jiudian.qunar.com/' to be the second url" do
24
+ subject.seo_ranks[1][:href].should == 'http://www.baidu.com/link?url=ZpGwUrZ8xUUgBQofg1TiNH1n_Ki3QWE62jvjkGvwwZ70wQPxFJxSD1uunh0uDwLM'
25
+ end
26
+
27
+ it "should have href,text,host elements for each seo result" do
28
+ subject.seo_ranks.each do |seo_rank|
29
+ seo_rank[:href].should_not == nil
30
+ seo_rank[:text].should_not == nil
31
+ seo_rank[:host].should_not == nil
32
+ end
33
+ end
34
+ end
35
+
36
+ describe '#ads_top' do
37
+ it "should have 6 top ads" do
38
+ subject.ads_top.size.should == 4
39
+ end
40
+
41
+ it "should find hotel.elong.com at the first position in the top ads" do
42
+ subject.ads_top[0][:host].should == 'www.agoda.com'
43
+ end
44
+
45
+ it "has an array of hashes with the required keys as the result of ads_top" do
46
+ subject.ads_top.class.should == Array
47
+ subject.ads_top.each do |ad_top|
48
+ ad_top.should have_key(:rank)
49
+ ad_top.should have_key(:host)
50
+ ad_top.should have_key(:href)
51
+ ad_top.should have_key(:text)
52
+ end
53
+ end
54
+ end
55
+
56
+ describe '#ads_right' do
57
+ it "should have 8 right ads" do
58
+ subject.ads_right.size.should == 8
59
+ end
60
+
61
+ it "has an array of hashes with the required keys as the result of ads_right" do
62
+ subject.ads_right.class.should == Array
63
+ subject.ads_right.each do |ad_right|
64
+ ad_right.should have_key(:rank)
65
+ ad_right.should have_key(:host)
66
+ ad_right.should have_key(:href)
67
+ ad_right.should have_key(:text)
68
+ end
69
+ end
70
+ end
71
+
72
+ describe '#ads_bottom' do
73
+ it "should have zero bottom ads" do
74
+ subject.ads_bottom.size.should == 0
75
+ end
76
+
77
+ it "has an array of hashes with the required keys as the result of ads_bottom" do
78
+ subject.ads_bottom.class.should == Array
79
+ subject.ads_bottom.each do |ad_bottom|
80
+ ad_bottom.should have_key(:rank)
81
+ ad_bottom.should have_key(:host)
82
+ ad_bottom.should have_key(:href)
83
+ ad_bottom.should have_key(:text)
84
+ end
85
+ end
86
+ end
87
+ end
88
+ __END__
89
+ describe Query::Engine::Baidu do
90
+ page = Query::Engine::Baidu.query '百度'
91
+
92
+ it "should return Query::Result::Baidu" do
93
+ page.class.should == Query::Result::Baidu
94
+ end
95
+
96
+ it "should return 100,000,000" do
97
+ page.count.should > 100000
98
+ end
99
+ it "should return 1" do
100
+ page.rank('www.baidu.com').should == 1
101
+ end
102
+
103
+ it "should return Query::Result::Baidu" do
104
+ page.next.class.should == Query::Result::Baidu
105
+ end
106
+
107
+ it "should return true" do
108
+ bool = Query::Engine::Baidu.popular?'百度'
109
+ bool.should == true
110
+ end
111
+
112
+ it "should return false" do
113
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
114
+ bool.should == false
115
+ end
116
+
117
+ it "should return over 5 words beginning with the query_word" do
118
+ query_word = '为'
119
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
120
+ suggestions.size.should > 5
121
+ suggestions.each do |suggestion|
122
+ suggestion[0].should == query_word
123
+ end
124
+ end
125
+
126
+ it "should return 100,000,000" do
127
+ result = baidu.pages('baidu.com')
128
+ result.class.should == Query::Result::Baidu
129
+ result.count.should == 100000000
130
+ end
131
+
132
+ it "should return 100,000,000" do
133
+ result = baidu.links('baidu.com')
134
+ result.class.should == Query::Result::Baidu
135
+ result.count.should == 100000000
136
+ end
137
+ it "should return 100,000,000" do
138
+ result = baidu.pages_with('baidu.com','baidu.com')
139
+ result.class.should == Query::Result::Baidu
140
+ result.count.should == 100000000
141
+ end
142
+ it "查询已经被收录的页面收录情况时,应返回true" do
143
+ baidu.indexed?('http://www.baidu.com').should == true
144
+ end
145
+ it "查询一个不存在的页面收录情况时,应返回true" do
146
+ baidu.indexed?('http://zxv.not-exists.com').should == false
147
+ end
148
+ page1 = Query::Engine::Baidu.query('seoaqua.com')
149
+ it "查询结果应该都能拿到title,href,host" do
150
+ page1.seo_ranks.each do |id,rank|
151
+ rank['href'].should_not == nil
152
+ rank['text'].should_not == nil
153
+ rank['host'].should_not == nil
154
+ end
155
+ end
156
+ end