query 0.0.1 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +4 -1
  3. data/Gemfile +3 -1
  4. data/README.md +6 -1
  5. data/lib/query/engine/baidu.rb +12 -8
  6. data/lib/query/engine/baidu_mobile.rb +4 -4
  7. data/lib/query/engine/{qihoo.rb → qihu.rb} +8 -3
  8. data/lib/query/engine/{qihoo_mobile.rb → qihu_mobile.rb} +0 -0
  9. data/lib/query/engine/sogou.rb +45 -0
  10. data/lib/query/engine/sogou_mobile.rb +21 -0
  11. data/lib/query/engine.rb +11 -4
  12. data/lib/query/result/baidu.rb +57 -91
  13. data/lib/query/result/baidu_mobile.rb +49 -93
  14. data/lib/query/result/qihu.rb +66 -0
  15. data/lib/query/result/{qihoo_mobile.rb → qihu_mobile.rb} +1 -1
  16. data/lib/query/result/sogou.rb +103 -0
  17. data/lib/query/result/sogou_mobile.rb +51 -0
  18. data/lib/query/result.rb +47 -4
  19. data/lib/query/version.rb +1 -1
  20. data/lib/query.rb +6 -8
  21. data/query.gemspec +2 -3
  22. data/spec/baidu1_spec.rb +157 -0
  23. data/spec/baidu2_spec.rb +156 -0
  24. data/spec/mbaidu1_spec.rb +167 -0
  25. data/spec/msogou_spec.rb +91 -0
  26. data/spec/qihu_spec.rb +87 -0
  27. data/spec/samples/baidu1.html +521 -0
  28. data/spec/samples/baidu2.html +662 -0
  29. data/spec/samples/mbaidu1.html +2 -0
  30. data/spec/samples/mbaidu2.html +2 -0
  31. data/spec/samples/msogou.html +474 -0
  32. data/spec/samples/qihu.html +506 -0
  33. data/spec/samples/sogou.html +629 -0
  34. data/spec/sogou_mobile_spec.rb +86 -0
  35. data/spec/sogou_spec.rb +107 -0
  36. data/spec/spec_helper.rb +12 -1
  37. metadata +56 -31
  38. data/lib/query/engine/base.rb +0 -16
  39. data/lib/query/result/base.rb +0 -50
  40. data/lib/query/result/qihoo.rb +0 -75
  41. data/spec/baidu_mobile_spec.rb +0 -19
  42. data/spec/baidu_spec.rb +0 -73
  43. data/spec/qihoo_spec.rb +0 -27
@@ -0,0 +1,103 @@
1
+ module Query
2
+ module Result
3
+ class Sogou
4
+ include Query::Result
5
+ def ads_top
6
+ return [] if sponsored_divs.empty?
7
+ sponsored_divs.first.search("li").map.with_index do|li,index|
8
+ {
9
+ :rank => index + 1,
10
+ :text => li.css('h3 a').text,
11
+ :href => li.css('h3 a')[0]['href'],
12
+ :host => Addressable::URI.parse(li.css('cite')[0].text).host
13
+ }
14
+ end
15
+ end
16
+
17
+ def ads_right
18
+ @page.css('div#right div#bdfs0').map.with_index do |div,index|
19
+ {
20
+ :rank => index + 1,
21
+ :text => div.css('h3 a').text,
22
+ :href => div.css('h3 a')[0]['href'],
23
+ :host => Addressable::URI.parse(div.css('div.fb a cite').text).host
24
+ }
25
+ end
26
+ end
27
+
28
+ def ads_bottom
29
+ return [] if sponsored_divs.size < 2
30
+ end
31
+
32
+ def seo_ranks
33
+ # @seo_ranks ||= @page.search("div[@class='result']/div/h3").map do |h3|
34
+ @page.search("//div[@class='results']/div/h3").map.with_index do |h3,index|
35
+ {
36
+ :text => h3.search('a').first.text,
37
+ :href => h3.search('a').first['href'],
38
+ :host => Addressable::URI.parse(h3.search('a').first['href']).host,
39
+ :rank => index + 1
40
+ }
41
+ end
42
+ end
43
+
44
+ def count
45
+ ["//div[@class='zhanzhang']//em", "//span[@id='scd_num']"].each do |xpath|
46
+ if counter_block = @page.search(xpath).first
47
+ return counter_block.text.gsub(/\D/,'').to_i
48
+ end
49
+ end
50
+ end
51
+
52
+ def related_keywords
53
+ @related_keywords ||= @page.search("table[@id='hint_container']/td").map{|td|td.first.text}
54
+ end
55
+
56
+ def next_url
57
+ @page.search("//a[text()='下一页>']").first['href']
58
+ end
59
+
60
+ def has_result?
61
+ @page.search("div[@class='no-result']").empty?
62
+ end
63
+
64
+ # def rank(host)
65
+ # raise "unknown host object type:#{host}" unless host.class == Regexp or host.class == String
66
+
67
+ # result = {}
68
+
69
+ # #顶部广告排名
70
+ # ranking_ads_top = 0
71
+ # ads_top.each do |line|
72
+ # ranking_ads_top += 1
73
+ # if host.class == Regexp and line[:host] =~ host
74
+ # result[:rank_top] = ranking_ads_top
75
+ # break
76
+ # elsif host.class == String and line[:host] == host
77
+ # result[:rank_top] = ranking_ads_top
78
+ # break
79
+ # end
80
+ # end
81
+
82
+ # #右侧广告排名
83
+ # ranking_ads_right = 0
84
+ # ads_right.each do |line|
85
+ # ranking_ads_right += 1
86
+ # if host.class == Regexp and line[:host] =~ host
87
+ # result[:rank_right] = ranking_ads_right
88
+ # break
89
+ # elsif host.class == String and line[:host] == host
90
+ # result[:rank_right] = ranking_ads_right
91
+ # break
92
+ # end
93
+ # end
94
+
95
+ # result
96
+ # end
97
+ private
98
+ def sponsored_divs
99
+ @page.search("div[@class='sponsored']")
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,51 @@
1
+ require 'cgi'
2
+ module Query
3
+ module Result
4
+ class SogouMobile
5
+ include Query::Result
6
+ def ads_top
7
+ @page.search("//ul[@class='searchresult']/li[1]/preceding-sibling::div").map.with_index do |ad_div,index|
8
+ parse_ad(ad_div).merge({:rank => index + 1})
9
+ end
10
+ end
11
+
12
+ def ads_right
13
+ []
14
+ end
15
+
16
+ def ads_bottom
17
+ @page.search("//ul[@class='searchresult']/li[last()]/following-sibling::div").map.with_index do |div,index|
18
+ parse_ad(div).merge({:rank => index + 1})
19
+ end
20
+ end
21
+
22
+ def seo_ranks
23
+ @seo_rank ||= @page.search("//ul[@class='searchresult']/li/a").map.with_index do |a,index|
24
+ href = URI.decode(CGI.parse(URI(URI.encode(a['href'])).query)['url'].first)
25
+ {
26
+ :rank => index + 1,
27
+ :text => a.search('h3').text,
28
+ :href => href,
29
+ :host => URI(href).host
30
+ }
31
+ end
32
+ end
33
+
34
+ def next_url
35
+ @page.search("//a[text()='下一页']").first['href']
36
+ end
37
+
38
+ def count
39
+ end
40
+
41
+ private
42
+ def parse_ad(ad_div)
43
+ {
44
+ :text => ad_div.search('h3').first.text,
45
+ :href => ad_div.search('a').first['href'],
46
+ :host => Addressable::URI.parse("http://#{ad_div.search('span[@class="site"]').text}").host
47
+ }
48
+ end
49
+ end
50
+ end
51
+ end
data/lib/query/result.rb CHANGED
@@ -1,10 +1,53 @@
1
1
  module Query
2
- module Result
2
+ module Result
3
+ attr_accessor :baseuri,:pagenumber,:perpage
4
+ def initialize(page)
5
+ @page = Nokogiri::HTML page
6
+ @pagenumber = 1
3
7
  end
8
+ def raw_ranks
9
+ {
10
+ 'ads_top'=>ads_top,
11
+ 'ads_right'=>ads_right,
12
+ 'ads_bottom'=>ads_bottom,
13
+ 'seo_ranks'=>seo_ranks
14
+ }
15
+ end
16
+
17
+ def rank(host)#on base of ranks
18
+ @rank ||= %w(seo_ranks ads_top ads_right ads_bottom).map do |type_str|
19
+ result = nil
20
+ send(type_str).each_with_index do |line,index|
21
+ if host.class == Regexp
22
+ result = index + 1 and break if line[:host] =~ host
23
+ elsif host.class == String
24
+ result = index + 1 and break if line[:host] == host
25
+ else
26
+ result = false
27
+ end
28
+ end
29
+ result
30
+ end
31
+ end
32
+
33
+ def next
34
+ @next_url = URI.join(@baseuri,next_url).to_s
35
+ next_page = HTTParty.get @next_url
36
+ next_page = self.class.new(next_page)
37
+ next_page.baseuri = @next_url
38
+ next_page.pagenumber = @pagenumber + 1
39
+ next_page.perpage = @perpage
40
+ r = next_page
41
+ r.baseuri = next_url
42
+ r
43
+ end
44
+ end
4
45
  end
5
46
  require 'nokogiri'
6
- require 'query/result/base'
47
+ require "addressable/uri"
7
48
  require 'query/result/baidu'
8
49
  require 'query/result/baidu_mobile'
9
- require 'query/result/qihoo'
10
- require 'query/result/qihoo_mobile'
50
+ require 'query/result/qihu'
51
+ require 'query/result/qihu_mobile'
52
+ require 'query/result/sogou'
53
+ require 'query/result/sogou_mobile'
data/lib/query/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Query
2
- VERSION = "0.0.1"
2
+ VERSION = "0.1.4"
3
3
  end
data/lib/query.rb CHANGED
@@ -1,9 +1,7 @@
1
- require "query/version"
2
- require "query/engine"
3
- require "query/result"
4
- require "httparty"
5
- require 'addressable/uri'
6
- require 'awesome_print'
7
- module Query
8
- # Your code goes here...
1
+ class MyFilter
2
+ def contains set, str
3
+ set.any? { |x| x.to_s.downcase == str.downcase}
4
+ end
9
5
  end
6
+ require 'query/result'
7
+ require 'query/engine'
data/query.gemspec CHANGED
@@ -8,8 +8,8 @@ Gem::Specification.new do |spec|
8
8
  spec.version = Query::VERSION
9
9
  spec.authors = ["seoaqua"]
10
10
  spec.email = ["seoaqua@me.com"]
11
- spec.description = %q{This GEM is designed to work for SEOers who need to fetch query and parse results from all kinds of search engines}
12
- spec.summary = %q{Now its only support Chinese main search engines}
11
+ spec.description = %q{This GEM is designed to work for Chinese SEOers who need to fetch query and parse results from all kinds of search engines}
12
+ spec.summary = %q{I dont have time to write the document yet. Usage is almost within rspec tests. Any questions,pls contact me with QQ628552}
13
13
  spec.homepage = "https://github.com/seoaqua/query"
14
14
  spec.license = "MIT"
15
15
 
@@ -23,5 +23,4 @@ Gem::Specification.new do |spec|
23
23
  spec.add_dependency "nokogiri"
24
24
  spec.add_dependency "addressable"
25
25
  spec.add_dependency "httparty"
26
-
27
26
  end
@@ -0,0 +1,157 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Result::Baidu do
4
+ subject{Query::Result::Baidu.new(File.read($sample_baidu1))}
5
+
6
+ it "can click the next page button" do
7
+ subject.next_url.should == '/s?wd=%E5%90%8C%E7%A8%8B%E7%BD%91%E9%85%92%E5%BA%97%E9%A2%84%E8%AE%A2&pn=10&tn=baiduhome_pg&ie=utf-8&f=3&usm=2&rsv_page=1'
8
+ end
9
+
10
+ it "have 69200000 results" do
11
+ subject.count.should == 69200000
12
+ end
13
+
14
+
15
+ describe '#seo_ranks' do
16
+ it "puts www.17u.cn to be on first" do
17
+ subject.seo_ranks.first[:host].should == 'www.17u.cn'
18
+ end
19
+
20
+ it "should put 同程旅游网客服电话 to be the first title" do
21
+ subject.seo_ranks.first[:text].should == '同程旅游网客服电话'
22
+ end
23
+
24
+ it "should put 'http://www.17u.cn/' to be the second url" do
25
+ subject.seo_ranks[1][:href].should == 'http://www.17u.cn/'
26
+ end
27
+
28
+ it "should have href,text,host elements for each seo result" do
29
+ subject.seo_ranks.each do |seo_rank|
30
+ seo_rank[:href].should_not == nil
31
+ seo_rank[:text].should_not == nil
32
+ seo_rank[:host].should_not == nil
33
+ end
34
+ end
35
+ end
36
+
37
+ describe '#ads_top' do
38
+ it "should have 6 top ads" do
39
+ subject.ads_top.size.should == 3
40
+ end
41
+
42
+ it "should find hotel.elong.com at the first position in the top ads" do
43
+ subject.ads_top[0][:host].should == 'www.17u.cn'
44
+ end
45
+
46
+ it "has an array of hashes with the required keys as the result of ads_top" do
47
+ subject.ads_top.class.should == Array
48
+ subject.ads_top.each do |ad_top|
49
+ ad_top.should have_key(:rank)
50
+ ad_top.should have_key(:host)
51
+ ad_top.should have_key(:href)
52
+ ad_top.should have_key(:text)
53
+ end
54
+ end
55
+ end
56
+
57
+ describe '#ads_right' do
58
+ it "should have 5 right ads" do
59
+ subject.ads_right.size.should == 5
60
+ end
61
+
62
+ it "has an array of hashes with the required keys as the result of ads_right" do
63
+ subject.ads_right.class.should == Array
64
+ subject.ads_right.each do |ad_right|
65
+ ad_right.should have_key(:rank)
66
+ ad_right.should have_key(:host)
67
+ ad_right.should have_key(:href)
68
+ ad_right.should have_key(:text)
69
+ end
70
+ end
71
+ end
72
+
73
+ describe '#ads_bottom' do
74
+ it "should have zero bottom ads" do
75
+ subject.ads_bottom.size.should == 3
76
+ end
77
+
78
+ it "has an array of hashes with the required keys as the result of ads_bottom" do
79
+ subject.ads_bottom.class.should == Array
80
+ subject.ads_bottom.each do |ad_bottom|
81
+ ad_bottom.should have_key(:rank)
82
+ ad_bottom.should have_key(:host)
83
+ ad_bottom.should have_key(:href)
84
+ ad_bottom.should have_key(:text)
85
+ end
86
+ end
87
+ end
88
+ end
89
+ __END__
90
+ describe Query::Engine::Baidu do
91
+ page = Query::Engine::Baidu.query '百度'
92
+
93
+ it "should return Query::Result::Baidu" do
94
+ page.class.should == Query::Result::Baidu
95
+ end
96
+
97
+ it "should return 100,000,000" do
98
+ page.count.should > 100000
99
+ end
100
+ it "should return 1" do
101
+ page.rank('www.baidu.com').should == 1
102
+ end
103
+
104
+ it "should return Query::Result::Baidu" do
105
+ page.next.class.should == Query::Result::Baidu
106
+ end
107
+
108
+ it "should return true" do
109
+ bool = Query::Engine::Baidu.popular?'百度'
110
+ bool.should == true
111
+ end
112
+
113
+ it "should return false" do
114
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
115
+ bool.should == false
116
+ end
117
+
118
+ it "should return over 5 words beginning with the query_word" do
119
+ query_word = '为'
120
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
121
+ suggestions.size.should > 5
122
+ suggestions.each do |suggestion|
123
+ suggestion[0].should == query_word
124
+ end
125
+ end
126
+
127
+ it "should return 100,000,000" do
128
+ result = baidu.pages('baidu.com')
129
+ result.class.should == Query::Result::Baidu
130
+ result.count.should == 100000000
131
+ end
132
+
133
+ it "should return 100,000,000" do
134
+ result = baidu.links('baidu.com')
135
+ result.class.should == Query::Result::Baidu
136
+ result.count.should == 100000000
137
+ end
138
+ it "should return 100,000,000" do
139
+ result = baidu.pages_with('baidu.com','baidu.com')
140
+ result.class.should == Query::Result::Baidu
141
+ result.count.should == 100000000
142
+ end
143
+ it "查询已经被收录的页面收录情况时,应返回true" do
144
+ baidu.indexed?('http://www.baidu.com').should == true
145
+ end
146
+ it "查询一个不存在的页面收录情况时,应返回true" do
147
+ baidu.indexed?('http://zxv.not-exists.com').should == false
148
+ end
149
+ page1 = Query::Engine::Baidu.query('seoaqua.com')
150
+ it "查询结果应该都能拿到title,href,host" do
151
+ page1.seo_ranks.each do |id,rank|
152
+ rank['href'].should_not == nil
153
+ rank['text'].should_not == nil
154
+ rank['host'].should_not == nil
155
+ end
156
+ end
157
+ end
@@ -0,0 +1,156 @@
1
+ #coding:UTF-8
2
+ require 'spec_helper'
3
+ describe Query::Result::Baidu do
4
+ subject{Query::Result::Baidu.new(File.read($sample_baidu2))}
5
+
6
+ it "can click the next page button" do
7
+ subject.next_url.should == '/s?wd=%E9%85%92%E5%BA%97%E9%A2%84%E8%AE%A2%E7%BD%91&pn=10&tn=baiduhome_pg&ie=utf-8&f=3&usm=1&rsv_page=1'
8
+ end
9
+
10
+ it "have 100000000 results" do
11
+ subject.count.should == 100000000
12
+ end
13
+
14
+ describe '#seo_ranks' do
15
+ it "should put hotel.qunar.com to be on first" do
16
+ subject.seo_ranks.first[:host].should == 'hotel.qunar.com'
17
+ end
18
+
19
+ it "should put 北京酒店预订_8371家特惠酒店_百度品质保证_去哪儿网提供 to be the first title" do
20
+ subject.seo_ranks.first[:text].should == '北京酒店预订_8371家特惠酒店_百度品质保证_去哪儿网提供'
21
+ end
22
+
23
+ it "should put 'http://jiudian.qunar.com/' to be the second url" do
24
+ subject.seo_ranks[1][:href].should == 'http://www.baidu.com/link?url=ZpGwUrZ8xUUgBQofg1TiNH1n_Ki3QWE62jvjkGvwwZ70wQPxFJxSD1uunh0uDwLM'
25
+ end
26
+
27
+ it "should have href,text,host elements for each seo result" do
28
+ subject.seo_ranks.each do |seo_rank|
29
+ seo_rank[:href].should_not == nil
30
+ seo_rank[:text].should_not == nil
31
+ seo_rank[:host].should_not == nil
32
+ end
33
+ end
34
+ end
35
+
36
+ describe '#ads_top' do
37
+ it "should have 6 top ads" do
38
+ subject.ads_top.size.should == 4
39
+ end
40
+
41
+ it "should find hotel.elong.com at the first position in the top ads" do
42
+ subject.ads_top[0][:host].should == 'www.agoda.com'
43
+ end
44
+
45
+ it "has an array of hashes with the required keys as the result of ads_top" do
46
+ subject.ads_top.class.should == Array
47
+ subject.ads_top.each do |ad_top|
48
+ ad_top.should have_key(:rank)
49
+ ad_top.should have_key(:host)
50
+ ad_top.should have_key(:href)
51
+ ad_top.should have_key(:text)
52
+ end
53
+ end
54
+ end
55
+
56
+ describe '#ads_right' do
57
+ it "should have 8 right ads" do
58
+ subject.ads_right.size.should == 8
59
+ end
60
+
61
+ it "has an array of hashes with the required keys as the result of ads_right" do
62
+ subject.ads_right.class.should == Array
63
+ subject.ads_right.each do |ad_right|
64
+ ad_right.should have_key(:rank)
65
+ ad_right.should have_key(:host)
66
+ ad_right.should have_key(:href)
67
+ ad_right.should have_key(:text)
68
+ end
69
+ end
70
+ end
71
+
72
+ describe '#ads_bottom' do
73
+ it "should have zero bottom ads" do
74
+ subject.ads_bottom.size.should == 0
75
+ end
76
+
77
+ it "has an array of hashes with the required keys as the result of ads_bottom" do
78
+ subject.ads_bottom.class.should == Array
79
+ subject.ads_bottom.each do |ad_bottom|
80
+ ad_bottom.should have_key(:rank)
81
+ ad_bottom.should have_key(:host)
82
+ ad_bottom.should have_key(:href)
83
+ ad_bottom.should have_key(:text)
84
+ end
85
+ end
86
+ end
87
+ end
88
+ __END__
89
+ describe Query::Engine::Baidu do
90
+ page = Query::Engine::Baidu.query '百度'
91
+
92
+ it "should return Query::Result::Baidu" do
93
+ page.class.should == Query::Result::Baidu
94
+ end
95
+
96
+ it "should return 100,000,000" do
97
+ page.count.should > 100000
98
+ end
99
+ it "should return 1" do
100
+ page.rank('www.baidu.com').should == 1
101
+ end
102
+
103
+ it "should return Query::Result::Baidu" do
104
+ page.next.class.should == Query::Result::Baidu
105
+ end
106
+
107
+ it "should return true" do
108
+ bool = Query::Engine::Baidu.popular?'百度'
109
+ bool.should == true
110
+ end
111
+
112
+ it "should return false" do
113
+ bool = Query::Engine::Baidu.popular?'lavataliuming'
114
+ bool.should == false
115
+ end
116
+
117
+ it "should return over 5 words beginning with the query_word" do
118
+ query_word = '为'
119
+ suggestions = Query::Engine::Baidu.suggestions(query_word)
120
+ suggestions.size.should > 5
121
+ suggestions.each do |suggestion|
122
+ suggestion[0].should == query_word
123
+ end
124
+ end
125
+
126
+ it "should return 100,000,000" do
127
+ result = baidu.pages('baidu.com')
128
+ result.class.should == Query::Result::Baidu
129
+ result.count.should == 100000000
130
+ end
131
+
132
+ it "should return 100,000,000" do
133
+ result = baidu.links('baidu.com')
134
+ result.class.should == Query::Result::Baidu
135
+ result.count.should == 100000000
136
+ end
137
+ it "should return 100,000,000" do
138
+ result = baidu.pages_with('baidu.com','baidu.com')
139
+ result.class.should == Query::Result::Baidu
140
+ result.count.should == 100000000
141
+ end
142
+ it "查询已经被收录的页面收录情况时,应返回true" do
143
+ baidu.indexed?('http://www.baidu.com').should == true
144
+ end
145
+ it "查询一个不存在的页面收录情况时,应返回true" do
146
+ baidu.indexed?('http://zxv.not-exists.com').should == false
147
+ end
148
+ page1 = Query::Engine::Baidu.query('seoaqua.com')
149
+ it "查询结果应该都能拿到title,href,host" do
150
+ page1.seo_ranks.each do |id,rank|
151
+ rank['href'].should_not == nil
152
+ rank['text'].should_not == nil
153
+ rank['host'].should_not == nil
154
+ end
155
+ end
156
+ end