ro_crawler 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,197 @@
1
+ require 'spec_helper'
2
+
3
+ describe RoCrawler::Base do
4
+ before do
5
+ @b = RoCrawler::Base.new
6
+ end
7
+
8
+ after do
9
+ @b.browser_close
10
+ end
11
+
12
+ it 'local?' do
13
+ expect(@b.local?).to be_true
14
+ end
15
+
16
+ describe 'crawler' do
17
+ it 'phantomjs' do
18
+ expect do
19
+ @b.crawler driver: :phantomjs
20
+ end.not_to raise_error
21
+ end
22
+
23
+ it 'chrome' do
24
+ expect do
25
+ @b.crawler
26
+ end.not_to raise_error
27
+ end
28
+ end
29
+
30
+ it 'spider yjs' do
31
+ url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
32
+ expect do
33
+ ::RoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
34
+ end.not_to raise_error
35
+ end
36
+
37
+ describe 'spider v2ex' do
38
+ before do
39
+ @v2ex_spider = Base.new
40
+ end
41
+
42
+ #it 'case1' do
43
+ # url = 'http://www.v2ex.com/go/jobs?p=1'
44
+ # expect do
45
+ # ::RoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
46
+ # end.not_to raise_error
47
+ #end
48
+
49
+ it 'case2' do
50
+ @v2ex_spider.instance_variable_set(:@link_titles, [
51
+ ["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
52
+ ])
53
+ @v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
54
+ @v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
55
+ @v2ex_spider.get_contents
56
+ end
57
+ end
58
+
59
+ it 'spider ruby-china' do
60
+ url = 'http://ruby-china.org/topics/node25'
61
+ expect do
62
+ ::RoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
63
+ end.not_to raise_error
64
+ end
65
+
66
+ it 'spider ruby-china intr not nil' do
67
+ expect(
68
+ @b.get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
69
+ ).not_to be_empty
70
+ end
71
+
72
+ it 'handler' do
73
+ ::RoCrawler::Base.class_eval do
74
+ def handler(&blk)
75
+ results = [
76
+ [1, 2, 3],
77
+ [4, 5, 6],
78
+ [7, 8, 9]
79
+ ]
80
+
81
+ handle results, &blk
82
+ end
83
+ end
84
+
85
+ ::RoCrawler::Base.new.handler do |a, b, c|
86
+ expect(a.to_s).to match /^\d$/
87
+ expect(b.to_s).to match /^\d$/
88
+ expect(c.to_s).to match /^\d$/
89
+ end
90
+ end
91
+
92
+
93
+ it 'get tags from url' do
94
+ expect(
95
+ @b.get_tags_from("http://baidu.com", 'body').inner_html
96
+ ).not_to be_nil
97
+ end
98
+
99
+ describe 'get attrs in tags' do
100
+ it 'when get two attribute' do
101
+ tags = @b.get_tags_from("http://baidu.com", '#m p#nv a')
102
+ @b.get_attrs_in(tags, 'href', 'text').each do |attrs|
103
+ expect(attrs[0][/http/]).to be_true
104
+ expect(attrs[1][/.*/]).to be_true
105
+ end
106
+ end
107
+
108
+ it 'when get a attribute' do
109
+ tag = @b.get_tags_from("http://baidu.com", 'a')
110
+ @b.get_attrs_in(tag, 'text').each do |attr|
111
+ expect(attr).to be_a String
112
+ end
113
+ end
114
+ end
115
+
116
+ describe 'get_tags_attrs_from' do
117
+ it 'case1' do
118
+ @b.get_tags_attrs_from("http://baidu.com", '#m p#nv a', 'href', 'text').each do |attrs|
119
+ expect(attrs[0][/http/]).to be_true
120
+ expect(attrs[1][/.*/]).to be_true
121
+ end
122
+ end
123
+
124
+ it 'case2' do
125
+ @b.get_tags_attrs_from("http://www.v2ex.com/t/80954#reply6", '.topic_content', 'text')
126
+ end
127
+ end
128
+
129
+ it 'handle result' do
130
+ results = [
131
+ [1, 2, 3],
132
+ [4, 5, 6],
133
+ [7, 8, 9]
134
+ ]
135
+
136
+ @b.handle results do |a, b, c|
137
+ expect(a.to_s).to match /^\d$/
138
+ expect(b.to_s).to match /^\d$/
139
+ expect(c.to_s).to match /^\d$/
140
+ end
141
+ end
142
+
143
+ it 'get home url' do
144
+ url = 'http://www.v2ex.com/go/jobs?p=1'
145
+ expect(@b.get_home_url(url)).to be == 'http://www.v2ex.com'
146
+ url = 'http://ruby-china.org/topics/node25'
147
+ expect(@b.get_home_url(url)).to be == 'http://ruby-china.org'
148
+ end
149
+
150
+ it 'handle accident error' do
151
+ # test refresh if timeout
152
+ ::RoCrawler::Base.class_eval do
153
+ def new_browser
154
+ @b = crawler.goto 'http://baidu.com'
155
+ end
156
+ end
157
+
158
+ b = @b.new_browser
159
+
160
+ expect do
161
+ @b.handle_accident_error do |browser|
162
+ 10.times { browser.refresh }
163
+ end
164
+ end.not_to raise_error
165
+
166
+ b.close
167
+ end
168
+
169
+ describe 'get html from url' do
170
+ it 'refresh when timeout' do
171
+ ::RoCrawler::Base.class_eval do
172
+ def get_html_from(url)
173
+ html = ""
174
+ @b ||= crawler
175
+
176
+ get_html = lambda do
177
+ if url[/http/]
178
+ Nokogiri::HTML.parse @b.goto(url).html
179
+ else
180
+ Nokogiri::HTML.parse File.read(url)
181
+ end
182
+ end
183
+
184
+ begin
185
+ raise Net::ReadTimeout.new
186
+ rescue => e
187
+ html = get_html.call if e.is_a?(Net::ReadTimeout)
188
+ end
189
+
190
+ html
191
+ end
192
+ end
193
+
194
+ expect(@b.get_html_from('http://baidu.com')).not_to be_nil
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ include RoCrawler::Misc
4
+ describe RoCrawler::Misc do
5
+ it 'urlify' do
6
+ baidu_url = Regexp.new "^http://baidu.com$"
7
+ local_url = Regexp.new "^file:///home/zxr/testfile$"
8
+ expect('baidu.com'.urlify).to match baidu_url
9
+ expect('/home/zxr/testfile'.urlify local:true).to match local_url
10
+ end
11
+ end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ include RoCrawler::Watir
4
+ describe ::Watir::Browser do
5
+ it 'new' do
6
+ b = ::Watir::Browser.new :chrome
7
+ expect(b.goto 'http://google.com').to be_a(::Watir::Browser)
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ #require 'spec_helper'
2
+ #
3
+ #include WatirDSL
4
+ #
5
+ #describe ::Watir::ElementCollection do
6
+ # it 'collect' do
7
+ #
8
+ # end
9
+ #end
@@ -0,0 +1,20 @@
1
+ $LOAD_PATH.unshift File.expand_path('../..', __FILE__)
2
+ require 'ro_support'
3
+ require 'lib/ro_crawler'
4
+ require 'headless'
5
+ include RoCrawler
6
+ RSpec.configure do |config|
7
+ config.before(:all) do
8
+ Headless.new.start
9
+ end
10
+
11
+ config.treat_symbols_as_metadata_keys_with_true_values = true
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
metadata ADDED
@@ -0,0 +1,139 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ro_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - ro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-09-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 4.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 4.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir-rails
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: headless
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: grit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: ''
84
+ email:
85
+ - rolobogu@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - lib/ro_crawler.rb
91
+ - lib/ro_crawler/watir.rb
92
+ - lib/ro_crawler/version.rb
93
+ - lib/ro_crawler/misc.rb
94
+ - lib/ro_crawler/base.rb
95
+ - lib/ro_crawler/watir/element.rb
96
+ - lib/ro_crawler/watir/element_collection.rb
97
+ - lib/ro_crawler/watir/browser.rb
98
+ - lib/ro_crawler/methods.rb
99
+ - MIT-LICENSE
100
+ - Rakefile
101
+ - README.rdoc
102
+ - spec/fixtures/intr.html
103
+ - spec/fixtures/list.html
104
+ - spec/spec_helper.rb
105
+ - spec/ro_crawler/watir/browser_spec.rb
106
+ - spec/ro_crawler/watir/element_collection_spec.rb
107
+ - spec/ro_crawler/misc_spec.rb
108
+ - spec/ro_crawler/base_spec.rb
109
+ homepage: ''
110
+ licenses: []
111
+ metadata: {}
112
+ post_install_message:
113
+ rdoc_options: []
114
+ require_paths:
115
+ - lib
116
+ required_ruby_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ requirements: []
127
+ rubyforge_project:
128
+ rubygems_version: 2.0.7
129
+ signing_key:
130
+ specification_version: 4
131
+ summary: ''
132
+ test_files:
133
+ - spec/fixtures/intr.html
134
+ - spec/fixtures/list.html
135
+ - spec/spec_helper.rb
136
+ - spec/ro_crawler/watir/browser_spec.rb
137
+ - spec/ro_crawler/watir/element_collection_spec.rb
138
+ - spec/ro_crawler/misc_spec.rb
139
+ - spec/ro_crawler/base_spec.rb