ro_crawler 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,197 @@
1
+ require 'spec_helper'
2
+
3
+ describe RoCrawler::Base do
4
+ before do
5
+ @b = RoCrawler::Base.new
6
+ end
7
+
8
+ after do
9
+ @b.browser_close
10
+ end
11
+
12
+ it 'local?' do
13
+ expect(@b.local?).to be_true
14
+ end
15
+
16
+ describe 'crawler' do
17
+ it 'phantomjs' do
18
+ expect do
19
+ @b.crawler driver: :phantomjs
20
+ end.not_to raise_error
21
+ end
22
+
23
+ it 'chrome' do
24
+ expect do
25
+ @b.crawler
26
+ end.not_to raise_error
27
+ end
28
+ end
29
+
30
+ it 'spider yjs' do
31
+ url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
32
+ expect do
33
+ ::RoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
34
+ end.not_to raise_error
35
+ end
36
+
37
+ describe 'spider v2ex' do
38
+ before do
39
+ @v2ex_spider = Base.new
40
+ end
41
+
42
+ #it 'case1' do
43
+ # url = 'http://www.v2ex.com/go/jobs?p=1'
44
+ # expect do
45
+ # ::RoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
46
+ # end.not_to raise_error
47
+ #end
48
+
49
+ it 'case2' do
50
+ @v2ex_spider.instance_variable_set(:@link_titles, [
51
+ ["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
52
+ ])
53
+ @v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
54
+ @v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
55
+ @v2ex_spider.get_contents
56
+ end
57
+ end
58
+
59
+ it 'spider ruby-china' do
60
+ url = 'http://ruby-china.org/topics/node25'
61
+ expect do
62
+ ::RoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
63
+ end.not_to raise_error
64
+ end
65
+
66
+ it 'spider ruby-china intr not nil' do
67
+ expect(
68
+ @b.get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
69
+ ).not_to be_empty
70
+ end
71
+
72
+ it 'handler' do
73
+ ::RoCrawler::Base.class_eval do
74
+ def handler(&blk)
75
+ results = [
76
+ [1, 2, 3],
77
+ [4, 5, 6],
78
+ [7, 8, 9]
79
+ ]
80
+
81
+ handle results, &blk
82
+ end
83
+ end
84
+
85
+ ::RoCrawler::Base.new.handler do |a, b, c|
86
+ expect(a.to_s).to match /^\d$/
87
+ expect(b.to_s).to match /^\d$/
88
+ expect(c.to_s).to match /^\d$/
89
+ end
90
+ end
91
+
92
+
93
+ it 'get tags from url' do
94
+ expect(
95
+ @b.get_tags_from("http://baidu.com", 'body').inner_html
96
+ ).not_to be_nil
97
+ end
98
+
99
+ describe 'get attrs in tags' do
100
+ it 'when get two attribute' do
101
+ tags = @b.get_tags_from("http://baidu.com", '#m p#nv a')
102
+ @b.get_attrs_in(tags, 'href', 'text').each do |attrs|
103
+ expect(attrs[0][/http/]).to be_true
104
+ expect(attrs[1][/.*/]).to be_true
105
+ end
106
+ end
107
+
108
+ it 'when get a attribute' do
109
+ tag = @b.get_tags_from("http://baidu.com", 'a')
110
+ @b.get_attrs_in(tag, 'text').each do |attr|
111
+ expect(attr).to be_a String
112
+ end
113
+ end
114
+ end
115
+
116
+ describe 'get_tags_attrs_from' do
117
+ it 'case1' do
118
+ @b.get_tags_attrs_from("http://baidu.com", '#m p#nv a', 'href', 'text').each do |attrs|
119
+ expect(attrs[0][/http/]).to be_true
120
+ expect(attrs[1][/.*/]).to be_true
121
+ end
122
+ end
123
+
124
+ it 'case2' do
125
+ @b.get_tags_attrs_from("http://www.v2ex.com/t/80954#reply6", '.topic_content', 'text')
126
+ end
127
+ end
128
+
129
+ it 'handle result' do
130
+ results = [
131
+ [1, 2, 3],
132
+ [4, 5, 6],
133
+ [7, 8, 9]
134
+ ]
135
+
136
+ @b.handle results do |a, b, c|
137
+ expect(a.to_s).to match /^\d$/
138
+ expect(b.to_s).to match /^\d$/
139
+ expect(c.to_s).to match /^\d$/
140
+ end
141
+ end
142
+
143
+ it 'get home url' do
144
+ url = 'http://www.v2ex.com/go/jobs?p=1'
145
+ expect(@b.get_home_url(url)).to be == 'http://www.v2ex.com'
146
+ url = 'http://ruby-china.org/topics/node25'
147
+ expect(@b.get_home_url(url)).to be == 'http://ruby-china.org'
148
+ end
149
+
150
+ it 'handle accident error' do
151
+ # test refresh if timeout
152
+ ::RoCrawler::Base.class_eval do
153
+ def new_browser
154
+ @b = crawler.goto 'http://baidu.com'
155
+ end
156
+ end
157
+
158
+ b = @b.new_browser
159
+
160
+ expect do
161
+ @b.handle_accident_error do |browser|
162
+ 10.times { browser.refresh }
163
+ end
164
+ end.not_to raise_error
165
+
166
+ b.close
167
+ end
168
+
169
+ describe 'get html from url' do
170
+ it 'refresh when timeout' do
171
+ ::RoCrawler::Base.class_eval do
172
+ def get_html_from(url)
173
+ html = ""
174
+ @b ||= crawler
175
+
176
+ get_html = lambda do
177
+ if url[/http/]
178
+ Nokogiri::HTML.parse @b.goto(url).html
179
+ else
180
+ Nokogiri::HTML.parse File.read(url)
181
+ end
182
+ end
183
+
184
+ begin
185
+ raise Net::ReadTimeout.new
186
+ rescue => e
187
+ html = get_html.call if e.is_a?(Net::ReadTimeout)
188
+ end
189
+
190
+ html
191
+ end
192
+ end
193
+
194
+ expect(@b.get_html_from('http://baidu.com')).not_to be_nil
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,11 @@
1
+ require 'spec_helper'
2
+
3
+ include RoCrawler::Misc
4
+ describe RoCrawler::Misc do
5
+ it 'urlify' do
6
+ baidu_url = Regexp.new "^http://baidu.com$"
7
+ local_url = Regexp.new "^file:///home/zxr/testfile$"
8
+ expect('baidu.com'.urlify).to match baidu_url
9
+ expect('/home/zxr/testfile'.urlify local:true).to match local_url
10
+ end
11
+ end
@@ -0,0 +1,9 @@
1
+ require 'spec_helper'
2
+
3
+ include RoCrawler::Watir
4
+ describe ::Watir::Browser do
5
+ it 'new' do
6
+ b = ::Watir::Browser.new :chrome
7
+ expect(b.goto 'http://google.com').to be_a(::Watir::Browser)
8
+ end
9
+ end
@@ -0,0 +1,9 @@
1
+ #require 'spec_helper'
2
+ #
3
+ #include WatirDSL
4
+ #
5
+ #describe ::Watir::ElementCollection do
6
+ # it 'collect' do
7
+ #
8
+ # end
9
+ #end
@@ -0,0 +1,20 @@
1
+ $LOAD_PATH.unshift File.expand_path('../..', __FILE__)
2
+ require 'ro_support'
3
+ require 'lib/ro_crawler'
4
+ require 'headless'
5
+ include RoCrawler
6
+ RSpec.configure do |config|
7
+ config.before(:all) do
8
+ Headless.new.start
9
+ end
10
+
11
+ config.treat_symbols_as_metadata_keys_with_true_values = true
12
+ config.run_all_when_everything_filtered = true
13
+ config.filter_run :focus
14
+
15
+ # Run specs in random order to surface order dependencies. If you find an
16
+ # order dependency and want to debug it, you can fix the order by providing
17
+ # the seed, which is printed after each run.
18
+ # --seed 1234
19
+ config.order = 'random'
20
+ end
metadata ADDED
@@ -0,0 +1,139 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ro_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.4
5
+ platform: ruby
6
+ authors:
7
+ - ro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-09-26 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rails
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 4.0.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 4.0.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: watir-rails
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: nokogiri
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: headless
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: grit
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ description: ''
84
+ email:
85
+ - rolobogu@gmail.com
86
+ executables: []
87
+ extensions: []
88
+ extra_rdoc_files: []
89
+ files:
90
+ - lib/ro_crawler.rb
91
+ - lib/ro_crawler/watir.rb
92
+ - lib/ro_crawler/version.rb
93
+ - lib/ro_crawler/misc.rb
94
+ - lib/ro_crawler/base.rb
95
+ - lib/ro_crawler/watir/element.rb
96
+ - lib/ro_crawler/watir/element_collection.rb
97
+ - lib/ro_crawler/watir/browser.rb
98
+ - lib/ro_crawler/methods.rb
99
+ - MIT-LICENSE
100
+ - Rakefile
101
+ - README.rdoc
102
+ - spec/fixtures/intr.html
103
+ - spec/fixtures/list.html
104
+ - spec/spec_helper.rb
105
+ - spec/ro_crawler/watir/browser_spec.rb
106
+ - spec/ro_crawler/watir/element_collection_spec.rb
107
+ - spec/ro_crawler/misc_spec.rb
108
+ - spec/ro_crawler/base_spec.rb
109
+ homepage: ''
110
+ licenses: []
111
+ metadata: {}
112
+ post_install_message:
113
+ rdoc_options: []
114
+ require_paths:
115
+ - lib
116
+ required_ruby_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - '>='
119
+ - !ruby/object:Gem::Version
120
+ version: '0'
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ requirements: []
127
+ rubyforge_project:
128
+ rubygems_version: 2.0.7
129
+ signing_key:
130
+ specification_version: 4
131
+ summary: ''
132
+ test_files:
133
+ - spec/fixtures/intr.html
134
+ - spec/fixtures/list.html
135
+ - spec/spec_helper.rb
136
+ - spec/ro_crawler/watir/browser_spec.rb
137
+ - spec/ro_crawler/watir/element_collection_spec.rb
138
+ - spec/ro_crawler/misc_spec.rb
139
+ - spec/ro_crawler/base_spec.rb