ro_crawler 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/MIT-LICENSE +20 -0
- data/README.rdoc +3 -0
- data/Rakefile +37 -0
- data/lib/ro_crawler/base.rb +149 -0
- data/lib/ro_crawler/methods.rb +18 -0
- data/lib/ro_crawler/misc.rb +14 -0
- data/lib/ro_crawler/version.rb +3 -0
- data/lib/ro_crawler/watir/browser.rb +47 -0
- data/lib/ro_crawler/watir/element.rb +10 -0
- data/lib/ro_crawler/watir/element_collection.rb +18 -0
- data/lib/ro_crawler/watir.rb +8 -0
- data/lib/ro_crawler.rb +21 -0
- data/spec/fixtures/intr.html +217 -0
- data/spec/fixtures/list.html +619 -0
- data/spec/ro_crawler/base_spec.rb +197 -0
- data/spec/ro_crawler/misc_spec.rb +11 -0
- data/spec/ro_crawler/watir/browser_spec.rb +9 -0
- data/spec/ro_crawler/watir/element_collection_spec.rb +9 -0
- data/spec/spec_helper.rb +20 -0
- metadata +139 -0
@@ -0,0 +1,197 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe RoCrawler::Base do
|
4
|
+
before do
|
5
|
+
@b = RoCrawler::Base.new
|
6
|
+
end
|
7
|
+
|
8
|
+
after do
|
9
|
+
@b.browser_close
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'local?' do
|
13
|
+
expect(@b.local?).to be_true
|
14
|
+
end
|
15
|
+
|
16
|
+
describe 'crawler' do
|
17
|
+
it 'phantomjs' do
|
18
|
+
expect do
|
19
|
+
@b.crawler driver: :phantomjs
|
20
|
+
end.not_to raise_error
|
21
|
+
end
|
22
|
+
|
23
|
+
it 'chrome' do
|
24
|
+
expect do
|
25
|
+
@b.crawler
|
26
|
+
end.not_to raise_error
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
it 'spider yjs' do
|
31
|
+
url = 'http://s.yingjiesheng.com/result.jsp?keyword=%E5%89%8D%E7%AB%AF%E5%AE%9E%E4%B9%A0&city=0&jobtype=0&do=1&stype=0'
|
32
|
+
expect do
|
33
|
+
::RoCrawler::Base.new.spider(url, 'h3.title>a', '.job, .j_i')
|
34
|
+
end.not_to raise_error
|
35
|
+
end
|
36
|
+
|
37
|
+
describe 'spider v2ex' do
|
38
|
+
before do
|
39
|
+
@v2ex_spider = Base.new
|
40
|
+
end
|
41
|
+
|
42
|
+
#it 'case1' do
|
43
|
+
# url = 'http://www.v2ex.com/go/jobs?p=1'
|
44
|
+
# expect do
|
45
|
+
# ::RoCrawler::Base.new.spider(url, 'span.item_title>a', '.topic_content')
|
46
|
+
# end.not_to raise_error
|
47
|
+
#end
|
48
|
+
|
49
|
+
it 'case2' do
|
50
|
+
@v2ex_spider.instance_variable_set(:@link_titles, [
|
51
|
+
["/t/79990#reply63", "[北京/杭州] 阿里巴巴2014校园招聘优秀人才内部同事推荐计划"]
|
52
|
+
])
|
53
|
+
@v2ex_spider.instance_variable_set(:@home_url, "http://www.v2ex.com")
|
54
|
+
@v2ex_spider.instance_variable_set(:@intr_selector, ".cell>.topic_content")
|
55
|
+
@v2ex_spider.get_contents
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
it 'spider ruby-china' do
|
60
|
+
url = 'http://ruby-china.org/topics/node25'
|
61
|
+
expect do
|
62
|
+
::RoCrawler::Base.new.spider(url, '.title>a', '.entry_content')
|
63
|
+
end.not_to raise_error
|
64
|
+
end
|
65
|
+
|
66
|
+
it 'spider ruby-china intr not nil' do
|
67
|
+
expect(
|
68
|
+
@b.get_tags_attrs_from('http://ruby-china.org/topics/13700', '.body.entry-content', 'inner_html')
|
69
|
+
).not_to be_empty
|
70
|
+
end
|
71
|
+
|
72
|
+
it 'handler' do
|
73
|
+
::RoCrawler::Base.class_eval do
|
74
|
+
def handler(&blk)
|
75
|
+
results = [
|
76
|
+
[1, 2, 3],
|
77
|
+
[4, 5, 6],
|
78
|
+
[7, 8, 9]
|
79
|
+
]
|
80
|
+
|
81
|
+
handle results, &blk
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
::RoCrawler::Base.new.handler do |a, b, c|
|
86
|
+
expect(a.to_s).to match /^\d$/
|
87
|
+
expect(b.to_s).to match /^\d$/
|
88
|
+
expect(c.to_s).to match /^\d$/
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
it 'get tags from url' do
|
94
|
+
expect(
|
95
|
+
@b.get_tags_from("http://baidu.com", 'body').inner_html
|
96
|
+
).not_to be_nil
|
97
|
+
end
|
98
|
+
|
99
|
+
describe 'get attrs in tags' do
|
100
|
+
it 'when get two attribute' do
|
101
|
+
tags = @b.get_tags_from("http://baidu.com", '#m p#nv a')
|
102
|
+
@b.get_attrs_in(tags, 'href', 'text').each do |attrs|
|
103
|
+
expect(attrs[0][/http/]).to be_true
|
104
|
+
expect(attrs[1][/.*/]).to be_true
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
it 'when get a attribute' do
|
109
|
+
tag = @b.get_tags_from("http://baidu.com", 'a')
|
110
|
+
@b.get_attrs_in(tag, 'text').each do |attr|
|
111
|
+
expect(attr).to be_a String
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
describe 'get_tags_attrs_from' do
|
117
|
+
it 'case1' do
|
118
|
+
@b.get_tags_attrs_from("http://baidu.com", '#m p#nv a', 'href', 'text').each do |attrs|
|
119
|
+
expect(attrs[0][/http/]).to be_true
|
120
|
+
expect(attrs[1][/.*/]).to be_true
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
it 'case2' do
|
125
|
+
@b.get_tags_attrs_from("http://www.v2ex.com/t/80954#reply6", '.topic_content', 'text')
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
it 'handle result' do
|
130
|
+
results = [
|
131
|
+
[1, 2, 3],
|
132
|
+
[4, 5, 6],
|
133
|
+
[7, 8, 9]
|
134
|
+
]
|
135
|
+
|
136
|
+
@b.handle results do |a, b, c|
|
137
|
+
expect(a.to_s).to match /^\d$/
|
138
|
+
expect(b.to_s).to match /^\d$/
|
139
|
+
expect(c.to_s).to match /^\d$/
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
it 'get home url' do
|
144
|
+
url = 'http://www.v2ex.com/go/jobs?p=1'
|
145
|
+
expect(@b.get_home_url(url)).to be == 'http://www.v2ex.com'
|
146
|
+
url = 'http://ruby-china.org/topics/node25'
|
147
|
+
expect(@b.get_home_url(url)).to be == 'http://ruby-china.org'
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'handle accident error' do
|
151
|
+
# test refresh if timeout
|
152
|
+
::RoCrawler::Base.class_eval do
|
153
|
+
def new_browser
|
154
|
+
@b = crawler.goto 'http://baidu.com'
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
b = @b.new_browser
|
159
|
+
|
160
|
+
expect do
|
161
|
+
@b.handle_accident_error do |browser|
|
162
|
+
10.times { browser.refresh }
|
163
|
+
end
|
164
|
+
end.not_to raise_error
|
165
|
+
|
166
|
+
b.close
|
167
|
+
end
|
168
|
+
|
169
|
+
describe 'get html from url' do
|
170
|
+
it 'refresh when timeout' do
|
171
|
+
::RoCrawler::Base.class_eval do
|
172
|
+
def get_html_from(url)
|
173
|
+
html = ""
|
174
|
+
@b ||= crawler
|
175
|
+
|
176
|
+
get_html = lambda do
|
177
|
+
if url[/http/]
|
178
|
+
Nokogiri::HTML.parse @b.goto(url).html
|
179
|
+
else
|
180
|
+
Nokogiri::HTML.parse File.read(url)
|
181
|
+
end
|
182
|
+
end
|
183
|
+
|
184
|
+
begin
|
185
|
+
raise Net::ReadTimeout.new
|
186
|
+
rescue => e
|
187
|
+
html = get_html.call if e.is_a?(Net::ReadTimeout)
|
188
|
+
end
|
189
|
+
|
190
|
+
html
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
expect(@b.get_html_from('http://baidu.com')).not_to be_nil
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
include RoCrawler::Misc
|
4
|
+
describe RoCrawler::Misc do
|
5
|
+
it 'urlify' do
|
6
|
+
baidu_url = Regexp.new "^http://baidu.com$"
|
7
|
+
local_url = Regexp.new "^file:///home/zxr/testfile$"
|
8
|
+
expect('baidu.com'.urlify).to match baidu_url
|
9
|
+
expect('/home/zxr/testfile'.urlify local:true).to match local_url
|
10
|
+
end
|
11
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
$LOAD_PATH.unshift File.expand_path('../..', __FILE__)
|
2
|
+
require 'ro_support'
|
3
|
+
require 'lib/ro_crawler'
|
4
|
+
require 'headless'
|
5
|
+
include RoCrawler
|
6
|
+
RSpec.configure do |config|
|
7
|
+
config.before(:all) do
|
8
|
+
Headless.new.start
|
9
|
+
end
|
10
|
+
|
11
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
12
|
+
config.run_all_when_everything_filtered = true
|
13
|
+
config.filter_run :focus
|
14
|
+
|
15
|
+
# Run specs in random order to surface order dependencies. If you find an
|
16
|
+
# order dependency and want to debug it, you can fix the order by providing
|
17
|
+
# the seed, which is printed after each run.
|
18
|
+
# --seed 1234
|
19
|
+
config.order = 'random'
|
20
|
+
end
|
metadata
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ro_crawler
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.4
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- ro
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-09-26 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rails
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 4.0.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 4.0.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: watir-rails
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: nokogiri
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: headless
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: grit
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - '>='
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description: ''
|
84
|
+
email:
|
85
|
+
- rolobogu@gmail.com
|
86
|
+
executables: []
|
87
|
+
extensions: []
|
88
|
+
extra_rdoc_files: []
|
89
|
+
files:
|
90
|
+
- lib/ro_crawler.rb
|
91
|
+
- lib/ro_crawler/watir.rb
|
92
|
+
- lib/ro_crawler/version.rb
|
93
|
+
- lib/ro_crawler/misc.rb
|
94
|
+
- lib/ro_crawler/base.rb
|
95
|
+
- lib/ro_crawler/watir/element.rb
|
96
|
+
- lib/ro_crawler/watir/element_collection.rb
|
97
|
+
- lib/ro_crawler/watir/browser.rb
|
98
|
+
- lib/ro_crawler/methods.rb
|
99
|
+
- MIT-LICENSE
|
100
|
+
- Rakefile
|
101
|
+
- README.rdoc
|
102
|
+
- spec/fixtures/intr.html
|
103
|
+
- spec/fixtures/list.html
|
104
|
+
- spec/spec_helper.rb
|
105
|
+
- spec/ro_crawler/watir/browser_spec.rb
|
106
|
+
- spec/ro_crawler/watir/element_collection_spec.rb
|
107
|
+
- spec/ro_crawler/misc_spec.rb
|
108
|
+
- spec/ro_crawler/base_spec.rb
|
109
|
+
homepage: ''
|
110
|
+
licenses: []
|
111
|
+
metadata: {}
|
112
|
+
post_install_message:
|
113
|
+
rdoc_options: []
|
114
|
+
require_paths:
|
115
|
+
- lib
|
116
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
117
|
+
requirements:
|
118
|
+
- - '>='
|
119
|
+
- !ruby/object:Gem::Version
|
120
|
+
version: '0'
|
121
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
122
|
+
requirements:
|
123
|
+
- - '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
126
|
+
requirements: []
|
127
|
+
rubyforge_project:
|
128
|
+
rubygems_version: 2.0.7
|
129
|
+
signing_key:
|
130
|
+
specification_version: 4
|
131
|
+
summary: ''
|
132
|
+
test_files:
|
133
|
+
- spec/fixtures/intr.html
|
134
|
+
- spec/fixtures/list.html
|
135
|
+
- spec/spec_helper.rb
|
136
|
+
- spec/ro_crawler/watir/browser_spec.rb
|
137
|
+
- spec/ro_crawler/watir/element_collection_spec.rb
|
138
|
+
- spec/ro_crawler/misc_spec.rb
|
139
|
+
- spec/ro_crawler/base_spec.rb
|