micro_spider 0.1.23 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +5 -12
- data/lib/micro_spider.rb +49 -33
- data/lib/spider_core.rb +1 -0
- data/lib/spider_core/behavior.rb +8 -6
- data/lib/spider_core/excretion.rb +13 -0
- data/lib/spider_core/field_dsl.rb +10 -30
- data/lib/spider_core/follow_dsl.rb +8 -6
- data/lib/spider_core/pagination_dsl.rb +2 -3
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +60 -43
- data/test/test_helper.rb +1 -1
- metadata +133 -62
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cedb84b01c8f87814486e68113ff9d69808749a0
|
4
|
+
data.tar.gz: 82fe718a38b980c0879b8b4eaec807b13b36e4fe
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 28035e8b24d4cb6ac59fcf97fbf34f661c610e054de37c28b2ae7de4e79f002cebfddfb0fdc064e6f8c3475b68683fd1b07055add0d76c76e3ea5a4700f31697
|
7
|
+
data.tar.gz: d6924f477e659bf14954eb4893d0c9247b892920cd1583a37024eceae2a105130b2c3c98013f7069ede665df27e92f0458c71bdae33d5e182a4cb80b4b5acaf9
|
data/README.md
CHANGED
@@ -11,22 +11,15 @@ spider = MicroSpider.new
|
|
11
11
|
spider.learn do
|
12
12
|
site 'http://www.bbc.com'
|
13
13
|
entrance '/news'
|
14
|
-
|
15
|
-
field :top_story, '#top-story h2 a'
|
16
|
-
|
17
|
-
follow '.story' do
|
18
|
-
|
19
|
-
field :title, 'h1.story-header'
|
20
|
-
field :body, '.story-body'
|
21
|
-
|
22
|
-
fields :related_stories, '.related-links-list a'
|
23
|
-
|
24
|
-
end
|
25
|
-
|
14
|
+
fields :top_stories, 'a.title-link'
|
26
15
|
end
|
27
16
|
|
28
17
|
spider.crawl
|
29
18
|
|
19
|
+
spider.get('top_stories')
|
20
|
+
# or
|
21
|
+
spider.excretion['/news']['top_stories']
|
22
|
+
|
30
23
|
```
|
31
24
|
|
32
25
|
|
data/lib/micro_spider.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'hashie'
|
1
2
|
require 'capybara'
|
2
3
|
require 'capybara/dsl'
|
3
4
|
require 'capybara/mechanize'
|
@@ -32,14 +33,16 @@ class MicroSpider
|
|
32
33
|
include SpiderCore::PaginationDSL
|
33
34
|
|
34
35
|
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
35
|
-
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
36
|
+
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout, :selector
|
36
37
|
|
37
|
-
def initialize(excretion = nil)
|
38
|
+
def initialize(excretion = nil, selector: :css)
|
39
|
+
@selector = selector
|
38
40
|
@paths = []
|
39
41
|
@actions = []
|
40
42
|
@setted_variables = {}
|
41
43
|
@timeout = 120
|
42
|
-
@
|
44
|
+
@status = 'pending'
|
45
|
+
@excretion = excretion || SpiderCore::Excretion.new
|
43
46
|
@logger = Logger.new(STDOUT)
|
44
47
|
@visited_paths = Set.new
|
45
48
|
@broken_paths = []
|
@@ -67,7 +70,7 @@ class MicroSpider
|
|
67
70
|
sleep_or_not
|
68
71
|
logger.info "Begin to visit #{path}."
|
69
72
|
super(path)
|
70
|
-
@current_location =
|
73
|
+
@current_location = SpiderCore::Excretion['_path' => path]
|
71
74
|
logger.info "Current location is #{path}."
|
72
75
|
end
|
73
76
|
|
@@ -84,16 +87,15 @@ class MicroSpider
|
|
84
87
|
# spider.set :table, '.tb a', selector: :css do |e|
|
85
88
|
# e['src']
|
86
89
|
# end
|
87
|
-
def set(name, value
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
90
|
+
def set(name, value)
|
91
|
+
@setted_variables[name.to_s] = value
|
92
|
+
end
|
93
|
+
|
94
|
+
def set_on(name, pattern, &block)
|
95
|
+
actions << lambda {
|
96
|
+
element = scan_first(pattern)
|
97
|
+
@setted_variables[name.to_s] = block_given? ? yield(element) : handle_element(element)
|
98
|
+
}
|
97
99
|
end
|
98
100
|
|
99
101
|
# Click the locator. This will trigger visit action and change current location.
|
@@ -107,8 +109,9 @@ class MicroSpider
|
|
107
109
|
spider = self.spawn
|
108
110
|
spider.entrance(path)
|
109
111
|
spider.learn(&block)
|
110
|
-
|
111
|
-
|
112
|
+
put(
|
113
|
+
"click::#{path}", spider.crawl
|
114
|
+
)
|
112
115
|
else
|
113
116
|
visit(path)
|
114
117
|
end
|
@@ -153,7 +156,7 @@ class MicroSpider
|
|
153
156
|
|
154
157
|
def site(url)
|
155
158
|
return if @site
|
156
|
-
Capybara.app_host = @
|
159
|
+
Capybara.app_host = @site = url
|
157
160
|
end
|
158
161
|
|
159
162
|
# This will be the first path for spider to visit.
|
@@ -171,21 +174,25 @@ class MicroSpider
|
|
171
174
|
@paths += path_or_paths
|
172
175
|
end
|
173
176
|
|
177
|
+
def with(pattern, path:, &block)
|
178
|
+
visit(path)
|
179
|
+
scan_all(pattern).map{ |element| yield(element) }
|
180
|
+
end
|
181
|
+
|
174
182
|
# Sometimes the entrances are on the page.
|
175
183
|
# @param path [String] path to visit
|
176
184
|
# @param pattern [String, Regexp] links pattern
|
177
185
|
#
|
178
186
|
# @example
|
179
187
|
# spider = MicroSpider.new
|
180
|
-
# spider.
|
188
|
+
# spider.entrance_on('.links a')
|
189
|
+
# spider.entrance_on('.links a', path: '/a')
|
181
190
|
#
|
182
|
-
def
|
191
|
+
def entrance_on(pattern, path: '/', attr: :href)
|
183
192
|
return if @skip_set_entrance
|
184
|
-
|
193
|
+
|
185
194
|
visit(path)
|
186
|
-
entrances = scan_all(
|
187
|
-
block_given? ? yield(element) : element[:href]
|
188
|
-
end
|
195
|
+
entrances = scan_all(pattern).map{ |element| element[attr] }
|
189
196
|
@paths += entrances.to_a
|
190
197
|
end
|
191
198
|
|
@@ -209,12 +216,14 @@ class MicroSpider
|
|
209
216
|
|
210
217
|
begin
|
211
218
|
visit(path)
|
219
|
+
@status = 'inprogress'
|
212
220
|
rescue Timeout::Error => err
|
213
221
|
@broken_paths << path
|
214
222
|
logger.fatal("Timeout!!! execution expired when visit `#{path}`")
|
215
223
|
logger.fatal(err)
|
216
224
|
rescue SystemExit, Interrupt
|
217
225
|
logger.fatal("SystemExit && Interrupt")
|
226
|
+
@status = 'exit'
|
218
227
|
exit!
|
219
228
|
rescue Exception => err
|
220
229
|
@broken_paths << path
|
@@ -224,8 +233,8 @@ class MicroSpider
|
|
224
233
|
else
|
225
234
|
@visited_paths << path
|
226
235
|
execute_actions
|
227
|
-
yield(@current_location) if block_given?
|
228
|
-
excretion
|
236
|
+
#yield(@current_location) if block_given?
|
237
|
+
@excretion = @excretion.put(path, @current_location)
|
229
238
|
ensure
|
230
239
|
@actions = []
|
231
240
|
@skip_set_entrance = true
|
@@ -238,7 +247,8 @@ class MicroSpider
|
|
238
247
|
def reset
|
239
248
|
return unless completed?
|
240
249
|
@paths = visited_paths.to_a
|
241
|
-
@
|
250
|
+
@status = 'pending'
|
251
|
+
@excretion = nil
|
242
252
|
@visited_paths = Set.new
|
243
253
|
@current_location = nil
|
244
254
|
end
|
@@ -257,7 +267,7 @@ class MicroSpider
|
|
257
267
|
# spider.save
|
258
268
|
#
|
259
269
|
def create_action(name, &block)
|
260
|
-
action = proc { actions << lambda { block.call(
|
270
|
+
action = proc { actions << lambda { block.call(@excretion) } }
|
261
271
|
metaclass.send :define_method, name, &action
|
262
272
|
end
|
263
273
|
|
@@ -269,10 +279,12 @@ class MicroSpider
|
|
269
279
|
logger.fatal('Timeout!!! execution expired when execute action')
|
270
280
|
logger.fatal(err.message)
|
271
281
|
logger.fatal(err.backtrace.inspect)
|
282
|
+
@visited_paths.pop
|
272
283
|
break
|
273
284
|
rescue SpiderCore::ClickPathNotFound => err
|
274
285
|
logger.fatal(err.message)
|
275
286
|
logger.fatal(err.backtrace.inspect)
|
287
|
+
@visited_paths.pop
|
276
288
|
break
|
277
289
|
end
|
278
290
|
}
|
@@ -292,22 +304,26 @@ class MicroSpider
|
|
292
304
|
spider = self.class.new
|
293
305
|
spider.logger = logger
|
294
306
|
spider.timeout = timeout
|
307
|
+
spider.site(@site)
|
295
308
|
spider.learn(&block) if block_given?
|
296
309
|
spider
|
297
310
|
end
|
298
311
|
|
299
|
-
def results
|
300
|
-
excretion[:results]
|
301
|
-
end
|
302
|
-
|
303
312
|
def completed?
|
304
|
-
|
313
|
+
@status == 'completed'
|
305
314
|
end
|
306
315
|
|
307
316
|
def metaclass
|
308
317
|
class << self; self; end
|
309
318
|
end
|
310
319
|
|
320
|
+
def get(field)
|
321
|
+
@_deep_fetch ||= excretion.extend Hashie::Extensions::DeepFind
|
322
|
+
result = @_deep_fetch.deep_find_all(field.to_s)
|
323
|
+
return if result.nil?
|
324
|
+
result.length == 1 ? result.pop : result
|
325
|
+
end
|
326
|
+
|
311
327
|
# The default page is Capybara.current_session.
|
312
328
|
# Share one page may cause difficult issue, so here i separate it.
|
313
329
|
def page
|
@@ -335,7 +351,7 @@ class MicroSpider
|
|
335
351
|
end
|
336
352
|
|
337
353
|
def complete
|
338
|
-
|
354
|
+
@status = 'completed'
|
339
355
|
suicide
|
340
356
|
end
|
341
357
|
|
data/lib/spider_core.rb
CHANGED
data/lib/spider_core/behavior.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
|
-
require 'enumerable/lazy' if RUBY_VERSION < '2.0'
|
2
|
-
|
3
1
|
module SpiderCore
|
4
2
|
module Behavior
|
5
3
|
|
6
4
|
protected
|
7
5
|
|
8
|
-
def scan_all(
|
6
|
+
def scan_all(pattern, opts = {})
|
9
7
|
pattern = handle_pattern(pattern)
|
10
8
|
if pattern.is_a?(String)
|
11
|
-
elements = all(
|
9
|
+
elements = all(selector, pattern).lazy
|
12
10
|
if opts[:limit] && opts[:limit].to_i > 0
|
13
11
|
elements = elements.take(opts[:limit].to_i)
|
14
12
|
end
|
@@ -18,10 +16,10 @@ module SpiderCore
|
|
18
16
|
end
|
19
17
|
end
|
20
18
|
|
21
|
-
def scan_first(
|
19
|
+
def scan_first(pattern)
|
22
20
|
pattern = handle_pattern(pattern)
|
23
21
|
if pattern.is_a?(String)
|
24
|
-
first(
|
22
|
+
first(selector, pattern)
|
25
23
|
elsif pattern.is_a?(Regexp)
|
26
24
|
html[pattern, 1]
|
27
25
|
end
|
@@ -61,5 +59,9 @@ module SpiderCore
|
|
61
59
|
pattern
|
62
60
|
end
|
63
61
|
|
62
|
+
def put(display, value)
|
63
|
+
@current_location = @current_location.put(display, value)
|
64
|
+
end
|
65
|
+
|
64
66
|
end
|
65
67
|
end
|
@@ -5,40 +5,22 @@ module SpiderCore
|
|
5
5
|
#
|
6
6
|
# @param display [String] display name
|
7
7
|
def field(display, pattern, opts = {}, &block)
|
8
|
-
kind = opts[:kind] || :css
|
9
8
|
actions << lambda {
|
10
|
-
action_for(:field, {display: display, pattern: pattern
|
9
|
+
action_for(:field, {display: display, pattern: pattern}, opts, &block)
|
11
10
|
}
|
12
11
|
end
|
13
12
|
|
14
|
-
def css_field(display, pattern, opts = {}, &block)
|
15
|
-
field(display, pattern, opts.merge(kind: :css), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def xpath_field(display, pattern, opts = {}, &block)
|
19
|
-
field(display, pattern, opts.merge(kind: :xpath), &block)
|
20
|
-
end
|
21
|
-
|
22
13
|
def fields(display, pattern, opts = {}, &block)
|
23
|
-
kind = opts[:kind] || :css
|
24
14
|
actions << lambda {
|
25
|
-
action_for(:fields, {display: display, pattern: pattern
|
15
|
+
action_for(:fields, {display: display, pattern: pattern}, opts, &block)
|
26
16
|
}
|
27
17
|
end
|
28
18
|
|
29
|
-
def css_fields(display, pattern, opts = {}, &block)
|
30
|
-
fields(display, pattern, opts.merge(kind: :css), &block)
|
31
|
-
end
|
32
|
-
|
33
|
-
def xpath_fields(display, pattern, opts = {}, &block)
|
34
|
-
fields(display, pattern, opts.merge(kind: :xpath), &block)
|
35
|
-
end
|
36
|
-
|
37
19
|
def foreach(pattern, opts = {}, &block)
|
38
20
|
return unless block_given?
|
39
|
-
|
21
|
+
|
40
22
|
actions << lambda {
|
41
|
-
scan_all(
|
23
|
+
scan_all(pattern, opts).each do |element|
|
42
24
|
yield(element)
|
43
25
|
end
|
44
26
|
}
|
@@ -52,24 +34,22 @@ module SpiderCore
|
|
52
34
|
|
53
35
|
elements = case action
|
54
36
|
when :field
|
55
|
-
scan_first
|
37
|
+
scan_first action_opts[:pattern]
|
56
38
|
when :fields
|
57
|
-
scan_all
|
39
|
+
scan_all action_opts[:pattern], opts
|
58
40
|
else
|
59
41
|
raise 'Unknow action.'
|
60
42
|
end
|
61
43
|
|
62
|
-
|
44
|
+
put(
|
45
|
+
action_opts[:display].to_s,
|
46
|
+
handle_elements(elements, &block)
|
47
|
+
)
|
63
48
|
rescue Exception => err
|
64
49
|
logger.fatal("Caught exception when get `#{action_opts[:pattern]}`.")
|
65
50
|
logger.fatal(err)
|
66
51
|
end
|
67
52
|
end
|
68
53
|
|
69
|
-
def make_field_result(display, field)
|
70
|
-
current_location[:field] ||= []
|
71
|
-
current_location[:field] << {display => field}
|
72
|
-
end
|
73
|
-
|
74
54
|
end
|
75
55
|
end
|
@@ -3,19 +3,21 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :skip_followers
|
5
5
|
|
6
|
-
def follow(pattern,
|
6
|
+
def follow(pattern, attr: :href, **opts, &block)
|
7
7
|
return unless block_given?
|
8
|
-
|
8
|
+
|
9
9
|
actions << lambda {
|
10
10
|
spider = self.spawn
|
11
11
|
spider.learn(&block)
|
12
|
-
scan_all(
|
12
|
+
scan_all(pattern, opts).each do |element|
|
13
13
|
next if skip_followers && skip_followers.include?(element[:href])
|
14
|
+
|
14
15
|
spider.skip_set_entrance = false
|
15
|
-
spider.entrance(element[
|
16
|
+
spider.entrance(element[attr])
|
16
17
|
end
|
17
|
-
|
18
|
-
|
18
|
+
put(
|
19
|
+
"follow::#{pattern}", spider.crawl
|
20
|
+
)
|
19
21
|
}
|
20
22
|
end
|
21
23
|
|
@@ -3,10 +3,9 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern,
|
7
|
-
kind = opts[:kind] || :css
|
6
|
+
def keep_eyes_on_next_page(pattern, attr: :href, &block)
|
8
7
|
actions << lambda {
|
9
|
-
element = first(
|
8
|
+
element = first(pattern)
|
10
9
|
path = block_given? ? yield(element) : element && element[:href]
|
11
10
|
@paths.unshift(path) if path
|
12
11
|
}
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
|
-
class MicroSpiderTest <
|
3
|
+
class MicroSpiderTest < Minitest::Unit::TestCase
|
4
4
|
|
5
5
|
def setup
|
6
|
-
|
7
|
-
|
6
|
+
@spider = MicroSpider.new
|
7
|
+
@spider.logger.level = Logger::WARN
|
8
8
|
end
|
9
9
|
|
10
10
|
def test_spider_can_visit_path_with_some_delays
|
@@ -16,22 +16,36 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
16
16
|
assert (Time.now - now) > 5
|
17
17
|
end
|
18
18
|
|
19
|
+
def test_spider_can_get_field
|
20
|
+
@spider.learn do
|
21
|
+
entrance '/'
|
22
|
+
entrance '/a'
|
23
|
+
field :name, '#name'
|
24
|
+
end
|
25
|
+
excretion = @spider.crawl
|
26
|
+
assert_equal 'Home', excretion['/']['name']
|
27
|
+
assert_equal 'This is a', excretion['/a']['name']
|
28
|
+
assert_includes @spider.get('name'), 'Home'
|
29
|
+
assert_includes @spider.get('name'), 'This is a'
|
30
|
+
assert_equal nil, @spider.get('name1')
|
31
|
+
end
|
32
|
+
|
19
33
|
def test_spider_can_follow_lots_of_links
|
20
34
|
@spider.entrance('/')
|
21
35
|
@spider.follow('.links a') do
|
22
36
|
field :name, '#name'
|
23
37
|
end
|
24
38
|
excretion = @spider.crawl
|
25
|
-
excretion[
|
26
|
-
case
|
39
|
+
excretion['/']["follow::.links a"].each do |path, value|
|
40
|
+
case path
|
27
41
|
when '/a'
|
28
|
-
assert_equal 'This is a',
|
42
|
+
assert_equal 'This is a', value.get('name')
|
29
43
|
when '/b'
|
30
|
-
assert_equal 'This is b',
|
44
|
+
assert_equal 'This is b', value.get('name')
|
31
45
|
when '/c'
|
32
|
-
assert_equal 'This is c',
|
46
|
+
assert_equal 'This is c', value.get('name')
|
33
47
|
when '/d'
|
34
|
-
assert_equal 'This is d',
|
48
|
+
assert_equal 'This is d', value.get('name')
|
35
49
|
end
|
36
50
|
end
|
37
51
|
end
|
@@ -44,18 +58,17 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
44
58
|
end
|
45
59
|
end
|
46
60
|
excretion = @spider.crawl
|
47
|
-
excretion[
|
48
|
-
|
49
|
-
|
50
|
-
case ff[:entrance]
|
61
|
+
excretion['/']["follow::.links a"].each do |key, value|
|
62
|
+
value["follow::.links a"].each do |k, v|
|
63
|
+
case k
|
51
64
|
when '/a'
|
52
|
-
assert_equal 'This is a',
|
65
|
+
assert_equal 'This is a', v.get('name')
|
53
66
|
when '/b'
|
54
|
-
assert_equal 'This is b',
|
67
|
+
assert_equal 'This is b', v.get('name')
|
55
68
|
when '/c'
|
56
|
-
assert_equal 'This is c',
|
69
|
+
assert_equal 'This is c', v.get('name')
|
57
70
|
when '/d'
|
58
|
-
assert_equal 'This is d',
|
71
|
+
assert_equal 'This is d', v.get('name')
|
59
72
|
end
|
60
73
|
end
|
61
74
|
end
|
@@ -68,12 +81,12 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
68
81
|
field(:current_page, '#current_page')
|
69
82
|
end
|
70
83
|
excretion = @spider.crawl
|
71
|
-
excretion
|
72
|
-
|
73
|
-
assert_equal "Current Page #{$1}",
|
84
|
+
excretion.each do |k,v|
|
85
|
+
k =~ /\/page\/(\d)/
|
86
|
+
assert_equal "Current Page #{$1}", v.get('current_page')
|
74
87
|
end
|
75
88
|
end
|
76
|
-
|
89
|
+
|
77
90
|
def test_spider_can_follow_and_keep_eyes_on_next_page
|
78
91
|
@spider.entrance('/page/1')
|
79
92
|
@spider.follow('a.next_page') do
|
@@ -81,9 +94,9 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
81
94
|
field :current_page, '#current_page'
|
82
95
|
end
|
83
96
|
excretion = @spider.crawl
|
84
|
-
excretion[
|
85
|
-
|
86
|
-
assert_equal "Current Page #{$1}",
|
97
|
+
excretion['/page/1']['follow::a.next_page'].each do |k, v|
|
98
|
+
k =~ /\/page\/(\d)/
|
99
|
+
assert_equal "Current Page #{$1}", v.get('current_page')
|
87
100
|
end
|
88
101
|
end
|
89
102
|
|
@@ -91,33 +104,37 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
91
104
|
end
|
92
105
|
|
93
106
|
def test_spider_can_create_custom_action
|
107
|
+
@saved = false
|
94
108
|
@spider.create_action(:save) do |result|
|
95
|
-
|
96
|
-
end
|
97
|
-
@spider.learn do
|
98
|
-
entrance '/'
|
99
|
-
field :name, '#name'
|
100
|
-
save
|
101
|
-
end
|
102
|
-
excretion = @spider.crawl
|
103
|
-
assert_equal 'saved', excretion[:results].first[:save]
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_spider_can_create_custom_action_reached_by_spawn
|
107
|
-
@spider.create_action(:save) do |result|
|
108
|
-
result[:save] = 'saved'
|
109
|
+
@saved = true
|
109
110
|
end
|
110
111
|
@spider.learn do
|
111
112
|
entrance '/'
|
112
113
|
field :name, '#name'
|
113
114
|
save
|
114
|
-
follow '.links a' do
|
115
|
-
field :name, '#name'
|
116
|
-
save
|
117
|
-
end
|
118
115
|
end
|
119
116
|
excretion = @spider.crawl
|
120
|
-
assert_equal
|
117
|
+
assert_equal true, @saved
|
118
|
+
assert_equal 'Home', excretion['/']['name']
|
119
|
+
assert_equal 'Home', @spider.get('name')
|
121
120
|
end
|
122
121
|
|
122
|
+
#def test_spider_can_create_custom_action_reached_by_spawn
|
123
|
+
#@saved = false
|
124
|
+
#@spider.create_action(:save) do |result|
|
125
|
+
#@saved = true
|
126
|
+
#end
|
127
|
+
#@spider.learn do
|
128
|
+
#entrance '/'
|
129
|
+
#field :name, '#name'
|
130
|
+
#save
|
131
|
+
#follow '.links a' do
|
132
|
+
#field :name, '#name'
|
133
|
+
#save
|
134
|
+
#end
|
135
|
+
#end
|
136
|
+
#excretion = @spider.crawl
|
137
|
+
#require 'pry'; binding.pry
|
138
|
+
#assert_equal true, @saved
|
139
|
+
#end
|
123
140
|
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,144 +1,215 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- zires
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2016-06-30 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: capybara
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
19
|
+
version: '2.7'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '2.7'
|
22
23
|
type: :runtime
|
23
24
|
prerelease: false
|
24
25
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.7'
|
30
|
+
- - ">="
|
28
31
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
32
|
+
version: '2.7'
|
30
33
|
- !ruby/object:Gem::Dependency
|
31
34
|
name: capybara-mechanize
|
32
35
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
36
|
requirements:
|
35
|
-
- -
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 1.5.0
|
40
|
+
- - ">="
|
36
41
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
42
|
+
version: 1.5.0
|
38
43
|
type: :runtime
|
39
44
|
prerelease: false
|
40
45
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
46
|
requirements:
|
43
|
-
- -
|
47
|
+
- - "~>"
|
44
48
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
49
|
+
version: 1.5.0
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.5.0
|
46
53
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
54
|
+
name: hamster
|
48
55
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
56
|
requirements:
|
51
|
-
- -
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 3.0.0
|
60
|
+
- - ">="
|
52
61
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
62
|
+
version: 3.0.0
|
54
63
|
type: :runtime
|
55
64
|
prerelease: false
|
56
65
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
66
|
requirements:
|
59
|
-
- -
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 3.0.0
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 3.0.0
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: hashie
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 3.4.4
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 3.4.0
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.4.4
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 3.4.0
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: minitest
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 4.7.5
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 4.7.5
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 4.7.5
|
110
|
+
- - ">="
|
60
111
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
112
|
+
version: 4.7.5
|
62
113
|
- !ruby/object:Gem::Dependency
|
63
114
|
name: pry
|
64
115
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
116
|
requirements:
|
67
|
-
- -
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.10.3
|
120
|
+
- - ">="
|
68
121
|
- !ruby/object:Gem::Version
|
69
|
-
version:
|
122
|
+
version: 0.10.3
|
70
123
|
type: :development
|
71
124
|
prerelease: false
|
72
125
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
126
|
requirements:
|
75
|
-
- -
|
127
|
+
- - "~>"
|
76
128
|
- !ruby/object:Gem::Version
|
77
|
-
version:
|
129
|
+
version: 0.10.3
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: 0.10.3
|
78
133
|
- !ruby/object:Gem::Dependency
|
79
134
|
name: yard
|
80
135
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
136
|
requirements:
|
83
|
-
- -
|
137
|
+
- - "~>"
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: 0.8.7.6
|
140
|
+
- - ">="
|
84
141
|
- !ruby/object:Gem::Version
|
85
|
-
version:
|
142
|
+
version: 0.8.7
|
86
143
|
type: :development
|
87
144
|
prerelease: false
|
88
145
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
146
|
requirements:
|
91
|
-
- -
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: 0.8.7.6
|
150
|
+
- - ">="
|
92
151
|
- !ruby/object:Gem::Version
|
93
|
-
version:
|
152
|
+
version: 0.8.7
|
94
153
|
- !ruby/object:Gem::Dependency
|
95
154
|
name: rake
|
96
155
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
156
|
requirements:
|
99
|
-
- -
|
157
|
+
- - "~>"
|
100
158
|
- !ruby/object:Gem::Version
|
101
|
-
version:
|
159
|
+
version: 11.2.2
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 11.2.0
|
102
163
|
type: :development
|
103
164
|
prerelease: false
|
104
165
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
166
|
requirements:
|
107
|
-
- -
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: 11.2.2
|
170
|
+
- - ">="
|
108
171
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
172
|
+
version: 11.2.0
|
110
173
|
- !ruby/object:Gem::Dependency
|
111
174
|
name: turn
|
112
175
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
176
|
requirements:
|
115
|
-
- -
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: 0.9.7
|
180
|
+
- - ">="
|
116
181
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
182
|
+
version: 0.9.7
|
118
183
|
type: :development
|
119
184
|
prerelease: false
|
120
185
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
186
|
requirements:
|
123
|
-
- -
|
187
|
+
- - "~>"
|
124
188
|
- !ruby/object:Gem::Version
|
125
|
-
version:
|
189
|
+
version: 0.9.7
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 0.9.7
|
126
193
|
- !ruby/object:Gem::Dependency
|
127
194
|
name: sinatra
|
128
195
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
196
|
requirements:
|
131
|
-
- -
|
197
|
+
- - "~>"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: 1.4.7
|
200
|
+
- - ">="
|
132
201
|
- !ruby/object:Gem::Version
|
133
|
-
version:
|
202
|
+
version: 1.4.7
|
134
203
|
type: :development
|
135
204
|
prerelease: false
|
136
205
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
206
|
requirements:
|
139
|
-
- -
|
207
|
+
- - "~>"
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: 1.4.7
|
210
|
+
- - ">="
|
140
211
|
- !ruby/object:Gem::Version
|
141
|
-
version:
|
212
|
+
version: 1.4.7
|
142
213
|
description: A DSL to write web spider. Depend on capybara and capybara-webkit.
|
143
214
|
email:
|
144
215
|
- zshuaibin@gmail.com
|
@@ -146,43 +217,43 @@ executables: []
|
|
146
217
|
extensions: []
|
147
218
|
extra_rdoc_files: []
|
148
219
|
files:
|
220
|
+
- MIT-LICENSE
|
221
|
+
- README.md
|
222
|
+
- Rakefile
|
149
223
|
- lib/micro_spider.rb
|
224
|
+
- lib/spider_core.rb
|
150
225
|
- lib/spider_core/behavior.rb
|
151
226
|
- lib/spider_core/exceptions.rb
|
227
|
+
- lib/spider_core/excretion.rb
|
152
228
|
- lib/spider_core/field_dsl.rb
|
153
229
|
- lib/spider_core/follow_dsl.rb
|
154
230
|
- lib/spider_core/pagination_dsl.rb
|
155
231
|
- lib/spider_core/version.rb
|
156
|
-
- lib/spider_core.rb
|
157
|
-
- MIT-LICENSE
|
158
|
-
- Rakefile
|
159
|
-
- README.md
|
160
232
|
- test/micro_spider_test.rb
|
161
233
|
- test/test_helper.rb
|
162
234
|
homepage: https://github.com/zires/micro-spider
|
163
235
|
licenses:
|
164
236
|
- MIT
|
237
|
+
metadata: {}
|
165
238
|
post_install_message:
|
166
239
|
rdoc_options: []
|
167
240
|
require_paths:
|
168
241
|
- lib
|
169
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
170
|
-
none: false
|
171
243
|
requirements:
|
172
|
-
- -
|
244
|
+
- - ">="
|
173
245
|
- !ruby/object:Gem::Version
|
174
246
|
version: '0'
|
175
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
|
-
none: false
|
177
248
|
requirements:
|
178
|
-
- -
|
249
|
+
- - ">="
|
179
250
|
- !ruby/object:Gem::Version
|
180
251
|
version: '0'
|
181
252
|
requirements: []
|
182
253
|
rubyforge_project:
|
183
|
-
rubygems_version:
|
254
|
+
rubygems_version: 2.4.5
|
184
255
|
signing_key:
|
185
|
-
specification_version:
|
256
|
+
specification_version: 4
|
186
257
|
summary: A DSL to write web spider.
|
187
258
|
test_files:
|
188
259
|
- test/micro_spider_test.rb
|