micro_spider 0.1.23 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +5 -12
- data/lib/micro_spider.rb +49 -33
- data/lib/spider_core.rb +1 -0
- data/lib/spider_core/behavior.rb +8 -6
- data/lib/spider_core/excretion.rb +13 -0
- data/lib/spider_core/field_dsl.rb +10 -30
- data/lib/spider_core/follow_dsl.rb +8 -6
- data/lib/spider_core/pagination_dsl.rb +2 -3
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +60 -43
- data/test/test_helper.rb +1 -1
- metadata +133 -62
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: cedb84b01c8f87814486e68113ff9d69808749a0
|
4
|
+
data.tar.gz: 82fe718a38b980c0879b8b4eaec807b13b36e4fe
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 28035e8b24d4cb6ac59fcf97fbf34f661c610e054de37c28b2ae7de4e79f002cebfddfb0fdc064e6f8c3475b68683fd1b07055add0d76c76e3ea5a4700f31697
|
7
|
+
data.tar.gz: d6924f477e659bf14954eb4893d0c9247b892920cd1583a37024eceae2a105130b2c3c98013f7069ede665df27e92f0458c71bdae33d5e182a4cb80b4b5acaf9
|
data/README.md
CHANGED
@@ -11,22 +11,15 @@ spider = MicroSpider.new
|
|
11
11
|
spider.learn do
|
12
12
|
site 'http://www.bbc.com'
|
13
13
|
entrance '/news'
|
14
|
-
|
15
|
-
field :top_story, '#top-story h2 a'
|
16
|
-
|
17
|
-
follow '.story' do
|
18
|
-
|
19
|
-
field :title, 'h1.story-header'
|
20
|
-
field :body, '.story-body'
|
21
|
-
|
22
|
-
fields :related_stories, '.related-links-list a'
|
23
|
-
|
24
|
-
end
|
25
|
-
|
14
|
+
fields :top_stories, 'a.title-link'
|
26
15
|
end
|
27
16
|
|
28
17
|
spider.crawl
|
29
18
|
|
19
|
+
spider.get('top_stories')
|
20
|
+
# or
|
21
|
+
spider.excretion['/news']['top_stories']
|
22
|
+
|
30
23
|
```
|
31
24
|
|
32
25
|
|
data/lib/micro_spider.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require 'hashie'
|
1
2
|
require 'capybara'
|
2
3
|
require 'capybara/dsl'
|
3
4
|
require 'capybara/mechanize'
|
@@ -32,14 +33,16 @@ class MicroSpider
|
|
32
33
|
include SpiderCore::PaginationDSL
|
33
34
|
|
34
35
|
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
35
|
-
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
36
|
+
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout, :selector
|
36
37
|
|
37
|
-
def initialize(excretion = nil)
|
38
|
+
def initialize(excretion = nil, selector: :css)
|
39
|
+
@selector = selector
|
38
40
|
@paths = []
|
39
41
|
@actions = []
|
40
42
|
@setted_variables = {}
|
41
43
|
@timeout = 120
|
42
|
-
@
|
44
|
+
@status = 'pending'
|
45
|
+
@excretion = excretion || SpiderCore::Excretion.new
|
43
46
|
@logger = Logger.new(STDOUT)
|
44
47
|
@visited_paths = Set.new
|
45
48
|
@broken_paths = []
|
@@ -67,7 +70,7 @@ class MicroSpider
|
|
67
70
|
sleep_or_not
|
68
71
|
logger.info "Begin to visit #{path}."
|
69
72
|
super(path)
|
70
|
-
@current_location =
|
73
|
+
@current_location = SpiderCore::Excretion['_path' => path]
|
71
74
|
logger.info "Current location is #{path}."
|
72
75
|
end
|
73
76
|
|
@@ -84,16 +87,15 @@ class MicroSpider
|
|
84
87
|
# spider.set :table, '.tb a', selector: :css do |e|
|
85
88
|
# e['src']
|
86
89
|
# end
|
87
|
-
def set(name, value
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
end
|
90
|
+
def set(name, value)
|
91
|
+
@setted_variables[name.to_s] = value
|
92
|
+
end
|
93
|
+
|
94
|
+
def set_on(name, pattern, &block)
|
95
|
+
actions << lambda {
|
96
|
+
element = scan_first(pattern)
|
97
|
+
@setted_variables[name.to_s] = block_given? ? yield(element) : handle_element(element)
|
98
|
+
}
|
97
99
|
end
|
98
100
|
|
99
101
|
# Click the locator. This will trigger visit action and change current location.
|
@@ -107,8 +109,9 @@ class MicroSpider
|
|
107
109
|
spider = self.spawn
|
108
110
|
spider.entrance(path)
|
109
111
|
spider.learn(&block)
|
110
|
-
|
111
|
-
|
112
|
+
put(
|
113
|
+
"click::#{path}", spider.crawl
|
114
|
+
)
|
112
115
|
else
|
113
116
|
visit(path)
|
114
117
|
end
|
@@ -153,7 +156,7 @@ class MicroSpider
|
|
153
156
|
|
154
157
|
def site(url)
|
155
158
|
return if @site
|
156
|
-
Capybara.app_host = @
|
159
|
+
Capybara.app_host = @site = url
|
157
160
|
end
|
158
161
|
|
159
162
|
# This will be the first path for spider to visit.
|
@@ -171,21 +174,25 @@ class MicroSpider
|
|
171
174
|
@paths += path_or_paths
|
172
175
|
end
|
173
176
|
|
177
|
+
def with(pattern, path:, &block)
|
178
|
+
visit(path)
|
179
|
+
scan_all(pattern).map{ |element| yield(element) }
|
180
|
+
end
|
181
|
+
|
174
182
|
# Sometimes the entrances are on the page.
|
175
183
|
# @param path [String] path to visit
|
176
184
|
# @param pattern [String, Regexp] links pattern
|
177
185
|
#
|
178
186
|
# @example
|
179
187
|
# spider = MicroSpider.new
|
180
|
-
# spider.
|
188
|
+
# spider.entrance_on('.links a')
|
189
|
+
# spider.entrance_on('.links a', path: '/a')
|
181
190
|
#
|
182
|
-
def
|
191
|
+
def entrance_on(pattern, path: '/', attr: :href)
|
183
192
|
return if @skip_set_entrance
|
184
|
-
|
193
|
+
|
185
194
|
visit(path)
|
186
|
-
entrances = scan_all(
|
187
|
-
block_given? ? yield(element) : element[:href]
|
188
|
-
end
|
195
|
+
entrances = scan_all(pattern).map{ |element| element[attr] }
|
189
196
|
@paths += entrances.to_a
|
190
197
|
end
|
191
198
|
|
@@ -209,12 +216,14 @@ class MicroSpider
|
|
209
216
|
|
210
217
|
begin
|
211
218
|
visit(path)
|
219
|
+
@status = 'inprogress'
|
212
220
|
rescue Timeout::Error => err
|
213
221
|
@broken_paths << path
|
214
222
|
logger.fatal("Timeout!!! execution expired when visit `#{path}`")
|
215
223
|
logger.fatal(err)
|
216
224
|
rescue SystemExit, Interrupt
|
217
225
|
logger.fatal("SystemExit && Interrupt")
|
226
|
+
@status = 'exit'
|
218
227
|
exit!
|
219
228
|
rescue Exception => err
|
220
229
|
@broken_paths << path
|
@@ -224,8 +233,8 @@ class MicroSpider
|
|
224
233
|
else
|
225
234
|
@visited_paths << path
|
226
235
|
execute_actions
|
227
|
-
yield(@current_location) if block_given?
|
228
|
-
excretion
|
236
|
+
#yield(@current_location) if block_given?
|
237
|
+
@excretion = @excretion.put(path, @current_location)
|
229
238
|
ensure
|
230
239
|
@actions = []
|
231
240
|
@skip_set_entrance = true
|
@@ -238,7 +247,8 @@ class MicroSpider
|
|
238
247
|
def reset
|
239
248
|
return unless completed?
|
240
249
|
@paths = visited_paths.to_a
|
241
|
-
@
|
250
|
+
@status = 'pending'
|
251
|
+
@excretion = nil
|
242
252
|
@visited_paths = Set.new
|
243
253
|
@current_location = nil
|
244
254
|
end
|
@@ -257,7 +267,7 @@ class MicroSpider
|
|
257
267
|
# spider.save
|
258
268
|
#
|
259
269
|
def create_action(name, &block)
|
260
|
-
action = proc { actions << lambda { block.call(
|
270
|
+
action = proc { actions << lambda { block.call(@excretion) } }
|
261
271
|
metaclass.send :define_method, name, &action
|
262
272
|
end
|
263
273
|
|
@@ -269,10 +279,12 @@ class MicroSpider
|
|
269
279
|
logger.fatal('Timeout!!! execution expired when execute action')
|
270
280
|
logger.fatal(err.message)
|
271
281
|
logger.fatal(err.backtrace.inspect)
|
282
|
+
@visited_paths.pop
|
272
283
|
break
|
273
284
|
rescue SpiderCore::ClickPathNotFound => err
|
274
285
|
logger.fatal(err.message)
|
275
286
|
logger.fatal(err.backtrace.inspect)
|
287
|
+
@visited_paths.pop
|
276
288
|
break
|
277
289
|
end
|
278
290
|
}
|
@@ -292,22 +304,26 @@ class MicroSpider
|
|
292
304
|
spider = self.class.new
|
293
305
|
spider.logger = logger
|
294
306
|
spider.timeout = timeout
|
307
|
+
spider.site(@site)
|
295
308
|
spider.learn(&block) if block_given?
|
296
309
|
spider
|
297
310
|
end
|
298
311
|
|
299
|
-
def results
|
300
|
-
excretion[:results]
|
301
|
-
end
|
302
|
-
|
303
312
|
def completed?
|
304
|
-
|
313
|
+
@status == 'completed'
|
305
314
|
end
|
306
315
|
|
307
316
|
def metaclass
|
308
317
|
class << self; self; end
|
309
318
|
end
|
310
319
|
|
320
|
+
def get(field)
|
321
|
+
@_deep_fetch ||= excretion.extend Hashie::Extensions::DeepFind
|
322
|
+
result = @_deep_fetch.deep_find_all(field.to_s)
|
323
|
+
return if result.nil?
|
324
|
+
result.length == 1 ? result.pop : result
|
325
|
+
end
|
326
|
+
|
311
327
|
# The default page is Capybara.current_session.
|
312
328
|
# Share one page may cause difficult issue, so here i separate it.
|
313
329
|
def page
|
@@ -335,7 +351,7 @@ class MicroSpider
|
|
335
351
|
end
|
336
352
|
|
337
353
|
def complete
|
338
|
-
|
354
|
+
@status = 'completed'
|
339
355
|
suicide
|
340
356
|
end
|
341
357
|
|
data/lib/spider_core.rb
CHANGED
data/lib/spider_core/behavior.rb
CHANGED
@@ -1,14 +1,12 @@
|
|
1
|
-
require 'enumerable/lazy' if RUBY_VERSION < '2.0'
|
2
|
-
|
3
1
|
module SpiderCore
|
4
2
|
module Behavior
|
5
3
|
|
6
4
|
protected
|
7
5
|
|
8
|
-
def scan_all(
|
6
|
+
def scan_all(pattern, opts = {})
|
9
7
|
pattern = handle_pattern(pattern)
|
10
8
|
if pattern.is_a?(String)
|
11
|
-
elements = all(
|
9
|
+
elements = all(selector, pattern).lazy
|
12
10
|
if opts[:limit] && opts[:limit].to_i > 0
|
13
11
|
elements = elements.take(opts[:limit].to_i)
|
14
12
|
end
|
@@ -18,10 +16,10 @@ module SpiderCore
|
|
18
16
|
end
|
19
17
|
end
|
20
18
|
|
21
|
-
def scan_first(
|
19
|
+
def scan_first(pattern)
|
22
20
|
pattern = handle_pattern(pattern)
|
23
21
|
if pattern.is_a?(String)
|
24
|
-
first(
|
22
|
+
first(selector, pattern)
|
25
23
|
elsif pattern.is_a?(Regexp)
|
26
24
|
html[pattern, 1]
|
27
25
|
end
|
@@ -61,5 +59,9 @@ module SpiderCore
|
|
61
59
|
pattern
|
62
60
|
end
|
63
61
|
|
62
|
+
def put(display, value)
|
63
|
+
@current_location = @current_location.put(display, value)
|
64
|
+
end
|
65
|
+
|
64
66
|
end
|
65
67
|
end
|
@@ -5,40 +5,22 @@ module SpiderCore
|
|
5
5
|
#
|
6
6
|
# @param display [String] display name
|
7
7
|
def field(display, pattern, opts = {}, &block)
|
8
|
-
kind = opts[:kind] || :css
|
9
8
|
actions << lambda {
|
10
|
-
action_for(:field, {display: display, pattern: pattern
|
9
|
+
action_for(:field, {display: display, pattern: pattern}, opts, &block)
|
11
10
|
}
|
12
11
|
end
|
13
12
|
|
14
|
-
def css_field(display, pattern, opts = {}, &block)
|
15
|
-
field(display, pattern, opts.merge(kind: :css), &block)
|
16
|
-
end
|
17
|
-
|
18
|
-
def xpath_field(display, pattern, opts = {}, &block)
|
19
|
-
field(display, pattern, opts.merge(kind: :xpath), &block)
|
20
|
-
end
|
21
|
-
|
22
13
|
def fields(display, pattern, opts = {}, &block)
|
23
|
-
kind = opts[:kind] || :css
|
24
14
|
actions << lambda {
|
25
|
-
action_for(:fields, {display: display, pattern: pattern
|
15
|
+
action_for(:fields, {display: display, pattern: pattern}, opts, &block)
|
26
16
|
}
|
27
17
|
end
|
28
18
|
|
29
|
-
def css_fields(display, pattern, opts = {}, &block)
|
30
|
-
fields(display, pattern, opts.merge(kind: :css), &block)
|
31
|
-
end
|
32
|
-
|
33
|
-
def xpath_fields(display, pattern, opts = {}, &block)
|
34
|
-
fields(display, pattern, opts.merge(kind: :xpath), &block)
|
35
|
-
end
|
36
|
-
|
37
19
|
def foreach(pattern, opts = {}, &block)
|
38
20
|
return unless block_given?
|
39
|
-
|
21
|
+
|
40
22
|
actions << lambda {
|
41
|
-
scan_all(
|
23
|
+
scan_all(pattern, opts).each do |element|
|
42
24
|
yield(element)
|
43
25
|
end
|
44
26
|
}
|
@@ -52,24 +34,22 @@ module SpiderCore
|
|
52
34
|
|
53
35
|
elements = case action
|
54
36
|
when :field
|
55
|
-
scan_first
|
37
|
+
scan_first action_opts[:pattern]
|
56
38
|
when :fields
|
57
|
-
scan_all
|
39
|
+
scan_all action_opts[:pattern], opts
|
58
40
|
else
|
59
41
|
raise 'Unknow action.'
|
60
42
|
end
|
61
43
|
|
62
|
-
|
44
|
+
put(
|
45
|
+
action_opts[:display].to_s,
|
46
|
+
handle_elements(elements, &block)
|
47
|
+
)
|
63
48
|
rescue Exception => err
|
64
49
|
logger.fatal("Caught exception when get `#{action_opts[:pattern]}`.")
|
65
50
|
logger.fatal(err)
|
66
51
|
end
|
67
52
|
end
|
68
53
|
|
69
|
-
def make_field_result(display, field)
|
70
|
-
current_location[:field] ||= []
|
71
|
-
current_location[:field] << {display => field}
|
72
|
-
end
|
73
|
-
|
74
54
|
end
|
75
55
|
end
|
@@ -3,19 +3,21 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :skip_followers
|
5
5
|
|
6
|
-
def follow(pattern,
|
6
|
+
def follow(pattern, attr: :href, **opts, &block)
|
7
7
|
return unless block_given?
|
8
|
-
|
8
|
+
|
9
9
|
actions << lambda {
|
10
10
|
spider = self.spawn
|
11
11
|
spider.learn(&block)
|
12
|
-
scan_all(
|
12
|
+
scan_all(pattern, opts).each do |element|
|
13
13
|
next if skip_followers && skip_followers.include?(element[:href])
|
14
|
+
|
14
15
|
spider.skip_set_entrance = false
|
15
|
-
spider.entrance(element[
|
16
|
+
spider.entrance(element[attr])
|
16
17
|
end
|
17
|
-
|
18
|
-
|
18
|
+
put(
|
19
|
+
"follow::#{pattern}", spider.crawl
|
20
|
+
)
|
19
21
|
}
|
20
22
|
end
|
21
23
|
|
@@ -3,10 +3,9 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern,
|
7
|
-
kind = opts[:kind] || :css
|
6
|
+
def keep_eyes_on_next_page(pattern, attr: :href, &block)
|
8
7
|
actions << lambda {
|
9
|
-
element = first(
|
8
|
+
element = first(pattern)
|
10
9
|
path = block_given? ? yield(element) : element && element[:href]
|
11
10
|
@paths.unshift(path) if path
|
12
11
|
}
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
require 'test_helper'
|
2
2
|
|
3
|
-
class MicroSpiderTest <
|
3
|
+
class MicroSpiderTest < Minitest::Unit::TestCase
|
4
4
|
|
5
5
|
def setup
|
6
|
-
|
7
|
-
|
6
|
+
@spider = MicroSpider.new
|
7
|
+
@spider.logger.level = Logger::WARN
|
8
8
|
end
|
9
9
|
|
10
10
|
def test_spider_can_visit_path_with_some_delays
|
@@ -16,22 +16,36 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
16
16
|
assert (Time.now - now) > 5
|
17
17
|
end
|
18
18
|
|
19
|
+
def test_spider_can_get_field
|
20
|
+
@spider.learn do
|
21
|
+
entrance '/'
|
22
|
+
entrance '/a'
|
23
|
+
field :name, '#name'
|
24
|
+
end
|
25
|
+
excretion = @spider.crawl
|
26
|
+
assert_equal 'Home', excretion['/']['name']
|
27
|
+
assert_equal 'This is a', excretion['/a']['name']
|
28
|
+
assert_includes @spider.get('name'), 'Home'
|
29
|
+
assert_includes @spider.get('name'), 'This is a'
|
30
|
+
assert_equal nil, @spider.get('name1')
|
31
|
+
end
|
32
|
+
|
19
33
|
def test_spider_can_follow_lots_of_links
|
20
34
|
@spider.entrance('/')
|
21
35
|
@spider.follow('.links a') do
|
22
36
|
field :name, '#name'
|
23
37
|
end
|
24
38
|
excretion = @spider.crawl
|
25
|
-
excretion[
|
26
|
-
case
|
39
|
+
excretion['/']["follow::.links a"].each do |path, value|
|
40
|
+
case path
|
27
41
|
when '/a'
|
28
|
-
assert_equal 'This is a',
|
42
|
+
assert_equal 'This is a', value.get('name')
|
29
43
|
when '/b'
|
30
|
-
assert_equal 'This is b',
|
44
|
+
assert_equal 'This is b', value.get('name')
|
31
45
|
when '/c'
|
32
|
-
assert_equal 'This is c',
|
46
|
+
assert_equal 'This is c', value.get('name')
|
33
47
|
when '/d'
|
34
|
-
assert_equal 'This is d',
|
48
|
+
assert_equal 'This is d', value.get('name')
|
35
49
|
end
|
36
50
|
end
|
37
51
|
end
|
@@ -44,18 +58,17 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
44
58
|
end
|
45
59
|
end
|
46
60
|
excretion = @spider.crawl
|
47
|
-
excretion[
|
48
|
-
|
49
|
-
|
50
|
-
case ff[:entrance]
|
61
|
+
excretion['/']["follow::.links a"].each do |key, value|
|
62
|
+
value["follow::.links a"].each do |k, v|
|
63
|
+
case k
|
51
64
|
when '/a'
|
52
|
-
assert_equal 'This is a',
|
65
|
+
assert_equal 'This is a', v.get('name')
|
53
66
|
when '/b'
|
54
|
-
assert_equal 'This is b',
|
67
|
+
assert_equal 'This is b', v.get('name')
|
55
68
|
when '/c'
|
56
|
-
assert_equal 'This is c',
|
69
|
+
assert_equal 'This is c', v.get('name')
|
57
70
|
when '/d'
|
58
|
-
assert_equal 'This is d',
|
71
|
+
assert_equal 'This is d', v.get('name')
|
59
72
|
end
|
60
73
|
end
|
61
74
|
end
|
@@ -68,12 +81,12 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
68
81
|
field(:current_page, '#current_page')
|
69
82
|
end
|
70
83
|
excretion = @spider.crawl
|
71
|
-
excretion
|
72
|
-
|
73
|
-
assert_equal "Current Page #{$1}",
|
84
|
+
excretion.each do |k,v|
|
85
|
+
k =~ /\/page\/(\d)/
|
86
|
+
assert_equal "Current Page #{$1}", v.get('current_page')
|
74
87
|
end
|
75
88
|
end
|
76
|
-
|
89
|
+
|
77
90
|
def test_spider_can_follow_and_keep_eyes_on_next_page
|
78
91
|
@spider.entrance('/page/1')
|
79
92
|
@spider.follow('a.next_page') do
|
@@ -81,9 +94,9 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
81
94
|
field :current_page, '#current_page'
|
82
95
|
end
|
83
96
|
excretion = @spider.crawl
|
84
|
-
excretion[
|
85
|
-
|
86
|
-
assert_equal "Current Page #{$1}",
|
97
|
+
excretion['/page/1']['follow::a.next_page'].each do |k, v|
|
98
|
+
k =~ /\/page\/(\d)/
|
99
|
+
assert_equal "Current Page #{$1}", v.get('current_page')
|
87
100
|
end
|
88
101
|
end
|
89
102
|
|
@@ -91,33 +104,37 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
91
104
|
end
|
92
105
|
|
93
106
|
def test_spider_can_create_custom_action
|
107
|
+
@saved = false
|
94
108
|
@spider.create_action(:save) do |result|
|
95
|
-
|
96
|
-
end
|
97
|
-
@spider.learn do
|
98
|
-
entrance '/'
|
99
|
-
field :name, '#name'
|
100
|
-
save
|
101
|
-
end
|
102
|
-
excretion = @spider.crawl
|
103
|
-
assert_equal 'saved', excretion[:results].first[:save]
|
104
|
-
end
|
105
|
-
|
106
|
-
def test_spider_can_create_custom_action_reached_by_spawn
|
107
|
-
@spider.create_action(:save) do |result|
|
108
|
-
result[:save] = 'saved'
|
109
|
+
@saved = true
|
109
110
|
end
|
110
111
|
@spider.learn do
|
111
112
|
entrance '/'
|
112
113
|
field :name, '#name'
|
113
114
|
save
|
114
|
-
follow '.links a' do
|
115
|
-
field :name, '#name'
|
116
|
-
save
|
117
|
-
end
|
118
115
|
end
|
119
116
|
excretion = @spider.crawl
|
120
|
-
assert_equal
|
117
|
+
assert_equal true, @saved
|
118
|
+
assert_equal 'Home', excretion['/']['name']
|
119
|
+
assert_equal 'Home', @spider.get('name')
|
121
120
|
end
|
122
121
|
|
122
|
+
#def test_spider_can_create_custom_action_reached_by_spawn
|
123
|
+
#@saved = false
|
124
|
+
#@spider.create_action(:save) do |result|
|
125
|
+
#@saved = true
|
126
|
+
#end
|
127
|
+
#@spider.learn do
|
128
|
+
#entrance '/'
|
129
|
+
#field :name, '#name'
|
130
|
+
#save
|
131
|
+
#follow '.links a' do
|
132
|
+
#field :name, '#name'
|
133
|
+
#save
|
134
|
+
#end
|
135
|
+
#end
|
136
|
+
#excretion = @spider.crawl
|
137
|
+
#require 'pry'; binding.pry
|
138
|
+
#assert_equal true, @saved
|
139
|
+
#end
|
123
140
|
end
|
data/test/test_helper.rb
CHANGED
metadata
CHANGED
@@ -1,144 +1,215 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- zires
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2016-06-30 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: capybara
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - "~>"
|
20
18
|
- !ruby/object:Gem::Version
|
21
|
-
version: '
|
19
|
+
version: '2.7'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: '2.7'
|
22
23
|
type: :runtime
|
23
24
|
prerelease: false
|
24
25
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
26
|
requirements:
|
27
|
-
- -
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '2.7'
|
30
|
+
- - ">="
|
28
31
|
- !ruby/object:Gem::Version
|
29
|
-
version: '
|
32
|
+
version: '2.7'
|
30
33
|
- !ruby/object:Gem::Dependency
|
31
34
|
name: capybara-mechanize
|
32
35
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
36
|
requirements:
|
35
|
-
- -
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 1.5.0
|
40
|
+
- - ">="
|
36
41
|
- !ruby/object:Gem::Version
|
37
|
-
version:
|
42
|
+
version: 1.5.0
|
38
43
|
type: :runtime
|
39
44
|
prerelease: false
|
40
45
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
46
|
requirements:
|
43
|
-
- -
|
47
|
+
- - "~>"
|
44
48
|
- !ruby/object:Gem::Version
|
45
|
-
version:
|
49
|
+
version: 1.5.0
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.5.0
|
46
53
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
54
|
+
name: hamster
|
48
55
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
56
|
requirements:
|
51
|
-
- -
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 3.0.0
|
60
|
+
- - ">="
|
52
61
|
- !ruby/object:Gem::Version
|
53
|
-
version:
|
62
|
+
version: 3.0.0
|
54
63
|
type: :runtime
|
55
64
|
prerelease: false
|
56
65
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
66
|
requirements:
|
59
|
-
- -
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 3.0.0
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 3.0.0
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: hashie
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 3.4.4
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 3.4.0
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 3.4.4
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 3.4.0
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: minitest
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 4.7.5
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 4.7.5
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 4.7.5
|
110
|
+
- - ">="
|
60
111
|
- !ruby/object:Gem::Version
|
61
|
-
version:
|
112
|
+
version: 4.7.5
|
62
113
|
- !ruby/object:Gem::Dependency
|
63
114
|
name: pry
|
64
115
|
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
116
|
requirements:
|
67
|
-
- -
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.10.3
|
120
|
+
- - ">="
|
68
121
|
- !ruby/object:Gem::Version
|
69
|
-
version:
|
122
|
+
version: 0.10.3
|
70
123
|
type: :development
|
71
124
|
prerelease: false
|
72
125
|
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
126
|
requirements:
|
75
|
-
- -
|
127
|
+
- - "~>"
|
76
128
|
- !ruby/object:Gem::Version
|
77
|
-
version:
|
129
|
+
version: 0.10.3
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: 0.10.3
|
78
133
|
- !ruby/object:Gem::Dependency
|
79
134
|
name: yard
|
80
135
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
136
|
requirements:
|
83
|
-
- -
|
137
|
+
- - "~>"
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: 0.8.7.6
|
140
|
+
- - ">="
|
84
141
|
- !ruby/object:Gem::Version
|
85
|
-
version:
|
142
|
+
version: 0.8.7
|
86
143
|
type: :development
|
87
144
|
prerelease: false
|
88
145
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
146
|
requirements:
|
91
|
-
- -
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: 0.8.7.6
|
150
|
+
- - ">="
|
92
151
|
- !ruby/object:Gem::Version
|
93
|
-
version:
|
152
|
+
version: 0.8.7
|
94
153
|
- !ruby/object:Gem::Dependency
|
95
154
|
name: rake
|
96
155
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
156
|
requirements:
|
99
|
-
- -
|
157
|
+
- - "~>"
|
100
158
|
- !ruby/object:Gem::Version
|
101
|
-
version:
|
159
|
+
version: 11.2.2
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 11.2.0
|
102
163
|
type: :development
|
103
164
|
prerelease: false
|
104
165
|
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
166
|
requirements:
|
107
|
-
- -
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: 11.2.2
|
170
|
+
- - ">="
|
108
171
|
- !ruby/object:Gem::Version
|
109
|
-
version:
|
172
|
+
version: 11.2.0
|
110
173
|
- !ruby/object:Gem::Dependency
|
111
174
|
name: turn
|
112
175
|
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
176
|
requirements:
|
115
|
-
- -
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: 0.9.7
|
180
|
+
- - ">="
|
116
181
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
182
|
+
version: 0.9.7
|
118
183
|
type: :development
|
119
184
|
prerelease: false
|
120
185
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
186
|
requirements:
|
123
|
-
- -
|
187
|
+
- - "~>"
|
124
188
|
- !ruby/object:Gem::Version
|
125
|
-
version:
|
189
|
+
version: 0.9.7
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 0.9.7
|
126
193
|
- !ruby/object:Gem::Dependency
|
127
194
|
name: sinatra
|
128
195
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
196
|
requirements:
|
131
|
-
- -
|
197
|
+
- - "~>"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: 1.4.7
|
200
|
+
- - ">="
|
132
201
|
- !ruby/object:Gem::Version
|
133
|
-
version:
|
202
|
+
version: 1.4.7
|
134
203
|
type: :development
|
135
204
|
prerelease: false
|
136
205
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
206
|
requirements:
|
139
|
-
- -
|
207
|
+
- - "~>"
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: 1.4.7
|
210
|
+
- - ">="
|
140
211
|
- !ruby/object:Gem::Version
|
141
|
-
version:
|
212
|
+
version: 1.4.7
|
142
213
|
description: A DSL to write web spider. Depend on capybara and capybara-webkit.
|
143
214
|
email:
|
144
215
|
- zshuaibin@gmail.com
|
@@ -146,43 +217,43 @@ executables: []
|
|
146
217
|
extensions: []
|
147
218
|
extra_rdoc_files: []
|
148
219
|
files:
|
220
|
+
- MIT-LICENSE
|
221
|
+
- README.md
|
222
|
+
- Rakefile
|
149
223
|
- lib/micro_spider.rb
|
224
|
+
- lib/spider_core.rb
|
150
225
|
- lib/spider_core/behavior.rb
|
151
226
|
- lib/spider_core/exceptions.rb
|
227
|
+
- lib/spider_core/excretion.rb
|
152
228
|
- lib/spider_core/field_dsl.rb
|
153
229
|
- lib/spider_core/follow_dsl.rb
|
154
230
|
- lib/spider_core/pagination_dsl.rb
|
155
231
|
- lib/spider_core/version.rb
|
156
|
-
- lib/spider_core.rb
|
157
|
-
- MIT-LICENSE
|
158
|
-
- Rakefile
|
159
|
-
- README.md
|
160
232
|
- test/micro_spider_test.rb
|
161
233
|
- test/test_helper.rb
|
162
234
|
homepage: https://github.com/zires/micro-spider
|
163
235
|
licenses:
|
164
236
|
- MIT
|
237
|
+
metadata: {}
|
165
238
|
post_install_message:
|
166
239
|
rdoc_options: []
|
167
240
|
require_paths:
|
168
241
|
- lib
|
169
242
|
required_ruby_version: !ruby/object:Gem::Requirement
|
170
|
-
none: false
|
171
243
|
requirements:
|
172
|
-
- -
|
244
|
+
- - ">="
|
173
245
|
- !ruby/object:Gem::Version
|
174
246
|
version: '0'
|
175
247
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
176
|
-
none: false
|
177
248
|
requirements:
|
178
|
-
- -
|
249
|
+
- - ">="
|
179
250
|
- !ruby/object:Gem::Version
|
180
251
|
version: '0'
|
181
252
|
requirements: []
|
182
253
|
rubyforge_project:
|
183
|
-
rubygems_version:
|
254
|
+
rubygems_version: 2.4.5
|
184
255
|
signing_key:
|
185
|
-
specification_version:
|
256
|
+
specification_version: 4
|
186
257
|
summary: A DSL to write web spider.
|
187
258
|
test_files:
|
188
259
|
- test/micro_spider_test.rb
|