micro_spider 0.1.17 → 0.1.18
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +32 -3
- data/lib/micro_spider.rb +25 -6
- data/lib/spider_core/pagination_dsl.rb +4 -3
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +16 -0
- metadata +2 -2
data/README.md
CHANGED
@@ -1,4 +1,33 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# micro-spider
|
2
|
+
|
3
|
+
A DSL to write web spider. Depend on capybara and capybara-webkit.
|
4
|
+
|
5
|
+
# Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'micro_spider'
|
9
|
+
spider = MicroSpider.new
|
10
|
+
|
11
|
+
spider.learn do
|
12
|
+
site 'http://www.bbc.com'
|
13
|
+
entrance '/news'
|
14
|
+
|
15
|
+
field :top_story, '#top-story h2 a'
|
16
|
+
|
17
|
+
follow '.story' do
|
18
|
+
|
19
|
+
field :title, 'h1.story-header'
|
20
|
+
field :body, '.story-body'
|
21
|
+
|
22
|
+
fields :related_stories, '.related-links-list a'
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
spider.crawl
|
29
|
+
|
30
|
+
```
|
31
|
+
|
32
|
+
|
3
33
|
|
4
|
-
web spider
|
data/lib/micro_spider.rb
CHANGED
@@ -10,6 +10,7 @@ end
|
|
10
10
|
|
11
11
|
require 'logger'
|
12
12
|
require 'set'
|
13
|
+
require 'timeout'
|
13
14
|
require 'spider_core'
|
14
15
|
|
15
16
|
class MicroSpider
|
@@ -21,11 +22,12 @@ class MicroSpider
|
|
21
22
|
include SpiderCore::PaginationDSL
|
22
23
|
|
23
24
|
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
24
|
-
attr_accessor :logger, :actions, :recipe, :skip_set_entrance
|
25
|
+
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
25
26
|
|
26
27
|
def initialize(excretion = nil)
|
27
28
|
@paths = []
|
28
29
|
@actions = []
|
30
|
+
@timeout = 120
|
29
31
|
@excretion = excretion || { status: 'inprogress', results: [] }
|
30
32
|
@logger = Logger.new(STDOUT)
|
31
33
|
@visited_paths = Set.new
|
@@ -60,13 +62,21 @@ class MicroSpider
|
|
60
62
|
# Click the locator. This will trigger visit action and change current location.
|
61
63
|
# @params locator [String] the text or id of the link.
|
62
64
|
#
|
63
|
-
def click(locator, opts = {})
|
65
|
+
def click(locator, opts = {}, &block)
|
64
66
|
actions << lambda {
|
65
67
|
path = find_link(locator, opts)[:href]
|
66
|
-
|
68
|
+
if block_given?
|
69
|
+
spider = self.spawn
|
70
|
+
spider.entrance(path)
|
71
|
+
spider.learn(&block)
|
72
|
+
current_location[:click] ||= []
|
73
|
+
current_location[:click] << spider.crawl[:results]
|
74
|
+
else
|
75
|
+
visit(path)
|
76
|
+
end
|
67
77
|
}
|
68
78
|
end
|
69
|
-
|
79
|
+
|
70
80
|
# Teach the spider behaviors and it will repeat to the end.
|
71
81
|
# @param recipe [String, Proc] the recipe be learned.
|
72
82
|
#
|
@@ -205,14 +215,23 @@ class MicroSpider
|
|
205
215
|
end
|
206
216
|
|
207
217
|
def execute_actions
|
208
|
-
actions.delete_if { |action|
|
218
|
+
actions.delete_if { |action|
|
219
|
+
begin
|
220
|
+
Timeout::timeout(@timeout) { action.call }
|
221
|
+
rescue Timeout::Error => err
|
222
|
+
logger.fatal('Timeout!!! execution expired when execute action')
|
223
|
+
logger.fatal(err.message)
|
224
|
+
logger.fatal(err.backtrace.inspect)
|
225
|
+
break
|
226
|
+
end
|
227
|
+
}
|
209
228
|
end
|
210
229
|
|
211
230
|
def spawn
|
212
231
|
spider = self.clone
|
213
232
|
spider.instance_variable_set(:@paths, [])
|
214
233
|
spider.instance_variable_set(:@actions, [])
|
215
|
-
spider.instance_variable_set(:@visited_paths,
|
234
|
+
spider.instance_variable_set(:@visited_paths, Set.new)
|
216
235
|
spider.instance_variable_set(:@broken_paths, Set.new)
|
217
236
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
218
237
|
spider.skip_set_entrance = false
|
@@ -3,11 +3,12 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern, opts = {})
|
6
|
+
def keep_eyes_on_next_page(pattern, opts = {}, &block)
|
7
7
|
kind = opts[:kind] || :css
|
8
8
|
actions << lambda {
|
9
|
-
|
10
|
-
|
9
|
+
element = first(kind, pattern)
|
10
|
+
path = block_given? ? yield(element) : element && element[:href]
|
11
|
+
@paths.unshift(path) if path
|
11
12
|
}
|
12
13
|
end
|
13
14
|
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
@@ -73,6 +73,22 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
73
73
|
assert_equal "Current Page #{$1}", f[:field].first[:current_page]
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
def test_spider_can_follow_and_keep_eyes_on_next_page
|
78
|
+
@spider.entrance('/page/1')
|
79
|
+
@spider.follow('a.next_page') do
|
80
|
+
keep_eyes_on_next_page('.pages a.next_page')
|
81
|
+
field :current_page, '#current_page'
|
82
|
+
end
|
83
|
+
excretion = @spider.crawl
|
84
|
+
excretion[:results].first[:follow].first.each do |f|
|
85
|
+
f[:entrance] =~ /\/page\/(\d)/
|
86
|
+
assert_equal "Current Page #{$1}", f[:field].first[:current_page]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_spider_can_nest_follow_lots_of_links_and_keep_eyes_on_next_page
|
91
|
+
end
|
76
92
|
|
77
93
|
def test_spider_can_create_custom_action
|
78
94
|
@spider.create_action(:save) do |result|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.18
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07
|
12
|
+
date: 2013-08-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: capybara
|