micro_spider 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +32 -3
- data/lib/micro_spider.rb +25 -6
- data/lib/spider_core/pagination_dsl.rb +4 -3
- data/lib/spider_core/version.rb +1 -1
- data/test/micro_spider_test.rb +16 -0
- metadata +2 -2
data/README.md
CHANGED
@@ -1,4 +1,33 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
# micro-spider
|
2
|
+
|
3
|
+
A DSL to write web spider. Depend on capybara and capybara-webkit.
|
4
|
+
|
5
|
+
# Example
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
require 'micro_spider'
|
9
|
+
spider = MicroSpider.new
|
10
|
+
|
11
|
+
spider.learn do
|
12
|
+
site 'http://www.bbc.com'
|
13
|
+
entrance '/news'
|
14
|
+
|
15
|
+
field :top_story, '#top-story h2 a'
|
16
|
+
|
17
|
+
follow '.story' do
|
18
|
+
|
19
|
+
field :title, 'h1.story-header'
|
20
|
+
field :body, '.story-body'
|
21
|
+
|
22
|
+
fields :related_stories, '.related-links-list a'
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
27
|
+
|
28
|
+
spider.crawl
|
29
|
+
|
30
|
+
```
|
31
|
+
|
32
|
+
|
3
33
|
|
4
|
-
web spider
|
data/lib/micro_spider.rb
CHANGED
@@ -10,6 +10,7 @@ end
|
|
10
10
|
|
11
11
|
require 'logger'
|
12
12
|
require 'set'
|
13
|
+
require 'timeout'
|
13
14
|
require 'spider_core'
|
14
15
|
|
15
16
|
class MicroSpider
|
@@ -21,11 +22,12 @@ class MicroSpider
|
|
21
22
|
include SpiderCore::PaginationDSL
|
22
23
|
|
23
24
|
attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
|
24
|
-
attr_accessor :logger, :actions, :recipe, :skip_set_entrance
|
25
|
+
attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
|
25
26
|
|
26
27
|
def initialize(excretion = nil)
|
27
28
|
@paths = []
|
28
29
|
@actions = []
|
30
|
+
@timeout = 120
|
29
31
|
@excretion = excretion || { status: 'inprogress', results: [] }
|
30
32
|
@logger = Logger.new(STDOUT)
|
31
33
|
@visited_paths = Set.new
|
@@ -60,13 +62,21 @@ class MicroSpider
|
|
60
62
|
# Click the locator. This will trigger visit action and change current location.
|
61
63
|
# @params locator [String] the text or id of the link.
|
62
64
|
#
|
63
|
-
def click(locator, opts = {})
|
65
|
+
def click(locator, opts = {}, &block)
|
64
66
|
actions << lambda {
|
65
67
|
path = find_link(locator, opts)[:href]
|
66
|
-
|
68
|
+
if block_given?
|
69
|
+
spider = self.spawn
|
70
|
+
spider.entrance(path)
|
71
|
+
spider.learn(&block)
|
72
|
+
current_location[:click] ||= []
|
73
|
+
current_location[:click] << spider.crawl[:results]
|
74
|
+
else
|
75
|
+
visit(path)
|
76
|
+
end
|
67
77
|
}
|
68
78
|
end
|
69
|
-
|
79
|
+
|
70
80
|
# Teach the spider behaviors and it will repeat to the end.
|
71
81
|
# @param recipe [String, Proc] the recipe be learned.
|
72
82
|
#
|
@@ -205,14 +215,23 @@ class MicroSpider
|
|
205
215
|
end
|
206
216
|
|
207
217
|
def execute_actions
|
208
|
-
actions.delete_if { |action|
|
218
|
+
actions.delete_if { |action|
|
219
|
+
begin
|
220
|
+
Timeout::timeout(@timeout) { action.call }
|
221
|
+
rescue Timeout::Error => err
|
222
|
+
logger.fatal('Timeout!!! execution expired when execute action')
|
223
|
+
logger.fatal(err.message)
|
224
|
+
logger.fatal(err.backtrace.inspect)
|
225
|
+
break
|
226
|
+
end
|
227
|
+
}
|
209
228
|
end
|
210
229
|
|
211
230
|
def spawn
|
212
231
|
spider = self.clone
|
213
232
|
spider.instance_variable_set(:@paths, [])
|
214
233
|
spider.instance_variable_set(:@actions, [])
|
215
|
-
spider.instance_variable_set(:@visited_paths,
|
234
|
+
spider.instance_variable_set(:@visited_paths, Set.new)
|
216
235
|
spider.instance_variable_set(:@broken_paths, Set.new)
|
217
236
|
spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
|
218
237
|
spider.skip_set_entrance = false
|
@@ -3,11 +3,12 @@ module SpiderCore
|
|
3
3
|
|
4
4
|
attr_accessor :next_page, :skip_pages
|
5
5
|
|
6
|
-
def keep_eyes_on_next_page(pattern, opts = {})
|
6
|
+
def keep_eyes_on_next_page(pattern, opts = {}, &block)
|
7
7
|
kind = opts[:kind] || :css
|
8
8
|
actions << lambda {
|
9
|
-
|
10
|
-
|
9
|
+
element = first(kind, pattern)
|
10
|
+
path = block_given? ? yield(element) : element && element[:href]
|
11
|
+
@paths.unshift(path) if path
|
11
12
|
}
|
12
13
|
end
|
13
14
|
|
data/lib/spider_core/version.rb
CHANGED
data/test/micro_spider_test.rb
CHANGED
@@ -73,6 +73,22 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
|
|
73
73
|
assert_equal "Current Page #{$1}", f[:field].first[:current_page]
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
def test_spider_can_follow_and_keep_eyes_on_next_page
|
78
|
+
@spider.entrance('/page/1')
|
79
|
+
@spider.follow('a.next_page') do
|
80
|
+
keep_eyes_on_next_page('.pages a.next_page')
|
81
|
+
field :current_page, '#current_page'
|
82
|
+
end
|
83
|
+
excretion = @spider.crawl
|
84
|
+
excretion[:results].first[:follow].first.each do |f|
|
85
|
+
f[:entrance] =~ /\/page\/(\d)/
|
86
|
+
assert_equal "Current Page #{$1}", f[:field].first[:current_page]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def test_spider_can_nest_follow_lots_of_links_and_keep_eyes_on_next_page
|
91
|
+
end
|
76
92
|
|
77
93
|
def test_spider_can_create_custom_action
|
78
94
|
@spider.create_action(:save) do |result|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: micro_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.18
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-07
|
12
|
+
date: 2013-08-07 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: capybara
|