micro_spider 0.1.17 → 0.1.18

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -1,4 +1,33 @@
1
- tiny-spider
2
- ===========
1
+ # micro-spider
2
+
3
+ A DSL to write web spider. Depend on capybara and capybara-webkit.
4
+
5
+ # Example
6
+
7
+ ```ruby
8
+ require 'micro_spider'
9
+ spider = MicroSpider.new
10
+
11
+ spider.learn do
12
+ site 'http://www.bbc.com'
13
+ entrance '/news'
14
+
15
+ field :top_story, '#top-story h2 a'
16
+
17
+ follow '.story' do
18
+
19
+ field :title, 'h1.story-header'
20
+ field :body, '.story-body'
21
+
22
+ fields :related_stories, '.related-links-list a'
23
+
24
+ end
25
+
26
+ end
27
+
28
+ spider.crawl
29
+
30
+ ```
31
+
32
+
3
33
 
4
- web spider
data/lib/micro_spider.rb CHANGED
@@ -10,6 +10,7 @@ end
10
10
 
11
11
  require 'logger'
12
12
  require 'set'
13
+ require 'timeout'
13
14
  require 'spider_core'
14
15
 
15
16
  class MicroSpider
@@ -21,11 +22,12 @@ class MicroSpider
21
22
  include SpiderCore::PaginationDSL
22
23
 
23
24
  attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
24
- attr_accessor :logger, :actions, :recipe, :skip_set_entrance
25
+ attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
25
26
 
26
27
  def initialize(excretion = nil)
27
28
  @paths = []
28
29
  @actions = []
30
+ @timeout = 120
29
31
  @excretion = excretion || { status: 'inprogress', results: [] }
30
32
  @logger = Logger.new(STDOUT)
31
33
  @visited_paths = Set.new
@@ -60,13 +62,21 @@ class MicroSpider
60
62
  # Click the locator. This will trigger visit action and change current location.
61
63
  # @params locator [String] the text or id of the link.
62
64
  #
63
- def click(locator, opts = {})
65
+ def click(locator, opts = {}, &block)
64
66
  actions << lambda {
65
67
  path = find_link(locator, opts)[:href]
66
- visit(path)
68
+ if block_given?
69
+ spider = self.spawn
70
+ spider.entrance(path)
71
+ spider.learn(&block)
72
+ current_location[:click] ||= []
73
+ current_location[:click] << spider.crawl[:results]
74
+ else
75
+ visit(path)
76
+ end
67
77
  }
68
78
  end
69
-
79
+
70
80
  # Teach the spider behaviors and it will repeat to the end.
71
81
  # @param recipe [String, Proc] the recipe be learned.
72
82
  #
@@ -205,14 +215,23 @@ class MicroSpider
205
215
  end
206
216
 
207
217
  def execute_actions
208
- actions.delete_if { |action| action.call }
218
+ actions.delete_if { |action|
219
+ begin
220
+ Timeout::timeout(@timeout) { action.call }
221
+ rescue Timeout::Error => err
222
+ logger.fatal('Timeout!!! execution expired when execute action')
223
+ logger.fatal(err.message)
224
+ logger.fatal(err.backtrace.inspect)
225
+ break
226
+ end
227
+ }
209
228
  end
210
229
 
211
230
  def spawn
212
231
  spider = self.clone
213
232
  spider.instance_variable_set(:@paths, [])
214
233
  spider.instance_variable_set(:@actions, [])
215
- spider.instance_variable_set(:@visited_paths, [])
234
+ spider.instance_variable_set(:@visited_paths, Set.new)
216
235
  spider.instance_variable_set(:@broken_paths, Set.new)
217
236
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
218
237
  spider.skip_set_entrance = false
@@ -3,11 +3,12 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :next_page, :skip_pages
5
5
 
6
- def keep_eyes_on_next_page(pattern, opts = {})
6
+ def keep_eyes_on_next_page(pattern, opts = {}, &block)
7
7
  kind = opts[:kind] || :css
8
8
  actions << lambda {
9
- @next_page = first(kind, pattern)[:href] rescue nil
10
- @paths.unshift(@next_page) if @next_page
9
+ element = first(kind, pattern)
10
+ path = block_given? ? yield(element) : element && element[:href]
11
+ @paths.unshift(path) if path
11
12
  }
12
13
  end
13
14
 
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.17"
2
+ VERSION = "0.1.18"
3
3
  end
@@ -73,6 +73,22 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
73
73
  assert_equal "Current Page #{$1}", f[:field].first[:current_page]
74
74
  end
75
75
  end
76
+
77
+ def test_spider_can_follow_and_keep_eyes_on_next_page
78
+ @spider.entrance('/page/1')
79
+ @spider.follow('a.next_page') do
80
+ keep_eyes_on_next_page('.pages a.next_page')
81
+ field :current_page, '#current_page'
82
+ end
83
+ excretion = @spider.crawl
84
+ excretion[:results].first[:follow].first.each do |f|
85
+ f[:entrance] =~ /\/page\/(\d)/
86
+ assert_equal "Current Page #{$1}", f[:field].first[:current_page]
87
+ end
88
+ end
89
+
90
+ def test_spider_can_nest_follow_lots_of_links_and_keep_eyes_on_next_page
91
+ end
76
92
 
77
93
  def test_spider_can_create_custom_action
78
94
  @spider.create_action(:save) do |result|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.18
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-08-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: capybara