micro_spider 0.1.17 → 0.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -1,4 +1,33 @@
1
- tiny-spider
2
- ===========
1
+ # micro-spider
2
+
3
+ A DSL to write web spider. Depend on capybara and capybara-webkit.
4
+
5
+ # Example
6
+
7
+ ```ruby
8
+ require 'micro_spider'
9
+ spider = MicroSpider.new
10
+
11
+ spider.learn do
12
+ site 'http://www.bbc.com'
13
+ entrance '/news'
14
+
15
+ field :top_story, '#top-story h2 a'
16
+
17
+ follow '.story' do
18
+
19
+ field :title, 'h1.story-header'
20
+ field :body, '.story-body'
21
+
22
+ fields :related_stories, '.related-links-list a'
23
+
24
+ end
25
+
26
+ end
27
+
28
+ spider.crawl
29
+
30
+ ```
31
+
32
+
3
33
 
4
- web spider
data/lib/micro_spider.rb CHANGED
@@ -10,6 +10,7 @@ end
10
10
 
11
11
  require 'logger'
12
12
  require 'set'
13
+ require 'timeout'
13
14
  require 'spider_core'
14
15
 
15
16
  class MicroSpider
@@ -21,11 +22,12 @@ class MicroSpider
21
22
  include SpiderCore::PaginationDSL
22
23
 
23
24
  attr_reader :excretion, :paths, :delay, :current_location, :visited_paths, :broken_paths
24
- attr_accessor :logger, :actions, :recipe, :skip_set_entrance
25
+ attr_accessor :logger, :actions, :recipe, :skip_set_entrance, :timeout
25
26
 
26
27
  def initialize(excretion = nil)
27
28
  @paths = []
28
29
  @actions = []
30
+ @timeout = 120
29
31
  @excretion = excretion || { status: 'inprogress', results: [] }
30
32
  @logger = Logger.new(STDOUT)
31
33
  @visited_paths = Set.new
@@ -60,13 +62,21 @@ class MicroSpider
60
62
  # Click the locator. This will trigger visit action and change current location.
61
63
  # @params locator [String] the text or id of the link.
62
64
  #
63
- def click(locator, opts = {})
65
+ def click(locator, opts = {}, &block)
64
66
  actions << lambda {
65
67
  path = find_link(locator, opts)[:href]
66
- visit(path)
68
+ if block_given?
69
+ spider = self.spawn
70
+ spider.entrance(path)
71
+ spider.learn(&block)
72
+ current_location[:click] ||= []
73
+ current_location[:click] << spider.crawl[:results]
74
+ else
75
+ visit(path)
76
+ end
67
77
  }
68
78
  end
69
-
79
+
70
80
  # Teach the spider behaviors and it will repeat to the end.
71
81
  # @param recipe [String, Proc] the recipe be learned.
72
82
  #
@@ -205,14 +215,23 @@ class MicroSpider
205
215
  end
206
216
 
207
217
  def execute_actions
208
- actions.delete_if { |action| action.call }
218
+ actions.delete_if { |action|
219
+ begin
220
+ Timeout::timeout(@timeout) { action.call }
221
+ rescue Timeout::Error => err
222
+ logger.fatal('Timeout!!! execution expired when execute action')
223
+ logger.fatal(err.message)
224
+ logger.fatal(err.backtrace.inspect)
225
+ break
226
+ end
227
+ }
209
228
  end
210
229
 
211
230
  def spawn
212
231
  spider = self.clone
213
232
  spider.instance_variable_set(:@paths, [])
214
233
  spider.instance_variable_set(:@actions, [])
215
- spider.instance_variable_set(:@visited_paths, [])
234
+ spider.instance_variable_set(:@visited_paths, Set.new)
216
235
  spider.instance_variable_set(:@broken_paths, Set.new)
217
236
  spider.instance_variable_set(:@excretion, { status: 'inprogress', results: [] })
218
237
  spider.skip_set_entrance = false
@@ -3,11 +3,12 @@ module SpiderCore
3
3
 
4
4
  attr_accessor :next_page, :skip_pages
5
5
 
6
- def keep_eyes_on_next_page(pattern, opts = {})
6
+ def keep_eyes_on_next_page(pattern, opts = {}, &block)
7
7
  kind = opts[:kind] || :css
8
8
  actions << lambda {
9
- @next_page = first(kind, pattern)[:href] rescue nil
10
- @paths.unshift(@next_page) if @next_page
9
+ element = first(kind, pattern)
10
+ path = block_given? ? yield(element) : element && element[:href]
11
+ @paths.unshift(path) if path
11
12
  }
12
13
  end
13
14
 
@@ -1,3 +1,3 @@
1
1
  module SpiderCore
2
- VERSION = "0.1.17"
2
+ VERSION = "0.1.18"
3
3
  end
@@ -73,6 +73,22 @@ class MicroSpiderTest < MiniTest::Unit::TestCase
73
73
  assert_equal "Current Page #{$1}", f[:field].first[:current_page]
74
74
  end
75
75
  end
76
+
77
+ def test_spider_can_follow_and_keep_eyes_on_next_page
78
+ @spider.entrance('/page/1')
79
+ @spider.follow('a.next_page') do
80
+ keep_eyes_on_next_page('.pages a.next_page')
81
+ field :current_page, '#current_page'
82
+ end
83
+ excretion = @spider.crawl
84
+ excretion[:results].first[:follow].first.each do |f|
85
+ f[:entrance] =~ /\/page\/(\d)/
86
+ assert_equal "Current Page #{$1}", f[:field].first[:current_page]
87
+ end
88
+ end
89
+
90
+ def test_spider_can_nest_follow_lots_of_links_and_keep_eyes_on_next_page
91
+ end
76
92
 
77
93
  def test_spider_can_create_custom_action
78
94
  @spider.create_action(:save) do |result|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: micro_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.17
4
+ version: 0.1.18
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-07-25 00:00:00.000000000 Z
12
+ date: 2013-08-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: capybara