crabfarm 0.2.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/lib/crabfarm.rb +17 -18
  3. data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
  4. data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
  5. data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
  6. data/lib/crabfarm/adapters/browser/noop.rb +25 -0
  7. data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
  8. data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
  9. data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
  10. data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
  11. data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
  12. data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
  13. data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
  14. data/lib/crabfarm/assertion/fields.rb +85 -0
  15. data/lib/crabfarm/base_navigator.rb +78 -0
  16. data/lib/crabfarm/base_reducer.rb +68 -0
  17. data/lib/crabfarm/base_struct.rb +17 -0
  18. data/lib/crabfarm/cli.rb +18 -8
  19. data/lib/crabfarm/configuration.rb +24 -51
  20. data/lib/crabfarm/context.rb +19 -43
  21. data/lib/crabfarm/crabtrap_context.rb +4 -11
  22. data/lib/crabfarm/driver_pool.rb +32 -0
  23. data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
  24. data/lib/crabfarm/engines/async_state_manager.rb +1 -1
  25. data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
  26. data/lib/crabfarm/forked_navigator.rb +31 -0
  27. data/lib/crabfarm/modes/console.rb +4 -4
  28. data/lib/crabfarm/modes/generator.rb +24 -11
  29. data/lib/crabfarm/rspec.rb +26 -24
  30. data/lib/crabfarm/strategies.rb +15 -9
  31. data/lib/crabfarm/templates/Crabfile.erb +21 -26
  32. data/lib/crabfarm/templates/Gemfile.erb +6 -0
  33. data/lib/crabfarm/templates/navigator.rb.erb +20 -0
  34. data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
  35. data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
  36. data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
  37. data/lib/crabfarm/templates/struct.rb.erb +12 -0
  38. data/lib/crabfarm/transition_service.rb +20 -7
  39. data/lib/crabfarm/version.rb +1 -1
  40. metadata +50 -48
  41. data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
  42. data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
  43. data/lib/crabfarm/adapters/output/hash.rb +0 -11
  44. data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
  45. data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
  46. data/lib/crabfarm/base_parser.rb +0 -59
  47. data/lib/crabfarm/base_state.rb +0 -112
  48. data/lib/crabfarm/default_driver_factory.rb +0 -86
  49. data/lib/crabfarm/driver_bucket.rb +0 -42
  50. data/lib/crabfarm/driver_bucket_pool.rb +0 -26
  51. data/lib/crabfarm/forked_state.rb +0 -38
  52. data/lib/crabfarm/mocks/noop_driver.rb +0 -6
  53. data/lib/crabfarm/phantom_driver_factory.rb +0 -33
  54. data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,32 @@
1
+ module Crabfarm
2
+ class DriverPool
3
+
4
+ def initialize(_factory)
5
+ @factory = _factory
6
+ @drivers = Hash.new
7
+ end
8
+
9
+ def driver(_session_id=nil)
10
+ _session_id ||= :default_driver
11
+ driver = @drivers[_session_id.to_sym]
12
+ driver = @drivers[_session_id.to_sym] = @factory.build_driver(_session_id) if driver.nil?
13
+ driver
14
+ end
15
+
16
+ def reset(_session_id=nil)
17
+ if _session_id.nil?
18
+ @drivers.values.each { |d| @factory.release_driver d }
19
+ @drivers = Hash.new
20
+ else
21
+ _session_id = _session_id.to_sym
22
+ driver = @drivers.delete _session_id
23
+ @factory.release_driver driver unless driver.nil?
24
+ end
25
+ end
26
+
27
+ def release
28
+ reset
29
+ end
30
+
31
+ end
32
+ end
@@ -3,12 +3,13 @@ module Crabfarm
3
3
  module Surfer
4
4
  class SurfContext < SearchContext
5
5
 
6
- def_delegators :@bucket, :setup
6
+ attr_reader :driver
7
+
7
8
  def_delegators 'driver.navigate', :back, :forward, :refresh
8
9
 
9
- def initialize(_bucket)
10
+ def initialize(_driver)
10
11
  super nil, self
11
- @bucket = _bucket
12
+ @driver = _driver
12
13
  end
13
14
 
14
15
  def root
@@ -23,14 +24,6 @@ module Crabfarm
23
24
  driver.page_source
24
25
  end
25
26
 
26
- def driver
27
- @bucket.original
28
- end
29
-
30
- def quit
31
- @bucket.reset
32
- end
33
-
34
27
  def current_uri
35
28
  URI.parse driver.current_url
36
29
  end
@@ -41,20 +34,7 @@ module Crabfarm
41
34
 
42
35
  def goto(_url, _params=nil)
43
36
  _url += "?#{_params.to_query}" if _params
44
- retries = 0
45
-
46
- loop do
47
- begin
48
- @bucket.reset if retries > 0
49
- driver.get(_url)
50
- break
51
- rescue Timeout::Error #, Selenium::WebDriver::Error::UnknownError
52
- # TODO: log this
53
- raise if retries >= max_retries
54
- retries += 1
55
- sleep 1.0
56
- end
57
- end
37
+ driver.get(_url)
58
38
  end
59
39
  end
60
40
  end
@@ -108,7 +108,7 @@ module Crabfarm
108
108
  logger.info "Transitioning state: #{@next_state_name}"
109
109
  @elapsed = Benchmark.measure do
110
110
  ActiveSupport::Dependencies.clear
111
- @doc = TransitionService.apply_state(@context, @next_state_name, @next_state_params).output_as_json
111
+ @doc = TransitionService.transition(@context, @next_state_name, @next_state_params).document
112
112
  end.real
113
113
 
114
114
  logger.info "Transitioned in #{@elapsed.real}"
@@ -28,7 +28,7 @@ module Crabfarm
28
28
  output = { name: _name, params: _params }
29
29
 
30
30
  output[:elapsed] = Benchmark.measure do
31
- output[:doc] = TransitionService.apply_state(@context, _name, _params).output_as_json
31
+ output[:doc] = TransitionService.transition(@context, _name, _params).document
32
32
  end
33
33
 
34
34
  OpenStruct.new output
@@ -0,0 +1,31 @@
1
+ module Crabfarm
2
+ class ForkedNavigator < Delegator
3
+
4
+ def initialize(_context, _parent, _browser_name, _mutex)
5
+ @context = _context
6
+ @parent = _parent
7
+ @browser_name = _browser_name
8
+ @mutex = _mutex
9
+
10
+ super @parent
11
+ end
12
+
13
+ def browser
14
+ @browser ||= @context.pool.driver(@browser_name)
15
+ end
16
+
17
+ def synchronize
18
+ @mutex.synchronize {
19
+ yield
20
+ }
21
+ end
22
+
23
+ def __getobj__
24
+ @parent
25
+ end
26
+
27
+ def __setobj__(obj)
28
+ @parent = obj
29
+ end
30
+ end
31
+ end
@@ -20,14 +20,14 @@ module Crabfarm
20
20
  super
21
21
  end
22
22
 
23
- def transition(_name=nil, _params={})
23
+ def navigate(_name=nil, _params={})
24
24
  if _name.nil?
25
- puts "Must provide a state name".color(:red)
25
+ puts "Must provide a navigator name".color(:red)
26
26
  return
27
27
  end
28
28
 
29
29
  begin
30
- puts "Transitioning to #{_name.to_s.camelize} state"
30
+ puts "Navigating to #{_name.to_s.camelize} state"
31
31
  output = super
32
32
 
33
33
  puts "State changed, generated document:"
@@ -44,7 +44,7 @@ module Crabfarm
44
44
  puts "Ejem..."
45
45
  end
46
46
 
47
- alias :t :transition
47
+ alias :n :navigate
48
48
  alias :r :reset
49
49
  end
50
50
 
@@ -23,8 +23,9 @@ module Crabfarm
23
23
  path(_name, '.crabfarm').render('dot_crabfarm', binding)
24
24
  path(_name, 'boot.rb').render('boot.rb', binding)
25
25
  path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
26
- path(_name, 'app', 'parsers', '.gitkeep').render('dot_gitkeep')
27
- path(_name, 'app', 'states', '.gitkeep').render('dot_gitkeep')
26
+ path(_name, 'app', 'navigators', '.gitkeep').render('dot_gitkeep')
27
+ path(_name, 'app', 'reducers', '.gitkeep').render('dot_gitkeep')
28
+ path(_name, 'app', 'structs', '.gitkeep').render('dot_gitkeep')
28
29
  path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
29
30
  path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
30
31
  path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
@@ -34,26 +35,38 @@ module Crabfarm
34
35
  end
35
36
  end
36
37
 
37
- def generate_state(_target, _class_name)
38
+ def generate_navigator(_target, _class_name, _skip_reducer=false)
38
39
  validate_class_name _class_name
39
40
 
40
41
  route = Utils::Naming.route_from_constant _class_name
41
42
  with_base_path _target do
42
- binding = { state_class: _class_name }
43
- path(*(['app', 'states'] + route[0...-1] + [route.last + '.rb'])).render('state.rb', binding)
44
- path(*(['spec', 'states'] + route[0...-1] + [route.last + '_spec.rb'])).render('state_spec.rb', binding)
43
+ binding = { navigator_class: _class_name }
44
+ path(*(['app', 'navigators'] + route[0...-1] + [route.last + '.rb'])).render('navigator.rb', binding)
45
+ path(*(['spec', 'navigators'] + route[0...-1] + [route.last + '_spec.rb'])).render('navigator_spec.rb', binding)
46
+ end
47
+
48
+ generate_reducer(_target, _class_name) unless _skip_reducer
49
+ end
50
+
51
+ def generate_reducer(_target, _class_name)
52
+ validate_class_name _class_name
53
+
54
+ _class_name = _class_name + 'Reducer'
55
+ route = Utils::Naming.route_from_constant _class_name
56
+ with_base_path _target do
57
+ binding = { reducer_class: _class_name }
58
+ path(*(['app', 'reducers'] + route[0...-1] + [route.last + '.rb'])).render('reducer.rb', binding)
59
+ path(*(['spec', 'reducers'] + route[0...-1] + [route.last + '_spec.rb'])).render('reducer_spec.rb', binding)
45
60
  end
46
61
  end
47
62
 
48
- def generate_parser(_target, _class_name)
63
+ def generate_struct(_target, _class_name)
49
64
  validate_class_name _class_name
50
65
 
51
- _class_name = _class_name + 'Parser'
52
66
  route = Utils::Naming.route_from_constant _class_name
53
67
  with_base_path _target do
54
- binding = { parser_class: _class_name }
55
- path(*(['app', 'parsers'] + route[0...-1] + [route.last + '.rb'])).render('parser.rb', binding)
56
- path(*(['spec', 'parsers'] + route[0...-1] + [route.last + '_spec.rb'])).render('parser_spec.rb', binding)
68
+ binding = { struct_class: _class_name }
69
+ path(*(['app', 'structs'] + route[0...-1] + [route.last + '.rb'])).render('struct.rb', binding)
57
70
  end
58
71
  end
59
72
 
@@ -3,9 +3,9 @@ module Crabfarm
3
3
 
4
4
  class Error < Crabfarm::Error; end
5
5
 
6
- def parse(_snapshot=nil, _options={})
6
+ def reduce(_snapshot=nil, _options={})
7
7
 
8
- raise Error.new "Crawl is only available in parser specs" unless described_class < Crabfarm::BaseParser
8
+ raise Error.new "'reduce' is only available in reducer specs" unless described_class < Crabfarm::BaseReducer
9
9
 
10
10
  if _snapshot.is_a? Hash
11
11
  raise ArgumentException.new 'Invalid arguments' unless _options.nil?
@@ -17,42 +17,44 @@ module Crabfarm
17
17
  raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
18
18
 
19
19
  data = File.read snapshot_path
20
- parser = described_class.new data, _options
21
- parser.parse
22
- parser
20
+ reducer = described_class.new data, _options
21
+ reducer.run
22
+ reducer
23
23
  end
24
24
 
25
- def crawl(_state=nil, _params={})
25
+ def navigate(_name=nil, _params={})
26
26
 
27
- raise Error.new "Crawl is only available in state specs" if @context.nil?
27
+ raise Error.new "'navigate' is only available in navigator specs" if @context.nil?
28
28
 
29
- if _state.is_a? Hash
30
- _params = _state
31
- _state = nil
29
+ if _name.is_a? Hash
30
+ _params = _name
31
+ _name = nil
32
32
  end
33
33
 
34
- if _state.nil?
35
- return nil unless described_class < BaseState # TODO: maybe raise an error here.
36
- @state = @last_state = TransitionService.apply_state @context, described_class, _params
34
+ if _name.nil?
35
+ return nil unless described_class < BaseNavigator # TODO: maybe raise an error here.
36
+ @state = @last_state = TransitionService.transition @context, described_class, _params
37
37
  else
38
- @last_state = TransitionService.apply_state @context, _state, _params
38
+ @last_state = TransitionService.transition @context, _name, _params
39
39
  end
40
40
  end
41
41
 
42
42
  def state
43
- @state ||= crawl
43
+ @state ||= navigate
44
44
  end
45
45
 
46
+ alias :navigator :state
47
+
46
48
  def last_state
47
49
  @last_state
48
50
  end
49
51
 
50
- def parser
51
- @parser ||= parse
52
+ def reducer
53
+ @reducer ||= reduce
52
54
  end
53
55
 
54
- def driver(_session_id=nil)
55
- @context.pool.driver(_session_id)
56
+ def browser(_session_id=nil)
57
+ @context.pool.browser(_session_id)
56
58
  end
57
59
 
58
60
  end
@@ -62,13 +64,13 @@ RSpec.configure do |config|
62
64
  config.include Crabfarm::RSpec
63
65
 
64
66
  config.around(:example) do |example|
65
- if described_class < Crabfarm::BaseParser
66
- if example.metadata[:parsing] || example[:parsing_with_params]
67
- @parser = parse example.metadata[:parsing], example.metadata[:parsing_with_params] || {}
67
+ if described_class < Crabfarm::BaseReducer
68
+ if example.metadata[:reducing] || example[:reducing_with_params]
69
+ @reducer = reduce example.metadata[:reducing], example.metadata[:reducing_with_params] || {}
68
70
  end
69
71
  example.run
70
- elsif described_class < Crabfarm::BaseState
71
- Crabfarm::ContextFactory.with_context example.metadata[:crawling] do |ctx|
72
+ elsif described_class < Crabfarm::BaseNavigator
73
+ Crabfarm::ContextFactory.with_context example.metadata[:navigating] do |ctx|
72
74
  @context = ctx
73
75
  example.run
74
76
  end
@@ -2,28 +2,34 @@ module Crabfarm
2
2
  module Strategies
3
3
 
4
4
  class Loader
5
- def initialize(_name, _klass, _pkg, _deps)
5
+ def initialize(_name, _klass, _options={})
6
6
  @name = _name
7
7
  @klass = _klass
8
- @pkg = _pkg
9
- @deps = _deps
8
+
9
+ @pkg = if _options.key? :require
10
+ _options[:require]
11
+ elsif @klass.is_a? String
12
+ Utils::Naming.route_from_constant(@klass).join('/')
13
+ else nil end
14
+
15
+ @requirements = Array(_options[:dependencies]) if _options.key? :dependencies
10
16
  end
11
17
 
12
18
  def load
13
- load_dependencies
19
+ load_requirements unless @requirements.nil?
14
20
  require @pkg if @pkg
15
21
  if @klass.is_a? String then Object.const_get @klass else @klass end
16
22
  end
17
23
 
18
24
  private
19
25
 
20
- def load_dependencies
21
- @deps.each do |dep|
26
+ def load_requirements
27
+ @requirements.each do |dep|
22
28
  begin
23
29
  require dep
24
30
  # TODO: check dependency version!
25
31
  rescue LoadError
26
- raise ConfigurationError.new "Missing #{@name} dependency, please add `gem \"#{dep}\"` to the crawler's Gemfile"
32
+ raise ConfigurationError.new "Could not find #{@name} dependency, maybe you forgot to add `gem \"#{dep}\"` to the crawler's Gemfile?"
27
33
  end
28
34
  end
29
35
  end
@@ -31,9 +37,9 @@ module Crabfarm
31
37
 
32
38
  @@register = {}
33
39
 
34
- def self.register(_cat, _name, _klass, _pkg=nil, _deps=[])
40
+ def self.register(_cat, _name, _klass, _options={})
35
41
  full_name = _cat.to_s + ':' + _name.to_s
36
- @@register[full_name] = Loader.new(full_name, _klass, _pkg, _deps)
42
+ @@register[full_name] = Loader.new(full_name, _klass, _options)
37
43
  end
38
44
 
39
45
  def self.load(_cat, _name)
@@ -1,44 +1,39 @@
1
+ # The selected browser engine for navigators to be used throughout the crawler
2
+ # Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
3
+ set_browser :phantomjs
1
4
 
2
- # The default crawling dsl to use in states and parsers, can be overriden in each component using the `browser_dsl :dsl` modifier
3
- # Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
4
- set_browser_dsl :surfer
5
-
6
- # The default parser engine for parsers that do not specify one.
5
+ # The default parser engine for reducers that do not specify one.
7
6
  # Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
8
- # set_parser_engine :nokogiri
9
-
10
- # Change the defaut output builder used in a state to generate the output document.
11
- # Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
12
- # set_output_builder :hash
7
+ set_parser :nokogiri
13
8
 
14
9
  # The path where every crawler log is stored.
15
10
  set_log_path 'logs'
16
11
 
12
+ # Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
13
+ # set_proxy 'the.proxy.address'
17
14
 
18
- # General driver configuration
15
+ # General webdriver configuration
19
16
  ########################################
20
17
 
21
- # The selected selenium driver to be using throughout the crawler
22
- # Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
23
- set_driver :phantomjs
18
+ # The following parameters only apply if using a webdriver based driver
19
+
20
+ # Selects the webdriver wrapper library to be used, options are :surfer, :watir and :capybara.
21
+ # Both watir and capybara require an additional gem to be added to Gemfile
22
+ set_webdriver_dsl :surfer
24
23
 
25
24
  # Set the selected webdriver capabilities (check the driver documentation for more details)
26
- # set_driver_capabilities
25
+ # set_webdriver_capabilities
27
26
 
28
27
  # Set the browser window width
29
- # set_driver_window_width 1280
28
+ # set_webdriver_window_width 1280
30
29
 
31
30
  # Set the browser window height
32
- # set_driver_window_height 800
33
-
34
- # Set the driver proxy address
35
- # set_proxy
36
-
31
+ # set_webdriver_window_height 800
37
32
 
38
33
  # Phantom launcher configuration
39
34
  ########################################
40
35
 
41
- # The following parameters only apply if using the :phantomjs driver
36
+ # The following parameters only apply if using the :phantomjs webdriver
42
37
 
43
38
  # Make phantom load images when requesting documents
44
39
  # set_phantom_load_images false
@@ -53,16 +48,16 @@ set_driver :phantomjs
53
48
  # Remote driver options
54
49
  ########################################
55
50
 
56
- # The following parameters only apply if using the :remote driver
51
+ # The following parameters only apply if using the :remote webdriver
57
52
 
58
53
  # Remote driver host
59
- # set_driver_host www.myseleniumgrid.com
54
+ # set_webdriver_host www.myseleniumgrid.com
60
55
 
61
56
  # Remote driver port
62
- # set_driver_port 8080
57
+ # set_webdriver_port 8080
63
58
 
64
59
  # Remote driver response timeout, in seconds
65
- # set_driver_remote_timeout 120
60
+ # set_webdriver_remote_timeout 120
66
61
 
67
62
 
68
63
  # Recording configuration