crabfarm 0.2.5 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/lib/crabfarm.rb +17 -18
  3. data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
  4. data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
  5. data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
  6. data/lib/crabfarm/adapters/browser/noop.rb +25 -0
  7. data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
  8. data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
  9. data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
  10. data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
  11. data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
  12. data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
  13. data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
  14. data/lib/crabfarm/assertion/fields.rb +85 -0
  15. data/lib/crabfarm/base_navigator.rb +78 -0
  16. data/lib/crabfarm/base_reducer.rb +68 -0
  17. data/lib/crabfarm/base_struct.rb +17 -0
  18. data/lib/crabfarm/cli.rb +18 -8
  19. data/lib/crabfarm/configuration.rb +24 -51
  20. data/lib/crabfarm/context.rb +19 -43
  21. data/lib/crabfarm/crabtrap_context.rb +4 -11
  22. data/lib/crabfarm/driver_pool.rb +32 -0
  23. data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
  24. data/lib/crabfarm/engines/async_state_manager.rb +1 -1
  25. data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
  26. data/lib/crabfarm/forked_navigator.rb +31 -0
  27. data/lib/crabfarm/modes/console.rb +4 -4
  28. data/lib/crabfarm/modes/generator.rb +24 -11
  29. data/lib/crabfarm/rspec.rb +26 -24
  30. data/lib/crabfarm/strategies.rb +15 -9
  31. data/lib/crabfarm/templates/Crabfile.erb +21 -26
  32. data/lib/crabfarm/templates/Gemfile.erb +6 -0
  33. data/lib/crabfarm/templates/navigator.rb.erb +20 -0
  34. data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
  35. data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
  36. data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
  37. data/lib/crabfarm/templates/struct.rb.erb +12 -0
  38. data/lib/crabfarm/transition_service.rb +20 -7
  39. data/lib/crabfarm/version.rb +1 -1
  40. metadata +50 -48
  41. data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
  42. data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
  43. data/lib/crabfarm/adapters/output/hash.rb +0 -11
  44. data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
  45. data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
  46. data/lib/crabfarm/base_parser.rb +0 -59
  47. data/lib/crabfarm/base_state.rb +0 -112
  48. data/lib/crabfarm/default_driver_factory.rb +0 -86
  49. data/lib/crabfarm/driver_bucket.rb +0 -42
  50. data/lib/crabfarm/driver_bucket_pool.rb +0 -26
  51. data/lib/crabfarm/forked_state.rb +0 -38
  52. data/lib/crabfarm/mocks/noop_driver.rb +0 -6
  53. data/lib/crabfarm/phantom_driver_factory.rb +0 -33
  54. data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,32 @@
1
+ module Crabfarm
2
+ class DriverPool
3
+
4
+ def initialize(_factory)
5
+ @factory = _factory
6
+ @drivers = Hash.new
7
+ end
8
+
9
+ def driver(_session_id=nil)
10
+ _session_id ||= :default_driver
11
+ driver = @drivers[_session_id.to_sym]
12
+ driver = @drivers[_session_id.to_sym] = @factory.build_driver(_session_id) if driver.nil?
13
+ driver
14
+ end
15
+
16
+ def reset(_session_id=nil)
17
+ if _session_id.nil?
18
+ @drivers.values.each { |d| @factory.release_driver d }
19
+ @drivers = Hash.new
20
+ else
21
+ _session_id = _session_id.to_sym
22
+ driver = @drivers.delete _session_id
23
+ @factory.release_driver driver unless driver.nil?
24
+ end
25
+ end
26
+
27
+ def release
28
+ reset
29
+ end
30
+
31
+ end
32
+ end
@@ -3,12 +3,13 @@ module Crabfarm
3
3
  module Surfer
4
4
  class SurfContext < SearchContext
5
5
 
6
- def_delegators :@bucket, :setup
6
+ attr_reader :driver
7
+
7
8
  def_delegators 'driver.navigate', :back, :forward, :refresh
8
9
 
9
- def initialize(_bucket)
10
+ def initialize(_driver)
10
11
  super nil, self
11
- @bucket = _bucket
12
+ @driver = _driver
12
13
  end
13
14
 
14
15
  def root
@@ -23,14 +24,6 @@ module Crabfarm
23
24
  driver.page_source
24
25
  end
25
26
 
26
- def driver
27
- @bucket.original
28
- end
29
-
30
- def quit
31
- @bucket.reset
32
- end
33
-
34
27
  def current_uri
35
28
  URI.parse driver.current_url
36
29
  end
@@ -41,20 +34,7 @@ module Crabfarm
41
34
 
42
35
  def goto(_url, _params=nil)
43
36
  _url += "?#{_params.to_query}" if _params
44
- retries = 0
45
-
46
- loop do
47
- begin
48
- @bucket.reset if retries > 0
49
- driver.get(_url)
50
- break
51
- rescue Timeout::Error #, Selenium::WebDriver::Error::UnknownError
52
- # TODO: log this
53
- raise if retries >= max_retries
54
- retries += 1
55
- sleep 1.0
56
- end
57
- end
37
+ driver.get(_url)
58
38
  end
59
39
  end
60
40
  end
@@ -108,7 +108,7 @@ module Crabfarm
108
108
  logger.info "Transitioning state: #{@next_state_name}"
109
109
  @elapsed = Benchmark.measure do
110
110
  ActiveSupport::Dependencies.clear
111
- @doc = TransitionService.apply_state(@context, @next_state_name, @next_state_params).output_as_json
111
+ @doc = TransitionService.transition(@context, @next_state_name, @next_state_params).document
112
112
  end.real
113
113
 
114
114
  logger.info "Transitioned in #{@elapsed.real}"
@@ -28,7 +28,7 @@ module Crabfarm
28
28
  output = { name: _name, params: _params }
29
29
 
30
30
  output[:elapsed] = Benchmark.measure do
31
- output[:doc] = TransitionService.apply_state(@context, _name, _params).output_as_json
31
+ output[:doc] = TransitionService.transition(@context, _name, _params).document
32
32
  end
33
33
 
34
34
  OpenStruct.new output
@@ -0,0 +1,31 @@
1
+ module Crabfarm
2
+ class ForkedNavigator < Delegator
3
+
4
+ def initialize(_context, _parent, _browser_name, _mutex)
5
+ @context = _context
6
+ @parent = _parent
7
+ @browser_name = _browser_name
8
+ @mutex = _mutex
9
+
10
+ super @parent
11
+ end
12
+
13
+ def browser
14
+ @browser ||= @context.pool.driver(@browser_name)
15
+ end
16
+
17
+ def synchronize
18
+ @mutex.synchronize {
19
+ yield
20
+ }
21
+ end
22
+
23
+ def __getobj__
24
+ @parent
25
+ end
26
+
27
+ def __setobj__(obj)
28
+ @parent = obj
29
+ end
30
+ end
31
+ end
@@ -20,14 +20,14 @@ module Crabfarm
20
20
  super
21
21
  end
22
22
 
23
- def transition(_name=nil, _params={})
23
+ def navigate(_name=nil, _params={})
24
24
  if _name.nil?
25
- puts "Must provide a state name".color(:red)
25
+ puts "Must provide a navigator name".color(:red)
26
26
  return
27
27
  end
28
28
 
29
29
  begin
30
- puts "Transitioning to #{_name.to_s.camelize} state"
30
+ puts "Navigating to #{_name.to_s.camelize} state"
31
31
  output = super
32
32
 
33
33
  puts "State changed, generated document:"
@@ -44,7 +44,7 @@ module Crabfarm
44
44
  puts "Ejem..."
45
45
  end
46
46
 
47
- alias :t :transition
47
+ alias :n :navigate
48
48
  alias :r :reset
49
49
  end
50
50
 
@@ -23,8 +23,9 @@ module Crabfarm
23
23
  path(_name, '.crabfarm').render('dot_crabfarm', binding)
24
24
  path(_name, 'boot.rb').render('boot.rb', binding)
25
25
  path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
26
- path(_name, 'app', 'parsers', '.gitkeep').render('dot_gitkeep')
27
- path(_name, 'app', 'states', '.gitkeep').render('dot_gitkeep')
26
+ path(_name, 'app', 'navigators', '.gitkeep').render('dot_gitkeep')
27
+ path(_name, 'app', 'reducers', '.gitkeep').render('dot_gitkeep')
28
+ path(_name, 'app', 'structs', '.gitkeep').render('dot_gitkeep')
28
29
  path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
29
30
  path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
30
31
  path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
@@ -34,26 +35,38 @@ module Crabfarm
34
35
  end
35
36
  end
36
37
 
37
- def generate_state(_target, _class_name)
38
+ def generate_navigator(_target, _class_name, _skip_reducer=false)
38
39
  validate_class_name _class_name
39
40
 
40
41
  route = Utils::Naming.route_from_constant _class_name
41
42
  with_base_path _target do
42
- binding = { state_class: _class_name }
43
- path(*(['app', 'states'] + route[0...-1] + [route.last + '.rb'])).render('state.rb', binding)
44
- path(*(['spec', 'states'] + route[0...-1] + [route.last + '_spec.rb'])).render('state_spec.rb', binding)
43
+ binding = { navigator_class: _class_name }
44
+ path(*(['app', 'navigators'] + route[0...-1] + [route.last + '.rb'])).render('navigator.rb', binding)
45
+ path(*(['spec', 'navigators'] + route[0...-1] + [route.last + '_spec.rb'])).render('navigator_spec.rb', binding)
46
+ end
47
+
48
+ generate_reducer(_target, _class_name) unless _skip_reducer
49
+ end
50
+
51
+ def generate_reducer(_target, _class_name)
52
+ validate_class_name _class_name
53
+
54
+ _class_name = _class_name + 'Reducer'
55
+ route = Utils::Naming.route_from_constant _class_name
56
+ with_base_path _target do
57
+ binding = { reducer_class: _class_name }
58
+ path(*(['app', 'reducers'] + route[0...-1] + [route.last + '.rb'])).render('reducer.rb', binding)
59
+ path(*(['spec', 'reducers'] + route[0...-1] + [route.last + '_spec.rb'])).render('reducer_spec.rb', binding)
45
60
  end
46
61
  end
47
62
 
48
- def generate_parser(_target, _class_name)
63
+ def generate_struct(_target, _class_name)
49
64
  validate_class_name _class_name
50
65
 
51
- _class_name = _class_name + 'Parser'
52
66
  route = Utils::Naming.route_from_constant _class_name
53
67
  with_base_path _target do
54
- binding = { parser_class: _class_name }
55
- path(*(['app', 'parsers'] + route[0...-1] + [route.last + '.rb'])).render('parser.rb', binding)
56
- path(*(['spec', 'parsers'] + route[0...-1] + [route.last + '_spec.rb'])).render('parser_spec.rb', binding)
68
+ binding = { struct_class: _class_name }
69
+ path(*(['app', 'structs'] + route[0...-1] + [route.last + '.rb'])).render('struct.rb', binding)
57
70
  end
58
71
  end
59
72
 
@@ -3,9 +3,9 @@ module Crabfarm
3
3
 
4
4
  class Error < Crabfarm::Error; end
5
5
 
6
- def parse(_snapshot=nil, _options={})
6
+ def reduce(_snapshot=nil, _options={})
7
7
 
8
- raise Error.new "Crawl is only available in parser specs" unless described_class < Crabfarm::BaseParser
8
+ raise Error.new "'reduce' is only available in reducer specs" unless described_class < Crabfarm::BaseReducer
9
9
 
10
10
  if _snapshot.is_a? Hash
11
11
  raise ArgumentException.new 'Invalid arguments' unless _options.nil?
@@ -17,42 +17,44 @@ module Crabfarm
17
17
  raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
18
18
 
19
19
  data = File.read snapshot_path
20
- parser = described_class.new data, _options
21
- parser.parse
22
- parser
20
+ reducer = described_class.new data, _options
21
+ reducer.run
22
+ reducer
23
23
  end
24
24
 
25
- def crawl(_state=nil, _params={})
25
+ def navigate(_name=nil, _params={})
26
26
 
27
- raise Error.new "Crawl is only available in state specs" if @context.nil?
27
+ raise Error.new "'navigate' is only available in navigator specs" if @context.nil?
28
28
 
29
- if _state.is_a? Hash
30
- _params = _state
31
- _state = nil
29
+ if _name.is_a? Hash
30
+ _params = _name
31
+ _name = nil
32
32
  end
33
33
 
34
- if _state.nil?
35
- return nil unless described_class < BaseState # TODO: maybe raise an error here.
36
- @state = @last_state = TransitionService.apply_state @context, described_class, _params
34
+ if _name.nil?
35
+ return nil unless described_class < BaseNavigator # TODO: maybe raise an error here.
36
+ @state = @last_state = TransitionService.transition @context, described_class, _params
37
37
  else
38
- @last_state = TransitionService.apply_state @context, _state, _params
38
+ @last_state = TransitionService.transition @context, _name, _params
39
39
  end
40
40
  end
41
41
 
42
42
  def state
43
- @state ||= crawl
43
+ @state ||= navigate
44
44
  end
45
45
 
46
+ alias :navigator :state
47
+
46
48
  def last_state
47
49
  @last_state
48
50
  end
49
51
 
50
- def parser
51
- @parser ||= parse
52
+ def reducer
53
+ @reducer ||= reduce
52
54
  end
53
55
 
54
- def driver(_session_id=nil)
55
- @context.pool.driver(_session_id)
56
+ def browser(_session_id=nil)
57
+ @context.pool.browser(_session_id)
56
58
  end
57
59
 
58
60
  end
@@ -62,13 +64,13 @@ RSpec.configure do |config|
62
64
  config.include Crabfarm::RSpec
63
65
 
64
66
  config.around(:example) do |example|
65
- if described_class < Crabfarm::BaseParser
66
- if example.metadata[:parsing] || example[:parsing_with_params]
67
- @parser = parse example.metadata[:parsing], example.metadata[:parsing_with_params] || {}
67
+ if described_class < Crabfarm::BaseReducer
68
+ if example.metadata[:reducing] || example[:reducing_with_params]
69
+ @reducer = reduce example.metadata[:reducing], example.metadata[:reducing_with_params] || {}
68
70
  end
69
71
  example.run
70
- elsif described_class < Crabfarm::BaseState
71
- Crabfarm::ContextFactory.with_context example.metadata[:crawling] do |ctx|
72
+ elsif described_class < Crabfarm::BaseNavigator
73
+ Crabfarm::ContextFactory.with_context example.metadata[:navigating] do |ctx|
72
74
  @context = ctx
73
75
  example.run
74
76
  end
@@ -2,28 +2,34 @@ module Crabfarm
2
2
  module Strategies
3
3
 
4
4
  class Loader
5
- def initialize(_name, _klass, _pkg, _deps)
5
+ def initialize(_name, _klass, _options={})
6
6
  @name = _name
7
7
  @klass = _klass
8
- @pkg = _pkg
9
- @deps = _deps
8
+
9
+ @pkg = if _options.key? :require
10
+ _options[:require]
11
+ elsif @klass.is_a? String
12
+ Utils::Naming.route_from_constant(@klass).join('/')
13
+ else nil end
14
+
15
+ @requirements = Array(_options[:dependencies]) if _options.key? :dependencies
10
16
  end
11
17
 
12
18
  def load
13
- load_dependencies
19
+ load_requirements unless @requirements.nil?
14
20
  require @pkg if @pkg
15
21
  if @klass.is_a? String then Object.const_get @klass else @klass end
16
22
  end
17
23
 
18
24
  private
19
25
 
20
- def load_dependencies
21
- @deps.each do |dep|
26
+ def load_requirements
27
+ @requirements.each do |dep|
22
28
  begin
23
29
  require dep
24
30
  # TODO: check dependency version!
25
31
  rescue LoadError
26
- raise ConfigurationError.new "Missing #{@name} dependency, please add `gem \"#{dep}\"` to the crawler's Gemfile"
32
+ raise ConfigurationError.new "Could not find #{@name} dependency, maybe you forgot to add `gem \"#{dep}\"` to the crawler's Gemfile?"
27
33
  end
28
34
  end
29
35
  end
@@ -31,9 +37,9 @@ module Crabfarm
31
37
 
32
38
  @@register = {}
33
39
 
34
- def self.register(_cat, _name, _klass, _pkg=nil, _deps=[])
40
+ def self.register(_cat, _name, _klass, _options={})
35
41
  full_name = _cat.to_s + ':' + _name.to_s
36
- @@register[full_name] = Loader.new(full_name, _klass, _pkg, _deps)
42
+ @@register[full_name] = Loader.new(full_name, _klass, _options)
37
43
  end
38
44
 
39
45
  def self.load(_cat, _name)
@@ -1,44 +1,39 @@
1
+ # The selected browser engine for navigators to be used throughout the crawler
2
+ # Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
3
+ set_browser :phantomjs
1
4
 
2
- # The default crawling dsl to use in states and parsers, can be overriden in each component using the `browser_dsl :dsl` modifier
3
- # Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
4
- set_browser_dsl :surfer
5
-
6
- # The default parser engine for parsers that do not specify one.
5
+ # The default parser engine for reducers that do not specify one.
7
6
  # Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
8
- # set_parser_engine :nokogiri
9
-
10
- # Change the defaut output builder used in a state to generate the output document.
11
- # Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
12
- # set_output_builder :hash
7
+ set_parser :nokogiri
13
8
 
14
9
  # The path where every crawler log is stored.
15
10
  set_log_path 'logs'
16
11
 
12
+ # Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
13
+ # set_proxy 'the.proxy.address'
17
14
 
18
- # General driver configuration
15
+ # General webdriver configuration
19
16
  ########################################
20
17
 
21
- # The selected selenium driver to be using throughout the crawler
22
- # Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
23
- set_driver :phantomjs
18
+ # The following parameters only apply if using a webdriver based driver
19
+
20
+ # Selects the webdriver wrapper library to be used, options are :surfer, :watir and :capybara.
21
+ # Both watir and capybara require an additional gem to be added to Gemfile
22
+ set_webdriver_dsl :surfer
24
23
 
25
24
  # Set the selected webdriver capabilities (check the driver documentation for more details)
26
- # set_driver_capabilities
25
+ # set_webdriver_capabilities
27
26
 
28
27
  # Set the browser window width
29
- # set_driver_window_width 1280
28
+ # set_webdriver_window_width 1280
30
29
 
31
30
  # Set the browser window height
32
- # set_driver_window_height 800
33
-
34
- # Set the driver proxy address
35
- # set_proxy
36
-
31
+ # set_webdriver_window_height 800
37
32
 
38
33
  # Phantom launcher configuration
39
34
  ########################################
40
35
 
41
- # The following parameters only apply if using the :phantomjs driver
36
+ # The following parameters only apply if using the :phantomjs webdriver
42
37
 
43
38
  # Make phantom load images when requesting documents
44
39
  # set_phantom_load_images false
@@ -53,16 +48,16 @@ set_driver :phantomjs
53
48
  # Remote driver options
54
49
  ########################################
55
50
 
56
- # The following parameters only apply if using the :remote driver
51
+ # The following parameters only apply if using the :remote webdriver
57
52
 
58
53
  # Remote driver host
59
- # set_driver_host www.myseleniumgrid.com
54
+ # set_webdriver_host www.myseleniumgrid.com
60
55
 
61
56
  # Remote driver port
62
- # set_driver_port 8080
57
+ # set_webdriver_port 8080
63
58
 
64
59
  # Remote driver response timeout, in seconds
65
- # set_driver_remote_timeout 120
60
+ # set_webdriver_remote_timeout 120
66
61
 
67
62
 
68
63
  # Recording configuration