crabfarm 0.2.5 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm.rb +17 -18
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
- data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
- data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
- data/lib/crabfarm/adapters/browser/noop.rb +25 -0
- data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
- data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
- data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
- data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
- data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
- data/lib/crabfarm/assertion/fields.rb +85 -0
- data/lib/crabfarm/base_navigator.rb +78 -0
- data/lib/crabfarm/base_reducer.rb +68 -0
- data/lib/crabfarm/base_struct.rb +17 -0
- data/lib/crabfarm/cli.rb +18 -8
- data/lib/crabfarm/configuration.rb +24 -51
- data/lib/crabfarm/context.rb +19 -43
- data/lib/crabfarm/crabtrap_context.rb +4 -11
- data/lib/crabfarm/driver_pool.rb +32 -0
- data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
- data/lib/crabfarm/engines/async_state_manager.rb +1 -1
- data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
- data/lib/crabfarm/forked_navigator.rb +31 -0
- data/lib/crabfarm/modes/console.rb +4 -4
- data/lib/crabfarm/modes/generator.rb +24 -11
- data/lib/crabfarm/rspec.rb +26 -24
- data/lib/crabfarm/strategies.rb +15 -9
- data/lib/crabfarm/templates/Crabfile.erb +21 -26
- data/lib/crabfarm/templates/Gemfile.erb +6 -0
- data/lib/crabfarm/templates/navigator.rb.erb +20 -0
- data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
- data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/struct.rb.erb +12 -0
- data/lib/crabfarm/transition_service.rb +20 -7
- data/lib/crabfarm/version.rb +1 -1
- metadata +50 -48
- data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
- data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
- data/lib/crabfarm/adapters/output/hash.rb +0 -11
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
- data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
- data/lib/crabfarm/base_parser.rb +0 -59
- data/lib/crabfarm/base_state.rb +0 -112
- data/lib/crabfarm/default_driver_factory.rb +0 -86
- data/lib/crabfarm/driver_bucket.rb +0 -42
- data/lib/crabfarm/driver_bucket_pool.rb +0 -26
- data/lib/crabfarm/forked_state.rb +0 -38
- data/lib/crabfarm/mocks/noop_driver.rb +0 -6
- data/lib/crabfarm/phantom_driver_factory.rb +0 -33
- data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,32 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class DriverPool
|
3
|
+
|
4
|
+
def initialize(_factory)
|
5
|
+
@factory = _factory
|
6
|
+
@drivers = Hash.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def driver(_session_id=nil)
|
10
|
+
_session_id ||= :default_driver
|
11
|
+
driver = @drivers[_session_id.to_sym]
|
12
|
+
driver = @drivers[_session_id.to_sym] = @factory.build_driver(_session_id) if driver.nil?
|
13
|
+
driver
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset(_session_id=nil)
|
17
|
+
if _session_id.nil?
|
18
|
+
@drivers.values.each { |d| @factory.release_driver d }
|
19
|
+
@drivers = Hash.new
|
20
|
+
else
|
21
|
+
_session_id = _session_id.to_sym
|
22
|
+
driver = @drivers.delete _session_id
|
23
|
+
@factory.release_driver driver unless driver.nil?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def release
|
28
|
+
reset
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -3,12 +3,13 @@ module Crabfarm
|
|
3
3
|
module Surfer
|
4
4
|
class SurfContext < SearchContext
|
5
5
|
|
6
|
-
|
6
|
+
attr_reader :driver
|
7
|
+
|
7
8
|
def_delegators 'driver.navigate', :back, :forward, :refresh
|
8
9
|
|
9
|
-
def initialize(
|
10
|
+
def initialize(_driver)
|
10
11
|
super nil, self
|
11
|
-
@
|
12
|
+
@driver = _driver
|
12
13
|
end
|
13
14
|
|
14
15
|
def root
|
@@ -23,14 +24,6 @@ module Crabfarm
|
|
23
24
|
driver.page_source
|
24
25
|
end
|
25
26
|
|
26
|
-
def driver
|
27
|
-
@bucket.original
|
28
|
-
end
|
29
|
-
|
30
|
-
def quit
|
31
|
-
@bucket.reset
|
32
|
-
end
|
33
|
-
|
34
27
|
def current_uri
|
35
28
|
URI.parse driver.current_url
|
36
29
|
end
|
@@ -41,20 +34,7 @@ module Crabfarm
|
|
41
34
|
|
42
35
|
def goto(_url, _params=nil)
|
43
36
|
_url += "?#{_params.to_query}" if _params
|
44
|
-
|
45
|
-
|
46
|
-
loop do
|
47
|
-
begin
|
48
|
-
@bucket.reset if retries > 0
|
49
|
-
driver.get(_url)
|
50
|
-
break
|
51
|
-
rescue Timeout::Error #, Selenium::WebDriver::Error::UnknownError
|
52
|
-
# TODO: log this
|
53
|
-
raise if retries >= max_retries
|
54
|
-
retries += 1
|
55
|
-
sleep 1.0
|
56
|
-
end
|
57
|
-
end
|
37
|
+
driver.get(_url)
|
58
38
|
end
|
59
39
|
end
|
60
40
|
end
|
@@ -108,7 +108,7 @@ module Crabfarm
|
|
108
108
|
logger.info "Transitioning state: #{@next_state_name}"
|
109
109
|
@elapsed = Benchmark.measure do
|
110
110
|
ActiveSupport::Dependencies.clear
|
111
|
-
@doc = TransitionService.
|
111
|
+
@doc = TransitionService.transition(@context, @next_state_name, @next_state_params).document
|
112
112
|
end.real
|
113
113
|
|
114
114
|
logger.info "Transitioned in #{@elapsed.real}"
|
@@ -28,7 +28,7 @@ module Crabfarm
|
|
28
28
|
output = { name: _name, params: _params }
|
29
29
|
|
30
30
|
output[:elapsed] = Benchmark.measure do
|
31
|
-
output[:doc] = TransitionService.
|
31
|
+
output[:doc] = TransitionService.transition(@context, _name, _params).document
|
32
32
|
end
|
33
33
|
|
34
34
|
OpenStruct.new output
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class ForkedNavigator < Delegator
|
3
|
+
|
4
|
+
def initialize(_context, _parent, _browser_name, _mutex)
|
5
|
+
@context = _context
|
6
|
+
@parent = _parent
|
7
|
+
@browser_name = _browser_name
|
8
|
+
@mutex = _mutex
|
9
|
+
|
10
|
+
super @parent
|
11
|
+
end
|
12
|
+
|
13
|
+
def browser
|
14
|
+
@browser ||= @context.pool.driver(@browser_name)
|
15
|
+
end
|
16
|
+
|
17
|
+
def synchronize
|
18
|
+
@mutex.synchronize {
|
19
|
+
yield
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def __getobj__
|
24
|
+
@parent
|
25
|
+
end
|
26
|
+
|
27
|
+
def __setobj__(obj)
|
28
|
+
@parent = obj
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -20,14 +20,14 @@ module Crabfarm
|
|
20
20
|
super
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def navigate(_name=nil, _params={})
|
24
24
|
if _name.nil?
|
25
|
-
puts "Must provide a
|
25
|
+
puts "Must provide a navigator name".color(:red)
|
26
26
|
return
|
27
27
|
end
|
28
28
|
|
29
29
|
begin
|
30
|
-
puts "
|
30
|
+
puts "Navigating to #{_name.to_s.camelize} state"
|
31
31
|
output = super
|
32
32
|
|
33
33
|
puts "State changed, generated document:"
|
@@ -44,7 +44,7 @@ module Crabfarm
|
|
44
44
|
puts "Ejem..."
|
45
45
|
end
|
46
46
|
|
47
|
-
alias :
|
47
|
+
alias :n :navigate
|
48
48
|
alias :r :reset
|
49
49
|
end
|
50
50
|
|
@@ -23,8 +23,9 @@ module Crabfarm
|
|
23
23
|
path(_name, '.crabfarm').render('dot_crabfarm', binding)
|
24
24
|
path(_name, 'boot.rb').render('boot.rb', binding)
|
25
25
|
path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
|
26
|
-
path(_name, 'app', '
|
27
|
-
path(_name, 'app', '
|
26
|
+
path(_name, 'app', 'navigators', '.gitkeep').render('dot_gitkeep')
|
27
|
+
path(_name, 'app', 'reducers', '.gitkeep').render('dot_gitkeep')
|
28
|
+
path(_name, 'app', 'structs', '.gitkeep').render('dot_gitkeep')
|
28
29
|
path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
|
29
30
|
path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
|
30
31
|
path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
|
@@ -34,26 +35,38 @@ module Crabfarm
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
def
|
38
|
+
def generate_navigator(_target, _class_name, _skip_reducer=false)
|
38
39
|
validate_class_name _class_name
|
39
40
|
|
40
41
|
route = Utils::Naming.route_from_constant _class_name
|
41
42
|
with_base_path _target do
|
42
|
-
binding = {
|
43
|
-
path(*(['app', '
|
44
|
-
path(*(['spec', '
|
43
|
+
binding = { navigator_class: _class_name }
|
44
|
+
path(*(['app', 'navigators'] + route[0...-1] + [route.last + '.rb'])).render('navigator.rb', binding)
|
45
|
+
path(*(['spec', 'navigators'] + route[0...-1] + [route.last + '_spec.rb'])).render('navigator_spec.rb', binding)
|
46
|
+
end
|
47
|
+
|
48
|
+
generate_reducer(_target, _class_name) unless _skip_reducer
|
49
|
+
end
|
50
|
+
|
51
|
+
def generate_reducer(_target, _class_name)
|
52
|
+
validate_class_name _class_name
|
53
|
+
|
54
|
+
_class_name = _class_name + 'Reducer'
|
55
|
+
route = Utils::Naming.route_from_constant _class_name
|
56
|
+
with_base_path _target do
|
57
|
+
binding = { reducer_class: _class_name }
|
58
|
+
path(*(['app', 'reducers'] + route[0...-1] + [route.last + '.rb'])).render('reducer.rb', binding)
|
59
|
+
path(*(['spec', 'reducers'] + route[0...-1] + [route.last + '_spec.rb'])).render('reducer_spec.rb', binding)
|
45
60
|
end
|
46
61
|
end
|
47
62
|
|
48
|
-
def
|
63
|
+
def generate_struct(_target, _class_name)
|
49
64
|
validate_class_name _class_name
|
50
65
|
|
51
|
-
_class_name = _class_name + 'Parser'
|
52
66
|
route = Utils::Naming.route_from_constant _class_name
|
53
67
|
with_base_path _target do
|
54
|
-
binding = {
|
55
|
-
path(*(['app', '
|
56
|
-
path(*(['spec', 'parsers'] + route[0...-1] + [route.last + '_spec.rb'])).render('parser_spec.rb', binding)
|
68
|
+
binding = { struct_class: _class_name }
|
69
|
+
path(*(['app', 'structs'] + route[0...-1] + [route.last + '.rb'])).render('struct.rb', binding)
|
57
70
|
end
|
58
71
|
end
|
59
72
|
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,9 +3,9 @@ module Crabfarm
|
|
3
3
|
|
4
4
|
class Error < Crabfarm::Error; end
|
5
5
|
|
6
|
-
def
|
6
|
+
def reduce(_snapshot=nil, _options={})
|
7
7
|
|
8
|
-
raise Error.new "
|
8
|
+
raise Error.new "'reduce' is only available in reducer specs" unless described_class < Crabfarm::BaseReducer
|
9
9
|
|
10
10
|
if _snapshot.is_a? Hash
|
11
11
|
raise ArgumentException.new 'Invalid arguments' unless _options.nil?
|
@@ -17,42 +17,44 @@ module Crabfarm
|
|
17
17
|
raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
|
18
18
|
|
19
19
|
data = File.read snapshot_path
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
reducer = described_class.new data, _options
|
21
|
+
reducer.run
|
22
|
+
reducer
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def navigate(_name=nil, _params={})
|
26
26
|
|
27
|
-
raise Error.new "
|
27
|
+
raise Error.new "'navigate' is only available in navigator specs" if @context.nil?
|
28
28
|
|
29
|
-
if
|
30
|
-
_params =
|
31
|
-
|
29
|
+
if _name.is_a? Hash
|
30
|
+
_params = _name
|
31
|
+
_name = nil
|
32
32
|
end
|
33
33
|
|
34
|
-
if
|
35
|
-
return nil unless described_class <
|
36
|
-
@state = @last_state = TransitionService.
|
34
|
+
if _name.nil?
|
35
|
+
return nil unless described_class < BaseNavigator # TODO: maybe raise an error here.
|
36
|
+
@state = @last_state = TransitionService.transition @context, described_class, _params
|
37
37
|
else
|
38
|
-
@last_state = TransitionService.
|
38
|
+
@last_state = TransitionService.transition @context, _name, _params
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
42
|
def state
|
43
|
-
@state ||=
|
43
|
+
@state ||= navigate
|
44
44
|
end
|
45
45
|
|
46
|
+
alias :navigator :state
|
47
|
+
|
46
48
|
def last_state
|
47
49
|
@last_state
|
48
50
|
end
|
49
51
|
|
50
|
-
def
|
51
|
-
@
|
52
|
+
def reducer
|
53
|
+
@reducer ||= reduce
|
52
54
|
end
|
53
55
|
|
54
|
-
def
|
55
|
-
@context.pool.
|
56
|
+
def browser(_session_id=nil)
|
57
|
+
@context.pool.browser(_session_id)
|
56
58
|
end
|
57
59
|
|
58
60
|
end
|
@@ -62,13 +64,13 @@ RSpec.configure do |config|
|
|
62
64
|
config.include Crabfarm::RSpec
|
63
65
|
|
64
66
|
config.around(:example) do |example|
|
65
|
-
if described_class < Crabfarm::
|
66
|
-
if example.metadata[:
|
67
|
-
@
|
67
|
+
if described_class < Crabfarm::BaseReducer
|
68
|
+
if example.metadata[:reducing] || example[:reducing_with_params]
|
69
|
+
@reducer = reduce example.metadata[:reducing], example.metadata[:reducing_with_params] || {}
|
68
70
|
end
|
69
71
|
example.run
|
70
|
-
elsif described_class < Crabfarm::
|
71
|
-
Crabfarm::ContextFactory.with_context example.metadata[:
|
72
|
+
elsif described_class < Crabfarm::BaseNavigator
|
73
|
+
Crabfarm::ContextFactory.with_context example.metadata[:navigating] do |ctx|
|
72
74
|
@context = ctx
|
73
75
|
example.run
|
74
76
|
end
|
data/lib/crabfarm/strategies.rb
CHANGED
@@ -2,28 +2,34 @@ module Crabfarm
|
|
2
2
|
module Strategies
|
3
3
|
|
4
4
|
class Loader
|
5
|
-
def initialize(_name, _klass,
|
5
|
+
def initialize(_name, _klass, _options={})
|
6
6
|
@name = _name
|
7
7
|
@klass = _klass
|
8
|
-
|
9
|
-
@
|
8
|
+
|
9
|
+
@pkg = if _options.key? :require
|
10
|
+
_options[:require]
|
11
|
+
elsif @klass.is_a? String
|
12
|
+
Utils::Naming.route_from_constant(@klass).join('/')
|
13
|
+
else nil end
|
14
|
+
|
15
|
+
@requirements = Array(_options[:dependencies]) if _options.key? :dependencies
|
10
16
|
end
|
11
17
|
|
12
18
|
def load
|
13
|
-
|
19
|
+
load_requirements unless @requirements.nil?
|
14
20
|
require @pkg if @pkg
|
15
21
|
if @klass.is_a? String then Object.const_get @klass else @klass end
|
16
22
|
end
|
17
23
|
|
18
24
|
private
|
19
25
|
|
20
|
-
def
|
21
|
-
@
|
26
|
+
def load_requirements
|
27
|
+
@requirements.each do |dep|
|
22
28
|
begin
|
23
29
|
require dep
|
24
30
|
# TODO: check dependency version!
|
25
31
|
rescue LoadError
|
26
|
-
raise ConfigurationError.new "
|
32
|
+
raise ConfigurationError.new "Could not find #{@name} dependency, maybe you forgot to add `gem \"#{dep}\"` to the crawler's Gemfile?"
|
27
33
|
end
|
28
34
|
end
|
29
35
|
end
|
@@ -31,9 +37,9 @@ module Crabfarm
|
|
31
37
|
|
32
38
|
@@register = {}
|
33
39
|
|
34
|
-
def self.register(_cat, _name, _klass,
|
40
|
+
def self.register(_cat, _name, _klass, _options={})
|
35
41
|
full_name = _cat.to_s + ':' + _name.to_s
|
36
|
-
@@register[full_name] = Loader.new(full_name, _klass,
|
42
|
+
@@register[full_name] = Loader.new(full_name, _klass, _options)
|
37
43
|
end
|
38
44
|
|
39
45
|
def self.load(_cat, _name)
|
@@ -1,44 +1,39 @@
|
|
1
|
+
# The selected browser engine for navigators to be used throughout the crawler
|
2
|
+
# Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
|
3
|
+
set_browser :phantomjs
|
1
4
|
|
2
|
-
# The default
|
3
|
-
# Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
|
4
|
-
set_browser_dsl :surfer
|
5
|
-
|
6
|
-
# The default parser engine for parsers that do not specify one.
|
5
|
+
# The default parser engine for reducers that do not specify one.
|
7
6
|
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
8
|
-
|
9
|
-
|
10
|
-
# Change the defaut output builder used in a state to generate the output document.
|
11
|
-
# Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
|
12
|
-
# set_output_builder :hash
|
7
|
+
set_parser :nokogiri
|
13
8
|
|
14
9
|
# The path where every crawler log is stored.
|
15
10
|
set_log_path 'logs'
|
16
11
|
|
12
|
+
# Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
|
13
|
+
# set_proxy 'the.proxy.address'
|
17
14
|
|
18
|
-
# General
|
15
|
+
# General webdriver configuration
|
19
16
|
########################################
|
20
17
|
|
21
|
-
# The
|
22
|
-
|
23
|
-
|
18
|
+
# The following parameters only apply if using a webdriver based driver
|
19
|
+
|
20
|
+
# Selects the webdriver wrapper library to be used, options are :surfer, :watir and :capybara.
|
21
|
+
# Both watir and capybara require an additional gem to be added to Gemfile
|
22
|
+
set_webdriver_dsl :surfer
|
24
23
|
|
25
24
|
# Set the selected webdriver capabilities (check the driver documentation for more details)
|
26
|
-
#
|
25
|
+
# set_webdriver_capabilities
|
27
26
|
|
28
27
|
# Set the browser window width
|
29
|
-
#
|
28
|
+
# set_webdriver_window_width 1280
|
30
29
|
|
31
30
|
# Set the browser window height
|
32
|
-
#
|
33
|
-
|
34
|
-
# Set the driver proxy address
|
35
|
-
# set_proxy
|
36
|
-
|
31
|
+
# set_webdriver_window_height 800
|
37
32
|
|
38
33
|
# Phantom launcher configuration
|
39
34
|
########################################
|
40
35
|
|
41
|
-
# The following parameters only apply if using the :phantomjs
|
36
|
+
# The following parameters only apply if using the :phantomjs webdriver
|
42
37
|
|
43
38
|
# Make phantom load images when requesting documents
|
44
39
|
# set_phantom_load_images false
|
@@ -53,16 +48,16 @@ set_driver :phantomjs
|
|
53
48
|
# Remote driver options
|
54
49
|
########################################
|
55
50
|
|
56
|
-
# The following parameters only apply if using the :remote
|
51
|
+
# The following parameters only apply if using the :remote webdriver
|
57
52
|
|
58
53
|
# Remote driver host
|
59
|
-
#
|
54
|
+
# set_webdriver_host www.myseleniumgrid.com
|
60
55
|
|
61
56
|
# Remote driver port
|
62
|
-
#
|
57
|
+
# set_webdriver_port 8080
|
63
58
|
|
64
59
|
# Remote driver response timeout, in seconds
|
65
|
-
#
|
60
|
+
# set_webdriver_remote_timeout 120
|
66
61
|
|
67
62
|
|
68
63
|
# Recording configuration
|