crabfarm 0.2.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm.rb +17 -18
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
- data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
- data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
- data/lib/crabfarm/adapters/browser/noop.rb +25 -0
- data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
- data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
- data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
- data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
- data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
- data/lib/crabfarm/assertion/fields.rb +85 -0
- data/lib/crabfarm/base_navigator.rb +78 -0
- data/lib/crabfarm/base_reducer.rb +68 -0
- data/lib/crabfarm/base_struct.rb +17 -0
- data/lib/crabfarm/cli.rb +18 -8
- data/lib/crabfarm/configuration.rb +24 -51
- data/lib/crabfarm/context.rb +19 -43
- data/lib/crabfarm/crabtrap_context.rb +4 -11
- data/lib/crabfarm/driver_pool.rb +32 -0
- data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
- data/lib/crabfarm/engines/async_state_manager.rb +1 -1
- data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
- data/lib/crabfarm/forked_navigator.rb +31 -0
- data/lib/crabfarm/modes/console.rb +4 -4
- data/lib/crabfarm/modes/generator.rb +24 -11
- data/lib/crabfarm/rspec.rb +26 -24
- data/lib/crabfarm/strategies.rb +15 -9
- data/lib/crabfarm/templates/Crabfile.erb +21 -26
- data/lib/crabfarm/templates/Gemfile.erb +6 -0
- data/lib/crabfarm/templates/navigator.rb.erb +20 -0
- data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
- data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/struct.rb.erb +12 -0
- data/lib/crabfarm/transition_service.rb +20 -7
- data/lib/crabfarm/version.rb +1 -1
- metadata +50 -48
- data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
- data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
- data/lib/crabfarm/adapters/output/hash.rb +0 -11
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
- data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
- data/lib/crabfarm/base_parser.rb +0 -59
- data/lib/crabfarm/base_state.rb +0 -112
- data/lib/crabfarm/default_driver_factory.rb +0 -86
- data/lib/crabfarm/driver_bucket.rb +0 -42
- data/lib/crabfarm/driver_bucket_pool.rb +0 -26
- data/lib/crabfarm/forked_state.rb +0 -38
- data/lib/crabfarm/mocks/noop_driver.rb +0 -6
- data/lib/crabfarm/phantom_driver_factory.rb +0 -33
- data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,32 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class DriverPool
|
3
|
+
|
4
|
+
def initialize(_factory)
|
5
|
+
@factory = _factory
|
6
|
+
@drivers = Hash.new
|
7
|
+
end
|
8
|
+
|
9
|
+
def driver(_session_id=nil)
|
10
|
+
_session_id ||= :default_driver
|
11
|
+
driver = @drivers[_session_id.to_sym]
|
12
|
+
driver = @drivers[_session_id.to_sym] = @factory.build_driver(_session_id) if driver.nil?
|
13
|
+
driver
|
14
|
+
end
|
15
|
+
|
16
|
+
def reset(_session_id=nil)
|
17
|
+
if _session_id.nil?
|
18
|
+
@drivers.values.each { |d| @factory.release_driver d }
|
19
|
+
@drivers = Hash.new
|
20
|
+
else
|
21
|
+
_session_id = _session_id.to_sym
|
22
|
+
driver = @drivers.delete _session_id
|
23
|
+
@factory.release_driver driver unless driver.nil?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def release
|
28
|
+
reset
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
@@ -3,12 +3,13 @@ module Crabfarm
|
|
3
3
|
module Surfer
|
4
4
|
class SurfContext < SearchContext
|
5
5
|
|
6
|
-
|
6
|
+
attr_reader :driver
|
7
|
+
|
7
8
|
def_delegators 'driver.navigate', :back, :forward, :refresh
|
8
9
|
|
9
|
-
def initialize(
|
10
|
+
def initialize(_driver)
|
10
11
|
super nil, self
|
11
|
-
@
|
12
|
+
@driver = _driver
|
12
13
|
end
|
13
14
|
|
14
15
|
def root
|
@@ -23,14 +24,6 @@ module Crabfarm
|
|
23
24
|
driver.page_source
|
24
25
|
end
|
25
26
|
|
26
|
-
def driver
|
27
|
-
@bucket.original
|
28
|
-
end
|
29
|
-
|
30
|
-
def quit
|
31
|
-
@bucket.reset
|
32
|
-
end
|
33
|
-
|
34
27
|
def current_uri
|
35
28
|
URI.parse driver.current_url
|
36
29
|
end
|
@@ -41,20 +34,7 @@ module Crabfarm
|
|
41
34
|
|
42
35
|
def goto(_url, _params=nil)
|
43
36
|
_url += "?#{_params.to_query}" if _params
|
44
|
-
|
45
|
-
|
46
|
-
loop do
|
47
|
-
begin
|
48
|
-
@bucket.reset if retries > 0
|
49
|
-
driver.get(_url)
|
50
|
-
break
|
51
|
-
rescue Timeout::Error #, Selenium::WebDriver::Error::UnknownError
|
52
|
-
# TODO: log this
|
53
|
-
raise if retries >= max_retries
|
54
|
-
retries += 1
|
55
|
-
sleep 1.0
|
56
|
-
end
|
57
|
-
end
|
37
|
+
driver.get(_url)
|
58
38
|
end
|
59
39
|
end
|
60
40
|
end
|
@@ -108,7 +108,7 @@ module Crabfarm
|
|
108
108
|
logger.info "Transitioning state: #{@next_state_name}"
|
109
109
|
@elapsed = Benchmark.measure do
|
110
110
|
ActiveSupport::Dependencies.clear
|
111
|
-
@doc = TransitionService.
|
111
|
+
@doc = TransitionService.transition(@context, @next_state_name, @next_state_params).document
|
112
112
|
end.real
|
113
113
|
|
114
114
|
logger.info "Transitioned in #{@elapsed.real}"
|
@@ -28,7 +28,7 @@ module Crabfarm
|
|
28
28
|
output = { name: _name, params: _params }
|
29
29
|
|
30
30
|
output[:elapsed] = Benchmark.measure do
|
31
|
-
output[:doc] = TransitionService.
|
31
|
+
output[:doc] = TransitionService.transition(@context, _name, _params).document
|
32
32
|
end
|
33
33
|
|
34
34
|
OpenStruct.new output
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class ForkedNavigator < Delegator
|
3
|
+
|
4
|
+
def initialize(_context, _parent, _browser_name, _mutex)
|
5
|
+
@context = _context
|
6
|
+
@parent = _parent
|
7
|
+
@browser_name = _browser_name
|
8
|
+
@mutex = _mutex
|
9
|
+
|
10
|
+
super @parent
|
11
|
+
end
|
12
|
+
|
13
|
+
def browser
|
14
|
+
@browser ||= @context.pool.driver(@browser_name)
|
15
|
+
end
|
16
|
+
|
17
|
+
def synchronize
|
18
|
+
@mutex.synchronize {
|
19
|
+
yield
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def __getobj__
|
24
|
+
@parent
|
25
|
+
end
|
26
|
+
|
27
|
+
def __setobj__(obj)
|
28
|
+
@parent = obj
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -20,14 +20,14 @@ module Crabfarm
|
|
20
20
|
super
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def navigate(_name=nil, _params={})
|
24
24
|
if _name.nil?
|
25
|
-
puts "Must provide a
|
25
|
+
puts "Must provide a navigator name".color(:red)
|
26
26
|
return
|
27
27
|
end
|
28
28
|
|
29
29
|
begin
|
30
|
-
puts "
|
30
|
+
puts "Navigating to #{_name.to_s.camelize} state"
|
31
31
|
output = super
|
32
32
|
|
33
33
|
puts "State changed, generated document:"
|
@@ -44,7 +44,7 @@ module Crabfarm
|
|
44
44
|
puts "Ejem..."
|
45
45
|
end
|
46
46
|
|
47
|
-
alias :
|
47
|
+
alias :n :navigate
|
48
48
|
alias :r :reset
|
49
49
|
end
|
50
50
|
|
@@ -23,8 +23,9 @@ module Crabfarm
|
|
23
23
|
path(_name, '.crabfarm').render('dot_crabfarm', binding)
|
24
24
|
path(_name, 'boot.rb').render('boot.rb', binding)
|
25
25
|
path(_name, 'bin', 'crabfarm').render('crabfarm_bin', binding, 0755)
|
26
|
-
path(_name, 'app', '
|
27
|
-
path(_name, 'app', '
|
26
|
+
path(_name, 'app', 'navigators', '.gitkeep').render('dot_gitkeep')
|
27
|
+
path(_name, 'app', 'reducers', '.gitkeep').render('dot_gitkeep')
|
28
|
+
path(_name, 'app', 'structs', '.gitkeep').render('dot_gitkeep')
|
28
29
|
path(_name, 'app', 'helpers', '.gitkeep').render('dot_gitkeep')
|
29
30
|
path(_name, 'spec', 'spec_helper.rb').render('spec_helper.rb', binding)
|
30
31
|
path(_name, 'spec', 'snapshots', '.gitkeep').render('dot_gitkeep')
|
@@ -34,26 +35,38 @@ module Crabfarm
|
|
34
35
|
end
|
35
36
|
end
|
36
37
|
|
37
|
-
def
|
38
|
+
def generate_navigator(_target, _class_name, _skip_reducer=false)
|
38
39
|
validate_class_name _class_name
|
39
40
|
|
40
41
|
route = Utils::Naming.route_from_constant _class_name
|
41
42
|
with_base_path _target do
|
42
|
-
binding = {
|
43
|
-
path(*(['app', '
|
44
|
-
path(*(['spec', '
|
43
|
+
binding = { navigator_class: _class_name }
|
44
|
+
path(*(['app', 'navigators'] + route[0...-1] + [route.last + '.rb'])).render('navigator.rb', binding)
|
45
|
+
path(*(['spec', 'navigators'] + route[0...-1] + [route.last + '_spec.rb'])).render('navigator_spec.rb', binding)
|
46
|
+
end
|
47
|
+
|
48
|
+
generate_reducer(_target, _class_name) unless _skip_reducer
|
49
|
+
end
|
50
|
+
|
51
|
+
def generate_reducer(_target, _class_name)
|
52
|
+
validate_class_name _class_name
|
53
|
+
|
54
|
+
_class_name = _class_name + 'Reducer'
|
55
|
+
route = Utils::Naming.route_from_constant _class_name
|
56
|
+
with_base_path _target do
|
57
|
+
binding = { reducer_class: _class_name }
|
58
|
+
path(*(['app', 'reducers'] + route[0...-1] + [route.last + '.rb'])).render('reducer.rb', binding)
|
59
|
+
path(*(['spec', 'reducers'] + route[0...-1] + [route.last + '_spec.rb'])).render('reducer_spec.rb', binding)
|
45
60
|
end
|
46
61
|
end
|
47
62
|
|
48
|
-
def
|
63
|
+
def generate_struct(_target, _class_name)
|
49
64
|
validate_class_name _class_name
|
50
65
|
|
51
|
-
_class_name = _class_name + 'Parser'
|
52
66
|
route = Utils::Naming.route_from_constant _class_name
|
53
67
|
with_base_path _target do
|
54
|
-
binding = {
|
55
|
-
path(*(['app', '
|
56
|
-
path(*(['spec', 'parsers'] + route[0...-1] + [route.last + '_spec.rb'])).render('parser_spec.rb', binding)
|
68
|
+
binding = { struct_class: _class_name }
|
69
|
+
path(*(['app', 'structs'] + route[0...-1] + [route.last + '.rb'])).render('struct.rb', binding)
|
57
70
|
end
|
58
71
|
end
|
59
72
|
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,9 +3,9 @@ module Crabfarm
|
|
3
3
|
|
4
4
|
class Error < Crabfarm::Error; end
|
5
5
|
|
6
|
-
def
|
6
|
+
def reduce(_snapshot=nil, _options={})
|
7
7
|
|
8
|
-
raise Error.new "
|
8
|
+
raise Error.new "'reduce' is only available in reducer specs" unless described_class < Crabfarm::BaseReducer
|
9
9
|
|
10
10
|
if _snapshot.is_a? Hash
|
11
11
|
raise ArgumentException.new 'Invalid arguments' unless _options.nil?
|
@@ -17,42 +17,44 @@ module Crabfarm
|
|
17
17
|
raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
|
18
18
|
|
19
19
|
data = File.read snapshot_path
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
reducer = described_class.new data, _options
|
21
|
+
reducer.run
|
22
|
+
reducer
|
23
23
|
end
|
24
24
|
|
25
|
-
def
|
25
|
+
def navigate(_name=nil, _params={})
|
26
26
|
|
27
|
-
raise Error.new "
|
27
|
+
raise Error.new "'navigate' is only available in navigator specs" if @context.nil?
|
28
28
|
|
29
|
-
if
|
30
|
-
_params =
|
31
|
-
|
29
|
+
if _name.is_a? Hash
|
30
|
+
_params = _name
|
31
|
+
_name = nil
|
32
32
|
end
|
33
33
|
|
34
|
-
if
|
35
|
-
return nil unless described_class <
|
36
|
-
@state = @last_state = TransitionService.
|
34
|
+
if _name.nil?
|
35
|
+
return nil unless described_class < BaseNavigator # TODO: maybe raise an error here.
|
36
|
+
@state = @last_state = TransitionService.transition @context, described_class, _params
|
37
37
|
else
|
38
|
-
@last_state = TransitionService.
|
38
|
+
@last_state = TransitionService.transition @context, _name, _params
|
39
39
|
end
|
40
40
|
end
|
41
41
|
|
42
42
|
def state
|
43
|
-
@state ||=
|
43
|
+
@state ||= navigate
|
44
44
|
end
|
45
45
|
|
46
|
+
alias :navigator :state
|
47
|
+
|
46
48
|
def last_state
|
47
49
|
@last_state
|
48
50
|
end
|
49
51
|
|
50
|
-
def
|
51
|
-
@
|
52
|
+
def reducer
|
53
|
+
@reducer ||= reduce
|
52
54
|
end
|
53
55
|
|
54
|
-
def
|
55
|
-
@context.pool.
|
56
|
+
def browser(_session_id=nil)
|
57
|
+
@context.pool.browser(_session_id)
|
56
58
|
end
|
57
59
|
|
58
60
|
end
|
@@ -62,13 +64,13 @@ RSpec.configure do |config|
|
|
62
64
|
config.include Crabfarm::RSpec
|
63
65
|
|
64
66
|
config.around(:example) do |example|
|
65
|
-
if described_class < Crabfarm::
|
66
|
-
if example.metadata[:
|
67
|
-
@
|
67
|
+
if described_class < Crabfarm::BaseReducer
|
68
|
+
if example.metadata[:reducing] || example[:reducing_with_params]
|
69
|
+
@reducer = reduce example.metadata[:reducing], example.metadata[:reducing_with_params] || {}
|
68
70
|
end
|
69
71
|
example.run
|
70
|
-
elsif described_class < Crabfarm::
|
71
|
-
Crabfarm::ContextFactory.with_context example.metadata[:
|
72
|
+
elsif described_class < Crabfarm::BaseNavigator
|
73
|
+
Crabfarm::ContextFactory.with_context example.metadata[:navigating] do |ctx|
|
72
74
|
@context = ctx
|
73
75
|
example.run
|
74
76
|
end
|
data/lib/crabfarm/strategies.rb
CHANGED
@@ -2,28 +2,34 @@ module Crabfarm
|
|
2
2
|
module Strategies
|
3
3
|
|
4
4
|
class Loader
|
5
|
-
def initialize(_name, _klass,
|
5
|
+
def initialize(_name, _klass, _options={})
|
6
6
|
@name = _name
|
7
7
|
@klass = _klass
|
8
|
-
|
9
|
-
@
|
8
|
+
|
9
|
+
@pkg = if _options.key? :require
|
10
|
+
_options[:require]
|
11
|
+
elsif @klass.is_a? String
|
12
|
+
Utils::Naming.route_from_constant(@klass).join('/')
|
13
|
+
else nil end
|
14
|
+
|
15
|
+
@requirements = Array(_options[:dependencies]) if _options.key? :dependencies
|
10
16
|
end
|
11
17
|
|
12
18
|
def load
|
13
|
-
|
19
|
+
load_requirements unless @requirements.nil?
|
14
20
|
require @pkg if @pkg
|
15
21
|
if @klass.is_a? String then Object.const_get @klass else @klass end
|
16
22
|
end
|
17
23
|
|
18
24
|
private
|
19
25
|
|
20
|
-
def
|
21
|
-
@
|
26
|
+
def load_requirements
|
27
|
+
@requirements.each do |dep|
|
22
28
|
begin
|
23
29
|
require dep
|
24
30
|
# TODO: check dependency version!
|
25
31
|
rescue LoadError
|
26
|
-
raise ConfigurationError.new "
|
32
|
+
raise ConfigurationError.new "Could not find #{@name} dependency, maybe you forgot to add `gem \"#{dep}\"` to the crawler's Gemfile?"
|
27
33
|
end
|
28
34
|
end
|
29
35
|
end
|
@@ -31,9 +37,9 @@ module Crabfarm
|
|
31
37
|
|
32
38
|
@@register = {}
|
33
39
|
|
34
|
-
def self.register(_cat, _name, _klass,
|
40
|
+
def self.register(_cat, _name, _klass, _options={})
|
35
41
|
full_name = _cat.to_s + ':' + _name.to_s
|
36
|
-
@@register[full_name] = Loader.new(full_name, _klass,
|
42
|
+
@@register[full_name] = Loader.new(full_name, _klass, _options)
|
37
43
|
end
|
38
44
|
|
39
45
|
def self.load(_cat, _name)
|
@@ -1,44 +1,39 @@
|
|
1
|
+
# The selected browser engine for navigators to be used throughout the crawler
|
2
|
+
# Other options (may require instalation of additional libraries): :phantomjs, :remote, :chrome, :firefox
|
3
|
+
set_browser :phantomjs
|
1
4
|
|
2
|
-
# The default
|
3
|
-
# Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
|
4
|
-
set_browser_dsl :surfer
|
5
|
-
|
6
|
-
# The default parser engine for parsers that do not specify one.
|
5
|
+
# The default parser engine for reducers that do not specify one.
|
7
6
|
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
8
|
-
|
9
|
-
|
10
|
-
# Change the defaut output builder used in a state to generate the output document.
|
11
|
-
# Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
|
12
|
-
# set_output_builder :hash
|
7
|
+
set_parser :nokogiri
|
13
8
|
|
14
9
|
# The path where every crawler log is stored.
|
15
10
|
set_log_path 'logs'
|
16
11
|
|
12
|
+
# Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
|
13
|
+
# set_proxy 'the.proxy.address'
|
17
14
|
|
18
|
-
# General
|
15
|
+
# General webdriver configuration
|
19
16
|
########################################
|
20
17
|
|
21
|
-
# The
|
22
|
-
|
23
|
-
|
18
|
+
# The following parameters only apply if using a webdriver based driver
|
19
|
+
|
20
|
+
# Selects the webdriver wrapper library to be used, options are :surfer, :watir and :capybara.
|
21
|
+
# Both watir and capybara require an additional gem to be added to Gemfile
|
22
|
+
set_webdriver_dsl :surfer
|
24
23
|
|
25
24
|
# Set the selected webdriver capabilities (check the driver documentation for more details)
|
26
|
-
#
|
25
|
+
# set_webdriver_capabilities
|
27
26
|
|
28
27
|
# Set the browser window width
|
29
|
-
#
|
28
|
+
# set_webdriver_window_width 1280
|
30
29
|
|
31
30
|
# Set the browser window height
|
32
|
-
#
|
33
|
-
|
34
|
-
# Set the driver proxy address
|
35
|
-
# set_proxy
|
36
|
-
|
31
|
+
# set_webdriver_window_height 800
|
37
32
|
|
38
33
|
# Phantom launcher configuration
|
39
34
|
########################################
|
40
35
|
|
41
|
-
# The following parameters only apply if using the :phantomjs
|
36
|
+
# The following parameters only apply if using the :phantomjs webdriver
|
42
37
|
|
43
38
|
# Make phantom load images when requesting documents
|
44
39
|
# set_phantom_load_images false
|
@@ -53,16 +48,16 @@ set_driver :phantomjs
|
|
53
48
|
# Remote driver options
|
54
49
|
########################################
|
55
50
|
|
56
|
-
# The following parameters only apply if using the :remote
|
51
|
+
# The following parameters only apply if using the :remote webdriver
|
57
52
|
|
58
53
|
# Remote driver host
|
59
|
-
#
|
54
|
+
# set_webdriver_host www.myseleniumgrid.com
|
60
55
|
|
61
56
|
# Remote driver port
|
62
|
-
#
|
57
|
+
# set_webdriver_port 8080
|
63
58
|
|
64
59
|
# Remote driver response timeout, in seconds
|
65
|
-
#
|
60
|
+
# set_webdriver_remote_timeout 120
|
66
61
|
|
67
62
|
|
68
63
|
# Recording configuration
|