crabfarm 0.2.5 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/lib/crabfarm.rb +17 -18
  3. data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
  4. data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
  5. data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
  6. data/lib/crabfarm/adapters/browser/noop.rb +25 -0
  7. data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
  8. data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
  9. data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
  10. data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
  11. data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
  12. data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
  13. data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
  14. data/lib/crabfarm/assertion/fields.rb +85 -0
  15. data/lib/crabfarm/base_navigator.rb +78 -0
  16. data/lib/crabfarm/base_reducer.rb +68 -0
  17. data/lib/crabfarm/base_struct.rb +17 -0
  18. data/lib/crabfarm/cli.rb +18 -8
  19. data/lib/crabfarm/configuration.rb +24 -51
  20. data/lib/crabfarm/context.rb +19 -43
  21. data/lib/crabfarm/crabtrap_context.rb +4 -11
  22. data/lib/crabfarm/driver_pool.rb +32 -0
  23. data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
  24. data/lib/crabfarm/engines/async_state_manager.rb +1 -1
  25. data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
  26. data/lib/crabfarm/forked_navigator.rb +31 -0
  27. data/lib/crabfarm/modes/console.rb +4 -4
  28. data/lib/crabfarm/modes/generator.rb +24 -11
  29. data/lib/crabfarm/rspec.rb +26 -24
  30. data/lib/crabfarm/strategies.rb +15 -9
  31. data/lib/crabfarm/templates/Crabfile.erb +21 -26
  32. data/lib/crabfarm/templates/Gemfile.erb +6 -0
  33. data/lib/crabfarm/templates/navigator.rb.erb +20 -0
  34. data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
  35. data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
  36. data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
  37. data/lib/crabfarm/templates/struct.rb.erb +12 -0
  38. data/lib/crabfarm/transition_service.rb +20 -7
  39. data/lib/crabfarm/version.rb +1 -1
  40. metadata +50 -48
  41. data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
  42. data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
  43. data/lib/crabfarm/adapters/output/hash.rb +0 -11
  44. data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
  45. data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
  46. data/lib/crabfarm/base_parser.rb +0 -59
  47. data/lib/crabfarm/base_state.rb +0 -112
  48. data/lib/crabfarm/default_driver_factory.rb +0 -86
  49. data/lib/crabfarm/driver_bucket.rb +0 -42
  50. data/lib/crabfarm/driver_bucket_pool.rb +0 -26
  51. data/lib/crabfarm/forked_state.rb +0 -38
  52. data/lib/crabfarm/mocks/noop_driver.rb +0 -6
  53. data/lib/crabfarm/phantom_driver_factory.rb +0 -33
  54. data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,78 @@
1
+ require 'thwait'
2
+ require 'crabfarm/forked_navigator'
3
+ require "crabfarm/assertion/context"
4
+
5
+ module Crabfarm
6
+ class BaseNavigator
7
+ include Assertion::Context
8
+ extend Forwardable
9
+
10
+ attr_reader :params
11
+
12
+ def_delegators '@context', :http
13
+ def_delegators '@context.store', :get, :fetch
14
+
15
+ def initialize(_context, _params)
16
+ @context = _context
17
+ @params = _params
18
+ end
19
+
20
+ def browser(_name=nil)
21
+ @context.pool.driver(_name)
22
+ end
23
+
24
+ def download(_url)
25
+ @context.http.get(_url).body
26
+ end
27
+
28
+ def run
29
+ raise NotImplementedError.new
30
+ end
31
+
32
+ def reduce(_target=nil, _options={})
33
+ if _target.is_a? Hash
34
+ _options = _target
35
+ _target = browser
36
+ elsif _target.nil?
37
+ _target = browser
38
+ end
39
+
40
+ reduce_using(_options.delete(:using) || self.class.name, _target, _options)
41
+ end
42
+
43
+ alias :reduce_with_defaults :reduce
44
+
45
+ def fork_each(_enumerator, &_block)
46
+ session_id = 0
47
+ mutex = Mutex.new
48
+ ths = _enumerator.map do |value|
49
+ session_id += 1
50
+ start_forked_navigation("th_session_#{session_id}", value, _block, mutex)
51
+ end
52
+ ThreadsWait.all_waits(*ths)
53
+ end
54
+
55
+ private
56
+
57
+ def reduce_using(_reducer_class, _target, _options={})
58
+ if _reducer_class.is_a? String or _reducer_class.is_a? Symbol
59
+ _reducer_class = (Utils::Naming.decode_crabfarm_uri(_reducer_class.to_s) + 'Reducer').constantize
60
+ end
61
+
62
+ reducer = _reducer_class.new _target, @params.merge(_options)
63
+ reducer.run
64
+ reducer
65
+ end
66
+
67
+ def start_forked_navigation(_name, _value, _block, _mutex)
68
+ Thread.new {
69
+ fork = ForkedNavigator.new @context, self, _name, _mutex
70
+ begin
71
+ fork.instance_exec _value, &_block
72
+ ensure
73
+ @context.pool.reset _name
74
+ end
75
+ }
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,68 @@
1
+ require "crabfarm/assertion/fields"
2
+
3
+ module Crabfarm
4
+ class BaseReducer < Delegator
5
+ include Assertion::Fields
6
+
7
+ attr_reader :params, :document
8
+
9
+ def self.use_parser(_parser_name)
10
+ @parser_name = _parser_name
11
+ end
12
+
13
+ def self.parser
14
+ @parser ||= Strategies.load(:parser, @parser_name || Crabfarm.config.parser)
15
+ end
16
+
17
+ def self.snapshot_path(_name=nil)
18
+ _name = self.to_s.underscore if _name.nil?
19
+ File.join(GlobalState.snapshots_path, _name + '.' + parser.format)
20
+ end
21
+
22
+ def parser
23
+ self.class.parser
24
+ end
25
+
26
+ def initialize(_target, _params)
27
+ reset_fields
28
+
29
+ @parsed_data = parser.preprocess_parsing_target _target
30
+ @document = parser.parse @parsed_data
31
+ @params = _params
32
+
33
+ super @document
34
+ end
35
+
36
+ def run
37
+ raise NotImplementedError.new
38
+ end
39
+
40
+ def take_snapshot(_name=nil)
41
+ file_path = self.class.snapshot_path _name
42
+
43
+ dir_path = file_path.split(File::SEPARATOR)[0...-1]
44
+ FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
45
+
46
+ File.write file_path, @parsed_data
47
+ file_path
48
+ end
49
+
50
+ def take_snapshot_and_fail(_name=nil)
51
+ file_path = take_snapshot _name
52
+ raise ArgumentError.new "New snapshot for #{self.class.to_s} generated in '#{file_path}'"
53
+ end
54
+
55
+ def as_json(_options=nil)
56
+ field_hash
57
+ end
58
+
59
+ def __getobj__
60
+ @document
61
+ end
62
+
63
+ def __setobj__(obj)
64
+ @document = obj
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,17 @@
1
+ require "crabfarm/assertion/fields"
2
+
3
+ module Crabfarm
4
+ class BaseStruct
5
+ include Assertion::Fields
6
+
7
+ def initialize(_values={})
8
+ reset_fields
9
+ _values.each { |k,v| send("#{k}=", v) }
10
+ end
11
+
12
+ def as_json(_options=nil)
13
+ field_hash
14
+ end
15
+
16
+ end
17
+ end
data/lib/crabfarm/cli.rb CHANGED
@@ -85,23 +85,33 @@ module Crabfarm
85
85
  end
86
86
  end
87
87
 
88
- c.desc "Generates a new crabfarm parser and parser spec"
89
- c.command :parser do |parser|
90
- parser.action do |global_options,options,args|
88
+ c.desc "Generates a new crabfarm navigator and navigator spec"
89
+ c.command :navigator do |sub|
90
+ sub.action do |global_options,options,args|
91
91
  next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
92
92
 
93
93
  require "crabfarm/modes/generator"
94
- Crabfarm::Modes::Generator.generate_parser(GlobalState.app_path, args[0])
94
+ Crabfarm::Modes::Generator.generate_navigator(GlobalState.app_path, args[0])
95
95
  end
96
96
  end
97
97
 
98
- c.desc "Generates a new crabfarm state and parser spec"
99
- c.command :state do |parser|
100
- parser.action do |global_options,options,args|
98
+ c.desc "Generates a new crabfarm reducer and reducer spec"
99
+ c.command :reducer do |sub|
100
+ sub.action do |global_options,options,args|
101
101
  next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
102
102
 
103
103
  require "crabfarm/modes/generator"
104
- Crabfarm::Modes::Generator.generate_state(GlobalState.app_path, args[0])
104
+ Crabfarm::Modes::Generator.generate_reducer(GlobalState.app_path, args[0])
105
+ end
106
+ end
107
+
108
+ c.desc "Generates a new crabfarm struct"
109
+ c.command :reducer do |sub|
110
+ sub.action do |global_options,options,args|
111
+ next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
112
+
113
+ require "crabfarm/modes/generator"
114
+ Crabfarm::Modes::Generator.generate_struct(GlobalState.app_path, args[0])
105
115
  end
106
116
  end
107
117
  end
@@ -5,21 +5,20 @@ module Crabfarm
5
5
  class Option < Struct.new(:name, :type, :text); end
6
6
 
7
7
  OPTIONS = [
8
- [:browser_dsl, :string, 'Default browser dsl used by states'],
9
- [:parser_engine, :string, 'Default parser engine used by parsers'],
10
- [:output_builder, :string, 'Default json output builder used by states'],
11
- [:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
8
+ # Global options
9
+ [:browser, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Browser engine to be used by navigators, common options: phantomjs, chrome, firefox, remote.'],
10
+ [:parser, :string, 'Default parser engine used by reducers'],
12
11
  [:log_path, :string, 'Path where logs should be stored'],
13
12
  [:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
14
13
 
15
- # Default driver configuration parameters
16
- [:driver, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Webdriver to be user, common options: chrome, firefox, phantomjs, remote.'],
17
- [:driver_host, :string, 'Remote host, only available in driver: remote'],
18
- [:driver_port, :integer, 'Remote port, only available in driver: remote'],
19
- [:driver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
20
- [:driver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
21
- [:driver_window_width, :integer, 'Initial browser window width.'],
22
- [:driver_window_height, :integer, 'Initial browser window height.'],
14
+ # Webdriver configuration parameters
15
+ [:webdriver_host, :string, 'Remote host, only available in driver: remote'],
16
+ [:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
17
+ [:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
18
+ [:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
19
+ [:webdriver_window_width, :integer, 'Initial browser window width.'],
20
+ [:webdriver_window_height, :integer, 'Initial browser window height.'],
21
+ [:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are watir and surfer'],
23
22
 
24
23
  # Phantom launcher configuration
25
24
  [:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
@@ -50,19 +49,18 @@ module Crabfarm
50
49
 
51
50
  def reset
52
51
  @values = {
53
- browser_dsl: :surfer,
54
- parser_engine: :nokogiri,
55
- output_builder: :hash,
52
+ browser: 'phantomjs',
53
+ parser: :nokogiri,
56
54
  driver_factory: nil,
57
55
  log_path: nil,
58
56
  proxy: nil,
59
- driver: 'phantomjs',
60
- driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
61
- driver_host: 'localhost',
62
- driver_port: '8080',
63
- driver_remote_timeout: 120,
64
- driver_window_width: 1280,
65
- driver_window_height: 800,
57
+ webdriver_capabilities: nil,
58
+ webdriver_host: 'localhost',
59
+ webdriver_port: '8080',
60
+ webdriver_remote_timeout: 120,
61
+ webdriver_window_width: 1280,
62
+ webdriver_window_height: 800,
63
+ webdriver_dsl: :surfer,
66
64
  phantom_load_images: false,
67
65
  phantom_ssl: 'any',
68
66
  phantom_bin_path: 'phantomjs',
@@ -75,38 +73,13 @@ module Crabfarm
75
73
  @values.merge! _options
76
74
  end
77
75
 
78
- def driver_remote_host
79
- if driver_host then nil
80
- elsif driver_port then "http://#{driver_host}"
81
- else "http://#{driver_host}:#{driver_port}"
76
+ def webdriver_remote_host
77
+ if webdriver_host then nil
78
+ elsif webdriver_port then "http://#{webdriver_host}"
79
+ else "http://#{webdriver_host}:#{webdriver_port}"
82
80
  end
83
81
  end
84
82
 
85
- def driver_config
86
- {
87
- name: driver,
88
- proxy: proxy,
89
- capabilities: driver_capabilities,
90
- remote_host: driver_remote_host,
91
- remote_timeout: driver_remote_timeout,
92
- window_width: driver_window_width,
93
- window_height: driver_window_height
94
- }
95
- end
96
-
97
- def phantom_mode_enabled?
98
- driver.to_s == 'phantomjs'
99
- end
100
-
101
- def phantom_config
102
- {
103
- load_images: phantom_load_images,
104
- proxy: proxy,
105
- ssl: phantom_ssl,
106
- bin_path: phantom_bin_path
107
- }
108
- end
109
-
110
83
  def crabtrap_config
111
84
  {
112
85
  bin_path: crabtrap_bin_path,
@@ -32,7 +32,7 @@ module Crabfarm
32
32
  private
33
33
 
34
34
  def load_services
35
- init_phantom_if_required
35
+ init_driver_factory
36
36
  init_driver_pool
37
37
  init_http_client
38
38
  end
@@ -43,71 +43,47 @@ module Crabfarm
43
43
  end
44
44
 
45
45
  def unload_services
46
- release_driver_pool
47
46
  release_http_client
48
- release_phantom
49
- end
50
-
51
- def init_driver_pool
52
- @pool = DriverBucketPool.new build_driver_factory if @pool.nil?
47
+ release_driver_pool
48
+ release_driver_factory
53
49
  end
54
50
 
55
- def release_driver_pool
56
- @pool.release unless @pool.nil?
57
- @pool = nil
51
+ def init_driver_factory
52
+ if @factory.nil?
53
+ @factory = Strategies.load(:browser, config.browser).new proxy
54
+ @factory.prepare_driver_services
55
+ end
58
56
  end
59
57
 
60
- def init_phantom_if_required
61
- if config.phantom_mode_enabled? and @phantom.nil?
62
- @phantom = load_and_start_phantom
63
- end
58
+ def release_driver_factory
59
+ @factory.cleanup_driver_services unless @factory.nil?
60
+ @factory.nil?
64
61
  end
65
62
 
66
- def load_and_start_phantom
67
- phantom_port = Utils::PortDiscovery.find_available_port
68
- new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
69
- new_phantom.start
70
- return new_phantom
63
+ def init_driver_pool
64
+ @pool = DriverPool.new @factory if @pool.nil?
71
65
  end
72
66
 
73
- def release_phantom
74
- @phantom.stop unless @phantom.nil?
75
- @phantom = nil
67
+ def release_driver_pool
68
+ @pool.release unless @pool.nil?
69
+ @pool = nil
76
70
  end
77
71
 
78
72
  def init_http_client
79
- @http = build_http_client if @http.nil?
73
+ @http = HttpClient.new proxy if @http.nil?
80
74
  end
81
75
 
82
76
  def release_http_client
83
77
  @http = nil
84
78
  end
85
79
 
86
- def build_driver_factory
87
- if @phantom
88
- PhantomDriverFactory.new @phantom, driver_config
89
- else
90
- return config.driver_factory if config.driver_factory
91
- DefaultDriverFactory.new driver_config
92
- end
93
- end
94
-
95
- def build_http_client
96
- HttpClient.new config.proxy
80
+ def proxy
81
+ Crabfarm.config.proxy
97
82
  end
98
83
 
99
84
  def config
100
85
  Crabfarm.config
101
86
  end
102
87
 
103
- def driver_config
104
- config.driver_config
105
- end
106
-
107
- def phantom_config
108
- config.phantom_config
109
- end
110
-
111
88
  end
112
-
113
89
  end
@@ -57,10 +57,6 @@ module Crabfarm
57
57
  @port = nil
58
58
  end
59
59
 
60
- def build_http_client
61
- HttpClient.new proxy_address
62
- end
63
-
64
60
  def start_daemon
65
61
  if @runner.nil?
66
62
  options = {
@@ -69,7 +65,7 @@ module Crabfarm
69
65
  port: @port
70
66
  }
71
67
 
72
- @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(options)
68
+ @runner = CrabtrapRunner.new config.crabtrap_config.merge(options)
73
69
  @runner.start
74
70
  end
75
71
  end
@@ -81,12 +77,9 @@ module Crabfarm
81
77
  else nil end
82
78
  end
83
79
 
84
- def driver_config
85
- super.merge(proxy: proxy_address)
86
- end
87
-
88
- def phantom_config
89
- super.merge(proxy: proxy_address)
80
+ def proxy
81
+ # just step over configuration proxy
82
+ proxy_address
90
83
  end
91
84
 
92
85
  def proxy_address