crabfarm 0.2.5 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/lib/crabfarm.rb +17 -18
  3. data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
  4. data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
  5. data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
  6. data/lib/crabfarm/adapters/browser/noop.rb +25 -0
  7. data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
  8. data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
  9. data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
  10. data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
  11. data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
  12. data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
  13. data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
  14. data/lib/crabfarm/assertion/fields.rb +85 -0
  15. data/lib/crabfarm/base_navigator.rb +78 -0
  16. data/lib/crabfarm/base_reducer.rb +68 -0
  17. data/lib/crabfarm/base_struct.rb +17 -0
  18. data/lib/crabfarm/cli.rb +18 -8
  19. data/lib/crabfarm/configuration.rb +24 -51
  20. data/lib/crabfarm/context.rb +19 -43
  21. data/lib/crabfarm/crabtrap_context.rb +4 -11
  22. data/lib/crabfarm/driver_pool.rb +32 -0
  23. data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
  24. data/lib/crabfarm/engines/async_state_manager.rb +1 -1
  25. data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
  26. data/lib/crabfarm/forked_navigator.rb +31 -0
  27. data/lib/crabfarm/modes/console.rb +4 -4
  28. data/lib/crabfarm/modes/generator.rb +24 -11
  29. data/lib/crabfarm/rspec.rb +26 -24
  30. data/lib/crabfarm/strategies.rb +15 -9
  31. data/lib/crabfarm/templates/Crabfile.erb +21 -26
  32. data/lib/crabfarm/templates/Gemfile.erb +6 -0
  33. data/lib/crabfarm/templates/navigator.rb.erb +20 -0
  34. data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
  35. data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
  36. data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
  37. data/lib/crabfarm/templates/struct.rb.erb +12 -0
  38. data/lib/crabfarm/transition_service.rb +20 -7
  39. data/lib/crabfarm/version.rb +1 -1
  40. metadata +50 -48
  41. data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
  42. data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
  43. data/lib/crabfarm/adapters/output/hash.rb +0 -11
  44. data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
  45. data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
  46. data/lib/crabfarm/base_parser.rb +0 -59
  47. data/lib/crabfarm/base_state.rb +0 -112
  48. data/lib/crabfarm/default_driver_factory.rb +0 -86
  49. data/lib/crabfarm/driver_bucket.rb +0 -42
  50. data/lib/crabfarm/driver_bucket_pool.rb +0 -26
  51. data/lib/crabfarm/forked_state.rb +0 -38
  52. data/lib/crabfarm/mocks/noop_driver.rb +0 -6
  53. data/lib/crabfarm/phantom_driver_factory.rb +0 -33
  54. data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,78 @@
1
+ require 'thwait'
2
+ require 'crabfarm/forked_navigator'
3
+ require "crabfarm/assertion/context"
4
+
5
+ module Crabfarm
6
+ class BaseNavigator
7
+ include Assertion::Context
8
+ extend Forwardable
9
+
10
+ attr_reader :params
11
+
12
+ def_delegators '@context', :http
13
+ def_delegators '@context.store', :get, :fetch
14
+
15
+ def initialize(_context, _params)
16
+ @context = _context
17
+ @params = _params
18
+ end
19
+
20
+ def browser(_name=nil)
21
+ @context.pool.driver(_name)
22
+ end
23
+
24
+ def download(_url)
25
+ @context.http.get(_url).body
26
+ end
27
+
28
+ def run
29
+ raise NotImplementedError.new
30
+ end
31
+
32
+ def reduce(_target=nil, _options={})
33
+ if _target.is_a? Hash
34
+ _options = _target
35
+ _target = browser
36
+ elsif _target.nil?
37
+ _target = browser
38
+ end
39
+
40
+ reduce_using(_options.delete(:using) || self.class.name, _target, _options)
41
+ end
42
+
43
+ alias :reduce_with_defaults :reduce
44
+
45
+ def fork_each(_enumerator, &_block)
46
+ session_id = 0
47
+ mutex = Mutex.new
48
+ ths = _enumerator.map do |value|
49
+ session_id += 1
50
+ start_forked_navigation("th_session_#{session_id}", value, _block, mutex)
51
+ end
52
+ ThreadsWait.all_waits(*ths)
53
+ end
54
+
55
+ private
56
+
57
+ def reduce_using(_reducer_class, _target, _options={})
58
+ if _reducer_class.is_a? String or _reducer_class.is_a? Symbol
59
+ _reducer_class = (Utils::Naming.decode_crabfarm_uri(_reducer_class.to_s) + 'Reducer').constantize
60
+ end
61
+
62
+ reducer = _reducer_class.new _target, @params.merge(_options)
63
+ reducer.run
64
+ reducer
65
+ end
66
+
67
+ def start_forked_navigation(_name, _value, _block, _mutex)
68
+ Thread.new {
69
+ fork = ForkedNavigator.new @context, self, _name, _mutex
70
+ begin
71
+ fork.instance_exec _value, &_block
72
+ ensure
73
+ @context.pool.reset _name
74
+ end
75
+ }
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,68 @@
1
+ require "crabfarm/assertion/fields"
2
+
3
+ module Crabfarm
4
+ class BaseReducer < Delegator
5
+ include Assertion::Fields
6
+
7
+ attr_reader :params, :document
8
+
9
+ def self.use_parser(_parser_name)
10
+ @parser_name = _parser_name
11
+ end
12
+
13
+ def self.parser
14
+ @parser ||= Strategies.load(:parser, @parser_name || Crabfarm.config.parser)
15
+ end
16
+
17
+ def self.snapshot_path(_name=nil)
18
+ _name = self.to_s.underscore if _name.nil?
19
+ File.join(GlobalState.snapshots_path, _name + '.' + parser.format)
20
+ end
21
+
22
+ def parser
23
+ self.class.parser
24
+ end
25
+
26
+ def initialize(_target, _params)
27
+ reset_fields
28
+
29
+ @parsed_data = parser.preprocess_parsing_target _target
30
+ @document = parser.parse @parsed_data
31
+ @params = _params
32
+
33
+ super @document
34
+ end
35
+
36
+ def run
37
+ raise NotImplementedError.new
38
+ end
39
+
40
+ def take_snapshot(_name=nil)
41
+ file_path = self.class.snapshot_path _name
42
+
43
+ dir_path = file_path.split(File::SEPARATOR)[0...-1]
44
+ FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
45
+
46
+ File.write file_path, @parsed_data
47
+ file_path
48
+ end
49
+
50
+ def take_snapshot_and_fail(_name=nil)
51
+ file_path = take_snapshot _name
52
+ raise ArgumentError.new "New snapshot for #{self.class.to_s} generated in '#{file_path}'"
53
+ end
54
+
55
+ def as_json(_options=nil)
56
+ field_hash
57
+ end
58
+
59
+ def __getobj__
60
+ @document
61
+ end
62
+
63
+ def __setobj__(obj)
64
+ @document = obj
65
+ end
66
+
67
+ end
68
+ end
@@ -0,0 +1,17 @@
1
+ require "crabfarm/assertion/fields"
2
+
3
+ module Crabfarm
4
+ class BaseStruct
5
+ include Assertion::Fields
6
+
7
+ def initialize(_values={})
8
+ reset_fields
9
+ _values.each { |k,v| send("#{k}=", v) }
10
+ end
11
+
12
+ def as_json(_options=nil)
13
+ field_hash
14
+ end
15
+
16
+ end
17
+ end
data/lib/crabfarm/cli.rb CHANGED
@@ -85,23 +85,33 @@ module Crabfarm
85
85
  end
86
86
  end
87
87
 
88
- c.desc "Generates a new crabfarm parser and parser spec"
89
- c.command :parser do |parser|
90
- parser.action do |global_options,options,args|
88
+ c.desc "Generates a new crabfarm navigator and navigator spec"
89
+ c.command :navigator do |sub|
90
+ sub.action do |global_options,options,args|
91
91
  next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
92
92
 
93
93
  require "crabfarm/modes/generator"
94
- Crabfarm::Modes::Generator.generate_parser(GlobalState.app_path, args[0])
94
+ Crabfarm::Modes::Generator.generate_navigator(GlobalState.app_path, args[0])
95
95
  end
96
96
  end
97
97
 
98
- c.desc "Generates a new crabfarm state and parser spec"
99
- c.command :state do |parser|
100
- parser.action do |global_options,options,args|
98
+ c.desc "Generates a new crabfarm reducer and reducer spec"
99
+ c.command :reducer do |sub|
100
+ sub.action do |global_options,options,args|
101
101
  next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
102
102
 
103
103
  require "crabfarm/modes/generator"
104
- Crabfarm::Modes::Generator.generate_state(GlobalState.app_path, args[0])
104
+ Crabfarm::Modes::Generator.generate_reducer(GlobalState.app_path, args[0])
105
+ end
106
+ end
107
+
108
+ c.desc "Generates a new crabfarm struct"
109
+ c.command :reducer do |sub|
110
+ sub.action do |global_options,options,args|
111
+ next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
112
+
113
+ require "crabfarm/modes/generator"
114
+ Crabfarm::Modes::Generator.generate_struct(GlobalState.app_path, args[0])
105
115
  end
106
116
  end
107
117
  end
@@ -5,21 +5,20 @@ module Crabfarm
5
5
  class Option < Struct.new(:name, :type, :text); end
6
6
 
7
7
  OPTIONS = [
8
- [:browser_dsl, :string, 'Default browser dsl used by states'],
9
- [:parser_engine, :string, 'Default parser engine used by parsers'],
10
- [:output_builder, :string, 'Default json output builder used by states'],
11
- [:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
8
+ # Global options
9
+ [:browser, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Browser engine to be used by navigators, common options: phantomjs, chrome, firefox, remote.'],
10
+ [:parser, :string, 'Default parser engine used by reducers'],
12
11
  [:log_path, :string, 'Path where logs should be stored'],
13
12
  [:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
14
13
 
15
- # Default driver configuration parameters
16
- [:driver, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Webdriver to be user, common options: chrome, firefox, phantomjs, remote.'],
17
- [:driver_host, :string, 'Remote host, only available in driver: remote'],
18
- [:driver_port, :integer, 'Remote port, only available in driver: remote'],
19
- [:driver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
20
- [:driver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
21
- [:driver_window_width, :integer, 'Initial browser window width.'],
22
- [:driver_window_height, :integer, 'Initial browser window height.'],
14
+ # Webdriver configuration parameters
15
+ [:webdriver_host, :string, 'Remote host, only available in driver: remote'],
16
+ [:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
17
+ [:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
18
+ [:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
19
+ [:webdriver_window_width, :integer, 'Initial browser window width.'],
20
+ [:webdriver_window_height, :integer, 'Initial browser window height.'],
21
+ [:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are watir and surfer'],
23
22
 
24
23
  # Phantom launcher configuration
25
24
  [:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
@@ -50,19 +49,18 @@ module Crabfarm
50
49
 
51
50
  def reset
52
51
  @values = {
53
- browser_dsl: :surfer,
54
- parser_engine: :nokogiri,
55
- output_builder: :hash,
52
+ browser: 'phantomjs',
53
+ parser: :nokogiri,
56
54
  driver_factory: nil,
57
55
  log_path: nil,
58
56
  proxy: nil,
59
- driver: 'phantomjs',
60
- driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
61
- driver_host: 'localhost',
62
- driver_port: '8080',
63
- driver_remote_timeout: 120,
64
- driver_window_width: 1280,
65
- driver_window_height: 800,
57
+ webdriver_capabilities: nil,
58
+ webdriver_host: 'localhost',
59
+ webdriver_port: '8080',
60
+ webdriver_remote_timeout: 120,
61
+ webdriver_window_width: 1280,
62
+ webdriver_window_height: 800,
63
+ webdriver_dsl: :surfer,
66
64
  phantom_load_images: false,
67
65
  phantom_ssl: 'any',
68
66
  phantom_bin_path: 'phantomjs',
@@ -75,38 +73,13 @@ module Crabfarm
75
73
  @values.merge! _options
76
74
  end
77
75
 
78
- def driver_remote_host
79
- if driver_host then nil
80
- elsif driver_port then "http://#{driver_host}"
81
- else "http://#{driver_host}:#{driver_port}"
76
+ def webdriver_remote_host
77
+ if webdriver_host then nil
78
+ elsif webdriver_port then "http://#{webdriver_host}"
79
+ else "http://#{webdriver_host}:#{webdriver_port}"
82
80
  end
83
81
  end
84
82
 
85
- def driver_config
86
- {
87
- name: driver,
88
- proxy: proxy,
89
- capabilities: driver_capabilities,
90
- remote_host: driver_remote_host,
91
- remote_timeout: driver_remote_timeout,
92
- window_width: driver_window_width,
93
- window_height: driver_window_height
94
- }
95
- end
96
-
97
- def phantom_mode_enabled?
98
- driver.to_s == 'phantomjs'
99
- end
100
-
101
- def phantom_config
102
- {
103
- load_images: phantom_load_images,
104
- proxy: proxy,
105
- ssl: phantom_ssl,
106
- bin_path: phantom_bin_path
107
- }
108
- end
109
-
110
83
  def crabtrap_config
111
84
  {
112
85
  bin_path: crabtrap_bin_path,
@@ -32,7 +32,7 @@ module Crabfarm
32
32
  private
33
33
 
34
34
  def load_services
35
- init_phantom_if_required
35
+ init_driver_factory
36
36
  init_driver_pool
37
37
  init_http_client
38
38
  end
@@ -43,71 +43,47 @@ module Crabfarm
43
43
  end
44
44
 
45
45
  def unload_services
46
- release_driver_pool
47
46
  release_http_client
48
- release_phantom
49
- end
50
-
51
- def init_driver_pool
52
- @pool = DriverBucketPool.new build_driver_factory if @pool.nil?
47
+ release_driver_pool
48
+ release_driver_factory
53
49
  end
54
50
 
55
- def release_driver_pool
56
- @pool.release unless @pool.nil?
57
- @pool = nil
51
+ def init_driver_factory
52
+ if @factory.nil?
53
+ @factory = Strategies.load(:browser, config.browser).new proxy
54
+ @factory.prepare_driver_services
55
+ end
58
56
  end
59
57
 
60
- def init_phantom_if_required
61
- if config.phantom_mode_enabled? and @phantom.nil?
62
- @phantom = load_and_start_phantom
63
- end
58
+ def release_driver_factory
59
+ @factory.cleanup_driver_services unless @factory.nil?
60
+ @factory.nil?
64
61
  end
65
62
 
66
- def load_and_start_phantom
67
- phantom_port = Utils::PortDiscovery.find_available_port
68
- new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
69
- new_phantom.start
70
- return new_phantom
63
+ def init_driver_pool
64
+ @pool = DriverPool.new @factory if @pool.nil?
71
65
  end
72
66
 
73
- def release_phantom
74
- @phantom.stop unless @phantom.nil?
75
- @phantom = nil
67
+ def release_driver_pool
68
+ @pool.release unless @pool.nil?
69
+ @pool = nil
76
70
  end
77
71
 
78
72
  def init_http_client
79
- @http = build_http_client if @http.nil?
73
+ @http = HttpClient.new proxy if @http.nil?
80
74
  end
81
75
 
82
76
  def release_http_client
83
77
  @http = nil
84
78
  end
85
79
 
86
- def build_driver_factory
87
- if @phantom
88
- PhantomDriverFactory.new @phantom, driver_config
89
- else
90
- return config.driver_factory if config.driver_factory
91
- DefaultDriverFactory.new driver_config
92
- end
93
- end
94
-
95
- def build_http_client
96
- HttpClient.new config.proxy
80
+ def proxy
81
+ Crabfarm.config.proxy
97
82
  end
98
83
 
99
84
  def config
100
85
  Crabfarm.config
101
86
  end
102
87
 
103
- def driver_config
104
- config.driver_config
105
- end
106
-
107
- def phantom_config
108
- config.phantom_config
109
- end
110
-
111
88
  end
112
-
113
89
  end
@@ -57,10 +57,6 @@ module Crabfarm
57
57
  @port = nil
58
58
  end
59
59
 
60
- def build_http_client
61
- HttpClient.new proxy_address
62
- end
63
-
64
60
  def start_daemon
65
61
  if @runner.nil?
66
62
  options = {
@@ -69,7 +65,7 @@ module Crabfarm
69
65
  port: @port
70
66
  }
71
67
 
72
- @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(options)
68
+ @runner = CrabtrapRunner.new config.crabtrap_config.merge(options)
73
69
  @runner.start
74
70
  end
75
71
  end
@@ -81,12 +77,9 @@ module Crabfarm
81
77
  else nil end
82
78
  end
83
79
 
84
- def driver_config
85
- super.merge(proxy: proxy_address)
86
- end
87
-
88
- def phantom_config
89
- super.merge(proxy: proxy_address)
80
+ def proxy
81
+ # just step over configuration proxy
82
+ proxy_address
90
83
  end
91
84
 
92
85
  def proxy_address