crabfarm 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49343d990ddd94c9640f37040a6fe5fc9a97ab49
4
- data.tar.gz: f75e89f9dd4b5154f01df31837b98981fed38a3c
3
+ metadata.gz: c10577d223677baa807f27aa55fc8f2ccf19f189
4
+ data.tar.gz: 7d07015295fe0325a85d73396821872e0e3542d3
5
5
  SHA512:
6
- metadata.gz: f841a5d9a50476e38b7a1252b3de75fc132ffcbc4ecb7bd25f49cbfad976eab75f825f47c5ca1afa9c41c53f4713baeb2da8578ed233b2be40897b25f859348f
7
- data.tar.gz: c183db189f4968561409c0eea25a54a7ad4fb015b7d8cd3abcceefe3edc7790392626b5944b4bfa5d32b06bf074f357b00867762fb8ee3e1a97889cb0618e778
6
+ metadata.gz: 45015ef058a82381fc1db17791bb99c362c37c8bf0cf24e29d0862c76c44f006c47e60f6d2689b545be7ec564ce05857565ca7bc00d43f592cba0f58a58f658e
7
+ data.tar.gz: 83dc7148c8ed7b793290b0c197ad6ad10b1853d196d5c650a471cd4a26531d424177a303d82a25a4780a806d0b075683234318bef9f10f0ed1283f03f807d4df
data/lib/crabfarm.rb CHANGED
@@ -7,7 +7,7 @@ require "selenium-webdriver"
7
7
  require "crabfarm/version"
8
8
  require "crabfarm/errors"
9
9
  require "crabfarm/configuration"
10
- require "crabfarm/transition_service"
10
+ require "crabfarm/global_state"
11
11
  require "crabfarm/driver_bucket"
12
12
  require "crabfarm/driver_bucket_pool"
13
13
  require "crabfarm/http_client"
@@ -16,10 +16,14 @@ require "crabfarm/phantom_driver_factory"
16
16
  require "crabfarm/phantom_runner"
17
17
  require "crabfarm/state_store"
18
18
  require "crabfarm/context"
19
+ require "crabfarm/context_factory"
20
+ require "crabfarm/transition_service"
19
21
  require "crabfarm/base_state"
20
22
  require "crabfarm/base_parser"
21
23
  require "crabfarm/strategies"
22
24
 
25
+ require "crabfarm/utils/port_discovery"
26
+
23
27
  module Crabfarm
24
28
 
25
29
  @@config = Configuration.new
data/lib/crabfarm/cli.rb CHANGED
@@ -10,28 +10,20 @@ module Crabfarm
10
10
  desc "Starts the crawler in console mode"
11
11
  command [:console, :c] do |c|
12
12
 
13
- c.desc "Capture to crabtrap file"
14
- c.flag :capture
15
-
16
- c.desc "Replay from crabtrap file"
17
- c.flag :replay
13
+ c.desc "Use a recorded memento as data source, requires crabtrap"
14
+ c.flag [:m, :memento]
18
15
 
19
16
  Support::GLI.generate_options c
20
17
 
21
18
  c.action do |global_options,options,args|
22
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
19
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
23
20
 
24
21
  Crabfarm.config.set Support::GLI.parse_options options
25
22
 
26
- next puts "Cannot use --replay with --capture" if options[:capture] and options[:replay]
27
-
28
- require 'crabfarm/crabtrap_context'
29
- context = Crabfarm::CrabtrapContext.new
30
- context.capture options[:capture] if options[:capture]
31
- context.replay options[:replay] if options[:replay]
32
-
33
- require "crabfarm/modes/console"
34
- Crabfarm::Modes::Console.start context
23
+ ContextFactory.with_context options[:memento] do |context|
24
+ require "crabfarm/modes/console"
25
+ Crabfarm::Modes::Console.process_input context
26
+ end
35
27
  end
36
28
  end
37
29
 
@@ -46,6 +38,9 @@ module Crabfarm
46
38
  c.desc "Set the server min and max threads, defaults to 0:16"
47
39
  c.flag [:t,:threads]
48
40
 
41
+ c.desc "Use a recorded memento as data source, requires crabtrap"
42
+ c.flag [:m, :memento]
43
+
49
44
  c.desc "Start the server in verbose mode"
50
45
  c.switch :verbose, :default_value => false
51
46
 
@@ -55,18 +50,22 @@ module Crabfarm
55
50
  Support::GLI.generate_options c
56
51
 
57
52
  c.action do |global_options,options,args|
58
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
53
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
54
+
55
+ Crabfarm.config.set Support::GLI.parse_options options
56
+
57
+ ActiveSupport::Dependencies.mechanism = :require unless options[:reload]
59
58
 
60
- require "crabfarm/modes/server"
61
59
  server_options = {}
62
60
  server_options[:Host] = options[:host] unless options[:host].nil?
63
61
  server_options[:Port] = options[:port] || 3100
64
62
  server_options[:Threads] = options[:threads] unless options[:threads].nil?
65
63
  server_options[:Verbose] = options[:verbose]
66
64
 
67
- ActiveSupport::Dependencies.mechanism = :require unless options[:reload]
68
- Crabfarm.config.set Support::GLI.parse_options options
69
- Crabfarm::Modes::Server.start server_options
65
+ ContextFactory.with_context options[:memento] do |context|
66
+ require "crabfarm/modes/server"
67
+ Crabfarm::Modes::Server.serve context, server_options
68
+ end
70
69
  end
71
70
  end
72
71
 
@@ -79,6 +78,8 @@ module Crabfarm
79
78
  app.flag [:r, :remote]
80
79
 
81
80
  app.action do |global_options,options,args|
81
+ next puts "This command cannot be run inside a crabfarm application" if GlobalState.inside_crawler_app?
82
+
82
83
  require "crabfarm/modes/generator"
83
84
  Crabfarm::Modes::Generator.generate_app(Dir.pwd, args[0], options[:remote])
84
85
  end
@@ -87,27 +88,34 @@ module Crabfarm
87
88
  c.desc "Generates a new crabfarm parser and parser spec"
88
89
  c.command :parser do |parser|
89
90
  parser.action do |global_options,options,args|
91
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
92
+
90
93
  require "crabfarm/modes/generator"
91
- Crabfarm::Modes::Generator.generate_parser(args[0])
94
+ Crabfarm::Modes::Generator.generate_parser(GlobalState.app_path, args[0])
92
95
  end
93
96
  end
94
97
 
95
98
  c.desc "Generates a new crabfarm state and parser spec"
96
99
  c.command :state do |parser|
97
100
  parser.action do |global_options,options,args|
101
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
102
+
98
103
  require "crabfarm/modes/generator"
99
- Crabfarm::Modes::Generator.generate_state(args[0])
104
+ Crabfarm::Modes::Generator.generate_state(GlobalState.app_path, args[0])
100
105
  end
101
106
  end
102
107
  end
103
108
 
104
109
  desc "Perform an HTTP recording for use in tests"
105
110
  command [:record, :r] do |c|
111
+ c.desc "Run recorder in playback mode"
112
+ c.switch [:p, :playback], :default_value => false
113
+
106
114
  c.action do |global_options, options, args|
107
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
115
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
108
116
 
109
117
  require "crabfarm/modes/recorder"
110
- Crabfarm::Modes::Recorder.start args[0]
118
+ Crabfarm::Modes::Recorder.start GlobalState.memento_path(args[0]), options[:playback]
111
119
  end
112
120
  end
113
121
 
@@ -120,12 +128,27 @@ module Crabfarm
120
128
  c.switch :unsafe, :default_value => false
121
129
 
122
130
  c.action do |global_options,options,args|
123
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
131
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
124
132
 
125
133
  options[:remote] = args[0]
126
134
 
127
135
  require "crabfarm/modes/publisher"
128
- Crabfarm::Modes::Publisher.publish CF_PATH, options
136
+ Crabfarm::Modes::Publisher.publish GlobalState.app_path, options
137
+ end
138
+ end
139
+
140
+ on_error do |exc|
141
+ case exc
142
+ when BinaryMissingError
143
+ if exc.binary == 'phantomjs'
144
+ puts "Could not find the phantomjs binary at '#{exc.path}', try installing it using 'npm install phantomjs -g' or set the propper path in your project's Crabfile"
145
+ false
146
+ elsif exc.binary == 'crabtrap'
147
+ puts "Could not find the crabtrap binary at '#{exc.path}', try installing it using 'npm install crabtrap -g' or set the propper path in your project's Crabfile"
148
+ false
149
+ else true end
150
+ else
151
+ true
129
152
  end
130
153
  end
131
154
 
@@ -25,12 +25,12 @@ module Crabfarm
25
25
  [:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
26
26
  [:phantom_ssl, ['sslv3', 'sslv2', 'tlsv1', 'any'], 'Phantomjs ssl mode: sslv3, sslv2, tlsv1 or any, only for phantomjs driver.'],
27
27
  [:phantom_bin_path, :string, 'Phantomjs binary path, only for phantomjs driver.'],
28
- [:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
29
28
 
30
29
  # Crabtrap launcher configuration
31
30
  [:crabtrap_bin_path, :string, 'Crabtrap binary path.'],
32
- [:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
33
- [:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
31
+
32
+ # Recorder configuration
33
+ [:recorder_driver, :string, 'Recorder driver name, defaults to \'firefox\'']
34
34
  ]
35
35
  .map { |o| Option.new *o }
36
36
 
@@ -56,7 +56,6 @@ module Crabfarm
56
56
  driver_factory: nil,
57
57
  log_path: nil,
58
58
  proxy: nil,
59
-
60
59
  driver: 'phantomjs',
61
60
  driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
62
61
  driver_host: 'localhost',
@@ -64,14 +63,11 @@ module Crabfarm
64
63
  driver_remote_timeout: 120,
65
64
  driver_window_width: 1280,
66
65
  driver_window_height: 800,
67
-
68
66
  phantom_load_images: false,
69
67
  phantom_ssl: 'any',
70
68
  phantom_bin_path: 'phantomjs',
71
- phantom_lock_file: nil,
72
-
73
69
  crabtrap_bin_path: 'crabtrap',
74
- crabtrap_port: 4000
70
+ recorder_driver: :firefox
75
71
  }
76
72
  end
77
73
 
@@ -108,7 +104,6 @@ module Crabfarm
108
104
  proxy: proxy,
109
105
  ssl: phantom_ssl,
110
106
  bin_path: phantom_bin_path,
111
- lock_file: phantom_lock_file,
112
107
  log_file: log_path ? File.join(log_path, 'phantom.log') : nil
113
108
  }
114
109
  end
@@ -116,7 +111,6 @@ module Crabfarm
116
111
  def crabtrap_config
117
112
  {
118
113
  bin_path: crabtrap_bin_path,
119
- port: crabtrap_port,
120
114
  proxy: proxy
121
115
  }
122
116
  end
@@ -11,26 +11,45 @@ module Crabfarm
11
11
  @loaded = false
12
12
  end
13
13
 
14
- def load
14
+ def loaded?
15
+ @loaded
16
+ end
17
+
18
+ def prepare
19
+ unless @loaded
20
+ load_services
21
+ @loaded = true
22
+ end
23
+ end
24
+
25
+ def reset
26
+ reset_services if @loaded
27
+ end
28
+
29
+ def release
30
+ unload_services
31
+ @loaded = false
32
+ end
33
+
34
+ private
35
+
36
+ def load_services
15
37
  init_phantom_if_required
16
38
  init_driver_pool
17
39
  init_http_client
18
- @loaded = true
19
40
  end
20
41
 
21
- def reset
42
+ def reset_services
22
43
  @store.reset
23
- @pool.reset unless @pool.nil?
44
+ @pool.reset
24
45
  end
25
46
 
26
- def release
47
+ def unload_services
27
48
  release_driver_pool
49
+ release_http_client
28
50
  release_phantom
29
- @loaded = false
30
51
  end
31
52
 
32
- private
33
-
34
53
  def init_driver_pool
35
54
  @pool = DriverBucketPool.new build_driver_factory if @pool.nil?
36
55
  end
@@ -47,7 +66,8 @@ module Crabfarm
47
66
  end
48
67
 
49
68
  def load_and_start_phantom
50
- new_phantom = PhantomRunner.new phantom_config
69
+ phantom_port = Utils::PortDiscovery.find_available_port
70
+ new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
51
71
  new_phantom.start
52
72
  return new_phantom
53
73
  end
@@ -0,0 +1,31 @@
1
+ module Crabfarm
2
+ module ContextFactory
3
+
4
+ def with_context(_memento=nil)
5
+ ctx = build_context(_memento)
6
+ begin
7
+ ctx.prepare
8
+ yield ctx
9
+ ensure
10
+ ctx.release
11
+ end
12
+ end
13
+
14
+ def build_context(_memento=nil)
15
+ if _memento.nil?
16
+ Crabfarm::Context.new
17
+ else
18
+ load_crabtrap_context _memento
19
+ end
20
+ end
21
+
22
+ def load_crabtrap_context(_memento)
23
+ require 'crabfarm/crabtrap_context'
24
+ m_path = GlobalState.memento_path _memento
25
+ raise ResourceNotFoundError.new "Could not find memento '#{_name}'" unless File.exists? m_path
26
+ Crabfarm::CrabtrapContext.new :replay, m_path
27
+ end
28
+
29
+ extend self
30
+ end
31
+ end
@@ -4,54 +4,94 @@ require 'crabfarm/crabtrap_runner'
4
4
  module Crabfarm
5
5
  class CrabtrapContext < Context
6
6
 
7
- def load
8
- pass_through if @runner.nil?
9
- super
7
+ attr_accessor :mode
8
+
9
+ def initialize(_mode=:pass, _path=nil)
10
+ @mode = _mode
11
+ @path = _path
10
12
  end
11
13
 
12
14
  def pass_through
13
- restart_with_options(mode: :pass) if @runner.nil? or @runner.mode != :pass
15
+ if not loaded? or @mode != :pass
16
+ @mode = :pass
17
+ @path = nil
18
+ restart
19
+ end
14
20
  end
15
21
 
16
22
  def capture(_path)
17
- restart_with_options(mode: :capture, bucket_path: _path)
23
+ @mode = :capture
24
+ @path = _path
25
+ restart
18
26
  end
19
27
 
20
28
  def replay(_path)
21
- restart_with_options(mode: :replay, bucket_path: _path)
29
+ @mode = :replay
30
+ @path = _path
31
+ restart
22
32
  end
23
33
 
24
- def release
25
- super
26
- stop_daemon
34
+ def restart
35
+ if not loaded?
36
+ prepare
37
+ else
38
+ stop_daemon
39
+ start_daemon
40
+ end
27
41
  end
28
42
 
29
43
  private
30
44
 
45
+ def load_services
46
+ @port = Utils::PortDiscovery.find_available_port
47
+ start_daemon
48
+ super
49
+ end
50
+
51
+ def reset_services
52
+ restart
53
+ end
54
+
55
+ def unload_services
56
+ super
57
+ stop_daemon
58
+ @port = nil
59
+ end
60
+
31
61
  def build_http_client
32
62
  HttpClient.new proxy_address
33
63
  end
34
64
 
35
- def restart_with_options(_options)
36
- stop_daemon
37
- @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(_options)
38
- @runner.start
65
+ def start_daemon
66
+ if @runner.nil?
67
+ options = {
68
+ mode: @mode,
69
+ bucket_path: @path,
70
+ port: @port
71
+ }
72
+
73
+ @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(options)
74
+ @runner.start
75
+ end
39
76
  end
40
77
 
41
78
  def stop_daemon
42
- @runner.stop unless @runner.nil?
79
+ unless @runner.nil?
80
+ @runner.stop
81
+ @runner = nil
82
+ else nil end
43
83
  end
44
84
 
45
85
  def driver_config
46
- if @runner.is_running? then super.merge(proxy: proxy_address) else super end
86
+ super.merge(proxy: proxy_address)
47
87
  end
48
88
 
49
89
  def phantom_config
50
- if @runner.is_running? then super.merge(proxy: proxy_address) else super end
90
+ super.merge(proxy: proxy_address)
51
91
  end
52
92
 
53
93
  def proxy_address
54
- "127.0.0.1:#{@runner.port}"
94
+ "127.0.0.1:#{@port}"
55
95
  end
56
96
 
57
97
  end
@@ -1,6 +1,10 @@
1
+ require 'timeout'
2
+
1
3
  module Crabfarm
2
4
  class CrabtrapRunner
3
5
 
6
+ CRABTRAP_START_TM = 5 # seconds
7
+
4
8
  def initialize(_config={})
5
9
  @config = _config;
6
10
  @pid = nil
@@ -11,7 +15,7 @@ module Crabfarm
11
15
  end
12
16
 
13
17
  def port
14
- @config[:port] # TODO: maybe select port dynamically...
18
+ @config[:port]
15
19
  end
16
20
 
17
21
  def mode
@@ -19,25 +23,39 @@ module Crabfarm
19
23
  end
20
24
 
21
25
  def start
22
- begin
23
- @pid = Process.spawn({}, crabtrap_cmd)
24
- wait_for_server
25
- rescue
26
- puts "Could not find crabtrap at #{@config[:bin_path]}, memento replaying is disabled!"
27
- @pid = nil
28
- end
26
+ logger.info "Starting crabtrap in port #{port}"
27
+ @pid = spawn_crabtrap
28
+ logger.info "Crabtrap started (PID: #{@pid})"
29
29
  end
30
30
 
31
31
  def stop
32
32
  unless @pid.nil?
33
+ logger.info "Stopping crabtrap (PID: #{@pid})"
33
34
  Process.kill("INT", @pid)
34
35
  Process.wait @pid
36
+ logger.info "Crabtrap stopped"
35
37
  @pid = nil
36
38
  end
37
39
  end
38
40
 
39
41
  private
40
42
 
43
+ def spawn_crabtrap
44
+ pid = nil
45
+ begin
46
+ pid = Process.spawn({}, crabtrap_cmd)
47
+ Timeout::timeout(CRABTRAP_START_TM) { wait_for_server }
48
+ return pid
49
+ rescue Errno::ENOENT
50
+ raise BinaryMissingError.new 'crabtrap', @config[:bin_path]
51
+ rescue Timeout::Error
52
+ Process.kill "INT", pid
53
+ Process.wait pid
54
+ raise
55
+ end
56
+ pid
57
+ end
58
+
41
59
  def crabtrap_cmd
42
60
  cmd = [@config[:bin_path]]
43
61
  cmd << mode.to_s
@@ -57,5 +75,9 @@ module Crabfarm
57
75
  end
58
76
  end
59
77
 
78
+ def logger
79
+ Crabfarm.logger
80
+ end
81
+
60
82
  end
61
83
  end
@@ -3,11 +3,12 @@ require 'ostruct'
3
3
 
4
4
  module Crabfarm
5
5
  module Engines
6
- class SafeStateLoop
6
+ class AsyncStateManager
7
7
 
8
8
  class LoopAbortedException < StandardError; end
9
9
 
10
- def initialize
10
+ def initialize(_context)
11
+ @context = _context
11
12
  @working = false
12
13
  @fatal = nil
13
14
  @lock = Mutex.new
@@ -37,7 +38,7 @@ module Crabfarm
37
38
  start
38
39
  end
39
40
 
40
- def change_state(_name, _params={}, _wait=nil)
41
+ def transition(_name, _params={}, _wait=nil)
41
42
  @lock.synchronize {
42
43
  if @fatal
43
44
  raise CrawlerError.new @fatal
@@ -100,25 +101,24 @@ module Crabfarm
100
101
  end
101
102
 
102
103
  def crawl_loop
103
- context = Crabfarm::Context.new
104
-
105
104
  begin
106
105
  loop do
107
106
  if @working
108
- @elapsed = Benchmark.measure do
109
- begin
107
+ begin
108
+ logger.info "Transitioning state: #{@next_state_name}"
109
+ @elapsed = Benchmark.measure do
110
110
  ActiveSupport::Dependencies.clear
111
- logger.info "StateLoop: loading state: #{@next_state_name}"
112
- @doc = TransitionService.apply_state(context, @next_state_name, @next_state_params).output_as_json
113
- logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
114
- @error = nil
115
- rescue Exception => e
116
- logger.error "StateLoop: error while loading state: #{@next_state_name}"
117
- logger.error e
118
- @doc = nil
119
- @error = e
120
- end
121
- end.real
111
+ @doc = TransitionService.apply_state(@context, @next_state_name, @next_state_params).output_as_json
112
+ end.real
113
+
114
+ logger.info "Transitioned in #{@elapsed.real}"
115
+ @error = nil
116
+ rescue Exception => e
117
+ logger.error "Error during transition:"
118
+ logger.error e
119
+ @doc = nil
120
+ @error = e
121
+ end
122
122
 
123
123
  @lock.synchronize {
124
124
  @state_name = @next_state_name
@@ -128,17 +128,15 @@ module Crabfarm
128
128
  else sleep 0.2 end
129
129
  end
130
130
  rescue LoopAbortedException
131
- logger.info "StateLoop: stopping"
131
+ logger.info "Manager stopping"
132
132
 
133
133
  rescue Exception => e
134
- logger.fatal "StateLoop: unhandled exception!"
134
+ logger.fatal "Unhandled exception:"
135
135
  logger.fatal e
136
136
 
137
137
  @lock.synchronize {
138
138
  @fatal = e
139
139
  }
140
- ensure
141
- context.release
142
140
  end
143
141
  end
144
142
 
@@ -0,0 +1,40 @@
1
+ require 'benchmark'
2
+ require 'ostruct'
3
+
4
+ module Crabfarm
5
+ module Engines
6
+ class SyncStateManager
7
+
8
+ def initialize(_context)
9
+ @context = _context
10
+ @lock = Mutex.new
11
+ end
12
+
13
+ def reload!
14
+ @lock.synchronize {
15
+ ActiveSupport::Dependencies.clear
16
+ @context.reset
17
+ }
18
+ end
19
+
20
+ def reset
21
+ @lock.synchronize {
22
+ @context.reset
23
+ }
24
+ end
25
+
26
+ def transition(_name, _params={})
27
+ @lock.synchronize {
28
+ output = { name: _name, params: _params }
29
+
30
+ output[:elapsed] = Benchmark.measure do
31
+ output[:doc] = TransitionService.apply_state(@context, _name, _params).output_as_json
32
+ end
33
+
34
+ OpenStruct.new output
35
+ }
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -4,18 +4,25 @@ module Crabfarm
4
4
 
5
5
  class ConfigurationError < Error; end
6
6
 
7
- class AssertionError < Error; end
7
+ class BinaryMissingError < ConfigurationError
8
8
 
9
- class EntityNotFoundError < Error
10
- attr_accessor :role, :name
9
+ attr_accessor :binary
10
+ attr_accessor :path
11
11
 
12
- def initialize(_role, _name)
13
- super("The required #{_role} was not found (#{_name})")
14
- @role = _role
15
- @name = _name
12
+ def initialize(_binary, _path)
13
+ @binary = _binary
14
+ @path = _path
15
+ super "Could not find a suitable version of #{@binary}"
16
16
  end
17
+
17
18
  end
18
19
 
20
+ class AssertionError < Error; end
21
+
22
+ class ArgumentError < Error; end
23
+
24
+ class ResourceNotFoundError < Crabfarm::Error; end
25
+
19
26
  class ApiError < Error
20
27
  def code; 500 end
21
28
  def to_json; {} end
@@ -0,0 +1,22 @@
1
+ module Crabfarm
2
+ module GlobalState
3
+
4
+ def inside_crawler_app?
5
+ defined? CF_PATH
6
+ end
7
+
8
+ def app_path
9
+ CF_PATH
10
+ end
11
+
12
+ def memento_path(_name)
13
+ File.join(app_path, 'spec/mementos', _name + '.json.gz')
14
+ end
15
+
16
+ def snapshot_path(_file)
17
+ File.join(app_path, 'spec/snapshots', _file)
18
+ end
19
+
20
+ extend self
21
+ end
22
+ end
@@ -1,28 +1,23 @@
1
- require 'benchmark'
2
1
  require 'readline'
3
2
  require 'rainbow'
4
3
  require 'rainbow/ext/string'
5
4
  require 'json'
5
+ require 'crabfarm/engines/sync_state_loop'
6
6
 
7
7
  module Crabfarm
8
8
  module Modes
9
9
  class Console
10
10
 
11
- class ConsoleDsl
12
-
13
- def initialize(_context)
14
- @context = _context
15
- end
11
+ class ConsoleDsl < Engines::SyncStateManager
16
12
 
17
13
  def reload!
18
14
  puts "Reloading crawler source".color(:green)
19
- ActiveSupport::Dependencies.clear
20
- @context.reset
15
+ super
21
16
  end
22
17
 
23
18
  def reset
24
19
  puts "Resetting crawling context".color(:green)
25
- @context.reset
20
+ super
26
21
  end
27
22
 
28
23
  def transition(_name=nil, _params={})
@@ -32,17 +27,14 @@ module Crabfarm
32
27
  end
33
28
 
34
29
  begin
35
- elapsed = Benchmark.measure do
36
- puts "Transitioning to #{_name.to_s.camelize} state"
37
- doc = TransitionService.apply_state(@context, _name, _params).output_as_json
30
+ puts "Transitioning to #{_name.to_s.camelize} state"
31
+ output = super
38
32
 
39
- puts "State changed, generated document:"
40
- puts JSON.pretty_generate(doc).color(:green).gsub(/(^|\\n)/, ' ')
41
- end
42
- puts "Completed in #{elapsed.real} s"
43
- rescue EntityNotFoundError => e
44
- puts "#{e.to_s}".color(:red)
45
- rescue => e
33
+ puts "State changed, generated document:"
34
+ puts JSON.pretty_generate(output.doc).color(:green).gsub(/(^|\\n)/, ' ')
35
+ puts "Completed in #{output.elapsed.real} s"
36
+
37
+ rescue Exception => e
46
38
  puts "#{e.to_s}".color(:red)
47
39
  puts e.backtrace
48
40
  end
@@ -56,7 +48,7 @@ module Crabfarm
56
48
  alias :r :reset
57
49
  end
58
50
 
59
- def self.start(_context)
51
+ def self.process_input(_context)
60
52
  dsl = ConsoleDsl.new _context
61
53
 
62
54
  loop do
@@ -71,8 +63,7 @@ module Crabfarm
71
63
  end
72
64
  end
73
65
 
74
- puts "Releasing crawling context".color(:green)
75
- _context.release
66
+ puts "Exiting".color(:green)
76
67
  end
77
68
 
78
69
  end
@@ -9,7 +9,7 @@ module Crabfarm
9
9
  module Generator
10
10
 
11
11
  def generate_app(_target, _name, _default_remote=nil)
12
- with_external_path _target do
12
+ with_base_path _target do
13
13
  binding = {
14
14
  name: _name,
15
15
  remote: _default_remote,
@@ -35,38 +35,29 @@ module Crabfarm
35
35
  end
36
36
  end
37
37
 
38
- def generate_state(_name)
38
+ def generate_state(_target, _name)
39
39
  class_name = _name.camelize
40
- with_crawler_path do
40
+ with_base_path _target do
41
41
  binding = { state_class: class_name.camelize }
42
42
  path('app', 'states', class_name.underscore + '.rb').render('state.rb', binding)
43
43
  path('spec', 'states', class_name.underscore + '_spec.rb').render('state_spec.rb', binding)
44
44
  end
45
45
  end
46
46
 
47
- def generate_parser(_name)
47
+ def generate_parser(_target, _name)
48
48
  class_name = _name.camelize + 'Parser'
49
- with_crawler_path do
49
+ with_base_path _target do
50
50
  binding = { parser_class: class_name }
51
51
  path('app', 'parsers', class_name.underscore + '.rb').render('parser.rb', binding)
52
52
  path('spec', 'parsers', class_name.underscore + '_spec.rb').render('parser_spec.rb', binding)
53
53
  end
54
54
  end
55
55
 
56
- def with_external_path(_target)
56
+ def with_base_path(_target)
57
57
  @base_path = _target
58
58
  yield
59
59
  end
60
60
 
61
- def with_crawler_path
62
- if defined? CF_PATH
63
- @base_path = CF_PATH
64
- yield
65
- else
66
- puts "This command can only be run inside a crabfarm application"
67
- end
68
- end
69
-
70
61
  def path(*_args)
71
62
  @path = _args
72
63
  self
@@ -14,7 +14,7 @@ module Crabfarm
14
14
  module Publisher
15
15
  extend self
16
16
 
17
- DEFAULT_HOST = 'http://www.crabfarm.io'
17
+ DEFAULT_HOST = 'http://api.crabfarm.io'
18
18
 
19
19
  def publish(_path, _options={})
20
20
 
@@ -4,37 +4,42 @@ require 'crabfarm/crabtrap_runner'
4
4
 
5
5
  module Crabfarm
6
6
  module Modes
7
- class Recorder
7
+ module Recorder
8
8
 
9
- def self.start(_target)
10
- return puts "Must provide a recording name" unless _target.is_a? String
9
+ def self.start(_target, _replay=false)
10
+ return puts "Must provide a recording target" unless _target.is_a? String
11
+ return puts "Memento file does not exist: #{_target}" if _replay and not File.exist? _target
11
12
 
12
13
  crabtrap_config = Crabfarm.config.crabtrap_config
13
- crabtrap_config[:mode] = :capture
14
- crabtrap_config[:bucket_path] = File.join(CF_PATH, 'spec/mementos', _target + '.json.gz')
14
+ crabtrap_config[:mode] = _replay ? :replay : :capture
15
+ crabtrap_config[:port] = Utils::PortDiscovery.find_available_port
16
+ crabtrap_config[:bucket_path] = _target
15
17
 
16
18
  crabtrap = CrabtrapRunner.new crabtrap_config
17
19
  crabtrap.start
18
20
 
19
- driver_config = Crabfarm.config.driver_config
20
- driver_config[:name] = :firefox
21
- driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
22
-
23
- driver = DefaultDriverFactory.new(driver_config).build_driver nil
24
-
25
21
  begin
26
- puts "Press Ctrl-C to stop capturing."
27
- loop do
28
- driver.current_url
29
- sleep 1.0
22
+ driver_config = Crabfarm.config.driver_config
23
+ driver_config[:name] = Crabfarm.config.recorder_driver
24
+ driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
25
+
26
+ driver = DefaultDriverFactory.new(driver_config).build_driver nil
27
+
28
+ begin
29
+ puts "Press Ctrl-C to stop capturing."
30
+ loop do
31
+ driver.current_url
32
+ sleep 1.0
33
+ end
34
+ rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
35
+ # noop
30
36
  end
31
- rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
32
- # noop
33
- end
34
37
 
35
- puts "Releasing crawling context".color(:green)
36
- driver.quit rescue nil
37
- crabtrap.stop
38
+ puts "Releasing crawling context".color(:green)
39
+ driver.quit rescue nil
40
+ ensure
41
+ crabtrap.stop
42
+ end
38
43
  end
39
44
 
40
45
  end
@@ -1,6 +1,6 @@
1
1
  require 'grape'
2
2
  require 'crabfarm/support/custom_puma'
3
- require 'crabfarm/engines/safe_state_loop'
3
+ require 'crabfarm/engines/async_state_manager'
4
4
 
5
5
  module Crabfarm
6
6
  module Modes
@@ -54,7 +54,7 @@ module Crabfarm
54
54
  optional :wait, type: Float
55
55
  end
56
56
  put :state do
57
- print_state evaluator.change_state params[:name], params[:params], wait
57
+ print_state evaluator.transition params[:name], params[:params], wait
58
58
  end
59
59
  end
60
60
 
@@ -62,8 +62,8 @@ module Crabfarm
62
62
  @@evaluator
63
63
  end
64
64
 
65
- def self.start(_options)
66
- @@evaluator = Engines::SafeStateLoop.new
65
+ def self.serve(_context, _options)
66
+ @@evaluator = Engines::AsyncStateManager.new _context
67
67
  @@evaluator.start
68
68
  begin
69
69
  Support::CustomPuma.run API, _options
@@ -5,16 +5,17 @@ module Crabfarm
5
5
 
6
6
  PHANTOM_START_TM = 5 # seconds
7
7
 
8
- attr_reader :port
9
-
10
8
  def initialize(_config={})
11
9
  @config = _config;
12
10
  @pid = nil
13
11
  end
14
12
 
13
+ def port
14
+ @config[:port]
15
+ end
16
+
15
17
  def start
16
- find_available_port
17
- Crabfarm.logger.info "Starting phantomjs in port #{@port}"
18
+ Crabfarm.logger.info "Starting phantomjs in port #{port}"
18
19
  @pid = spawn_phantomjs
19
20
  Crabfarm.logger.info "Phantomjs started (PID: #{@pid})"
20
21
  end
@@ -32,22 +33,25 @@ module Crabfarm
32
33
  private
33
34
 
34
35
  def spawn_phantomjs
35
- pid = Process.spawn({}, phantomjs_cmd)
36
+ pid = nil
36
37
  begin
38
+ pid = Process.spawn({}, phantomjs_cmd)
37
39
  Timeout::timeout(PHANTOM_START_TM) { wait_for_server }
40
+ return pid
41
+ rescue Errno::ENOENT
42
+ raise BinaryMissingError.new 'phantomjs', @config[:bin_path]
38
43
  rescue Timeout::Error
39
44
  Process.kill "INT", pid
40
45
  Process.wait pid
41
46
  raise
42
47
  end
43
- return pid
44
48
  end
45
49
 
46
50
  def phantomjs_cmd
47
51
  cmd = [@config[:bin_path]]
48
52
  cmd << '--load-images=false' unless @config[:load_images]
49
53
  cmd << "--proxy=#{@config[:proxy]}" unless @config[:proxy].nil?
50
- cmd << "--webdriver=#{@port}"
54
+ cmd << "--webdriver=#{port}"
51
55
  cmd << "--ssl-protocol=#{@config[:ssl]}" unless @config[:ssl].nil?
52
56
  cmd << "--ignore-ssl-errors=true"
53
57
  cmd << "--webdriver-loglevel=WARN"
@@ -55,37 +59,15 @@ module Crabfarm
55
59
  cmd.join(' ')
56
60
  end
57
61
 
58
- def find_available_port
59
- with_lock do
60
- socket = Socket.new(:INET, :STREAM, 0)
61
- socket.bind(Addrinfo.tcp("127.0.0.1", 0))
62
- @port = socket.local_address.ip_port
63
- socket.close
64
- end
65
- end
66
-
67
62
  def wait_for_server
68
63
  loop do
69
64
  begin
70
- Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{@port}/status"))
65
+ Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{port}/status"))
71
66
  break
72
67
  rescue
73
68
  end
74
69
  end
75
70
  end
76
71
 
77
- def with_lock
78
- return yield if @config[:lock_file].nil?
79
-
80
- File.open(@config[:lock_file], 'a+') do |file|
81
- begin
82
- file.flock File::LOCK_EX
83
- return yield
84
- ensure
85
- file.flock File::LOCK_UN
86
- end
87
- end
88
- end
89
-
90
72
  end
91
73
  end
@@ -1,26 +1,22 @@
1
- require 'crabfarm/crabtrap_context'
2
- require 'net/http'
3
-
4
- CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
5
- CF_TEST_CONTEXT.load
6
-
7
1
  module Crabfarm
8
2
  module RSpec
9
3
 
10
- def parse(_snap_or_url, _options={})
11
- fixture = Pathname.new(File.join(ENV['SNAPSHOT_DIR'], _snap_or_url))
12
- html = if fixture.exist?
13
- File.read fixture.realpath
14
- else
15
- Net::HTTP.get(URI.parse _snap_or_url)
16
- end
4
+ class Error < Crabfarm::Error; end
17
5
 
6
+ def parse(_snapshot, _options={})
7
+ snapshot_path = GlobalState.snapshot_path _snapshot
8
+ raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
9
+
10
+ html = File.read snapshot_path
18
11
  parser = described_class.new html, _options
19
12
  parser.parse
20
13
  parser
21
14
  end
22
15
 
23
16
  def crawl(_state=nil, _params={})
17
+
18
+ raise Error.new "Crawl is only available in state specs" if @context.nil?
19
+
24
20
  if _state.is_a? Hash
25
21
  _params = _state
26
22
  _state = nil
@@ -28,9 +24,9 @@ module Crabfarm
28
24
 
29
25
  if _state.nil?
30
26
  return nil unless described_class < BaseState # TODO: maybe raise an error here.
31
- @state = @last_state = TransitionService.apply_state CF_TEST_CONTEXT, described_class, _params
27
+ @state = @last_state = TransitionService.apply_state @context, described_class, _params
32
28
  else
33
- @last_state = TransitionService.apply_state CF_TEST_CONTEXT, _state, _params
29
+ @last_state = TransitionService.apply_state @context, _state, _params
34
30
  end
35
31
  end
36
32
 
@@ -46,26 +42,30 @@ module Crabfarm
46
42
  @parser
47
43
  end
48
44
 
45
+ def driver(_session_id=nil)
46
+ @context.pool.driver(_session_id)
47
+ end
48
+
49
49
  end
50
50
  end
51
51
 
52
52
  RSpec.configure do |config|
53
53
  config.include Crabfarm::RSpec
54
54
 
55
- config.before(:example) do |example|
56
-
57
- if example.metadata[:parsing]
58
- @parser = parse example.metadata[:parsing], example.metadata[:using] || {}
59
- end
60
-
61
- if example.metadata[:crawling]
62
- CF_TEST_CONTEXT.replay File.join(CF_PATH, 'spec/mementos', example.metadata[:crawling] + '.json.gz')
55
+ config.around(:example) do |example|
56
+ if described_class < Crabfarm::BaseParser
57
+ if example.metadata[:parsing]
58
+ @parser = parse example.metadata[:parsing], example.metadata[:using] || {}
59
+ end
60
+ example.run
61
+ elsif described_class < Crabfarm::BaseState
62
+ Crabfarm::ContextFactory.with_context example.metadata[:crawling] do |ctx|
63
+ @context = ctx
64
+ example.run
65
+ end
63
66
  else
64
- CF_TEST_CONTEXT.pass_through
67
+ example.run
65
68
  end
66
69
  end
67
70
 
68
- config.after(:suite) do
69
- CF_TEST_CONTEXT.release
70
- end
71
71
  end
@@ -65,10 +65,13 @@ set_driver :phantomjs
65
65
  # set_driver_remote_timeout 120
66
66
 
67
67
 
68
- # Crabtrap launcher configuration
68
+ # Recording configuration
69
69
  ########################################
70
70
 
71
- # The following parameters only apply when running the crabtrap proxy on specs
71
+ # The following parameters only apply when recording or replaying mementos
72
72
 
73
73
  # Set the crabtrap executable location, by default crabfarm expects crabtrap to be included in enviroment
74
74
  # set_crabtrap_bin_path 'crabtrap'
75
+
76
+ # Change the browser used for the recording command, available options are 'firefox' or 'chrome'
77
+ # set_recorder_driver :firefox
@@ -1,8 +1,6 @@
1
1
  require File.expand_path("../../boot", __FILE__)
2
2
  Bundler.require :test
3
3
 
4
- ENV['SNAPSHOT_DIR'] ||= File.expand_path("../snapshots", __FILE__)
5
-
6
4
  require "crabfarm/rspec"
7
5
 
8
6
  RSpec.configure do |config|
@@ -6,7 +6,7 @@ module Crabfarm
6
6
  load_by_name _name
7
7
  else _name end
8
8
 
9
- _context.load
9
+ _context.prepare
10
10
  state = state_class.new _context, _params
11
11
  state.crawl
12
12
  state
@@ -0,0 +1,17 @@
1
+ module Crabfarm
2
+ module Utils
3
+ module PortDiscovery
4
+
5
+ def self.find_available_port
6
+ begin
7
+ socket = Socket.new(:INET, :STREAM, 0)
8
+ socket.bind(Addrinfo.tcp("127.0.0.1", 0))
9
+ return socket.local_address.ip_port
10
+ ensure
11
+ socket.close rescue nil
12
+ end
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  module Crabfarm
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabfarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ignacio Baixas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-24 00:00:00.000000000 Z
11
+ date: 2015-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: selenium-webdriver
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ~>
18
18
  - !ruby/object:Gem::Version
19
- version: '2.33'
19
+ version: '2.45'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: '2.33'
26
+ version: '2.45'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -377,6 +377,7 @@ files:
377
377
  - lib/crabfarm/cli.rb
378
378
  - lib/crabfarm/configuration.rb
379
379
  - lib/crabfarm/context.rb
380
+ - lib/crabfarm/context_factory.rb
380
381
  - lib/crabfarm/crabtrap_context.rb
381
382
  - lib/crabfarm/crabtrap_runner.rb
382
383
  - lib/crabfarm/default_driver_factory.rb
@@ -385,10 +386,11 @@ files:
385
386
  - lib/crabfarm/dsl/surfer/search_context.rb
386
387
  - lib/crabfarm/dsl/surfer/surf_context.rb
387
388
  - lib/crabfarm/dsl/surfer.rb
388
- - lib/crabfarm/engines/safe_state_loop.rb
389
+ - lib/crabfarm/engines/async_state_manager.rb
390
+ - lib/crabfarm/engines/sync_state_manager.rb
389
391
  - lib/crabfarm/errors.rb
390
- - lib/crabfarm/event_store.rb
391
392
  - lib/crabfarm/forked_state.rb
393
+ - lib/crabfarm/global_state.rb
392
394
  - lib/crabfarm/http_client.rb
393
395
  - lib/crabfarm/mocks/noop_driver.rb
394
396
  - lib/crabfarm/modes/console.rb
@@ -417,6 +419,7 @@ files:
417
419
  - lib/crabfarm/templates/state.rb.erb
418
420
  - lib/crabfarm/templates/state_spec.rb.erb
419
421
  - lib/crabfarm/transition_service.rb
422
+ - lib/crabfarm/utils/port_discovery.rb
420
423
  - lib/crabfarm/version.rb
421
424
  - lib/crabfarm.rb
422
425
  - bin/crabfarm
@@ -1,20 +0,0 @@
1
- module Crabfarm
2
- class EventStore
3
-
4
- def initialize
5
- @events = []
6
- @mutex = Mutex.new
7
- end
8
-
9
- def event(_category, _message)
10
- @mutex.synchronize do
11
- @events << {
12
- created_at: Time.current,
13
- category: _category,
14
- msg: _message
15
- }
16
- end
17
- end
18
-
19
- end
20
- end