crabfarm 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 49343d990ddd94c9640f37040a6fe5fc9a97ab49
4
- data.tar.gz: f75e89f9dd4b5154f01df31837b98981fed38a3c
3
+ metadata.gz: c10577d223677baa807f27aa55fc8f2ccf19f189
4
+ data.tar.gz: 7d07015295fe0325a85d73396821872e0e3542d3
5
5
  SHA512:
6
- metadata.gz: f841a5d9a50476e38b7a1252b3de75fc132ffcbc4ecb7bd25f49cbfad976eab75f825f47c5ca1afa9c41c53f4713baeb2da8578ed233b2be40897b25f859348f
7
- data.tar.gz: c183db189f4968561409c0eea25a54a7ad4fb015b7d8cd3abcceefe3edc7790392626b5944b4bfa5d32b06bf074f357b00867762fb8ee3e1a97889cb0618e778
6
+ metadata.gz: 45015ef058a82381fc1db17791bb99c362c37c8bf0cf24e29d0862c76c44f006c47e60f6d2689b545be7ec564ce05857565ca7bc00d43f592cba0f58a58f658e
7
+ data.tar.gz: 83dc7148c8ed7b793290b0c197ad6ad10b1853d196d5c650a471cd4a26531d424177a303d82a25a4780a806d0b075683234318bef9f10f0ed1283f03f807d4df
data/lib/crabfarm.rb CHANGED
@@ -7,7 +7,7 @@ require "selenium-webdriver"
7
7
  require "crabfarm/version"
8
8
  require "crabfarm/errors"
9
9
  require "crabfarm/configuration"
10
- require "crabfarm/transition_service"
10
+ require "crabfarm/global_state"
11
11
  require "crabfarm/driver_bucket"
12
12
  require "crabfarm/driver_bucket_pool"
13
13
  require "crabfarm/http_client"
@@ -16,10 +16,14 @@ require "crabfarm/phantom_driver_factory"
16
16
  require "crabfarm/phantom_runner"
17
17
  require "crabfarm/state_store"
18
18
  require "crabfarm/context"
19
+ require "crabfarm/context_factory"
20
+ require "crabfarm/transition_service"
19
21
  require "crabfarm/base_state"
20
22
  require "crabfarm/base_parser"
21
23
  require "crabfarm/strategies"
22
24
 
25
+ require "crabfarm/utils/port_discovery"
26
+
23
27
  module Crabfarm
24
28
 
25
29
  @@config = Configuration.new
data/lib/crabfarm/cli.rb CHANGED
@@ -10,28 +10,20 @@ module Crabfarm
10
10
  desc "Starts the crawler in console mode"
11
11
  command [:console, :c] do |c|
12
12
 
13
- c.desc "Capture to crabtrap file"
14
- c.flag :capture
15
-
16
- c.desc "Replay from crabtrap file"
17
- c.flag :replay
13
+ c.desc "Use a recorded memento as data source, requires crabtrap"
14
+ c.flag [:m, :memento]
18
15
 
19
16
  Support::GLI.generate_options c
20
17
 
21
18
  c.action do |global_options,options,args|
22
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
19
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
23
20
 
24
21
  Crabfarm.config.set Support::GLI.parse_options options
25
22
 
26
- next puts "Cannot use --replay with --capture" if options[:capture] and options[:replay]
27
-
28
- require 'crabfarm/crabtrap_context'
29
- context = Crabfarm::CrabtrapContext.new
30
- context.capture options[:capture] if options[:capture]
31
- context.replay options[:replay] if options[:replay]
32
-
33
- require "crabfarm/modes/console"
34
- Crabfarm::Modes::Console.start context
23
+ ContextFactory.with_context options[:memento] do |context|
24
+ require "crabfarm/modes/console"
25
+ Crabfarm::Modes::Console.process_input context
26
+ end
35
27
  end
36
28
  end
37
29
 
@@ -46,6 +38,9 @@ module Crabfarm
46
38
  c.desc "Set the server min and max threads, defaults to 0:16"
47
39
  c.flag [:t,:threads]
48
40
 
41
+ c.desc "Use a recorded memento as data source, requires crabtrap"
42
+ c.flag [:m, :memento]
43
+
49
44
  c.desc "Start the server in verbose mode"
50
45
  c.switch :verbose, :default_value => false
51
46
 
@@ -55,18 +50,22 @@ module Crabfarm
55
50
  Support::GLI.generate_options c
56
51
 
57
52
  c.action do |global_options,options,args|
58
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
53
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
54
+
55
+ Crabfarm.config.set Support::GLI.parse_options options
56
+
57
+ ActiveSupport::Dependencies.mechanism = :require unless options[:reload]
59
58
 
60
- require "crabfarm/modes/server"
61
59
  server_options = {}
62
60
  server_options[:Host] = options[:host] unless options[:host].nil?
63
61
  server_options[:Port] = options[:port] || 3100
64
62
  server_options[:Threads] = options[:threads] unless options[:threads].nil?
65
63
  server_options[:Verbose] = options[:verbose]
66
64
 
67
- ActiveSupport::Dependencies.mechanism = :require unless options[:reload]
68
- Crabfarm.config.set Support::GLI.parse_options options
69
- Crabfarm::Modes::Server.start server_options
65
+ ContextFactory.with_context options[:memento] do |context|
66
+ require "crabfarm/modes/server"
67
+ Crabfarm::Modes::Server.serve context, server_options
68
+ end
70
69
  end
71
70
  end
72
71
 
@@ -79,6 +78,8 @@ module Crabfarm
79
78
  app.flag [:r, :remote]
80
79
 
81
80
  app.action do |global_options,options,args|
81
+ next puts "This command cannot be run inside a crabfarm application" if GlobalState.inside_crawler_app?
82
+
82
83
  require "crabfarm/modes/generator"
83
84
  Crabfarm::Modes::Generator.generate_app(Dir.pwd, args[0], options[:remote])
84
85
  end
@@ -87,27 +88,34 @@ module Crabfarm
87
88
  c.desc "Generates a new crabfarm parser and parser spec"
88
89
  c.command :parser do |parser|
89
90
  parser.action do |global_options,options,args|
91
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
92
+
90
93
  require "crabfarm/modes/generator"
91
- Crabfarm::Modes::Generator.generate_parser(args[0])
94
+ Crabfarm::Modes::Generator.generate_parser(GlobalState.app_path, args[0])
92
95
  end
93
96
  end
94
97
 
95
98
  c.desc "Generates a new crabfarm state and parser spec"
96
99
  c.command :state do |parser|
97
100
  parser.action do |global_options,options,args|
101
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
102
+
98
103
  require "crabfarm/modes/generator"
99
- Crabfarm::Modes::Generator.generate_state(args[0])
104
+ Crabfarm::Modes::Generator.generate_state(GlobalState.app_path, args[0])
100
105
  end
101
106
  end
102
107
  end
103
108
 
104
109
  desc "Perform an HTTP recording for use in tests"
105
110
  command [:record, :r] do |c|
111
+ c.desc "Run recorder in playback mode"
112
+ c.switch [:p, :playback], :default_value => false
113
+
106
114
  c.action do |global_options, options, args|
107
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
115
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
108
116
 
109
117
  require "crabfarm/modes/recorder"
110
- Crabfarm::Modes::Recorder.start args[0]
118
+ Crabfarm::Modes::Recorder.start GlobalState.memento_path(args[0]), options[:playback]
111
119
  end
112
120
  end
113
121
 
@@ -120,12 +128,27 @@ module Crabfarm
120
128
  c.switch :unsafe, :default_value => false
121
129
 
122
130
  c.action do |global_options,options,args|
123
- next puts "This command can only be run inside a crabfarm application" unless defined? CF_PATH
131
+ next puts "This command can only be run inside a crabfarm application" unless GlobalState.inside_crawler_app?
124
132
 
125
133
  options[:remote] = args[0]
126
134
 
127
135
  require "crabfarm/modes/publisher"
128
- Crabfarm::Modes::Publisher.publish CF_PATH, options
136
+ Crabfarm::Modes::Publisher.publish GlobalState.app_path, options
137
+ end
138
+ end
139
+
140
+ on_error do |exc|
141
+ case exc
142
+ when BinaryMissingError
143
+ if exc.binary == 'phantomjs'
144
+ puts "Could not find the phantomjs binary at '#{exc.path}', try installing it using 'npm install phantomjs -g' or set the propper path in your project's Crabfile"
145
+ false
146
+ elsif exc.binary == 'crabtrap'
147
+ puts "Could not find the crabtrap binary at '#{exc.path}', try installing it using 'npm install crabtrap -g' or set the propper path in your project's Crabfile"
148
+ false
149
+ else true end
150
+ else
151
+ true
129
152
  end
130
153
  end
131
154
 
@@ -25,12 +25,12 @@ module Crabfarm
25
25
  [:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
26
26
  [:phantom_ssl, ['sslv3', 'sslv2', 'tlsv1', 'any'], 'Phantomjs ssl mode: sslv3, sslv2, tlsv1 or any, only for phantomjs driver.'],
27
27
  [:phantom_bin_path, :string, 'Phantomjs binary path, only for phantomjs driver.'],
28
- [:phantom_lock_file, :string, 'Phantomjs lock file path, only for phantomjs driver.'],
29
28
 
30
29
  # Crabtrap launcher configuration
31
30
  [:crabtrap_bin_path, :string, 'Crabtrap binary path.'],
32
- [:crabtrap_port, :integer, 'Crabtrap port, defaults to 4000.'],
33
- [:crabtrap_mode, ['capture', 'replay'], 'Crabtrap operation mode.']
31
+
32
+ # Recorder configuration
33
+ [:recorder_driver, :string, 'Recorder driver name, defaults to \'firefox\'']
34
34
  ]
35
35
  .map { |o| Option.new *o }
36
36
 
@@ -56,7 +56,6 @@ module Crabfarm
56
56
  driver_factory: nil,
57
57
  log_path: nil,
58
58
  proxy: nil,
59
-
60
59
  driver: 'phantomjs',
61
60
  driver_capabilities: Selenium::WebDriver::Remote::Capabilities.firefox,
62
61
  driver_host: 'localhost',
@@ -64,14 +63,11 @@ module Crabfarm
64
63
  driver_remote_timeout: 120,
65
64
  driver_window_width: 1280,
66
65
  driver_window_height: 800,
67
-
68
66
  phantom_load_images: false,
69
67
  phantom_ssl: 'any',
70
68
  phantom_bin_path: 'phantomjs',
71
- phantom_lock_file: nil,
72
-
73
69
  crabtrap_bin_path: 'crabtrap',
74
- crabtrap_port: 4000
70
+ recorder_driver: :firefox
75
71
  }
76
72
  end
77
73
 
@@ -108,7 +104,6 @@ module Crabfarm
108
104
  proxy: proxy,
109
105
  ssl: phantom_ssl,
110
106
  bin_path: phantom_bin_path,
111
- lock_file: phantom_lock_file,
112
107
  log_file: log_path ? File.join(log_path, 'phantom.log') : nil
113
108
  }
114
109
  end
@@ -116,7 +111,6 @@ module Crabfarm
116
111
  def crabtrap_config
117
112
  {
118
113
  bin_path: crabtrap_bin_path,
119
- port: crabtrap_port,
120
114
  proxy: proxy
121
115
  }
122
116
  end
@@ -11,26 +11,45 @@ module Crabfarm
11
11
  @loaded = false
12
12
  end
13
13
 
14
- def load
14
+ def loaded?
15
+ @loaded
16
+ end
17
+
18
+ def prepare
19
+ unless @loaded
20
+ load_services
21
+ @loaded = true
22
+ end
23
+ end
24
+
25
+ def reset
26
+ reset_services if @loaded
27
+ end
28
+
29
+ def release
30
+ unload_services
31
+ @loaded = false
32
+ end
33
+
34
+ private
35
+
36
+ def load_services
15
37
  init_phantom_if_required
16
38
  init_driver_pool
17
39
  init_http_client
18
- @loaded = true
19
40
  end
20
41
 
21
- def reset
42
+ def reset_services
22
43
  @store.reset
23
- @pool.reset unless @pool.nil?
44
+ @pool.reset
24
45
  end
25
46
 
26
- def release
47
+ def unload_services
27
48
  release_driver_pool
49
+ release_http_client
28
50
  release_phantom
29
- @loaded = false
30
51
  end
31
52
 
32
- private
33
-
34
53
  def init_driver_pool
35
54
  @pool = DriverBucketPool.new build_driver_factory if @pool.nil?
36
55
  end
@@ -47,7 +66,8 @@ module Crabfarm
47
66
  end
48
67
 
49
68
  def load_and_start_phantom
50
- new_phantom = PhantomRunner.new phantom_config
69
+ phantom_port = Utils::PortDiscovery.find_available_port
70
+ new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
51
71
  new_phantom.start
52
72
  return new_phantom
53
73
  end
@@ -0,0 +1,31 @@
1
+ module Crabfarm
2
+ module ContextFactory
3
+
4
+ def with_context(_memento=nil)
5
+ ctx = build_context(_memento)
6
+ begin
7
+ ctx.prepare
8
+ yield ctx
9
+ ensure
10
+ ctx.release
11
+ end
12
+ end
13
+
14
+ def build_context(_memento=nil)
15
+ if _memento.nil?
16
+ Crabfarm::Context.new
17
+ else
18
+ load_crabtrap_context _memento
19
+ end
20
+ end
21
+
22
+ def load_crabtrap_context(_memento)
23
+ require 'crabfarm/crabtrap_context'
24
+ m_path = GlobalState.memento_path _memento
25
+ raise ResourceNotFoundError.new "Could not find memento '#{_name}'" unless File.exists? m_path
26
+ Crabfarm::CrabtrapContext.new :replay, m_path
27
+ end
28
+
29
+ extend self
30
+ end
31
+ end
@@ -4,54 +4,94 @@ require 'crabfarm/crabtrap_runner'
4
4
  module Crabfarm
5
5
  class CrabtrapContext < Context
6
6
 
7
- def load
8
- pass_through if @runner.nil?
9
- super
7
+ attr_accessor :mode
8
+
9
+ def initialize(_mode=:pass, _path=nil)
10
+ @mode = _mode
11
+ @path = _path
10
12
  end
11
13
 
12
14
  def pass_through
13
- restart_with_options(mode: :pass) if @runner.nil? or @runner.mode != :pass
15
+ if not loaded? or @mode != :pass
16
+ @mode = :pass
17
+ @path = nil
18
+ restart
19
+ end
14
20
  end
15
21
 
16
22
  def capture(_path)
17
- restart_with_options(mode: :capture, bucket_path: _path)
23
+ @mode = :capture
24
+ @path = _path
25
+ restart
18
26
  end
19
27
 
20
28
  def replay(_path)
21
- restart_with_options(mode: :replay, bucket_path: _path)
29
+ @mode = :replay
30
+ @path = _path
31
+ restart
22
32
  end
23
33
 
24
- def release
25
- super
26
- stop_daemon
34
+ def restart
35
+ if not loaded?
36
+ prepare
37
+ else
38
+ stop_daemon
39
+ start_daemon
40
+ end
27
41
  end
28
42
 
29
43
  private
30
44
 
45
+ def load_services
46
+ @port = Utils::PortDiscovery.find_available_port
47
+ start_daemon
48
+ super
49
+ end
50
+
51
+ def reset_services
52
+ restart
53
+ end
54
+
55
+ def unload_services
56
+ super
57
+ stop_daemon
58
+ @port = nil
59
+ end
60
+
31
61
  def build_http_client
32
62
  HttpClient.new proxy_address
33
63
  end
34
64
 
35
- def restart_with_options(_options)
36
- stop_daemon
37
- @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(_options)
38
- @runner.start
65
+ def start_daemon
66
+ if @runner.nil?
67
+ options = {
68
+ mode: @mode,
69
+ bucket_path: @path,
70
+ port: @port
71
+ }
72
+
73
+ @runner = CrabtrapRunner.new Crabfarm.config.crabtrap_config.merge(options)
74
+ @runner.start
75
+ end
39
76
  end
40
77
 
41
78
  def stop_daemon
42
- @runner.stop unless @runner.nil?
79
+ unless @runner.nil?
80
+ @runner.stop
81
+ @runner = nil
82
+ else nil end
43
83
  end
44
84
 
45
85
  def driver_config
46
- if @runner.is_running? then super.merge(proxy: proxy_address) else super end
86
+ super.merge(proxy: proxy_address)
47
87
  end
48
88
 
49
89
  def phantom_config
50
- if @runner.is_running? then super.merge(proxy: proxy_address) else super end
90
+ super.merge(proxy: proxy_address)
51
91
  end
52
92
 
53
93
  def proxy_address
54
- "127.0.0.1:#{@runner.port}"
94
+ "127.0.0.1:#{@port}"
55
95
  end
56
96
 
57
97
  end
@@ -1,6 +1,10 @@
1
+ require 'timeout'
2
+
1
3
  module Crabfarm
2
4
  class CrabtrapRunner
3
5
 
6
+ CRABTRAP_START_TM = 5 # seconds
7
+
4
8
  def initialize(_config={})
5
9
  @config = _config;
6
10
  @pid = nil
@@ -11,7 +15,7 @@ module Crabfarm
11
15
  end
12
16
 
13
17
  def port
14
- @config[:port] # TODO: maybe select port dynamically...
18
+ @config[:port]
15
19
  end
16
20
 
17
21
  def mode
@@ -19,25 +23,39 @@ module Crabfarm
19
23
  end
20
24
 
21
25
  def start
22
- begin
23
- @pid = Process.spawn({}, crabtrap_cmd)
24
- wait_for_server
25
- rescue
26
- puts "Could not find crabtrap at #{@config[:bin_path]}, memento replaying is disabled!"
27
- @pid = nil
28
- end
26
+ logger.info "Starting crabtrap in port #{port}"
27
+ @pid = spawn_crabtrap
28
+ logger.info "Crabtrap started (PID: #{@pid})"
29
29
  end
30
30
 
31
31
  def stop
32
32
  unless @pid.nil?
33
+ logger.info "Stopping crabtrap (PID: #{@pid})"
33
34
  Process.kill("INT", @pid)
34
35
  Process.wait @pid
36
+ logger.info "Crabtrap stopped"
35
37
  @pid = nil
36
38
  end
37
39
  end
38
40
 
39
41
  private
40
42
 
43
+ def spawn_crabtrap
44
+ pid = nil
45
+ begin
46
+ pid = Process.spawn({}, crabtrap_cmd)
47
+ Timeout::timeout(CRABTRAP_START_TM) { wait_for_server }
48
+ return pid
49
+ rescue Errno::ENOENT
50
+ raise BinaryMissingError.new 'crabtrap', @config[:bin_path]
51
+ rescue Timeout::Error
52
+ Process.kill "INT", pid
53
+ Process.wait pid
54
+ raise
55
+ end
56
+ pid
57
+ end
58
+
41
59
  def crabtrap_cmd
42
60
  cmd = [@config[:bin_path]]
43
61
  cmd << mode.to_s
@@ -57,5 +75,9 @@ module Crabfarm
57
75
  end
58
76
  end
59
77
 
78
+ def logger
79
+ Crabfarm.logger
80
+ end
81
+
60
82
  end
61
83
  end
@@ -3,11 +3,12 @@ require 'ostruct'
3
3
 
4
4
  module Crabfarm
5
5
  module Engines
6
- class SafeStateLoop
6
+ class AsyncStateManager
7
7
 
8
8
  class LoopAbortedException < StandardError; end
9
9
 
10
- def initialize
10
+ def initialize(_context)
11
+ @context = _context
11
12
  @working = false
12
13
  @fatal = nil
13
14
  @lock = Mutex.new
@@ -37,7 +38,7 @@ module Crabfarm
37
38
  start
38
39
  end
39
40
 
40
- def change_state(_name, _params={}, _wait=nil)
41
+ def transition(_name, _params={}, _wait=nil)
41
42
  @lock.synchronize {
42
43
  if @fatal
43
44
  raise CrawlerError.new @fatal
@@ -100,25 +101,24 @@ module Crabfarm
100
101
  end
101
102
 
102
103
  def crawl_loop
103
- context = Crabfarm::Context.new
104
-
105
104
  begin
106
105
  loop do
107
106
  if @working
108
- @elapsed = Benchmark.measure do
109
- begin
107
+ begin
108
+ logger.info "Transitioning state: #{@next_state_name}"
109
+ @elapsed = Benchmark.measure do
110
110
  ActiveSupport::Dependencies.clear
111
- logger.info "StateLoop: loading state: #{@next_state_name}"
112
- @doc = TransitionService.apply_state(context, @next_state_name, @next_state_params).output_as_json
113
- logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
114
- @error = nil
115
- rescue Exception => e
116
- logger.error "StateLoop: error while loading state: #{@next_state_name}"
117
- logger.error e
118
- @doc = nil
119
- @error = e
120
- end
121
- end.real
111
+ @doc = TransitionService.apply_state(@context, @next_state_name, @next_state_params).output_as_json
112
+ end.real
113
+
114
+ logger.info "Transitioned in #{@elapsed.real}"
115
+ @error = nil
116
+ rescue Exception => e
117
+ logger.error "Error during transition:"
118
+ logger.error e
119
+ @doc = nil
120
+ @error = e
121
+ end
122
122
 
123
123
  @lock.synchronize {
124
124
  @state_name = @next_state_name
@@ -128,17 +128,15 @@ module Crabfarm
128
128
  else sleep 0.2 end
129
129
  end
130
130
  rescue LoopAbortedException
131
- logger.info "StateLoop: stopping"
131
+ logger.info "Manager stopping"
132
132
 
133
133
  rescue Exception => e
134
- logger.fatal "StateLoop: unhandled exception!"
134
+ logger.fatal "Unhandled exception:"
135
135
  logger.fatal e
136
136
 
137
137
  @lock.synchronize {
138
138
  @fatal = e
139
139
  }
140
- ensure
141
- context.release
142
140
  end
143
141
  end
144
142
 
@@ -0,0 +1,40 @@
1
+ require 'benchmark'
2
+ require 'ostruct'
3
+
4
+ module Crabfarm
5
+ module Engines
6
+ class SyncStateManager
7
+
8
+ def initialize(_context)
9
+ @context = _context
10
+ @lock = Mutex.new
11
+ end
12
+
13
+ def reload!
14
+ @lock.synchronize {
15
+ ActiveSupport::Dependencies.clear
16
+ @context.reset
17
+ }
18
+ end
19
+
20
+ def reset
21
+ @lock.synchronize {
22
+ @context.reset
23
+ }
24
+ end
25
+
26
+ def transition(_name, _params={})
27
+ @lock.synchronize {
28
+ output = { name: _name, params: _params }
29
+
30
+ output[:elapsed] = Benchmark.measure do
31
+ output[:doc] = TransitionService.apply_state(@context, _name, _params).output_as_json
32
+ end
33
+
34
+ OpenStruct.new output
35
+ }
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -4,18 +4,25 @@ module Crabfarm
4
4
 
5
5
  class ConfigurationError < Error; end
6
6
 
7
- class AssertionError < Error; end
7
+ class BinaryMissingError < ConfigurationError
8
8
 
9
- class EntityNotFoundError < Error
10
- attr_accessor :role, :name
9
+ attr_accessor :binary
10
+ attr_accessor :path
11
11
 
12
- def initialize(_role, _name)
13
- super("The required #{_role} was not found (#{_name})")
14
- @role = _role
15
- @name = _name
12
+ def initialize(_binary, _path)
13
+ @binary = _binary
14
+ @path = _path
15
+ super "Could not find a suitable version of #{@binary}"
16
16
  end
17
+
17
18
  end
18
19
 
20
+ class AssertionError < Error; end
21
+
22
+ class ArgumentError < Error; end
23
+
24
+ class ResourceNotFoundError < Crabfarm::Error; end
25
+
19
26
  class ApiError < Error
20
27
  def code; 500 end
21
28
  def to_json; {} end
@@ -0,0 +1,22 @@
1
+ module Crabfarm
2
+ module GlobalState
3
+
4
+ def inside_crawler_app?
5
+ defined? CF_PATH
6
+ end
7
+
8
+ def app_path
9
+ CF_PATH
10
+ end
11
+
12
+ def memento_path(_name)
13
+ File.join(app_path, 'spec/mementos', _name + '.json.gz')
14
+ end
15
+
16
+ def snapshot_path(_file)
17
+ File.join(app_path, 'spec/snapshots', _file)
18
+ end
19
+
20
+ extend self
21
+ end
22
+ end
@@ -1,28 +1,23 @@
1
- require 'benchmark'
2
1
  require 'readline'
3
2
  require 'rainbow'
4
3
  require 'rainbow/ext/string'
5
4
  require 'json'
5
+ require 'crabfarm/engines/sync_state_loop'
6
6
 
7
7
  module Crabfarm
8
8
  module Modes
9
9
  class Console
10
10
 
11
- class ConsoleDsl
12
-
13
- def initialize(_context)
14
- @context = _context
15
- end
11
+ class ConsoleDsl < Engines::SyncStateManager
16
12
 
17
13
  def reload!
18
14
  puts "Reloading crawler source".color(:green)
19
- ActiveSupport::Dependencies.clear
20
- @context.reset
15
+ super
21
16
  end
22
17
 
23
18
  def reset
24
19
  puts "Resetting crawling context".color(:green)
25
- @context.reset
20
+ super
26
21
  end
27
22
 
28
23
  def transition(_name=nil, _params={})
@@ -32,17 +27,14 @@ module Crabfarm
32
27
  end
33
28
 
34
29
  begin
35
- elapsed = Benchmark.measure do
36
- puts "Transitioning to #{_name.to_s.camelize} state"
37
- doc = TransitionService.apply_state(@context, _name, _params).output_as_json
30
+ puts "Transitioning to #{_name.to_s.camelize} state"
31
+ output = super
38
32
 
39
- puts "State changed, generated document:"
40
- puts JSON.pretty_generate(doc).color(:green).gsub(/(^|\\n)/, ' ')
41
- end
42
- puts "Completed in #{elapsed.real} s"
43
- rescue EntityNotFoundError => e
44
- puts "#{e.to_s}".color(:red)
45
- rescue => e
33
+ puts "State changed, generated document:"
34
+ puts JSON.pretty_generate(output.doc).color(:green).gsub(/(^|\\n)/, ' ')
35
+ puts "Completed in #{output.elapsed.real} s"
36
+
37
+ rescue Exception => e
46
38
  puts "#{e.to_s}".color(:red)
47
39
  puts e.backtrace
48
40
  end
@@ -56,7 +48,7 @@ module Crabfarm
56
48
  alias :r :reset
57
49
  end
58
50
 
59
- def self.start(_context)
51
+ def self.process_input(_context)
60
52
  dsl = ConsoleDsl.new _context
61
53
 
62
54
  loop do
@@ -71,8 +63,7 @@ module Crabfarm
71
63
  end
72
64
  end
73
65
 
74
- puts "Releasing crawling context".color(:green)
75
- _context.release
66
+ puts "Exiting".color(:green)
76
67
  end
77
68
 
78
69
  end
@@ -9,7 +9,7 @@ module Crabfarm
9
9
  module Generator
10
10
 
11
11
  def generate_app(_target, _name, _default_remote=nil)
12
- with_external_path _target do
12
+ with_base_path _target do
13
13
  binding = {
14
14
  name: _name,
15
15
  remote: _default_remote,
@@ -35,38 +35,29 @@ module Crabfarm
35
35
  end
36
36
  end
37
37
 
38
- def generate_state(_name)
38
+ def generate_state(_target, _name)
39
39
  class_name = _name.camelize
40
- with_crawler_path do
40
+ with_base_path _target do
41
41
  binding = { state_class: class_name.camelize }
42
42
  path('app', 'states', class_name.underscore + '.rb').render('state.rb', binding)
43
43
  path('spec', 'states', class_name.underscore + '_spec.rb').render('state_spec.rb', binding)
44
44
  end
45
45
  end
46
46
 
47
- def generate_parser(_name)
47
+ def generate_parser(_target, _name)
48
48
  class_name = _name.camelize + 'Parser'
49
- with_crawler_path do
49
+ with_base_path _target do
50
50
  binding = { parser_class: class_name }
51
51
  path('app', 'parsers', class_name.underscore + '.rb').render('parser.rb', binding)
52
52
  path('spec', 'parsers', class_name.underscore + '_spec.rb').render('parser_spec.rb', binding)
53
53
  end
54
54
  end
55
55
 
56
- def with_external_path(_target)
56
+ def with_base_path(_target)
57
57
  @base_path = _target
58
58
  yield
59
59
  end
60
60
 
61
- def with_crawler_path
62
- if defined? CF_PATH
63
- @base_path = CF_PATH
64
- yield
65
- else
66
- puts "This command can only be run inside a crabfarm application"
67
- end
68
- end
69
-
70
61
  def path(*_args)
71
62
  @path = _args
72
63
  self
@@ -14,7 +14,7 @@ module Crabfarm
14
14
  module Publisher
15
15
  extend self
16
16
 
17
- DEFAULT_HOST = 'http://www.crabfarm.io'
17
+ DEFAULT_HOST = 'http://api.crabfarm.io'
18
18
 
19
19
  def publish(_path, _options={})
20
20
 
@@ -4,37 +4,42 @@ require 'crabfarm/crabtrap_runner'
4
4
 
5
5
  module Crabfarm
6
6
  module Modes
7
- class Recorder
7
+ module Recorder
8
8
 
9
- def self.start(_target)
10
- return puts "Must provide a recording name" unless _target.is_a? String
9
+ def self.start(_target, _replay=false)
10
+ return puts "Must provide a recording target" unless _target.is_a? String
11
+ return puts "Memento file does not exist: #{_target}" if _replay and not File.exist? _target
11
12
 
12
13
  crabtrap_config = Crabfarm.config.crabtrap_config
13
- crabtrap_config[:mode] = :capture
14
- crabtrap_config[:bucket_path] = File.join(CF_PATH, 'spec/mementos', _target + '.json.gz')
14
+ crabtrap_config[:mode] = _replay ? :replay : :capture
15
+ crabtrap_config[:port] = Utils::PortDiscovery.find_available_port
16
+ crabtrap_config[:bucket_path] = _target
15
17
 
16
18
  crabtrap = CrabtrapRunner.new crabtrap_config
17
19
  crabtrap.start
18
20
 
19
- driver_config = Crabfarm.config.driver_config
20
- driver_config[:name] = :firefox
21
- driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
22
-
23
- driver = DefaultDriverFactory.new(driver_config).build_driver nil
24
-
25
21
  begin
26
- puts "Press Ctrl-C to stop capturing."
27
- loop do
28
- driver.current_url
29
- sleep 1.0
22
+ driver_config = Crabfarm.config.driver_config
23
+ driver_config[:name] = Crabfarm.config.recorder_driver
24
+ driver_config[:proxy] = "127.0.0.1:#{crabtrap.port}"
25
+
26
+ driver = DefaultDriverFactory.new(driver_config).build_driver nil
27
+
28
+ begin
29
+ puts "Press Ctrl-C to stop capturing."
30
+ loop do
31
+ driver.current_url
32
+ sleep 1.0
33
+ end
34
+ rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
35
+ # noop
30
36
  end
31
- rescue Selenium::WebDriver::Error::WebDriverError, SystemExit, Interrupt
32
- # noop
33
- end
34
37
 
35
- puts "Releasing crawling context".color(:green)
36
- driver.quit rescue nil
37
- crabtrap.stop
38
+ puts "Releasing crawling context".color(:green)
39
+ driver.quit rescue nil
40
+ ensure
41
+ crabtrap.stop
42
+ end
38
43
  end
39
44
 
40
45
  end
@@ -1,6 +1,6 @@
1
1
  require 'grape'
2
2
  require 'crabfarm/support/custom_puma'
3
- require 'crabfarm/engines/safe_state_loop'
3
+ require 'crabfarm/engines/async_state_manager'
4
4
 
5
5
  module Crabfarm
6
6
  module Modes
@@ -54,7 +54,7 @@ module Crabfarm
54
54
  optional :wait, type: Float
55
55
  end
56
56
  put :state do
57
- print_state evaluator.change_state params[:name], params[:params], wait
57
+ print_state evaluator.transition params[:name], params[:params], wait
58
58
  end
59
59
  end
60
60
 
@@ -62,8 +62,8 @@ module Crabfarm
62
62
  @@evaluator
63
63
  end
64
64
 
65
- def self.start(_options)
66
- @@evaluator = Engines::SafeStateLoop.new
65
+ def self.serve(_context, _options)
66
+ @@evaluator = Engines::AsyncStateManager.new _context
67
67
  @@evaluator.start
68
68
  begin
69
69
  Support::CustomPuma.run API, _options
@@ -5,16 +5,17 @@ module Crabfarm
5
5
 
6
6
  PHANTOM_START_TM = 5 # seconds
7
7
 
8
- attr_reader :port
9
-
10
8
  def initialize(_config={})
11
9
  @config = _config;
12
10
  @pid = nil
13
11
  end
14
12
 
13
+ def port
14
+ @config[:port]
15
+ end
16
+
15
17
  def start
16
- find_available_port
17
- Crabfarm.logger.info "Starting phantomjs in port #{@port}"
18
+ Crabfarm.logger.info "Starting phantomjs in port #{port}"
18
19
  @pid = spawn_phantomjs
19
20
  Crabfarm.logger.info "Phantomjs started (PID: #{@pid})"
20
21
  end
@@ -32,22 +33,25 @@ module Crabfarm
32
33
  private
33
34
 
34
35
  def spawn_phantomjs
35
- pid = Process.spawn({}, phantomjs_cmd)
36
+ pid = nil
36
37
  begin
38
+ pid = Process.spawn({}, phantomjs_cmd)
37
39
  Timeout::timeout(PHANTOM_START_TM) { wait_for_server }
40
+ return pid
41
+ rescue Errno::ENOENT
42
+ raise BinaryMissingError.new 'phantomjs', @config[:bin_path]
38
43
  rescue Timeout::Error
39
44
  Process.kill "INT", pid
40
45
  Process.wait pid
41
46
  raise
42
47
  end
43
- return pid
44
48
  end
45
49
 
46
50
  def phantomjs_cmd
47
51
  cmd = [@config[:bin_path]]
48
52
  cmd << '--load-images=false' unless @config[:load_images]
49
53
  cmd << "--proxy=#{@config[:proxy]}" unless @config[:proxy].nil?
50
- cmd << "--webdriver=#{@port}"
54
+ cmd << "--webdriver=#{port}"
51
55
  cmd << "--ssl-protocol=#{@config[:ssl]}" unless @config[:ssl].nil?
52
56
  cmd << "--ignore-ssl-errors=true"
53
57
  cmd << "--webdriver-loglevel=WARN"
@@ -55,37 +59,15 @@ module Crabfarm
55
59
  cmd.join(' ')
56
60
  end
57
61
 
58
- def find_available_port
59
- with_lock do
60
- socket = Socket.new(:INET, :STREAM, 0)
61
- socket.bind(Addrinfo.tcp("127.0.0.1", 0))
62
- @port = socket.local_address.ip_port
63
- socket.close
64
- end
65
- end
66
-
67
62
  def wait_for_server
68
63
  loop do
69
64
  begin
70
- Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{@port}/status"))
65
+ Net::HTTP.get_response(URI.parse("http://127.0.0.1:#{port}/status"))
71
66
  break
72
67
  rescue
73
68
  end
74
69
  end
75
70
  end
76
71
 
77
- def with_lock
78
- return yield if @config[:lock_file].nil?
79
-
80
- File.open(@config[:lock_file], 'a+') do |file|
81
- begin
82
- file.flock File::LOCK_EX
83
- return yield
84
- ensure
85
- file.flock File::LOCK_UN
86
- end
87
- end
88
- end
89
-
90
72
  end
91
73
  end
@@ -1,26 +1,22 @@
1
- require 'crabfarm/crabtrap_context'
2
- require 'net/http'
3
-
4
- CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
5
- CF_TEST_CONTEXT.load
6
-
7
1
  module Crabfarm
8
2
  module RSpec
9
3
 
10
- def parse(_snap_or_url, _options={})
11
- fixture = Pathname.new(File.join(ENV['SNAPSHOT_DIR'], _snap_or_url))
12
- html = if fixture.exist?
13
- File.read fixture.realpath
14
- else
15
- Net::HTTP.get(URI.parse _snap_or_url)
16
- end
4
+ class Error < Crabfarm::Error; end
17
5
 
6
+ def parse(_snapshot, _options={})
7
+ snapshot_path = GlobalState.snapshot_path _snapshot
8
+ raise Error.new "Snapshot does not exist #{_snapshot}" unless File.exist? snapshot_path
9
+
10
+ html = File.read snapshot_path
18
11
  parser = described_class.new html, _options
19
12
  parser.parse
20
13
  parser
21
14
  end
22
15
 
23
16
  def crawl(_state=nil, _params={})
17
+
18
+ raise Error.new "Crawl is only available in state specs" if @context.nil?
19
+
24
20
  if _state.is_a? Hash
25
21
  _params = _state
26
22
  _state = nil
@@ -28,9 +24,9 @@ module Crabfarm
28
24
 
29
25
  if _state.nil?
30
26
  return nil unless described_class < BaseState # TODO: maybe raise an error here.
31
- @state = @last_state = TransitionService.apply_state CF_TEST_CONTEXT, described_class, _params
27
+ @state = @last_state = TransitionService.apply_state @context, described_class, _params
32
28
  else
33
- @last_state = TransitionService.apply_state CF_TEST_CONTEXT, _state, _params
29
+ @last_state = TransitionService.apply_state @context, _state, _params
34
30
  end
35
31
  end
36
32
 
@@ -46,26 +42,30 @@ module Crabfarm
46
42
  @parser
47
43
  end
48
44
 
45
+ def driver(_session_id=nil)
46
+ @context.pool.driver(_session_id)
47
+ end
48
+
49
49
  end
50
50
  end
51
51
 
52
52
  RSpec.configure do |config|
53
53
  config.include Crabfarm::RSpec
54
54
 
55
- config.before(:example) do |example|
56
-
57
- if example.metadata[:parsing]
58
- @parser = parse example.metadata[:parsing], example.metadata[:using] || {}
59
- end
60
-
61
- if example.metadata[:crawling]
62
- CF_TEST_CONTEXT.replay File.join(CF_PATH, 'spec/mementos', example.metadata[:crawling] + '.json.gz')
55
+ config.around(:example) do |example|
56
+ if described_class < Crabfarm::BaseParser
57
+ if example.metadata[:parsing]
58
+ @parser = parse example.metadata[:parsing], example.metadata[:using] || {}
59
+ end
60
+ example.run
61
+ elsif described_class < Crabfarm::BaseState
62
+ Crabfarm::ContextFactory.with_context example.metadata[:crawling] do |ctx|
63
+ @context = ctx
64
+ example.run
65
+ end
63
66
  else
64
- CF_TEST_CONTEXT.pass_through
67
+ example.run
65
68
  end
66
69
  end
67
70
 
68
- config.after(:suite) do
69
- CF_TEST_CONTEXT.release
70
- end
71
71
  end
@@ -65,10 +65,13 @@ set_driver :phantomjs
65
65
  # set_driver_remote_timeout 120
66
66
 
67
67
 
68
- # Crabtrap launcher configuration
68
+ # Recording configuration
69
69
  ########################################
70
70
 
71
- # The following parameters only apply when running the crabtrap proxy on specs
71
+ # The following parameters only apply when recording or replaying mementos
72
72
 
73
73
  # Set the crabtrap executable location, by default crabfarm expects crabtrap to be included in enviroment
74
74
  # set_crabtrap_bin_path 'crabtrap'
75
+
76
+ # Change the browser used for the recording command, available options are 'firefox' or 'chrome'
77
+ # set_recorder_driver :firefox
@@ -1,8 +1,6 @@
1
1
  require File.expand_path("../../boot", __FILE__)
2
2
  Bundler.require :test
3
3
 
4
- ENV['SNAPSHOT_DIR'] ||= File.expand_path("../snapshots", __FILE__)
5
-
6
4
  require "crabfarm/rspec"
7
5
 
8
6
  RSpec.configure do |config|
@@ -6,7 +6,7 @@ module Crabfarm
6
6
  load_by_name _name
7
7
  else _name end
8
8
 
9
- _context.load
9
+ _context.prepare
10
10
  state = state_class.new _context, _params
11
11
  state.crawl
12
12
  state
@@ -0,0 +1,17 @@
1
+ module Crabfarm
2
+ module Utils
3
+ module PortDiscovery
4
+
5
+ def self.find_available_port
6
+ begin
7
+ socket = Socket.new(:INET, :STREAM, 0)
8
+ socket.bind(Addrinfo.tcp("127.0.0.1", 0))
9
+ return socket.local_address.ip_port
10
+ ensure
11
+ socket.close rescue nil
12
+ end
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -1,3 +1,3 @@
1
1
  module Crabfarm
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: crabfarm
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ignacio Baixas
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-03-24 00:00:00.000000000 Z
11
+ date: 2015-03-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: selenium-webdriver
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ~>
18
18
  - !ruby/object:Gem::Version
19
- version: '2.33'
19
+ version: '2.45'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ~>
25
25
  - !ruby/object:Gem::Version
26
- version: '2.33'
26
+ version: '2.45'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: nokogiri
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -377,6 +377,7 @@ files:
377
377
  - lib/crabfarm/cli.rb
378
378
  - lib/crabfarm/configuration.rb
379
379
  - lib/crabfarm/context.rb
380
+ - lib/crabfarm/context_factory.rb
380
381
  - lib/crabfarm/crabtrap_context.rb
381
382
  - lib/crabfarm/crabtrap_runner.rb
382
383
  - lib/crabfarm/default_driver_factory.rb
@@ -385,10 +386,11 @@ files:
385
386
  - lib/crabfarm/dsl/surfer/search_context.rb
386
387
  - lib/crabfarm/dsl/surfer/surf_context.rb
387
388
  - lib/crabfarm/dsl/surfer.rb
388
- - lib/crabfarm/engines/safe_state_loop.rb
389
+ - lib/crabfarm/engines/async_state_manager.rb
390
+ - lib/crabfarm/engines/sync_state_manager.rb
389
391
  - lib/crabfarm/errors.rb
390
- - lib/crabfarm/event_store.rb
391
392
  - lib/crabfarm/forked_state.rb
393
+ - lib/crabfarm/global_state.rb
392
394
  - lib/crabfarm/http_client.rb
393
395
  - lib/crabfarm/mocks/noop_driver.rb
394
396
  - lib/crabfarm/modes/console.rb
@@ -417,6 +419,7 @@ files:
417
419
  - lib/crabfarm/templates/state.rb.erb
418
420
  - lib/crabfarm/templates/state_spec.rb.erb
419
421
  - lib/crabfarm/transition_service.rb
422
+ - lib/crabfarm/utils/port_discovery.rb
420
423
  - lib/crabfarm/version.rb
421
424
  - lib/crabfarm.rb
422
425
  - bin/crabfarm
@@ -1,20 +0,0 @@
1
- module Crabfarm
2
- class EventStore
3
-
4
- def initialize
5
- @events = []
6
- @mutex = Mutex.new
7
- end
8
-
9
- def event(_category, _message)
10
- @mutex.synchronize do
11
- @events << {
12
- created_at: Time.current,
13
- category: _category,
14
- msg: _message
15
- }
16
- end
17
- end
18
-
19
- end
20
- end