crabfarm 0.2.5 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm.rb +17 -18
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
- data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
- data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
- data/lib/crabfarm/adapters/browser/noop.rb +25 -0
- data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
- data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
- data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
- data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
- data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
- data/lib/crabfarm/assertion/fields.rb +85 -0
- data/lib/crabfarm/base_navigator.rb +78 -0
- data/lib/crabfarm/base_reducer.rb +68 -0
- data/lib/crabfarm/base_struct.rb +17 -0
- data/lib/crabfarm/cli.rb +18 -8
- data/lib/crabfarm/configuration.rb +24 -51
- data/lib/crabfarm/context.rb +19 -43
- data/lib/crabfarm/crabtrap_context.rb +4 -11
- data/lib/crabfarm/driver_pool.rb +32 -0
- data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
- data/lib/crabfarm/engines/async_state_manager.rb +1 -1
- data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
- data/lib/crabfarm/forked_navigator.rb +31 -0
- data/lib/crabfarm/modes/console.rb +4 -4
- data/lib/crabfarm/modes/generator.rb +24 -11
- data/lib/crabfarm/rspec.rb +26 -24
- data/lib/crabfarm/strategies.rb +15 -9
- data/lib/crabfarm/templates/Crabfile.erb +21 -26
- data/lib/crabfarm/templates/Gemfile.erb +6 -0
- data/lib/crabfarm/templates/navigator.rb.erb +20 -0
- data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
- data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/struct.rb.erb +12 -0
- data/lib/crabfarm/transition_service.rb +20 -7
- data/lib/crabfarm/version.rb +1 -1
- metadata +50 -48
- data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
- data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
- data/lib/crabfarm/adapters/output/hash.rb +0 -11
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
- data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
- data/lib/crabfarm/base_parser.rb +0 -59
- data/lib/crabfarm/base_state.rb +0 -112
- data/lib/crabfarm/default_driver_factory.rb +0 -86
- data/lib/crabfarm/driver_bucket.rb +0 -42
- data/lib/crabfarm/driver_bucket_pool.rb +0 -26
- data/lib/crabfarm/forked_state.rb +0 -38
- data/lib/crabfarm/mocks/noop_driver.rb +0 -6
- data/lib/crabfarm/phantom_driver_factory.rb +0 -33
- data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'thwait'
|
2
|
+
require 'crabfarm/forked_navigator'
|
3
|
+
require "crabfarm/assertion/context"
|
4
|
+
|
5
|
+
module Crabfarm
|
6
|
+
class BaseNavigator
|
7
|
+
include Assertion::Context
|
8
|
+
extend Forwardable
|
9
|
+
|
10
|
+
attr_reader :params
|
11
|
+
|
12
|
+
def_delegators '@context', :http
|
13
|
+
def_delegators '@context.store', :get, :fetch
|
14
|
+
|
15
|
+
def initialize(_context, _params)
|
16
|
+
@context = _context
|
17
|
+
@params = _params
|
18
|
+
end
|
19
|
+
|
20
|
+
def browser(_name=nil)
|
21
|
+
@context.pool.driver(_name)
|
22
|
+
end
|
23
|
+
|
24
|
+
def download(_url)
|
25
|
+
@context.http.get(_url).body
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
raise NotImplementedError.new
|
30
|
+
end
|
31
|
+
|
32
|
+
def reduce(_target=nil, _options={})
|
33
|
+
if _target.is_a? Hash
|
34
|
+
_options = _target
|
35
|
+
_target = browser
|
36
|
+
elsif _target.nil?
|
37
|
+
_target = browser
|
38
|
+
end
|
39
|
+
|
40
|
+
reduce_using(_options.delete(:using) || self.class.name, _target, _options)
|
41
|
+
end
|
42
|
+
|
43
|
+
alias :reduce_with_defaults :reduce
|
44
|
+
|
45
|
+
def fork_each(_enumerator, &_block)
|
46
|
+
session_id = 0
|
47
|
+
mutex = Mutex.new
|
48
|
+
ths = _enumerator.map do |value|
|
49
|
+
session_id += 1
|
50
|
+
start_forked_navigation("th_session_#{session_id}", value, _block, mutex)
|
51
|
+
end
|
52
|
+
ThreadsWait.all_waits(*ths)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def reduce_using(_reducer_class, _target, _options={})
|
58
|
+
if _reducer_class.is_a? String or _reducer_class.is_a? Symbol
|
59
|
+
_reducer_class = (Utils::Naming.decode_crabfarm_uri(_reducer_class.to_s) + 'Reducer').constantize
|
60
|
+
end
|
61
|
+
|
62
|
+
reducer = _reducer_class.new _target, @params.merge(_options)
|
63
|
+
reducer.run
|
64
|
+
reducer
|
65
|
+
end
|
66
|
+
|
67
|
+
def start_forked_navigation(_name, _value, _block, _mutex)
|
68
|
+
Thread.new {
|
69
|
+
fork = ForkedNavigator.new @context, self, _name, _mutex
|
70
|
+
begin
|
71
|
+
fork.instance_exec _value, &_block
|
72
|
+
ensure
|
73
|
+
@context.pool.reset _name
|
74
|
+
end
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require "crabfarm/assertion/fields"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class BaseReducer < Delegator
|
5
|
+
include Assertion::Fields
|
6
|
+
|
7
|
+
attr_reader :params, :document
|
8
|
+
|
9
|
+
def self.use_parser(_parser_name)
|
10
|
+
@parser_name = _parser_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parser
|
14
|
+
@parser ||= Strategies.load(:parser, @parser_name || Crabfarm.config.parser)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.snapshot_path(_name=nil)
|
18
|
+
_name = self.to_s.underscore if _name.nil?
|
19
|
+
File.join(GlobalState.snapshots_path, _name + '.' + parser.format)
|
20
|
+
end
|
21
|
+
|
22
|
+
def parser
|
23
|
+
self.class.parser
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(_target, _params)
|
27
|
+
reset_fields
|
28
|
+
|
29
|
+
@parsed_data = parser.preprocess_parsing_target _target
|
30
|
+
@document = parser.parse @parsed_data
|
31
|
+
@params = _params
|
32
|
+
|
33
|
+
super @document
|
34
|
+
end
|
35
|
+
|
36
|
+
def run
|
37
|
+
raise NotImplementedError.new
|
38
|
+
end
|
39
|
+
|
40
|
+
def take_snapshot(_name=nil)
|
41
|
+
file_path = self.class.snapshot_path _name
|
42
|
+
|
43
|
+
dir_path = file_path.split(File::SEPARATOR)[0...-1]
|
44
|
+
FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
|
45
|
+
|
46
|
+
File.write file_path, @parsed_data
|
47
|
+
file_path
|
48
|
+
end
|
49
|
+
|
50
|
+
def take_snapshot_and_fail(_name=nil)
|
51
|
+
file_path = take_snapshot _name
|
52
|
+
raise ArgumentError.new "New snapshot for #{self.class.to_s} generated in '#{file_path}'"
|
53
|
+
end
|
54
|
+
|
55
|
+
def as_json(_options=nil)
|
56
|
+
field_hash
|
57
|
+
end
|
58
|
+
|
59
|
+
def __getobj__
|
60
|
+
@document
|
61
|
+
end
|
62
|
+
|
63
|
+
def __setobj__(obj)
|
64
|
+
@document = obj
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "crabfarm/assertion/fields"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class BaseStruct
|
5
|
+
include Assertion::Fields
|
6
|
+
|
7
|
+
def initialize(_values={})
|
8
|
+
reset_fields
|
9
|
+
_values.each { |k,v| send("#{k}=", v) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def as_json(_options=nil)
|
13
|
+
field_hash
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -85,23 +85,33 @@ module Crabfarm
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
|
88
|
-
c.desc "Generates a new crabfarm
|
89
|
-
c.command :
|
90
|
-
|
88
|
+
c.desc "Generates a new crabfarm navigator and navigator spec"
|
89
|
+
c.command :navigator do |sub|
|
90
|
+
sub.action do |global_options,options,args|
|
91
91
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
92
92
|
|
93
93
|
require "crabfarm/modes/generator"
|
94
|
-
Crabfarm::Modes::Generator.
|
94
|
+
Crabfarm::Modes::Generator.generate_navigator(GlobalState.app_path, args[0])
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
-
c.desc "Generates a new crabfarm
|
99
|
-
c.command :
|
100
|
-
|
98
|
+
c.desc "Generates a new crabfarm reducer and reducer spec"
|
99
|
+
c.command :reducer do |sub|
|
100
|
+
sub.action do |global_options,options,args|
|
101
101
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
102
102
|
|
103
103
|
require "crabfarm/modes/generator"
|
104
|
-
Crabfarm::Modes::Generator.
|
104
|
+
Crabfarm::Modes::Generator.generate_reducer(GlobalState.app_path, args[0])
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
c.desc "Generates a new crabfarm struct"
|
109
|
+
c.command :reducer do |sub|
|
110
|
+
sub.action do |global_options,options,args|
|
111
|
+
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
112
|
+
|
113
|
+
require "crabfarm/modes/generator"
|
114
|
+
Crabfarm::Modes::Generator.generate_struct(GlobalState.app_path, args[0])
|
105
115
|
end
|
106
116
|
end
|
107
117
|
end
|
@@ -5,21 +5,20 @@ module Crabfarm
|
|
5
5
|
class Option < Struct.new(:name, :type, :text); end
|
6
6
|
|
7
7
|
OPTIONS = [
|
8
|
-
|
9
|
-
[:
|
10
|
-
[:
|
11
|
-
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
8
|
+
# Global options
|
9
|
+
[:browser, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Browser engine to be used by navigators, common options: phantomjs, chrome, firefox, remote.'],
|
10
|
+
[:parser, :string, 'Default parser engine used by reducers'],
|
12
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
13
12
|
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
14
13
|
|
15
|
-
#
|
16
|
-
[:
|
17
|
-
[:
|
18
|
-
[:
|
19
|
-
[:
|
20
|
-
[:
|
21
|
-
[:
|
22
|
-
[:
|
14
|
+
# Webdriver configuration parameters
|
15
|
+
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
16
|
+
[:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
|
17
|
+
[:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
|
18
|
+
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
|
19
|
+
[:webdriver_window_width, :integer, 'Initial browser window width.'],
|
20
|
+
[:webdriver_window_height, :integer, 'Initial browser window height.'],
|
21
|
+
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are watir and surfer'],
|
23
22
|
|
24
23
|
# Phantom launcher configuration
|
25
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
@@ -50,19 +49,18 @@ module Crabfarm
|
|
50
49
|
|
51
50
|
def reset
|
52
51
|
@values = {
|
53
|
-
|
54
|
-
|
55
|
-
output_builder: :hash,
|
52
|
+
browser: 'phantomjs',
|
53
|
+
parser: :nokogiri,
|
56
54
|
driver_factory: nil,
|
57
55
|
log_path: nil,
|
58
56
|
proxy: nil,
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
57
|
+
webdriver_capabilities: nil,
|
58
|
+
webdriver_host: 'localhost',
|
59
|
+
webdriver_port: '8080',
|
60
|
+
webdriver_remote_timeout: 120,
|
61
|
+
webdriver_window_width: 1280,
|
62
|
+
webdriver_window_height: 800,
|
63
|
+
webdriver_dsl: :surfer,
|
66
64
|
phantom_load_images: false,
|
67
65
|
phantom_ssl: 'any',
|
68
66
|
phantom_bin_path: 'phantomjs',
|
@@ -75,38 +73,13 @@ module Crabfarm
|
|
75
73
|
@values.merge! _options
|
76
74
|
end
|
77
75
|
|
78
|
-
def
|
79
|
-
if
|
80
|
-
elsif
|
81
|
-
else "http://#{
|
76
|
+
def webdriver_remote_host
|
77
|
+
if webdriver_host then nil
|
78
|
+
elsif webdriver_port then "http://#{webdriver_host}"
|
79
|
+
else "http://#{webdriver_host}:#{webdriver_port}"
|
82
80
|
end
|
83
81
|
end
|
84
82
|
|
85
|
-
def driver_config
|
86
|
-
{
|
87
|
-
name: driver,
|
88
|
-
proxy: proxy,
|
89
|
-
capabilities: driver_capabilities,
|
90
|
-
remote_host: driver_remote_host,
|
91
|
-
remote_timeout: driver_remote_timeout,
|
92
|
-
window_width: driver_window_width,
|
93
|
-
window_height: driver_window_height
|
94
|
-
}
|
95
|
-
end
|
96
|
-
|
97
|
-
def phantom_mode_enabled?
|
98
|
-
driver.to_s == 'phantomjs'
|
99
|
-
end
|
100
|
-
|
101
|
-
def phantom_config
|
102
|
-
{
|
103
|
-
load_images: phantom_load_images,
|
104
|
-
proxy: proxy,
|
105
|
-
ssl: phantom_ssl,
|
106
|
-
bin_path: phantom_bin_path
|
107
|
-
}
|
108
|
-
end
|
109
|
-
|
110
83
|
def crabtrap_config
|
111
84
|
{
|
112
85
|
bin_path: crabtrap_bin_path,
|
data/lib/crabfarm/context.rb
CHANGED
@@ -32,7 +32,7 @@ module Crabfarm
|
|
32
32
|
private
|
33
33
|
|
34
34
|
def load_services
|
35
|
-
|
35
|
+
init_driver_factory
|
36
36
|
init_driver_pool
|
37
37
|
init_http_client
|
38
38
|
end
|
@@ -43,71 +43,47 @@ module Crabfarm
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def unload_services
|
46
|
-
release_driver_pool
|
47
46
|
release_http_client
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
def init_driver_pool
|
52
|
-
@pool = DriverBucketPool.new build_driver_factory if @pool.nil?
|
47
|
+
release_driver_pool
|
48
|
+
release_driver_factory
|
53
49
|
end
|
54
50
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
51
|
+
def init_driver_factory
|
52
|
+
if @factory.nil?
|
53
|
+
@factory = Strategies.load(:browser, config.browser).new proxy
|
54
|
+
@factory.prepare_driver_services
|
55
|
+
end
|
58
56
|
end
|
59
57
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
end
|
58
|
+
def release_driver_factory
|
59
|
+
@factory.cleanup_driver_services unless @factory.nil?
|
60
|
+
@factory.nil?
|
64
61
|
end
|
65
62
|
|
66
|
-
def
|
67
|
-
|
68
|
-
new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
|
69
|
-
new_phantom.start
|
70
|
-
return new_phantom
|
63
|
+
def init_driver_pool
|
64
|
+
@pool = DriverPool.new @factory if @pool.nil?
|
71
65
|
end
|
72
66
|
|
73
|
-
def
|
74
|
-
@
|
75
|
-
@
|
67
|
+
def release_driver_pool
|
68
|
+
@pool.release unless @pool.nil?
|
69
|
+
@pool = nil
|
76
70
|
end
|
77
71
|
|
78
72
|
def init_http_client
|
79
|
-
@http =
|
73
|
+
@http = HttpClient.new proxy if @http.nil?
|
80
74
|
end
|
81
75
|
|
82
76
|
def release_http_client
|
83
77
|
@http = nil
|
84
78
|
end
|
85
79
|
|
86
|
-
def
|
87
|
-
|
88
|
-
PhantomDriverFactory.new @phantom, driver_config
|
89
|
-
else
|
90
|
-
return config.driver_factory if config.driver_factory
|
91
|
-
DefaultDriverFactory.new driver_config
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def build_http_client
|
96
|
-
HttpClient.new config.proxy
|
80
|
+
def proxy
|
81
|
+
Crabfarm.config.proxy
|
97
82
|
end
|
98
83
|
|
99
84
|
def config
|
100
85
|
Crabfarm.config
|
101
86
|
end
|
102
87
|
|
103
|
-
def driver_config
|
104
|
-
config.driver_config
|
105
|
-
end
|
106
|
-
|
107
|
-
def phantom_config
|
108
|
-
config.phantom_config
|
109
|
-
end
|
110
|
-
|
111
88
|
end
|
112
|
-
|
113
89
|
end
|
@@ -57,10 +57,6 @@ module Crabfarm
|
|
57
57
|
@port = nil
|
58
58
|
end
|
59
59
|
|
60
|
-
def build_http_client
|
61
|
-
HttpClient.new proxy_address
|
62
|
-
end
|
63
|
-
|
64
60
|
def start_daemon
|
65
61
|
if @runner.nil?
|
66
62
|
options = {
|
@@ -69,7 +65,7 @@ module Crabfarm
|
|
69
65
|
port: @port
|
70
66
|
}
|
71
67
|
|
72
|
-
@runner = CrabtrapRunner.new
|
68
|
+
@runner = CrabtrapRunner.new config.crabtrap_config.merge(options)
|
73
69
|
@runner.start
|
74
70
|
end
|
75
71
|
end
|
@@ -81,12 +77,9 @@ module Crabfarm
|
|
81
77
|
else nil end
|
82
78
|
end
|
83
79
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
def phantom_config
|
89
|
-
super.merge(proxy: proxy_address)
|
80
|
+
def proxy
|
81
|
+
# just step over configuration proxy
|
82
|
+
proxy_address
|
90
83
|
end
|
91
84
|
|
92
85
|
def proxy_address
|