crabfarm 0.2.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm.rb +17 -18
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +60 -0
- data/lib/crabfarm/adapters/browser/chrome.rb +24 -0
- data/lib/crabfarm/adapters/browser/firefox.rb +26 -0
- data/lib/crabfarm/adapters/browser/noop.rb +25 -0
- data/lib/crabfarm/adapters/browser/phantom_js.rb +59 -0
- data/lib/crabfarm/adapters/browser/remote_webdriver.rb +31 -0
- data/lib/crabfarm/adapters/driver_wrapper/capybara.rb +11 -0
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +13 -0
- data/lib/crabfarm/adapters/{browser → driver_wrapper}/watir.rb +7 -3
- data/lib/crabfarm/adapters/parser/nokogiri.rb +17 -15
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +14 -12
- data/lib/crabfarm/assertion/fields.rb +85 -0
- data/lib/crabfarm/base_navigator.rb +78 -0
- data/lib/crabfarm/base_reducer.rb +68 -0
- data/lib/crabfarm/base_struct.rb +17 -0
- data/lib/crabfarm/cli.rb +18 -8
- data/lib/crabfarm/configuration.rb +24 -51
- data/lib/crabfarm/context.rb +19 -43
- data/lib/crabfarm/crabtrap_context.rb +4 -11
- data/lib/crabfarm/driver_pool.rb +32 -0
- data/lib/crabfarm/dsl/surfer/surf_context.rb +5 -25
- data/lib/crabfarm/engines/async_state_manager.rb +1 -1
- data/lib/crabfarm/engines/sync_state_manager.rb +1 -1
- data/lib/crabfarm/forked_navigator.rb +31 -0
- data/lib/crabfarm/modes/console.rb +4 -4
- data/lib/crabfarm/modes/generator.rb +24 -11
- data/lib/crabfarm/rspec.rb +26 -24
- data/lib/crabfarm/strategies.rb +15 -9
- data/lib/crabfarm/templates/Crabfile.erb +21 -26
- data/lib/crabfarm/templates/Gemfile.erb +6 -0
- data/lib/crabfarm/templates/navigator.rb.erb +20 -0
- data/lib/crabfarm/templates/{state_spec.rb.erb → navigator_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/{parser.rb.erb → reducer.rb.erb} +4 -4
- data/lib/crabfarm/templates/{parser_spec.rb.erb → reducer_spec.rb.erb} +1 -1
- data/lib/crabfarm/templates/struct.rb.erb +12 -0
- data/lib/crabfarm/transition_service.rb +20 -7
- data/lib/crabfarm/version.rb +1 -1
- metadata +50 -48
- data/lib/crabfarm/adapters/browser/capybara.rb +0 -7
- data/lib/crabfarm/adapters/browser/surfer.rb +0 -9
- data/lib/crabfarm/adapters/output/hash.rb +0 -11
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -11
- data/lib/crabfarm/adapters/output/ostruct.rb +0 -14
- data/lib/crabfarm/base_parser.rb +0 -59
- data/lib/crabfarm/base_state.rb +0 -112
- data/lib/crabfarm/default_driver_factory.rb +0 -86
- data/lib/crabfarm/driver_bucket.rb +0 -42
- data/lib/crabfarm/driver_bucket_pool.rb +0 -26
- data/lib/crabfarm/forked_state.rb +0 -38
- data/lib/crabfarm/mocks/noop_driver.rb +0 -6
- data/lib/crabfarm/phantom_driver_factory.rb +0 -33
- data/lib/crabfarm/templates/state.rb.erb +0 -8
@@ -0,0 +1,78 @@
|
|
1
|
+
require 'thwait'
|
2
|
+
require 'crabfarm/forked_navigator'
|
3
|
+
require "crabfarm/assertion/context"
|
4
|
+
|
5
|
+
module Crabfarm
|
6
|
+
class BaseNavigator
|
7
|
+
include Assertion::Context
|
8
|
+
extend Forwardable
|
9
|
+
|
10
|
+
attr_reader :params
|
11
|
+
|
12
|
+
def_delegators '@context', :http
|
13
|
+
def_delegators '@context.store', :get, :fetch
|
14
|
+
|
15
|
+
def initialize(_context, _params)
|
16
|
+
@context = _context
|
17
|
+
@params = _params
|
18
|
+
end
|
19
|
+
|
20
|
+
def browser(_name=nil)
|
21
|
+
@context.pool.driver(_name)
|
22
|
+
end
|
23
|
+
|
24
|
+
def download(_url)
|
25
|
+
@context.http.get(_url).body
|
26
|
+
end
|
27
|
+
|
28
|
+
def run
|
29
|
+
raise NotImplementedError.new
|
30
|
+
end
|
31
|
+
|
32
|
+
def reduce(_target=nil, _options={})
|
33
|
+
if _target.is_a? Hash
|
34
|
+
_options = _target
|
35
|
+
_target = browser
|
36
|
+
elsif _target.nil?
|
37
|
+
_target = browser
|
38
|
+
end
|
39
|
+
|
40
|
+
reduce_using(_options.delete(:using) || self.class.name, _target, _options)
|
41
|
+
end
|
42
|
+
|
43
|
+
alias :reduce_with_defaults :reduce
|
44
|
+
|
45
|
+
def fork_each(_enumerator, &_block)
|
46
|
+
session_id = 0
|
47
|
+
mutex = Mutex.new
|
48
|
+
ths = _enumerator.map do |value|
|
49
|
+
session_id += 1
|
50
|
+
start_forked_navigation("th_session_#{session_id}", value, _block, mutex)
|
51
|
+
end
|
52
|
+
ThreadsWait.all_waits(*ths)
|
53
|
+
end
|
54
|
+
|
55
|
+
private
|
56
|
+
|
57
|
+
def reduce_using(_reducer_class, _target, _options={})
|
58
|
+
if _reducer_class.is_a? String or _reducer_class.is_a? Symbol
|
59
|
+
_reducer_class = (Utils::Naming.decode_crabfarm_uri(_reducer_class.to_s) + 'Reducer').constantize
|
60
|
+
end
|
61
|
+
|
62
|
+
reducer = _reducer_class.new _target, @params.merge(_options)
|
63
|
+
reducer.run
|
64
|
+
reducer
|
65
|
+
end
|
66
|
+
|
67
|
+
def start_forked_navigation(_name, _value, _block, _mutex)
|
68
|
+
Thread.new {
|
69
|
+
fork = ForkedNavigator.new @context, self, _name, _mutex
|
70
|
+
begin
|
71
|
+
fork.instance_exec _value, &_block
|
72
|
+
ensure
|
73
|
+
@context.pool.reset _name
|
74
|
+
end
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require "crabfarm/assertion/fields"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class BaseReducer < Delegator
|
5
|
+
include Assertion::Fields
|
6
|
+
|
7
|
+
attr_reader :params, :document
|
8
|
+
|
9
|
+
def self.use_parser(_parser_name)
|
10
|
+
@parser_name = _parser_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.parser
|
14
|
+
@parser ||= Strategies.load(:parser, @parser_name || Crabfarm.config.parser)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.snapshot_path(_name=nil)
|
18
|
+
_name = self.to_s.underscore if _name.nil?
|
19
|
+
File.join(GlobalState.snapshots_path, _name + '.' + parser.format)
|
20
|
+
end
|
21
|
+
|
22
|
+
def parser
|
23
|
+
self.class.parser
|
24
|
+
end
|
25
|
+
|
26
|
+
def initialize(_target, _params)
|
27
|
+
reset_fields
|
28
|
+
|
29
|
+
@parsed_data = parser.preprocess_parsing_target _target
|
30
|
+
@document = parser.parse @parsed_data
|
31
|
+
@params = _params
|
32
|
+
|
33
|
+
super @document
|
34
|
+
end
|
35
|
+
|
36
|
+
def run
|
37
|
+
raise NotImplementedError.new
|
38
|
+
end
|
39
|
+
|
40
|
+
def take_snapshot(_name=nil)
|
41
|
+
file_path = self.class.snapshot_path _name
|
42
|
+
|
43
|
+
dir_path = file_path.split(File::SEPARATOR)[0...-1]
|
44
|
+
FileUtils.mkpath dir_path.join(File::SEPARATOR) if dir_path.length > 0
|
45
|
+
|
46
|
+
File.write file_path, @parsed_data
|
47
|
+
file_path
|
48
|
+
end
|
49
|
+
|
50
|
+
def take_snapshot_and_fail(_name=nil)
|
51
|
+
file_path = take_snapshot _name
|
52
|
+
raise ArgumentError.new "New snapshot for #{self.class.to_s} generated in '#{file_path}'"
|
53
|
+
end
|
54
|
+
|
55
|
+
def as_json(_options=nil)
|
56
|
+
field_hash
|
57
|
+
end
|
58
|
+
|
59
|
+
def __getobj__
|
60
|
+
@document
|
61
|
+
end
|
62
|
+
|
63
|
+
def __setobj__(obj)
|
64
|
+
@document = obj
|
65
|
+
end
|
66
|
+
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "crabfarm/assertion/fields"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class BaseStruct
|
5
|
+
include Assertion::Fields
|
6
|
+
|
7
|
+
def initialize(_values={})
|
8
|
+
reset_fields
|
9
|
+
_values.each { |k,v| send("#{k}=", v) }
|
10
|
+
end
|
11
|
+
|
12
|
+
def as_json(_options=nil)
|
13
|
+
field_hash
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
end
|
data/lib/crabfarm/cli.rb
CHANGED
@@ -85,23 +85,33 @@ module Crabfarm
|
|
85
85
|
end
|
86
86
|
end
|
87
87
|
|
88
|
-
c.desc "Generates a new crabfarm
|
89
|
-
c.command :
|
90
|
-
|
88
|
+
c.desc "Generates a new crabfarm navigator and navigator spec"
|
89
|
+
c.command :navigator do |sub|
|
90
|
+
sub.action do |global_options,options,args|
|
91
91
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
92
92
|
|
93
93
|
require "crabfarm/modes/generator"
|
94
|
-
Crabfarm::Modes::Generator.
|
94
|
+
Crabfarm::Modes::Generator.generate_navigator(GlobalState.app_path, args[0])
|
95
95
|
end
|
96
96
|
end
|
97
97
|
|
98
|
-
c.desc "Generates a new crabfarm
|
99
|
-
c.command :
|
100
|
-
|
98
|
+
c.desc "Generates a new crabfarm reducer and reducer spec"
|
99
|
+
c.command :reducer do |sub|
|
100
|
+
sub.action do |global_options,options,args|
|
101
101
|
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
102
102
|
|
103
103
|
require "crabfarm/modes/generator"
|
104
|
-
Crabfarm::Modes::Generator.
|
104
|
+
Crabfarm::Modes::Generator.generate_reducer(GlobalState.app_path, args[0])
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
c.desc "Generates a new crabfarm struct"
|
109
|
+
c.command :reducer do |sub|
|
110
|
+
sub.action do |global_options,options,args|
|
111
|
+
next puts "This command can only be ran inside a crabfarm application" unless GlobalState.inside_crawler_app?
|
112
|
+
|
113
|
+
require "crabfarm/modes/generator"
|
114
|
+
Crabfarm::Modes::Generator.generate_struct(GlobalState.app_path, args[0])
|
105
115
|
end
|
106
116
|
end
|
107
117
|
end
|
@@ -5,21 +5,20 @@ module Crabfarm
|
|
5
5
|
class Option < Struct.new(:name, :type, :text); end
|
6
6
|
|
7
7
|
OPTIONS = [
|
8
|
-
|
9
|
-
[:
|
10
|
-
[:
|
11
|
-
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
8
|
+
# Global options
|
9
|
+
[:browser, ['chrome', 'firefox', 'phantomjs', 'remote'], 'Browser engine to be used by navigators, common options: phantomjs, chrome, firefox, remote.'],
|
10
|
+
[:parser, :string, 'Default parser engine used by reducers'],
|
12
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
13
12
|
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
14
13
|
|
15
|
-
#
|
16
|
-
[:
|
17
|
-
[:
|
18
|
-
[:
|
19
|
-
[:
|
20
|
-
[:
|
21
|
-
[:
|
22
|
-
[:
|
14
|
+
# Webdriver configuration parameters
|
15
|
+
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
16
|
+
[:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
|
17
|
+
[:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
|
18
|
+
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phatomjs driver.'],
|
19
|
+
[:webdriver_window_width, :integer, 'Initial browser window width.'],
|
20
|
+
[:webdriver_window_height, :integer, 'Initial browser window height.'],
|
21
|
+
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are watir and surfer'],
|
23
22
|
|
24
23
|
# Phantom launcher configuration
|
25
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
@@ -50,19 +49,18 @@ module Crabfarm
|
|
50
49
|
|
51
50
|
def reset
|
52
51
|
@values = {
|
53
|
-
|
54
|
-
|
55
|
-
output_builder: :hash,
|
52
|
+
browser: 'phantomjs',
|
53
|
+
parser: :nokogiri,
|
56
54
|
driver_factory: nil,
|
57
55
|
log_path: nil,
|
58
56
|
proxy: nil,
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
57
|
+
webdriver_capabilities: nil,
|
58
|
+
webdriver_host: 'localhost',
|
59
|
+
webdriver_port: '8080',
|
60
|
+
webdriver_remote_timeout: 120,
|
61
|
+
webdriver_window_width: 1280,
|
62
|
+
webdriver_window_height: 800,
|
63
|
+
webdriver_dsl: :surfer,
|
66
64
|
phantom_load_images: false,
|
67
65
|
phantom_ssl: 'any',
|
68
66
|
phantom_bin_path: 'phantomjs',
|
@@ -75,38 +73,13 @@ module Crabfarm
|
|
75
73
|
@values.merge! _options
|
76
74
|
end
|
77
75
|
|
78
|
-
def
|
79
|
-
if
|
80
|
-
elsif
|
81
|
-
else "http://#{
|
76
|
+
def webdriver_remote_host
|
77
|
+
if webdriver_host then nil
|
78
|
+
elsif webdriver_port then "http://#{webdriver_host}"
|
79
|
+
else "http://#{webdriver_host}:#{webdriver_port}"
|
82
80
|
end
|
83
81
|
end
|
84
82
|
|
85
|
-
def driver_config
|
86
|
-
{
|
87
|
-
name: driver,
|
88
|
-
proxy: proxy,
|
89
|
-
capabilities: driver_capabilities,
|
90
|
-
remote_host: driver_remote_host,
|
91
|
-
remote_timeout: driver_remote_timeout,
|
92
|
-
window_width: driver_window_width,
|
93
|
-
window_height: driver_window_height
|
94
|
-
}
|
95
|
-
end
|
96
|
-
|
97
|
-
def phantom_mode_enabled?
|
98
|
-
driver.to_s == 'phantomjs'
|
99
|
-
end
|
100
|
-
|
101
|
-
def phantom_config
|
102
|
-
{
|
103
|
-
load_images: phantom_load_images,
|
104
|
-
proxy: proxy,
|
105
|
-
ssl: phantom_ssl,
|
106
|
-
bin_path: phantom_bin_path
|
107
|
-
}
|
108
|
-
end
|
109
|
-
|
110
83
|
def crabtrap_config
|
111
84
|
{
|
112
85
|
bin_path: crabtrap_bin_path,
|
data/lib/crabfarm/context.rb
CHANGED
@@ -32,7 +32,7 @@ module Crabfarm
|
|
32
32
|
private
|
33
33
|
|
34
34
|
def load_services
|
35
|
-
|
35
|
+
init_driver_factory
|
36
36
|
init_driver_pool
|
37
37
|
init_http_client
|
38
38
|
end
|
@@ -43,71 +43,47 @@ module Crabfarm
|
|
43
43
|
end
|
44
44
|
|
45
45
|
def unload_services
|
46
|
-
release_driver_pool
|
47
46
|
release_http_client
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
def init_driver_pool
|
52
|
-
@pool = DriverBucketPool.new build_driver_factory if @pool.nil?
|
47
|
+
release_driver_pool
|
48
|
+
release_driver_factory
|
53
49
|
end
|
54
50
|
|
55
|
-
def
|
56
|
-
|
57
|
-
|
51
|
+
def init_driver_factory
|
52
|
+
if @factory.nil?
|
53
|
+
@factory = Strategies.load(:browser, config.browser).new proxy
|
54
|
+
@factory.prepare_driver_services
|
55
|
+
end
|
58
56
|
end
|
59
57
|
|
60
|
-
def
|
61
|
-
|
62
|
-
|
63
|
-
end
|
58
|
+
def release_driver_factory
|
59
|
+
@factory.cleanup_driver_services unless @factory.nil?
|
60
|
+
@factory.nil?
|
64
61
|
end
|
65
62
|
|
66
|
-
def
|
67
|
-
|
68
|
-
new_phantom = PhantomRunner.new phantom_config.merge(port: phantom_port)
|
69
|
-
new_phantom.start
|
70
|
-
return new_phantom
|
63
|
+
def init_driver_pool
|
64
|
+
@pool = DriverPool.new @factory if @pool.nil?
|
71
65
|
end
|
72
66
|
|
73
|
-
def
|
74
|
-
@
|
75
|
-
@
|
67
|
+
def release_driver_pool
|
68
|
+
@pool.release unless @pool.nil?
|
69
|
+
@pool = nil
|
76
70
|
end
|
77
71
|
|
78
72
|
def init_http_client
|
79
|
-
@http =
|
73
|
+
@http = HttpClient.new proxy if @http.nil?
|
80
74
|
end
|
81
75
|
|
82
76
|
def release_http_client
|
83
77
|
@http = nil
|
84
78
|
end
|
85
79
|
|
86
|
-
def
|
87
|
-
|
88
|
-
PhantomDriverFactory.new @phantom, driver_config
|
89
|
-
else
|
90
|
-
return config.driver_factory if config.driver_factory
|
91
|
-
DefaultDriverFactory.new driver_config
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
def build_http_client
|
96
|
-
HttpClient.new config.proxy
|
80
|
+
def proxy
|
81
|
+
Crabfarm.config.proxy
|
97
82
|
end
|
98
83
|
|
99
84
|
def config
|
100
85
|
Crabfarm.config
|
101
86
|
end
|
102
87
|
|
103
|
-
def driver_config
|
104
|
-
config.driver_config
|
105
|
-
end
|
106
|
-
|
107
|
-
def phantom_config
|
108
|
-
config.phantom_config
|
109
|
-
end
|
110
|
-
|
111
88
|
end
|
112
|
-
|
113
89
|
end
|
@@ -57,10 +57,6 @@ module Crabfarm
|
|
57
57
|
@port = nil
|
58
58
|
end
|
59
59
|
|
60
|
-
def build_http_client
|
61
|
-
HttpClient.new proxy_address
|
62
|
-
end
|
63
|
-
|
64
60
|
def start_daemon
|
65
61
|
if @runner.nil?
|
66
62
|
options = {
|
@@ -69,7 +65,7 @@ module Crabfarm
|
|
69
65
|
port: @port
|
70
66
|
}
|
71
67
|
|
72
|
-
@runner = CrabtrapRunner.new
|
68
|
+
@runner = CrabtrapRunner.new config.crabtrap_config.merge(options)
|
73
69
|
@runner.start
|
74
70
|
end
|
75
71
|
end
|
@@ -81,12 +77,9 @@ module Crabfarm
|
|
81
77
|
else nil end
|
82
78
|
end
|
83
79
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
def phantom_config
|
89
|
-
super.merge(proxy: proxy_address)
|
80
|
+
def proxy
|
81
|
+
# just step over configuration proxy
|
82
|
+
proxy_address
|
90
83
|
end
|
91
84
|
|
92
85
|
def proxy_address
|