crabfarm 0.0.16 → 0.0.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/browser/capybara.rb +1 -1
- data/lib/crabfarm/adapters/browser/watir.rb +6 -9
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -2
- data/lib/crabfarm/adapters/parser/nokogiri.rb +7 -3
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +9 -0
- data/lib/crabfarm/base_parser.rb +11 -11
- data/lib/crabfarm/base_state.rb +33 -14
- data/lib/crabfarm/configuration.rb +2 -2
- data/lib/crabfarm/context.rb +14 -8
- data/lib/crabfarm/crabtrap_context.rb +4 -0
- data/lib/crabfarm/crabtrap_runner.rb +0 -2
- data/lib/crabfarm/dsl/surfer/search_context.rb +0 -4
- data/lib/crabfarm/engines/safe_state_loop.rb +1 -1
- data/lib/crabfarm/forked_state.rb +15 -4
- data/lib/crabfarm/http_client.rb +97 -0
- data/lib/crabfarm/modes/console.rb +1 -1
- data/lib/crabfarm/modes/generator.rb +8 -6
- data/lib/crabfarm/phantom_runner.rb +0 -1
- data/lib/crabfarm/rspec.rb +5 -4
- data/lib/crabfarm/strategies.rb +21 -4
- data/lib/crabfarm/templates/Crabfile.erb +6 -2
- data/lib/crabfarm/transition_service.rb +23 -0
- data/lib/crabfarm/version.rb +1 -1
- data/lib/crabfarm.rb +9 -7
- metadata +61 -60
- data/lib/crabfarm/loader_service.rb +0 -29
- data/lib/crabfarm/parser_service.rb +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62fc780d5c9db277ef5147d564c74cd684dbd39b
|
4
|
+
data.tar.gz: 9028aeb0ce71914ab549644948c0ceb83e1efdda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f003f40fbb5727dee2f831fede26ff6bfd559028c5d339f56aa63aabe3cc0fe7f03519d73df03ebee5121a8478dc0a082f8124272b194dc2a5023402c5147cbd
|
7
|
+
data.tar.gz: a19ccb92b10a0b1d316992cbde562987ac1a3928b4ae32811e56e570f93073f839b4422ae2a0e06bdcbebff3557bf5e0f1ea197bdbff442052c7b878273ca297
|
@@ -1,21 +1,18 @@
|
|
1
|
-
require 'watir-webdriver'
|
2
|
-
|
3
1
|
class Watir::Browser
|
4
|
-
def
|
5
|
-
|
2
|
+
def to_html
|
3
|
+
html
|
6
4
|
end
|
7
5
|
end
|
8
6
|
|
9
7
|
class Watir::Element
|
10
|
-
def
|
11
|
-
|
8
|
+
def to_html
|
9
|
+
html
|
12
10
|
end
|
13
11
|
end
|
14
12
|
|
15
13
|
class Watir::ElementCollection
|
16
|
-
def
|
17
|
-
|
18
|
-
Crabfarm::ParserService.parse _parser_class, full_html, _options
|
14
|
+
def to_html
|
15
|
+
self.map(&:html).join
|
19
16
|
end
|
20
17
|
end
|
21
18
|
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Crabfarm
|
4
|
-
class
|
5
|
-
def self.parse(
|
6
|
-
|
4
|
+
class NokogiriAdapter
|
5
|
+
def self.parse(_element)
|
6
|
+
if _element.respond_to? :to_html
|
7
|
+
Nokogiri::HTML _element.to_html
|
8
|
+
else
|
9
|
+
Nokogiri::HTML _element
|
10
|
+
end
|
7
11
|
end
|
8
12
|
end
|
9
13
|
end
|
data/lib/crabfarm/base_parser.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
module Crabfarm
|
2
2
|
class BaseParser < Delegator
|
3
3
|
|
4
|
-
attr_reader :params, :
|
4
|
+
attr_reader :params, :document
|
5
5
|
|
6
|
-
def self.
|
7
|
-
@
|
6
|
+
def self.engine(_engine)
|
7
|
+
@engine = _engine
|
8
8
|
end
|
9
9
|
|
10
|
-
def initialize(
|
11
|
-
|
12
|
-
@
|
10
|
+
def initialize(_target, _params)
|
11
|
+
engine_class = Strategies.load(:parser_engine, class_engine || Crabfarm.config.parser_engine)
|
12
|
+
@document = engine_class.parse _target
|
13
13
|
@params = _params
|
14
14
|
|
15
|
-
super @
|
15
|
+
super @document
|
16
16
|
end
|
17
17
|
|
18
18
|
def parse
|
@@ -20,17 +20,17 @@ module Crabfarm
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def __getobj__
|
23
|
-
@
|
23
|
+
@document
|
24
24
|
end
|
25
25
|
|
26
26
|
def __setobj__(obj)
|
27
|
-
@
|
27
|
+
@document = obj
|
28
28
|
end
|
29
29
|
|
30
30
|
private
|
31
31
|
|
32
|
-
def
|
33
|
-
self.class.instance_variable_get :@
|
32
|
+
def class_engine
|
33
|
+
self.class.instance_variable_get :@engine
|
34
34
|
end
|
35
35
|
end
|
36
36
|
end
|
data/lib/crabfarm/base_state.rb
CHANGED
@@ -5,10 +5,13 @@ module Crabfarm
|
|
5
5
|
class BaseState
|
6
6
|
extend Forwardable
|
7
7
|
|
8
|
+
PARSE_METHOD_RX = /^parse_(.*)$/
|
9
|
+
|
8
10
|
attr_reader :params, :output
|
9
11
|
|
10
|
-
def_delegators
|
11
|
-
def_delegators
|
12
|
+
def_delegators '@context', :http
|
13
|
+
def_delegators '@context.pool', :driver
|
14
|
+
def_delegators '@context.store', :get, :fetch
|
12
15
|
|
13
16
|
def self.browser_dsl(_dsl)
|
14
17
|
@class_browser_dsl = _dsl
|
@@ -18,11 +21,9 @@ module Crabfarm
|
|
18
21
|
@class_output_builder = _builder
|
19
22
|
end
|
20
23
|
|
21
|
-
def initialize(
|
22
|
-
@
|
23
|
-
@store = _store
|
24
|
+
def initialize(_context, _params)
|
25
|
+
@context = _context
|
24
26
|
@params = _params
|
25
|
-
@events = []
|
26
27
|
|
27
28
|
@dsl = Strategies.load(:browser_dsl, class_browser_dsl || Crabfarm.config.browser_dsl)
|
28
29
|
@builder = Strategies.load(:output_builder, class_output_builder || Crabfarm.config.output_builder)
|
@@ -32,6 +33,10 @@ module Crabfarm
|
|
32
33
|
@dsl.wrap driver(_name)
|
33
34
|
end
|
34
35
|
|
36
|
+
def download(_url)
|
37
|
+
@context.http.get(_url).body
|
38
|
+
end
|
39
|
+
|
35
40
|
def output
|
36
41
|
@output ||= @builder.prepare
|
37
42
|
end
|
@@ -44,16 +49,16 @@ module Crabfarm
|
|
44
49
|
raise NotImplementedError.new
|
45
50
|
end
|
46
51
|
|
47
|
-
def
|
48
|
-
|
49
|
-
end
|
52
|
+
def parse(_target=nil, _options={})
|
53
|
+
parser_class = _options.delete :using
|
50
54
|
|
51
|
-
|
52
|
-
|
53
|
-
|
55
|
+
if parser_class.nil?
|
56
|
+
parser_class = (self.class.name + 'Parser').constantize
|
57
|
+
end
|
54
58
|
|
55
|
-
|
56
|
-
|
59
|
+
parser = parser_class.new _target, @params.merge(_options)
|
60
|
+
parser.parse
|
61
|
+
return parser
|
57
62
|
end
|
58
63
|
|
59
64
|
def fork_each(_enumerator, &_block)
|
@@ -66,6 +71,20 @@ module Crabfarm
|
|
66
71
|
ThreadsWait.all_waits(*ths)
|
67
72
|
end
|
68
73
|
|
74
|
+
def method_missing(_method, *_args, &_block)
|
75
|
+
m = PARSE_METHOD_RX.match(_method)
|
76
|
+
if m
|
77
|
+
options = _args[1] || {}
|
78
|
+
options[:using] = (m[1].camelize + 'Parser').constantize
|
79
|
+
parse _args[0], options
|
80
|
+
else super end
|
81
|
+
end
|
82
|
+
|
83
|
+
def respond_to?(_method, _include_all=false)
|
84
|
+
return true if PARSE_METHOD_RX === _method
|
85
|
+
super
|
86
|
+
end
|
87
|
+
|
69
88
|
private
|
70
89
|
|
71
90
|
def class_browser_dsl
|
@@ -6,7 +6,7 @@ module Crabfarm
|
|
6
6
|
|
7
7
|
OPTIONS = [
|
8
8
|
[:browser_dsl, :string, 'Default browser dsl used by states'],
|
9
|
-
[:
|
9
|
+
[:parser_engine, :string, 'Default parser engine used by parsers'],
|
10
10
|
[:output_builder, :string, 'Default json output builder used by states'],
|
11
11
|
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
12
12
|
[:log_path, :string, 'Path where logs should be stored'],
|
@@ -51,7 +51,7 @@ module Crabfarm
|
|
51
51
|
def reset
|
52
52
|
@values = {
|
53
53
|
browser_dsl: :surfer,
|
54
|
-
|
54
|
+
parser_engine: :nokogiri,
|
55
55
|
output_builder: :hash,
|
56
56
|
driver_factory: nil,
|
57
57
|
log_path: nil,
|
data/lib/crabfarm/context.rb
CHANGED
@@ -4,7 +4,7 @@ module Crabfarm
|
|
4
4
|
class Context
|
5
5
|
extend Forwardable
|
6
6
|
|
7
|
-
|
7
|
+
attr_accessor :pool, :store, :http
|
8
8
|
|
9
9
|
def initialize
|
10
10
|
@store = StateStore.new
|
@@ -14,16 +14,10 @@ module Crabfarm
|
|
14
14
|
def load
|
15
15
|
init_phantom_if_required
|
16
16
|
init_driver_pool
|
17
|
+
init_http_client
|
17
18
|
@loaded = true
|
18
19
|
end
|
19
20
|
|
20
|
-
def run_state(_name, _params={})
|
21
|
-
load
|
22
|
-
state = LoaderService.load_state(_name).new @pool, @store, _params
|
23
|
-
state.crawl
|
24
|
-
state
|
25
|
-
end
|
26
|
-
|
27
21
|
def reset
|
28
22
|
@store.reset
|
29
23
|
@pool.reset unless @pool.nil?
|
@@ -63,6 +57,14 @@ module Crabfarm
|
|
63
57
|
@phantom = nil
|
64
58
|
end
|
65
59
|
|
60
|
+
def init_http_client
|
61
|
+
@http = build_http_client if @http.nil?
|
62
|
+
end
|
63
|
+
|
64
|
+
def release_http_client
|
65
|
+
@http = nil
|
66
|
+
end
|
67
|
+
|
66
68
|
def build_driver_factory
|
67
69
|
if @phantom
|
68
70
|
PhantomDriverFactory.new @phantom, driver_config
|
@@ -72,6 +74,10 @@ module Crabfarm
|
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
77
|
+
def build_http_client
|
78
|
+
HttpClient.new config.proxy
|
79
|
+
end
|
80
|
+
|
75
81
|
def config
|
76
82
|
Crabfarm.config
|
77
83
|
end
|
@@ -109,7 +109,7 @@ module Crabfarm
|
|
109
109
|
begin
|
110
110
|
ActiveSupport::Dependencies.clear
|
111
111
|
logger.info "StateLoop: loading state: #{@next_state_name}"
|
112
|
-
@doc =
|
112
|
+
@doc = TransitionService.apply_state(context, @next_state_name, @next_state_params).output_as_json
|
113
113
|
logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
|
114
114
|
@error = nil
|
115
115
|
rescue Exception => e
|
@@ -1,13 +1,12 @@
|
|
1
1
|
module Crabfarm
|
2
|
-
class ForkedState
|
3
|
-
extend Forwardable
|
4
|
-
|
5
|
-
def_delegators :@state, :params, :get, :fetch, :event, :alert, :info
|
2
|
+
class ForkedState < Delegator
|
6
3
|
|
7
4
|
def initialize(_state, _name, _mutex)
|
8
5
|
@state = _state
|
9
6
|
@name = _name
|
10
7
|
@mutex = _mutex
|
8
|
+
|
9
|
+
super @state
|
11
10
|
end
|
12
11
|
|
13
12
|
def driver
|
@@ -18,10 +17,22 @@ module Crabfarm
|
|
18
17
|
@browser ||= @state.browser(@name)
|
19
18
|
end
|
20
19
|
|
20
|
+
def output
|
21
|
+
raise ScriptError.new 'Use lock_output to access output in forked states'
|
22
|
+
end
|
23
|
+
|
21
24
|
def lock_output
|
22
25
|
@mutex.synchronize {
|
23
26
|
yield @state.output
|
24
27
|
}
|
25
28
|
end
|
29
|
+
|
30
|
+
def __getobj__
|
31
|
+
@state
|
32
|
+
end
|
33
|
+
|
34
|
+
def __setobj__(obj)
|
35
|
+
@state = obj
|
36
|
+
end
|
26
37
|
end
|
27
38
|
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class HttpClient
|
5
|
+
|
6
|
+
class HttpRequestError < StandardError
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@response, :code, :body
|
10
|
+
|
11
|
+
attr_reader :response
|
12
|
+
|
13
|
+
def initialize(_response)
|
14
|
+
@response = _response
|
15
|
+
super _response.message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class MaximumRedirectsError < StandardError
|
20
|
+
def initialize
|
21
|
+
super 'Redirection loop detected!'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :proxy_addr, :proxy_port
|
26
|
+
|
27
|
+
def initialize(_proxy=nil)
|
28
|
+
if _proxy.nil?
|
29
|
+
@proxy_addr = nil
|
30
|
+
@proxy_port = nil
|
31
|
+
else
|
32
|
+
@proxy_addr, @proxy_port = _proxy.split ':'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get(_url, _query={}, _headers={})
|
37
|
+
uri = URI _url
|
38
|
+
perform_request Net::HTTP::Get, uri, _headers
|
39
|
+
end
|
40
|
+
|
41
|
+
def post(_url, _data, _headers={})
|
42
|
+
perform_request Net::HTTP::Post, URI(_url), _headers do |req|
|
43
|
+
req.body = prepare_data(_data)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def put(_url, _data, _headers={})
|
48
|
+
perform_request Net::HTTP::Put, URI(_url), _headers do |req|
|
49
|
+
req.body = prepare_data(_data)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(_url)
|
54
|
+
perform_request Net::HTTP::Delete, URI(_url), _headers
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def perform_request(_req_type, _uri, _headers, _limit=10)
|
60
|
+
|
61
|
+
raise MaximumRedirectsError.new if _limit == 0
|
62
|
+
|
63
|
+
request = _req_type.new(_uri.path.empty? ? '/' : _uri.path)
|
64
|
+
_headers.keys.each { |k| request[k] = _headers[k] }
|
65
|
+
yield request if block_given?
|
66
|
+
|
67
|
+
response = build_client(_uri).request request
|
68
|
+
|
69
|
+
case response
|
70
|
+
when Net::HTTPSuccess then
|
71
|
+
response
|
72
|
+
when Net::HTTPRedirection then
|
73
|
+
location = response['location']
|
74
|
+
perform_request(_req_type, URI.parse(location), _headers, _limit - 1)
|
75
|
+
else
|
76
|
+
handle_error_response response
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def build_client(uri)
|
81
|
+
client = Net::HTTP.new uri.host, uri.port || 80, proxy_addr, proxy_port
|
82
|
+
client.use_ssl = true if uri.scheme == 'https'
|
83
|
+
client.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
84
|
+
client
|
85
|
+
end
|
86
|
+
|
87
|
+
def handle_error_response(_response)
|
88
|
+
raise HttpRequestError.new _response
|
89
|
+
end
|
90
|
+
|
91
|
+
def prepare_data(_data)
|
92
|
+
if _data.is_a? Hash
|
93
|
+
_data.keys.map { |k| "#{k}=#{_data[k]}" }.join '&'
|
94
|
+
else _data end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -34,7 +34,7 @@ module Crabfarm
|
|
34
34
|
begin
|
35
35
|
elapsed = Benchmark.measure do
|
36
36
|
puts "Transitioning to #{_name.to_s.camelize} state"
|
37
|
-
doc = @context
|
37
|
+
doc = TransitionService.apply_state(@context, _name, _params).output_as_json
|
38
38
|
|
39
39
|
puts "State changed, generated document:"
|
40
40
|
puts JSON.pretty_generate(doc).color(:green).gsub(/(^|\\n)/, ' ')
|
@@ -35,18 +35,20 @@ module Crabfarm
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def generate_state(_name)
|
38
|
+
class_name = _name.camelize
|
38
39
|
with_crawler_path do
|
39
|
-
binding = { state_class:
|
40
|
-
path('app', 'states',
|
41
|
-
path('spec', 'states',
|
40
|
+
binding = { state_class: class_name.camelize }
|
41
|
+
path('app', 'states', class_name.underscore + '.rb').render('state.rb', binding)
|
42
|
+
path('spec', 'states', class_name.underscore + '_spec.rb').render('state_spec.rb', binding)
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
45
46
|
def generate_parser(_name)
|
47
|
+
class_name = _name.camelize + 'Parser'
|
46
48
|
with_crawler_path do
|
47
|
-
binding = { parser_class:
|
48
|
-
path('app', 'parsers',
|
49
|
-
path('spec', 'parsers',
|
49
|
+
binding = { parser_class: class_name }
|
50
|
+
path('app', 'parsers', class_name.underscore + '.rb').render('parser.rb', binding)
|
51
|
+
path('spec', 'parsers', class_name.underscore + '_spec.rb').render('parser_spec.rb', binding)
|
50
52
|
end
|
51
53
|
end
|
52
54
|
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,7 +3,6 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
|
5
5
|
CF_TEST_CONTEXT.load
|
6
|
-
CF_TEST_BUCKET = CF_TEST_CONTEXT.driver
|
7
6
|
|
8
7
|
module Crabfarm
|
9
8
|
module RSpec
|
@@ -16,7 +15,9 @@ module Crabfarm
|
|
16
15
|
Net::HTTP.get(URI.parse _snap_or_url)
|
17
16
|
end
|
18
17
|
|
19
|
-
|
18
|
+
parser = described_class.new html, _options
|
19
|
+
parser.parse
|
20
|
+
parser
|
20
21
|
end
|
21
22
|
|
22
23
|
def crawl(_state=nil, _params={})
|
@@ -27,9 +28,9 @@ module Crabfarm
|
|
27
28
|
|
28
29
|
if _state.nil?
|
29
30
|
return nil unless described_class < BaseState # TODO: maybe raise an error here.
|
30
|
-
@state = @last_state = CF_TEST_CONTEXT
|
31
|
+
@state = @last_state = TransitionService.apply_state CF_TEST_CONTEXT, described_class, _params
|
31
32
|
else
|
32
|
-
@last_state = CF_TEST_CONTEXT
|
33
|
+
@last_state = TransitionService.apply_state CF_TEST_CONTEXT, _state, _params
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
data/lib/crabfarm/strategies.rb
CHANGED
@@ -2,25 +2,42 @@ module Crabfarm
|
|
2
2
|
module Strategies
|
3
3
|
|
4
4
|
class Loader
|
5
|
-
def initialize(_klass, _pkg
|
5
|
+
def initialize(_name, _klass, _pkg, _deps)
|
6
|
+
@name = _name
|
6
7
|
@klass = _klass
|
7
8
|
@pkg = _pkg
|
9
|
+
@deps = _deps
|
8
10
|
end
|
9
11
|
|
10
12
|
def load
|
13
|
+
load_dependencies
|
11
14
|
require @pkg if @pkg
|
12
15
|
if @klass.is_a? String then Object.const_get @klass else @klass end
|
13
16
|
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def load_dependencies
|
21
|
+
@deps.each do |dep|
|
22
|
+
begin
|
23
|
+
require dep
|
24
|
+
# TODO: check dependency version!
|
25
|
+
rescue LoadError
|
26
|
+
raise ConfigurationError.new "Missing #{@name} dependency, please add `gem \"#{dep}\"` to the crawler's Gemfile"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
14
30
|
end
|
15
31
|
|
16
32
|
@@register = {}
|
17
33
|
|
18
|
-
def self.register(_cat, _name, _klass, _pkg=nil)
|
19
|
-
|
34
|
+
def self.register(_cat, _name, _klass, _pkg=nil, _deps=[])
|
35
|
+
full_name = _cat.to_s + ':' + _name.to_s
|
36
|
+
@@register[full_name] = Loader.new(full_name, _klass, _pkg, _deps)
|
20
37
|
end
|
21
38
|
|
22
39
|
def self.load(_cat, _name)
|
23
|
-
full_name = _cat.to_s + _name.to_s
|
40
|
+
full_name = _cat.to_s + ':' + _name.to_s
|
24
41
|
raise ConfigurationError.new "Invalid #{_cat} name #{_name}" unless @@register.has_key? full_name
|
25
42
|
@@register[full_name].load
|
26
43
|
end
|
@@ -1,10 +1,14 @@
|
|
1
1
|
|
2
2
|
# The default crawling dsl to use in states and parsers, can be overriden in each component using the `browser_dsl :dsl` modifier
|
3
|
-
# Available options are :surfer, :watir and :capybara
|
3
|
+
# Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
|
4
4
|
set_browser_dsl :surfer
|
5
5
|
|
6
|
+
# The default parser engine for parsers that do not specify one.
|
7
|
+
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
8
|
+
# set_parser_engine :nokogiri
|
9
|
+
|
6
10
|
# Change the defaut output builder used in a state to generate the output document.
|
7
|
-
# Available options are :hash, :ostruct, :jbuilder
|
11
|
+
# Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
|
8
12
|
# set_output_builder :hash
|
9
13
|
|
10
14
|
# The path where every crawler log is stored.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class TransitionService
|
3
|
+
|
4
|
+
def self.apply_state(_context, _name, _params={})
|
5
|
+
state_class = if _name.is_a? String or _name.is_a? Symbol
|
6
|
+
load_by_name _name
|
7
|
+
else _name end
|
8
|
+
|
9
|
+
_context.load
|
10
|
+
state = state_class.new _context, _params
|
11
|
+
state.crawl
|
12
|
+
state
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def self.load_by_name(_name)
|
18
|
+
name = _name.to_s.gsub(/[^A-Z0-9:]+/i, '_').camelize
|
19
|
+
name.constantize
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
data/lib/crabfarm/version.rb
CHANGED
data/lib/crabfarm.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
|
-
require
|
1
|
+
require "logger"
|
2
2
|
require "forwardable"
|
3
|
+
require "net/http"
|
3
4
|
require "active_support/inflector"
|
4
5
|
require "selenium-webdriver"
|
5
6
|
|
6
7
|
require "crabfarm/version"
|
7
8
|
require "crabfarm/errors"
|
8
9
|
require "crabfarm/configuration"
|
9
|
-
require "crabfarm/
|
10
|
-
require "crabfarm/parser_service"
|
10
|
+
require "crabfarm/transition_service"
|
11
11
|
require "crabfarm/driver_bucket"
|
12
12
|
require "crabfarm/driver_bucket_pool"
|
13
|
+
require "crabfarm/http_client"
|
13
14
|
require "crabfarm/default_driver_factory"
|
14
15
|
require "crabfarm/phantom_driver_factory"
|
15
16
|
require "crabfarm/phantom_runner"
|
@@ -43,15 +44,16 @@ module Crabfarm
|
|
43
44
|
module Strategies
|
44
45
|
# bundled browser dsl adapters
|
45
46
|
register :browser_dsl, :surfer, 'Crabfarm::SurferBrowserDsl', 'crabfarm/adapters/browser/surfer'
|
46
|
-
register :browser_dsl, :watir, 'Crabfarm::WatirBrowserDsl', 'crabfarm/adapters/browser/watir'
|
47
|
-
register :browser_dsl, :capybara, 'Crabfarm::CapybaraBrowserDsl', 'crabfarm/adapters/browser/capybara'
|
47
|
+
register :browser_dsl, :watir, 'Crabfarm::WatirBrowserDsl', 'crabfarm/adapters/browser/watir', ['watir-webdriver']
|
48
|
+
register :browser_dsl, :capybara, 'Crabfarm::CapybaraBrowserDsl', 'crabfarm/adapters/browser/capybara', ['capybara']
|
48
49
|
|
49
50
|
# bundled parsers dsl adapters
|
50
|
-
register :
|
51
|
+
register :parser_engine, :nokogiri, 'Crabfarm::NokogiriAdapter', 'crabfarm/adapters/parser/nokogiri'
|
52
|
+
register :parser_engine, :pdf_reader, 'Crabfarm::PdfReaderAdapter', 'crabfarm/adapters/parser/pdf_reader', ['pdf-reader']
|
51
53
|
|
52
54
|
# bundled state output builders
|
53
55
|
register :output_builder, :hash, 'Crabfarm::HashOutputBuilder', 'crabfarm/adapters/output/hash'
|
54
56
|
register :output_builder, :ostruct, 'Crabfarm::OStructOutputBuilder', 'crabfarm/adapters/output/ostruct'
|
55
|
-
register :output_builder, :jbuilder, 'Crabfarm::JbuilderOutputBuilder', 'crabfarm/adapters/output/jbuilder'
|
57
|
+
register :output_builder, :jbuilder, 'Crabfarm::JbuilderOutputBuilder', 'crabfarm/adapters/output/jbuilder', ['jbuilder']
|
56
58
|
end
|
57
59
|
end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: jbuilder
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 2.2.0
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 2.2.0
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: selenium-webdriver
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,34 +24,6 @@ dependencies:
|
|
38
24
|
- - ~>
|
39
25
|
- !ruby/object:Gem::Version
|
40
26
|
version: '2.33'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: capybara
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: watir-webdriver
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - '>='
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
27
|
- !ruby/object:Gem::Dependency
|
70
28
|
name: nokogiri
|
71
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,20 +128,6 @@ dependencies:
|
|
170
128
|
- - '>='
|
171
129
|
- !ruby/object:Gem::Version
|
172
130
|
version: '0'
|
173
|
-
- !ruby/object:Gem::Dependency
|
174
|
-
name: multipart-post
|
175
|
-
requirement: !ruby/object:Gem::Requirement
|
176
|
-
requirements:
|
177
|
-
- - '>='
|
178
|
-
- !ruby/object:Gem::Version
|
179
|
-
version: '0'
|
180
|
-
type: :runtime
|
181
|
-
prerelease: false
|
182
|
-
version_requirements: !ruby/object:Gem::Requirement
|
183
|
-
requirements:
|
184
|
-
- - '>='
|
185
|
-
- !ruby/object:Gem::Version
|
186
|
-
version: '0'
|
187
131
|
- !ruby/object:Gem::Dependency
|
188
132
|
name: bundler
|
189
133
|
requirement: !ruby/object:Gem::Requirement
|
@@ -324,6 +268,62 @@ dependencies:
|
|
324
268
|
- - '>='
|
325
269
|
- !ruby/object:Gem::Version
|
326
270
|
version: '0'
|
271
|
+
- !ruby/object:Gem::Dependency
|
272
|
+
name: pdf-reader
|
273
|
+
requirement: !ruby/object:Gem::Requirement
|
274
|
+
requirements:
|
275
|
+
- - ~>
|
276
|
+
- !ruby/object:Gem::Version
|
277
|
+
version: 1.3.3
|
278
|
+
type: :development
|
279
|
+
prerelease: false
|
280
|
+
version_requirements: !ruby/object:Gem::Requirement
|
281
|
+
requirements:
|
282
|
+
- - ~>
|
283
|
+
- !ruby/object:Gem::Version
|
284
|
+
version: 1.3.3
|
285
|
+
- !ruby/object:Gem::Dependency
|
286
|
+
name: capybara
|
287
|
+
requirement: !ruby/object:Gem::Requirement
|
288
|
+
requirements:
|
289
|
+
- - '>='
|
290
|
+
- !ruby/object:Gem::Version
|
291
|
+
version: '0'
|
292
|
+
type: :development
|
293
|
+
prerelease: false
|
294
|
+
version_requirements: !ruby/object:Gem::Requirement
|
295
|
+
requirements:
|
296
|
+
- - '>='
|
297
|
+
- !ruby/object:Gem::Version
|
298
|
+
version: '0'
|
299
|
+
- !ruby/object:Gem::Dependency
|
300
|
+
name: watir-webdriver
|
301
|
+
requirement: !ruby/object:Gem::Requirement
|
302
|
+
requirements:
|
303
|
+
- - '>='
|
304
|
+
- !ruby/object:Gem::Version
|
305
|
+
version: '0'
|
306
|
+
type: :development
|
307
|
+
prerelease: false
|
308
|
+
version_requirements: !ruby/object:Gem::Requirement
|
309
|
+
requirements:
|
310
|
+
- - '>='
|
311
|
+
- !ruby/object:Gem::Version
|
312
|
+
version: '0'
|
313
|
+
- !ruby/object:Gem::Dependency
|
314
|
+
name: jbuilder
|
315
|
+
requirement: !ruby/object:Gem::Requirement
|
316
|
+
requirements:
|
317
|
+
- - ~>
|
318
|
+
- !ruby/object:Gem::Version
|
319
|
+
version: 2.2.0
|
320
|
+
type: :development
|
321
|
+
prerelease: false
|
322
|
+
version_requirements: !ruby/object:Gem::Requirement
|
323
|
+
requirements:
|
324
|
+
- - ~>
|
325
|
+
- !ruby/object:Gem::Version
|
326
|
+
version: 2.2.0
|
327
327
|
description:
|
328
328
|
email:
|
329
329
|
- ignacio@platan.us
|
@@ -339,6 +339,7 @@ files:
|
|
339
339
|
- lib/crabfarm/adapters/output/jbuilder.rb
|
340
340
|
- lib/crabfarm/adapters/output/ostruct.rb
|
341
341
|
- lib/crabfarm/adapters/parser/nokogiri.rb
|
342
|
+
- lib/crabfarm/adapters/parser/pdf_reader.rb
|
342
343
|
- lib/crabfarm/base_parser.rb
|
343
344
|
- lib/crabfarm/base_state.rb
|
344
345
|
- lib/crabfarm/cli.rb
|
@@ -355,14 +356,13 @@ files:
|
|
355
356
|
- lib/crabfarm/engines/safe_state_loop.rb
|
356
357
|
- lib/crabfarm/errors.rb
|
357
358
|
- lib/crabfarm/forked_state.rb
|
358
|
-
- lib/crabfarm/
|
359
|
+
- lib/crabfarm/http_client.rb
|
359
360
|
- lib/crabfarm/mocks/noop_driver.rb
|
360
361
|
- lib/crabfarm/modes/console.rb
|
361
362
|
- lib/crabfarm/modes/generator.rb
|
362
363
|
- lib/crabfarm/modes/publisher.rb
|
363
364
|
- lib/crabfarm/modes/recorder.rb
|
364
365
|
- lib/crabfarm/modes/server.rb
|
365
|
-
- lib/crabfarm/parser_service.rb
|
366
366
|
- lib/crabfarm/phantom_driver_factory.rb
|
367
367
|
- lib/crabfarm/phantom_runner.rb
|
368
368
|
- lib/crabfarm/rspec.rb
|
@@ -383,6 +383,7 @@ files:
|
|
383
383
|
- lib/crabfarm/templates/spec_helper.rb.erb
|
384
384
|
- lib/crabfarm/templates/state.rb.erb
|
385
385
|
- lib/crabfarm/templates/state_spec.rb.erb
|
386
|
+
- lib/crabfarm/transition_service.rb
|
386
387
|
- lib/crabfarm/version.rb
|
387
388
|
- lib/crabfarm.rb
|
388
389
|
- bin/crabfarm
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
class LoaderService
|
3
|
-
|
4
|
-
def self.load_state(_name)
|
5
|
-
load_entity _name, 'state', BaseState
|
6
|
-
end
|
7
|
-
|
8
|
-
def self.load_parser(_name)
|
9
|
-
load_entity _name, 'parser', BaseParser
|
10
|
-
end
|
11
|
-
|
12
|
-
private
|
13
|
-
|
14
|
-
def self.load_entity(_name, _role, _type)
|
15
|
-
|
16
|
-
if _name.is_a? String or _name.is_a? Symbol
|
17
|
-
name = _name.to_s.gsub(/[^A-Z0-9:]+/i, '_').camelize
|
18
|
-
mod = name.constantize rescue nil
|
19
|
-
else
|
20
|
-
mod = _name
|
21
|
-
end
|
22
|
-
|
23
|
-
raise EntityNotFoundError.new _role, name if mod.nil?
|
24
|
-
raise EntityNotFoundError.new _role, name unless mod < _type
|
25
|
-
mod
|
26
|
-
end
|
27
|
-
|
28
|
-
end
|
29
|
-
end
|
@@ -1,12 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
class ParserService
|
3
|
-
|
4
|
-
def self.parse(_parser_class, _html, _options={})
|
5
|
-
_parser_class = LoaderService.load_parser(_parser_class) if _parser_class.is_a? String or _parser_class.is_a? Symbol
|
6
|
-
parser = _parser_class.new _html, _options
|
7
|
-
parser.parse
|
8
|
-
parser
|
9
|
-
end
|
10
|
-
|
11
|
-
end
|
12
|
-
end
|