crabfarm 0.0.16 → 0.0.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/browser/capybara.rb +1 -1
- data/lib/crabfarm/adapters/browser/watir.rb +6 -9
- data/lib/crabfarm/adapters/output/jbuilder.rb +0 -2
- data/lib/crabfarm/adapters/parser/nokogiri.rb +7 -3
- data/lib/crabfarm/adapters/parser/pdf_reader.rb +9 -0
- data/lib/crabfarm/base_parser.rb +11 -11
- data/lib/crabfarm/base_state.rb +33 -14
- data/lib/crabfarm/configuration.rb +2 -2
- data/lib/crabfarm/context.rb +14 -8
- data/lib/crabfarm/crabtrap_context.rb +4 -0
- data/lib/crabfarm/crabtrap_runner.rb +0 -2
- data/lib/crabfarm/dsl/surfer/search_context.rb +0 -4
- data/lib/crabfarm/engines/safe_state_loop.rb +1 -1
- data/lib/crabfarm/forked_state.rb +15 -4
- data/lib/crabfarm/http_client.rb +97 -0
- data/lib/crabfarm/modes/console.rb +1 -1
- data/lib/crabfarm/modes/generator.rb +8 -6
- data/lib/crabfarm/phantom_runner.rb +0 -1
- data/lib/crabfarm/rspec.rb +5 -4
- data/lib/crabfarm/strategies.rb +21 -4
- data/lib/crabfarm/templates/Crabfile.erb +6 -2
- data/lib/crabfarm/transition_service.rb +23 -0
- data/lib/crabfarm/version.rb +1 -1
- data/lib/crabfarm.rb +9 -7
- metadata +61 -60
- data/lib/crabfarm/loader_service.rb +0 -29
- data/lib/crabfarm/parser_service.rb +0 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 62fc780d5c9db277ef5147d564c74cd684dbd39b
|
4
|
+
data.tar.gz: 9028aeb0ce71914ab549644948c0ceb83e1efdda
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f003f40fbb5727dee2f831fede26ff6bfd559028c5d339f56aa63aabe3cc0fe7f03519d73df03ebee5121a8478dc0a082f8124272b194dc2a5023402c5147cbd
|
7
|
+
data.tar.gz: a19ccb92b10a0b1d316992cbde562987ac1a3928b4ae32811e56e570f93073f839b4422ae2a0e06bdcbebff3557bf5e0f1ea197bdbff442052c7b878273ca297
|
@@ -1,21 +1,18 @@
|
|
1
|
-
require 'watir-webdriver'
|
2
|
-
|
3
1
|
class Watir::Browser
|
4
|
-
def
|
5
|
-
|
2
|
+
def to_html
|
3
|
+
html
|
6
4
|
end
|
7
5
|
end
|
8
6
|
|
9
7
|
class Watir::Element
|
10
|
-
def
|
11
|
-
|
8
|
+
def to_html
|
9
|
+
html
|
12
10
|
end
|
13
11
|
end
|
14
12
|
|
15
13
|
class Watir::ElementCollection
|
16
|
-
def
|
17
|
-
|
18
|
-
Crabfarm::ParserService.parse _parser_class, full_html, _options
|
14
|
+
def to_html
|
15
|
+
self.map(&:html).join
|
19
16
|
end
|
20
17
|
end
|
21
18
|
|
@@ -1,9 +1,13 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
module Crabfarm
|
4
|
-
class
|
5
|
-
def self.parse(
|
6
|
-
|
4
|
+
class NokogiriAdapter
|
5
|
+
def self.parse(_element)
|
6
|
+
if _element.respond_to? :to_html
|
7
|
+
Nokogiri::HTML _element.to_html
|
8
|
+
else
|
9
|
+
Nokogiri::HTML _element
|
10
|
+
end
|
7
11
|
end
|
8
12
|
end
|
9
13
|
end
|
data/lib/crabfarm/base_parser.rb
CHANGED
@@ -1,18 +1,18 @@
|
|
1
1
|
module Crabfarm
|
2
2
|
class BaseParser < Delegator
|
3
3
|
|
4
|
-
attr_reader :params, :
|
4
|
+
attr_reader :params, :document
|
5
5
|
|
6
|
-
def self.
|
7
|
-
@
|
6
|
+
def self.engine(_engine)
|
7
|
+
@engine = _engine
|
8
8
|
end
|
9
9
|
|
10
|
-
def initialize(
|
11
|
-
|
12
|
-
@
|
10
|
+
def initialize(_target, _params)
|
11
|
+
engine_class = Strategies.load(:parser_engine, class_engine || Crabfarm.config.parser_engine)
|
12
|
+
@document = engine_class.parse _target
|
13
13
|
@params = _params
|
14
14
|
|
15
|
-
super @
|
15
|
+
super @document
|
16
16
|
end
|
17
17
|
|
18
18
|
def parse
|
@@ -20,17 +20,17 @@ module Crabfarm
|
|
20
20
|
end
|
21
21
|
|
22
22
|
def __getobj__
|
23
|
-
@
|
23
|
+
@document
|
24
24
|
end
|
25
25
|
|
26
26
|
def __setobj__(obj)
|
27
|
-
@
|
27
|
+
@document = obj
|
28
28
|
end
|
29
29
|
|
30
30
|
private
|
31
31
|
|
32
|
-
def
|
33
|
-
self.class.instance_variable_get :@
|
32
|
+
def class_engine
|
33
|
+
self.class.instance_variable_get :@engine
|
34
34
|
end
|
35
35
|
end
|
36
36
|
end
|
data/lib/crabfarm/base_state.rb
CHANGED
@@ -5,10 +5,13 @@ module Crabfarm
|
|
5
5
|
class BaseState
|
6
6
|
extend Forwardable
|
7
7
|
|
8
|
+
PARSE_METHOD_RX = /^parse_(.*)$/
|
9
|
+
|
8
10
|
attr_reader :params, :output
|
9
11
|
|
10
|
-
def_delegators
|
11
|
-
def_delegators
|
12
|
+
def_delegators '@context', :http
|
13
|
+
def_delegators '@context.pool', :driver
|
14
|
+
def_delegators '@context.store', :get, :fetch
|
12
15
|
|
13
16
|
def self.browser_dsl(_dsl)
|
14
17
|
@class_browser_dsl = _dsl
|
@@ -18,11 +21,9 @@ module Crabfarm
|
|
18
21
|
@class_output_builder = _builder
|
19
22
|
end
|
20
23
|
|
21
|
-
def initialize(
|
22
|
-
@
|
23
|
-
@store = _store
|
24
|
+
def initialize(_context, _params)
|
25
|
+
@context = _context
|
24
26
|
@params = _params
|
25
|
-
@events = []
|
26
27
|
|
27
28
|
@dsl = Strategies.load(:browser_dsl, class_browser_dsl || Crabfarm.config.browser_dsl)
|
28
29
|
@builder = Strategies.load(:output_builder, class_output_builder || Crabfarm.config.output_builder)
|
@@ -32,6 +33,10 @@ module Crabfarm
|
|
32
33
|
@dsl.wrap driver(_name)
|
33
34
|
end
|
34
35
|
|
36
|
+
def download(_url)
|
37
|
+
@context.http.get(_url).body
|
38
|
+
end
|
39
|
+
|
35
40
|
def output
|
36
41
|
@output ||= @builder.prepare
|
37
42
|
end
|
@@ -44,16 +49,16 @@ module Crabfarm
|
|
44
49
|
raise NotImplementedError.new
|
45
50
|
end
|
46
51
|
|
47
|
-
def
|
48
|
-
|
49
|
-
end
|
52
|
+
def parse(_target=nil, _options={})
|
53
|
+
parser_class = _options.delete :using
|
50
54
|
|
51
|
-
|
52
|
-
|
53
|
-
|
55
|
+
if parser_class.nil?
|
56
|
+
parser_class = (self.class.name + 'Parser').constantize
|
57
|
+
end
|
54
58
|
|
55
|
-
|
56
|
-
|
59
|
+
parser = parser_class.new _target, @params.merge(_options)
|
60
|
+
parser.parse
|
61
|
+
return parser
|
57
62
|
end
|
58
63
|
|
59
64
|
def fork_each(_enumerator, &_block)
|
@@ -66,6 +71,20 @@ module Crabfarm
|
|
66
71
|
ThreadsWait.all_waits(*ths)
|
67
72
|
end
|
68
73
|
|
74
|
+
def method_missing(_method, *_args, &_block)
|
75
|
+
m = PARSE_METHOD_RX.match(_method)
|
76
|
+
if m
|
77
|
+
options = _args[1] || {}
|
78
|
+
options[:using] = (m[1].camelize + 'Parser').constantize
|
79
|
+
parse _args[0], options
|
80
|
+
else super end
|
81
|
+
end
|
82
|
+
|
83
|
+
def respond_to?(_method, _include_all=false)
|
84
|
+
return true if PARSE_METHOD_RX === _method
|
85
|
+
super
|
86
|
+
end
|
87
|
+
|
69
88
|
private
|
70
89
|
|
71
90
|
def class_browser_dsl
|
@@ -6,7 +6,7 @@ module Crabfarm
|
|
6
6
|
|
7
7
|
OPTIONS = [
|
8
8
|
[:browser_dsl, :string, 'Default browser dsl used by states'],
|
9
|
-
[:
|
9
|
+
[:parser_engine, :string, 'Default parser engine used by parsers'],
|
10
10
|
[:output_builder, :string, 'Default json output builder used by states'],
|
11
11
|
[:driver_factory, :mixed, 'Driver factory, disabled if phantom_mode is used'],
|
12
12
|
[:log_path, :string, 'Path where logs should be stored'],
|
@@ -51,7 +51,7 @@ module Crabfarm
|
|
51
51
|
def reset
|
52
52
|
@values = {
|
53
53
|
browser_dsl: :surfer,
|
54
|
-
|
54
|
+
parser_engine: :nokogiri,
|
55
55
|
output_builder: :hash,
|
56
56
|
driver_factory: nil,
|
57
57
|
log_path: nil,
|
data/lib/crabfarm/context.rb
CHANGED
@@ -4,7 +4,7 @@ module Crabfarm
|
|
4
4
|
class Context
|
5
5
|
extend Forwardable
|
6
6
|
|
7
|
-
|
7
|
+
attr_accessor :pool, :store, :http
|
8
8
|
|
9
9
|
def initialize
|
10
10
|
@store = StateStore.new
|
@@ -14,16 +14,10 @@ module Crabfarm
|
|
14
14
|
def load
|
15
15
|
init_phantom_if_required
|
16
16
|
init_driver_pool
|
17
|
+
init_http_client
|
17
18
|
@loaded = true
|
18
19
|
end
|
19
20
|
|
20
|
-
def run_state(_name, _params={})
|
21
|
-
load
|
22
|
-
state = LoaderService.load_state(_name).new @pool, @store, _params
|
23
|
-
state.crawl
|
24
|
-
state
|
25
|
-
end
|
26
|
-
|
27
21
|
def reset
|
28
22
|
@store.reset
|
29
23
|
@pool.reset unless @pool.nil?
|
@@ -63,6 +57,14 @@ module Crabfarm
|
|
63
57
|
@phantom = nil
|
64
58
|
end
|
65
59
|
|
60
|
+
def init_http_client
|
61
|
+
@http = build_http_client if @http.nil?
|
62
|
+
end
|
63
|
+
|
64
|
+
def release_http_client
|
65
|
+
@http = nil
|
66
|
+
end
|
67
|
+
|
66
68
|
def build_driver_factory
|
67
69
|
if @phantom
|
68
70
|
PhantomDriverFactory.new @phantom, driver_config
|
@@ -72,6 +74,10 @@ module Crabfarm
|
|
72
74
|
end
|
73
75
|
end
|
74
76
|
|
77
|
+
def build_http_client
|
78
|
+
HttpClient.new config.proxy
|
79
|
+
end
|
80
|
+
|
75
81
|
def config
|
76
82
|
Crabfarm.config
|
77
83
|
end
|
@@ -109,7 +109,7 @@ module Crabfarm
|
|
109
109
|
begin
|
110
110
|
ActiveSupport::Dependencies.clear
|
111
111
|
logger.info "StateLoop: loading state: #{@next_state_name}"
|
112
|
-
@doc =
|
112
|
+
@doc = TransitionService.apply_state(context, @next_state_name, @next_state_params).output_as_json
|
113
113
|
logger.info "StateLoop: state loaded successfully: #{@next_state_name}"
|
114
114
|
@error = nil
|
115
115
|
rescue Exception => e
|
@@ -1,13 +1,12 @@
|
|
1
1
|
module Crabfarm
|
2
|
-
class ForkedState
|
3
|
-
extend Forwardable
|
4
|
-
|
5
|
-
def_delegators :@state, :params, :get, :fetch, :event, :alert, :info
|
2
|
+
class ForkedState < Delegator
|
6
3
|
|
7
4
|
def initialize(_state, _name, _mutex)
|
8
5
|
@state = _state
|
9
6
|
@name = _name
|
10
7
|
@mutex = _mutex
|
8
|
+
|
9
|
+
super @state
|
11
10
|
end
|
12
11
|
|
13
12
|
def driver
|
@@ -18,10 +17,22 @@ module Crabfarm
|
|
18
17
|
@browser ||= @state.browser(@name)
|
19
18
|
end
|
20
19
|
|
20
|
+
def output
|
21
|
+
raise ScriptError.new 'Use lock_output to access output in forked states'
|
22
|
+
end
|
23
|
+
|
21
24
|
def lock_output
|
22
25
|
@mutex.synchronize {
|
23
26
|
yield @state.output
|
24
27
|
}
|
25
28
|
end
|
29
|
+
|
30
|
+
def __getobj__
|
31
|
+
@state
|
32
|
+
end
|
33
|
+
|
34
|
+
def __setobj__(obj)
|
35
|
+
@state = obj
|
36
|
+
end
|
26
37
|
end
|
27
38
|
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Crabfarm
|
4
|
+
class HttpClient
|
5
|
+
|
6
|
+
class HttpRequestError < StandardError
|
7
|
+
extend Forwardable
|
8
|
+
|
9
|
+
def_delegators :@response, :code, :body
|
10
|
+
|
11
|
+
attr_reader :response
|
12
|
+
|
13
|
+
def initialize(_response)
|
14
|
+
@response = _response
|
15
|
+
super _response.message
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
class MaximumRedirectsError < StandardError
|
20
|
+
def initialize
|
21
|
+
super 'Redirection loop detected!'
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_reader :proxy_addr, :proxy_port
|
26
|
+
|
27
|
+
def initialize(_proxy=nil)
|
28
|
+
if _proxy.nil?
|
29
|
+
@proxy_addr = nil
|
30
|
+
@proxy_port = nil
|
31
|
+
else
|
32
|
+
@proxy_addr, @proxy_port = _proxy.split ':'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def get(_url, _query={}, _headers={})
|
37
|
+
uri = URI _url
|
38
|
+
perform_request Net::HTTP::Get, uri, _headers
|
39
|
+
end
|
40
|
+
|
41
|
+
def post(_url, _data, _headers={})
|
42
|
+
perform_request Net::HTTP::Post, URI(_url), _headers do |req|
|
43
|
+
req.body = prepare_data(_data)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def put(_url, _data, _headers={})
|
48
|
+
perform_request Net::HTTP::Put, URI(_url), _headers do |req|
|
49
|
+
req.body = prepare_data(_data)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def delete(_url)
|
54
|
+
perform_request Net::HTTP::Delete, URI(_url), _headers
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def perform_request(_req_type, _uri, _headers, _limit=10)
|
60
|
+
|
61
|
+
raise MaximumRedirectsError.new if _limit == 0
|
62
|
+
|
63
|
+
request = _req_type.new(_uri.path.empty? ? '/' : _uri.path)
|
64
|
+
_headers.keys.each { |k| request[k] = _headers[k] }
|
65
|
+
yield request if block_given?
|
66
|
+
|
67
|
+
response = build_client(_uri).request request
|
68
|
+
|
69
|
+
case response
|
70
|
+
when Net::HTTPSuccess then
|
71
|
+
response
|
72
|
+
when Net::HTTPRedirection then
|
73
|
+
location = response['location']
|
74
|
+
perform_request(_req_type, URI.parse(location), _headers, _limit - 1)
|
75
|
+
else
|
76
|
+
handle_error_response response
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def build_client(uri)
|
81
|
+
client = Net::HTTP.new uri.host, uri.port || 80, proxy_addr, proxy_port
|
82
|
+
client.use_ssl = true if uri.scheme == 'https'
|
83
|
+
client.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
84
|
+
client
|
85
|
+
end
|
86
|
+
|
87
|
+
def handle_error_response(_response)
|
88
|
+
raise HttpRequestError.new _response
|
89
|
+
end
|
90
|
+
|
91
|
+
def prepare_data(_data)
|
92
|
+
if _data.is_a? Hash
|
93
|
+
_data.keys.map { |k| "#{k}=#{_data[k]}" }.join '&'
|
94
|
+
else _data end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -34,7 +34,7 @@ module Crabfarm
|
|
34
34
|
begin
|
35
35
|
elapsed = Benchmark.measure do
|
36
36
|
puts "Transitioning to #{_name.to_s.camelize} state"
|
37
|
-
doc = @context
|
37
|
+
doc = TransitionService.apply_state(@context, _name, _params).output_as_json
|
38
38
|
|
39
39
|
puts "State changed, generated document:"
|
40
40
|
puts JSON.pretty_generate(doc).color(:green).gsub(/(^|\\n)/, ' ')
|
@@ -35,18 +35,20 @@ module Crabfarm
|
|
35
35
|
end
|
36
36
|
|
37
37
|
def generate_state(_name)
|
38
|
+
class_name = _name.camelize
|
38
39
|
with_crawler_path do
|
39
|
-
binding = { state_class:
|
40
|
-
path('app', 'states',
|
41
|
-
path('spec', 'states',
|
40
|
+
binding = { state_class: class_name.camelize }
|
41
|
+
path('app', 'states', class_name.underscore + '.rb').render('state.rb', binding)
|
42
|
+
path('spec', 'states', class_name.underscore + '_spec.rb').render('state_spec.rb', binding)
|
42
43
|
end
|
43
44
|
end
|
44
45
|
|
45
46
|
def generate_parser(_name)
|
47
|
+
class_name = _name.camelize + 'Parser'
|
46
48
|
with_crawler_path do
|
47
|
-
binding = { parser_class:
|
48
|
-
path('app', 'parsers',
|
49
|
-
path('spec', 'parsers',
|
49
|
+
binding = { parser_class: class_name }
|
50
|
+
path('app', 'parsers', class_name.underscore + '.rb').render('parser.rb', binding)
|
51
|
+
path('spec', 'parsers', class_name.underscore + '_spec.rb').render('parser_spec.rb', binding)
|
50
52
|
end
|
51
53
|
end
|
52
54
|
|
data/lib/crabfarm/rspec.rb
CHANGED
@@ -3,7 +3,6 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
CF_TEST_CONTEXT = Crabfarm::CrabtrapContext::new
|
5
5
|
CF_TEST_CONTEXT.load
|
6
|
-
CF_TEST_BUCKET = CF_TEST_CONTEXT.driver
|
7
6
|
|
8
7
|
module Crabfarm
|
9
8
|
module RSpec
|
@@ -16,7 +15,9 @@ module Crabfarm
|
|
16
15
|
Net::HTTP.get(URI.parse _snap_or_url)
|
17
16
|
end
|
18
17
|
|
19
|
-
|
18
|
+
parser = described_class.new html, _options
|
19
|
+
parser.parse
|
20
|
+
parser
|
20
21
|
end
|
21
22
|
|
22
23
|
def crawl(_state=nil, _params={})
|
@@ -27,9 +28,9 @@ module Crabfarm
|
|
27
28
|
|
28
29
|
if _state.nil?
|
29
30
|
return nil unless described_class < BaseState # TODO: maybe raise an error here.
|
30
|
-
@state = @last_state = CF_TEST_CONTEXT
|
31
|
+
@state = @last_state = TransitionService.apply_state CF_TEST_CONTEXT, described_class, _params
|
31
32
|
else
|
32
|
-
@last_state = CF_TEST_CONTEXT
|
33
|
+
@last_state = TransitionService.apply_state CF_TEST_CONTEXT, _state, _params
|
33
34
|
end
|
34
35
|
end
|
35
36
|
|
data/lib/crabfarm/strategies.rb
CHANGED
@@ -2,25 +2,42 @@ module Crabfarm
|
|
2
2
|
module Strategies
|
3
3
|
|
4
4
|
class Loader
|
5
|
-
def initialize(_klass, _pkg
|
5
|
+
def initialize(_name, _klass, _pkg, _deps)
|
6
|
+
@name = _name
|
6
7
|
@klass = _klass
|
7
8
|
@pkg = _pkg
|
9
|
+
@deps = _deps
|
8
10
|
end
|
9
11
|
|
10
12
|
def load
|
13
|
+
load_dependencies
|
11
14
|
require @pkg if @pkg
|
12
15
|
if @klass.is_a? String then Object.const_get @klass else @klass end
|
13
16
|
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def load_dependencies
|
21
|
+
@deps.each do |dep|
|
22
|
+
begin
|
23
|
+
require dep
|
24
|
+
# TODO: check dependency version!
|
25
|
+
rescue LoadError
|
26
|
+
raise ConfigurationError.new "Missing #{@name} dependency, please add `gem \"#{dep}\"` to the crawler's Gemfile"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
14
30
|
end
|
15
31
|
|
16
32
|
@@register = {}
|
17
33
|
|
18
|
-
def self.register(_cat, _name, _klass, _pkg=nil)
|
19
|
-
|
34
|
+
def self.register(_cat, _name, _klass, _pkg=nil, _deps=[])
|
35
|
+
full_name = _cat.to_s + ':' + _name.to_s
|
36
|
+
@@register[full_name] = Loader.new(full_name, _klass, _pkg, _deps)
|
20
37
|
end
|
21
38
|
|
22
39
|
def self.load(_cat, _name)
|
23
|
-
full_name = _cat.to_s + _name.to_s
|
40
|
+
full_name = _cat.to_s + ':' + _name.to_s
|
24
41
|
raise ConfigurationError.new "Invalid #{_cat} name #{_name}" unless @@register.has_key? full_name
|
25
42
|
@@register[full_name].load
|
26
43
|
end
|
@@ -1,10 +1,14 @@
|
|
1
1
|
|
2
2
|
# The default crawling dsl to use in states and parsers, can be overriden in each component using the `browser_dsl :dsl` modifier
|
3
|
-
# Available options are :surfer, :watir and :capybara
|
3
|
+
# Available options are :surfer, :watir and :capybara. Both watir and capybara require an additional gem to be added to Gemfile
|
4
4
|
set_browser_dsl :surfer
|
5
5
|
|
6
|
+
# The default parser engine for parsers that do not specify one.
|
7
|
+
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
8
|
+
# set_parser_engine :nokogiri
|
9
|
+
|
6
10
|
# Change the defaut output builder used in a state to generate the output document.
|
7
|
-
# Available options are :hash, :ostruct, :jbuilder
|
11
|
+
# Available options are :hash, :ostruct, :jbuilder. :jbuilder requires an additional gem to be added to Gemfile
|
8
12
|
# set_output_builder :hash
|
9
13
|
|
10
14
|
# The path where every crawler log is stored.
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
class TransitionService
|
3
|
+
|
4
|
+
def self.apply_state(_context, _name, _params={})
|
5
|
+
state_class = if _name.is_a? String or _name.is_a? Symbol
|
6
|
+
load_by_name _name
|
7
|
+
else _name end
|
8
|
+
|
9
|
+
_context.load
|
10
|
+
state = state_class.new _context, _params
|
11
|
+
state.crawl
|
12
|
+
state
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def self.load_by_name(_name)
|
18
|
+
name = _name.to_s.gsub(/[^A-Z0-9:]+/i, '_').camelize
|
19
|
+
name.constantize
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
end
|
data/lib/crabfarm/version.rb
CHANGED
data/lib/crabfarm.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
|
-
require
|
1
|
+
require "logger"
|
2
2
|
require "forwardable"
|
3
|
+
require "net/http"
|
3
4
|
require "active_support/inflector"
|
4
5
|
require "selenium-webdriver"
|
5
6
|
|
6
7
|
require "crabfarm/version"
|
7
8
|
require "crabfarm/errors"
|
8
9
|
require "crabfarm/configuration"
|
9
|
-
require "crabfarm/
|
10
|
-
require "crabfarm/parser_service"
|
10
|
+
require "crabfarm/transition_service"
|
11
11
|
require "crabfarm/driver_bucket"
|
12
12
|
require "crabfarm/driver_bucket_pool"
|
13
|
+
require "crabfarm/http_client"
|
13
14
|
require "crabfarm/default_driver_factory"
|
14
15
|
require "crabfarm/phantom_driver_factory"
|
15
16
|
require "crabfarm/phantom_runner"
|
@@ -43,15 +44,16 @@ module Crabfarm
|
|
43
44
|
module Strategies
|
44
45
|
# bundled browser dsl adapters
|
45
46
|
register :browser_dsl, :surfer, 'Crabfarm::SurferBrowserDsl', 'crabfarm/adapters/browser/surfer'
|
46
|
-
register :browser_dsl, :watir, 'Crabfarm::WatirBrowserDsl', 'crabfarm/adapters/browser/watir'
|
47
|
-
register :browser_dsl, :capybara, 'Crabfarm::CapybaraBrowserDsl', 'crabfarm/adapters/browser/capybara'
|
47
|
+
register :browser_dsl, :watir, 'Crabfarm::WatirBrowserDsl', 'crabfarm/adapters/browser/watir', ['watir-webdriver']
|
48
|
+
register :browser_dsl, :capybara, 'Crabfarm::CapybaraBrowserDsl', 'crabfarm/adapters/browser/capybara', ['capybara']
|
48
49
|
|
49
50
|
# bundled parsers dsl adapters
|
50
|
-
register :
|
51
|
+
register :parser_engine, :nokogiri, 'Crabfarm::NokogiriAdapter', 'crabfarm/adapters/parser/nokogiri'
|
52
|
+
register :parser_engine, :pdf_reader, 'Crabfarm::PdfReaderAdapter', 'crabfarm/adapters/parser/pdf_reader', ['pdf-reader']
|
51
53
|
|
52
54
|
# bundled state output builders
|
53
55
|
register :output_builder, :hash, 'Crabfarm::HashOutputBuilder', 'crabfarm/adapters/output/hash'
|
54
56
|
register :output_builder, :ostruct, 'Crabfarm::OStructOutputBuilder', 'crabfarm/adapters/output/ostruct'
|
55
|
-
register :output_builder, :jbuilder, 'Crabfarm::JbuilderOutputBuilder', 'crabfarm/adapters/output/jbuilder'
|
57
|
+
register :output_builder, :jbuilder, 'Crabfarm::JbuilderOutputBuilder', 'crabfarm/adapters/output/jbuilder', ['jbuilder']
|
56
58
|
end
|
57
59
|
end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-03-
|
11
|
+
date: 2015-03-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: jbuilder
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 2.2.0
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 2.2.0
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: selenium-webdriver
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -38,34 +24,6 @@ dependencies:
|
|
38
24
|
- - ~>
|
39
25
|
- !ruby/object:Gem::Version
|
40
26
|
version: '2.33'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: capybara
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - '>='
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :runtime
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - '>='
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: watir-webdriver
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :runtime
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - '>='
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
27
|
- !ruby/object:Gem::Dependency
|
70
28
|
name: nokogiri
|
71
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -170,20 +128,6 @@ dependencies:
|
|
170
128
|
- - '>='
|
171
129
|
- !ruby/object:Gem::Version
|
172
130
|
version: '0'
|
173
|
-
- !ruby/object:Gem::Dependency
|
174
|
-
name: multipart-post
|
175
|
-
requirement: !ruby/object:Gem::Requirement
|
176
|
-
requirements:
|
177
|
-
- - '>='
|
178
|
-
- !ruby/object:Gem::Version
|
179
|
-
version: '0'
|
180
|
-
type: :runtime
|
181
|
-
prerelease: false
|
182
|
-
version_requirements: !ruby/object:Gem::Requirement
|
183
|
-
requirements:
|
184
|
-
- - '>='
|
185
|
-
- !ruby/object:Gem::Version
|
186
|
-
version: '0'
|
187
131
|
- !ruby/object:Gem::Dependency
|
188
132
|
name: bundler
|
189
133
|
requirement: !ruby/object:Gem::Requirement
|
@@ -324,6 +268,62 @@ dependencies:
|
|
324
268
|
- - '>='
|
325
269
|
- !ruby/object:Gem::Version
|
326
270
|
version: '0'
|
271
|
+
- !ruby/object:Gem::Dependency
|
272
|
+
name: pdf-reader
|
273
|
+
requirement: !ruby/object:Gem::Requirement
|
274
|
+
requirements:
|
275
|
+
- - ~>
|
276
|
+
- !ruby/object:Gem::Version
|
277
|
+
version: 1.3.3
|
278
|
+
type: :development
|
279
|
+
prerelease: false
|
280
|
+
version_requirements: !ruby/object:Gem::Requirement
|
281
|
+
requirements:
|
282
|
+
- - ~>
|
283
|
+
- !ruby/object:Gem::Version
|
284
|
+
version: 1.3.3
|
285
|
+
- !ruby/object:Gem::Dependency
|
286
|
+
name: capybara
|
287
|
+
requirement: !ruby/object:Gem::Requirement
|
288
|
+
requirements:
|
289
|
+
- - '>='
|
290
|
+
- !ruby/object:Gem::Version
|
291
|
+
version: '0'
|
292
|
+
type: :development
|
293
|
+
prerelease: false
|
294
|
+
version_requirements: !ruby/object:Gem::Requirement
|
295
|
+
requirements:
|
296
|
+
- - '>='
|
297
|
+
- !ruby/object:Gem::Version
|
298
|
+
version: '0'
|
299
|
+
- !ruby/object:Gem::Dependency
|
300
|
+
name: watir-webdriver
|
301
|
+
requirement: !ruby/object:Gem::Requirement
|
302
|
+
requirements:
|
303
|
+
- - '>='
|
304
|
+
- !ruby/object:Gem::Version
|
305
|
+
version: '0'
|
306
|
+
type: :development
|
307
|
+
prerelease: false
|
308
|
+
version_requirements: !ruby/object:Gem::Requirement
|
309
|
+
requirements:
|
310
|
+
- - '>='
|
311
|
+
- !ruby/object:Gem::Version
|
312
|
+
version: '0'
|
313
|
+
- !ruby/object:Gem::Dependency
|
314
|
+
name: jbuilder
|
315
|
+
requirement: !ruby/object:Gem::Requirement
|
316
|
+
requirements:
|
317
|
+
- - ~>
|
318
|
+
- !ruby/object:Gem::Version
|
319
|
+
version: 2.2.0
|
320
|
+
type: :development
|
321
|
+
prerelease: false
|
322
|
+
version_requirements: !ruby/object:Gem::Requirement
|
323
|
+
requirements:
|
324
|
+
- - ~>
|
325
|
+
- !ruby/object:Gem::Version
|
326
|
+
version: 2.2.0
|
327
327
|
description:
|
328
328
|
email:
|
329
329
|
- ignacio@platan.us
|
@@ -339,6 +339,7 @@ files:
|
|
339
339
|
- lib/crabfarm/adapters/output/jbuilder.rb
|
340
340
|
- lib/crabfarm/adapters/output/ostruct.rb
|
341
341
|
- lib/crabfarm/adapters/parser/nokogiri.rb
|
342
|
+
- lib/crabfarm/adapters/parser/pdf_reader.rb
|
342
343
|
- lib/crabfarm/base_parser.rb
|
343
344
|
- lib/crabfarm/base_state.rb
|
344
345
|
- lib/crabfarm/cli.rb
|
@@ -355,14 +356,13 @@ files:
|
|
355
356
|
- lib/crabfarm/engines/safe_state_loop.rb
|
356
357
|
- lib/crabfarm/errors.rb
|
357
358
|
- lib/crabfarm/forked_state.rb
|
358
|
-
- lib/crabfarm/
|
359
|
+
- lib/crabfarm/http_client.rb
|
359
360
|
- lib/crabfarm/mocks/noop_driver.rb
|
360
361
|
- lib/crabfarm/modes/console.rb
|
361
362
|
- lib/crabfarm/modes/generator.rb
|
362
363
|
- lib/crabfarm/modes/publisher.rb
|
363
364
|
- lib/crabfarm/modes/recorder.rb
|
364
365
|
- lib/crabfarm/modes/server.rb
|
365
|
-
- lib/crabfarm/parser_service.rb
|
366
366
|
- lib/crabfarm/phantom_driver_factory.rb
|
367
367
|
- lib/crabfarm/phantom_runner.rb
|
368
368
|
- lib/crabfarm/rspec.rb
|
@@ -383,6 +383,7 @@ files:
|
|
383
383
|
- lib/crabfarm/templates/spec_helper.rb.erb
|
384
384
|
- lib/crabfarm/templates/state.rb.erb
|
385
385
|
- lib/crabfarm/templates/state_spec.rb.erb
|
386
|
+
- lib/crabfarm/transition_service.rb
|
386
387
|
- lib/crabfarm/version.rb
|
387
388
|
- lib/crabfarm.rb
|
388
389
|
- bin/crabfarm
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
class LoaderService
|
3
|
-
|
4
|
-
def self.load_state(_name)
|
5
|
-
load_entity _name, 'state', BaseState
|
6
|
-
end
|
7
|
-
|
8
|
-
def self.load_parser(_name)
|
9
|
-
load_entity _name, 'parser', BaseParser
|
10
|
-
end
|
11
|
-
|
12
|
-
private
|
13
|
-
|
14
|
-
def self.load_entity(_name, _role, _type)
|
15
|
-
|
16
|
-
if _name.is_a? String or _name.is_a? Symbol
|
17
|
-
name = _name.to_s.gsub(/[^A-Z0-9:]+/i, '_').camelize
|
18
|
-
mod = name.constantize rescue nil
|
19
|
-
else
|
20
|
-
mod = _name
|
21
|
-
end
|
22
|
-
|
23
|
-
raise EntityNotFoundError.new _role, name if mod.nil?
|
24
|
-
raise EntityNotFoundError.new _role, name unless mod < _type
|
25
|
-
mod
|
26
|
-
end
|
27
|
-
|
28
|
-
end
|
29
|
-
end
|
@@ -1,12 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
class ParserService
|
3
|
-
|
4
|
-
def self.parse(_parser_class, _html, _options={})
|
5
|
-
_parser_class = LoaderService.load_parser(_parser_class) if _parser_class.is_a? String or _parser_class.is_a? Symbol
|
6
|
-
parser = _parser_class.new _html, _options
|
7
|
-
parser.parse
|
8
|
-
parser
|
9
|
-
end
|
10
|
-
|
11
|
-
end
|
12
|
-
end
|