crabfarm 0.5.3 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/driver_wrapper/pincers.rb +17 -0
- data/lib/crabfarm/adapters/parser/pincers.rb +23 -0
- data/lib/crabfarm/base_navigator.rb +6 -0
- data/lib/crabfarm/configuration.rb +4 -4
- data/lib/crabfarm/modes/publisher.rb +1 -1
- data/lib/crabfarm/templates/Crabfile.erb +4 -4
- data/lib/crabfarm/templates/Gemfile.erb +3 -5
- data/lib/crabfarm/version.rb +1 -1
- data/lib/crabfarm.rb +2 -1
- metadata +19 -6
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +0 -13
- data/lib/crabfarm/dsl/surfer/search_context.rb +0 -152
- data/lib/crabfarm/dsl/surfer/surf_context.rb +0 -42
- data/lib/crabfarm/dsl/surfer.rb +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 425cdb9cbc7e43b16e7ab9a7d8ef5f187f95d07d
|
4
|
+
data.tar.gz: a8f7360685f9febdb8e9076586ab1e856f6c0323
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 203b29582a08881d693923620e7cb601433be93edf3551f8daebc414fff2aca12e3bfcc34b3e81b0d045921cfbb5ad74a47e7d3858925c31bf300e995107b33d
|
7
|
+
data.tar.gz: 8b2215150d4976599385b2da16b1b5cb89f169ef6b0ba41b3e719cd56fc8d68b2c061d015cece7523ad72ec95378c9c49ad0faa292259a5f7645c17cc4af273d
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Pincers::Core::SearchContext
|
2
|
+
def webdriver_elements
|
3
|
+
elements
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Crabfarm
|
8
|
+
module Adapters
|
9
|
+
module DriverWrapper
|
10
|
+
class Pincers
|
11
|
+
def self.wrap(_driver)
|
12
|
+
::Pincers.for_webdriver _driver
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
module Adapters
|
3
|
+
module Parser
|
4
|
+
class Pincers
|
5
|
+
def self.format
|
6
|
+
'html'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
::Pincers.for_nokogiri ::Nokogiri::HTML _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
if _target.respond_to? :to_html
|
15
|
+
_target.to_html
|
16
|
+
else
|
17
|
+
_target
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -21,6 +21,12 @@ module Crabfarm
|
|
21
21
|
@params = _params
|
22
22
|
end
|
23
23
|
|
24
|
+
def navigate(_name, _params={})
|
25
|
+
TransitionService.transition(@context, _name, params.merge(_params)).navigator
|
26
|
+
end
|
27
|
+
|
28
|
+
alias :nav :navigate
|
29
|
+
|
24
30
|
def browser(_name=nil)
|
25
31
|
@context.pool.driver(_name)
|
26
32
|
end
|
@@ -15,10 +15,10 @@ module Crabfarm
|
|
15
15
|
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
16
16
|
[:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
|
17
17
|
[:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
|
18
|
-
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or
|
18
|
+
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phantomjs driver.'],
|
19
19
|
[:webdriver_window_width, :integer, 'Initial browser window width.'],
|
20
20
|
[:webdriver_window_height, :integer, 'Initial browser window height.'],
|
21
|
-
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are
|
21
|
+
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are pincers and watir'],
|
22
22
|
|
23
23
|
# Phantom launcher configuration
|
24
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
@@ -53,7 +53,7 @@ module Crabfarm
|
|
53
53
|
def reset
|
54
54
|
@values = {
|
55
55
|
browser: 'phantomjs',
|
56
|
-
parser: :
|
56
|
+
parser: :pincers,
|
57
57
|
driver_factory: nil,
|
58
58
|
log_path: nil,
|
59
59
|
proxy: nil,
|
@@ -63,7 +63,7 @@ module Crabfarm
|
|
63
63
|
webdriver_remote_timeout: 120,
|
64
64
|
webdriver_window_width: 1280,
|
65
65
|
webdriver_window_height: 800,
|
66
|
-
webdriver_dsl: :
|
66
|
+
webdriver_dsl: :pincers,
|
67
67
|
phantom_load_images: false,
|
68
68
|
phantom_ssl: 'any',
|
69
69
|
phantom_bin_path: 'phantomjs',
|
@@ -3,8 +3,8 @@
|
|
3
3
|
set_browser :phantomjs
|
4
4
|
|
5
5
|
# The default parser engine for reducers that do not specify one.
|
6
|
-
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
7
|
-
set_parser :
|
6
|
+
# Available options are :pincers, :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
7
|
+
set_parser :pincers
|
8
8
|
|
9
9
|
# The path where every crawler log is stored.
|
10
10
|
set_log_path 'logs'
|
@@ -17,9 +17,9 @@ set_log_path 'logs'
|
|
17
17
|
|
18
18
|
# The following parameters only apply if using a webdriver based driver
|
19
19
|
|
20
|
-
# Selects the webdriver wrapper library to be used, options are :
|
20
|
+
# Selects the webdriver wrapper library to be used, options are :pincers, :watir and :capybara.
|
21
21
|
# Both watir and capybara require an additional gem to be added to Gemfile
|
22
|
-
set_webdriver_dsl :
|
22
|
+
set_webdriver_dsl :pincers
|
23
23
|
|
24
24
|
# Set the selected webdriver capabilities (check the driver documentation for more details)
|
25
25
|
# set_webdriver_capabilities
|
@@ -1,16 +1,14 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem "crabfarm", '<%= version %>'
|
4
|
+
gem "pincers", '~> 0.2.0'
|
4
5
|
|
5
|
-
# Comment this
|
6
|
+
# Comment this if not using a nokogiri based parser or browser
|
6
7
|
gem 'nokogiri', "~> 1.6.6"
|
7
8
|
|
8
|
-
# Comment this
|
9
|
+
# Comment this if not using a selenium webdriver based driver
|
9
10
|
gem "selenium-webdriver", "~> 2.45"
|
10
11
|
|
11
|
-
# Comment this if using other webdriver dsl
|
12
|
-
gem "watir-webdriver"
|
13
|
-
|
14
12
|
group :test do
|
15
13
|
gem "rspec", "~> 3.2.0"
|
16
14
|
gem "rspec-nc"
|
data/lib/crabfarm/version.rb
CHANGED
data/lib/crabfarm.rb
CHANGED
@@ -101,11 +101,12 @@ module Crabfarm
|
|
101
101
|
register :browser, :noop, 'Crabfarm::Adapters::Browser::Noop'
|
102
102
|
|
103
103
|
# bundled webdriver dsl adapters
|
104
|
-
register :webdriver_dsl, :
|
104
|
+
register :webdriver_dsl, :pincers, 'Crabfarm::Adapters::DriverWrapper::Pincers', dependencies: ['pincers']
|
105
105
|
register :webdriver_dsl, :watir, 'Crabfarm::Adapters::DriverWrapper::Watir', dependencies: ['watir-webdriver']
|
106
106
|
register :webdriver_dsl, :capybara, 'Crabfarm::Adapters::DriverWrapper::Capybara', dependencies: ['capybara']
|
107
107
|
|
108
108
|
# bundled parsers dsl adapters
|
109
|
+
register :parser, :pincers, 'Crabfarm::Adapters::Parser::Pincers', dependencies: ['pincers', 'nokogiri']
|
109
110
|
register :parser, :nokogiri, 'Crabfarm::Adapters::Parser::Nokogiri', dependencies: ['nokogiri']
|
110
111
|
register :parser, :pdf_reader, 'Crabfarm::Adapters::Parser::PdfReader', dependencies: ['pdf-reader']
|
111
112
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -184,6 +184,20 @@ dependencies:
|
|
184
184
|
- - ~>
|
185
185
|
- !ruby/object:Gem::Version
|
186
186
|
version: 1.6.6
|
187
|
+
- !ruby/object:Gem::Dependency
|
188
|
+
name: pincers
|
189
|
+
requirement: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - ~>
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: 0.2.0
|
194
|
+
type: :development
|
195
|
+
prerelease: false
|
196
|
+
version_requirements: !ruby/object:Gem::Requirement
|
197
|
+
requirements:
|
198
|
+
- - ~>
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: 0.2.0
|
187
201
|
- !ruby/object:Gem::Dependency
|
188
202
|
name: bundler
|
189
203
|
requirement: !ruby/object:Gem::Requirement
|
@@ -409,10 +423,11 @@ files:
|
|
409
423
|
- lib/crabfarm/adapters/browser/phantom_js.rb
|
410
424
|
- lib/crabfarm/adapters/browser/remote_webdriver.rb
|
411
425
|
- lib/crabfarm/adapters/driver_wrapper/capybara.rb
|
412
|
-
- lib/crabfarm/adapters/driver_wrapper/
|
426
|
+
- lib/crabfarm/adapters/driver_wrapper/pincers.rb
|
413
427
|
- lib/crabfarm/adapters/driver_wrapper/watir.rb
|
414
428
|
- lib/crabfarm/adapters/parser/nokogiri.rb
|
415
429
|
- lib/crabfarm/adapters/parser/pdf_reader.rb
|
430
|
+
- lib/crabfarm/adapters/parser/pincers.rb
|
416
431
|
- lib/crabfarm/assertion/context.rb
|
417
432
|
- lib/crabfarm/assertion/fields.rb
|
418
433
|
- lib/crabfarm/assertion/parsers.rb
|
@@ -428,9 +443,6 @@ files:
|
|
428
443
|
- lib/crabfarm/crabtrap_context.rb
|
429
444
|
- lib/crabfarm/crabtrap_runner.rb
|
430
445
|
- lib/crabfarm/driver_pool.rb
|
431
|
-
- lib/crabfarm/dsl/surfer/search_context.rb
|
432
|
-
- lib/crabfarm/dsl/surfer/surf_context.rb
|
433
|
-
- lib/crabfarm/dsl/surfer.rb
|
434
446
|
- lib/crabfarm/engines/async_state_manager.rb
|
435
447
|
- lib/crabfarm/engines/sync_state_manager.rb
|
436
448
|
- lib/crabfarm/errors.rb
|
@@ -528,3 +540,4 @@ signing_key:
|
|
528
540
|
specification_version: 4
|
529
541
|
summary: Crabfarm crawler creation framework
|
530
542
|
test_files: []
|
543
|
+
has_rdoc:
|
@@ -1,152 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
module Dsl
|
3
|
-
module Surfer
|
4
|
-
class SearchContext
|
5
|
-
include Enumerable
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
TIMEOUT = 10.0 # Default timeout for waiting operations
|
9
|
-
|
10
|
-
attr_accessor :elements, :parent
|
11
|
-
|
12
|
-
def_delegators :elements, :length, :count, :empty?
|
13
|
-
|
14
|
-
def initialize(_elements, _parent)
|
15
|
-
@elements = _elements
|
16
|
-
@parent = _parent
|
17
|
-
end
|
18
|
-
|
19
|
-
def webdriver_elements
|
20
|
-
@elements
|
21
|
-
end
|
22
|
-
|
23
|
-
def root
|
24
|
-
@parent.root
|
25
|
-
end
|
26
|
-
|
27
|
-
def each
|
28
|
-
elements.each { |el| yield child_context [el] }
|
29
|
-
end
|
30
|
-
|
31
|
-
def [](*args)
|
32
|
-
if args[0].is_a? String or args[0].is_a? Symbol
|
33
|
-
attribute args[0]
|
34
|
-
else
|
35
|
-
child_context Array(elements.send(:[],*args))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def first
|
40
|
-
if elements.first.nil? then nil else child_context [elements.first] end
|
41
|
-
end
|
42
|
-
|
43
|
-
def last
|
44
|
-
if elements.last.nil? then nil else child_context [elements.last] end
|
45
|
-
end
|
46
|
-
|
47
|
-
def element!
|
48
|
-
raise EmptySetError.new("This set is empty", self) if empty?
|
49
|
-
elements.first
|
50
|
-
end
|
51
|
-
|
52
|
-
def classes
|
53
|
-
wrap_errors { (element!['class'] || '').split(' ') }
|
54
|
-
end
|
55
|
-
|
56
|
-
def search(_selector=nil, _options={})
|
57
|
-
_options[:css] = _selector if _selector
|
58
|
-
|
59
|
-
wait_mode = _options.delete :wait
|
60
|
-
if wait_mode
|
61
|
-
|
62
|
-
# retrieve timeout
|
63
|
-
timeout = _options.delete :timeout
|
64
|
-
timeout = TIMEOUT if timeout.nil?
|
65
|
-
|
66
|
-
# use a selenium timeout
|
67
|
-
wrap_errors do
|
68
|
-
wait = Selenium::WebDriver::Wait.new(timeout: timeout)
|
69
|
-
wait.until do
|
70
|
-
new_elements = search_elements _options
|
71
|
-
|
72
|
-
# test wait condition
|
73
|
-
ok = case wait_mode
|
74
|
-
when :present then (new_elements.length > 0)
|
75
|
-
when :visible then (new_elements.length > 0 and new_elements.first.displayed?)
|
76
|
-
when :enabled then (new_elements.length > 0 and new_elements.first.displayed? and new_elements.first.enabled?)
|
77
|
-
when :not_present then (new_elements.length == 0)
|
78
|
-
when :not_visible then (not new_elements.any? { |e| e.displayed? })
|
79
|
-
else
|
80
|
-
raise SetupError.new "Invalid wait mode '#{wait_mode}'"
|
81
|
-
end
|
82
|
-
|
83
|
-
child_context new_elements if ok
|
84
|
-
end
|
85
|
-
end
|
86
|
-
else
|
87
|
-
child_context search_elements(_options)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def fill(_value)
|
92
|
-
wrap_errors do
|
93
|
-
element!.clear
|
94
|
-
element!.send_keys _value
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def to_html
|
99
|
-
elements.map { |e| e['outerHTML'] }.join
|
100
|
-
end
|
101
|
-
|
102
|
-
# Any methods missing are forwarded to the main element (first).
|
103
|
-
def method_missing(_method, *_args, &_block)
|
104
|
-
wrap_errors do
|
105
|
-
m = /^(.*)_all$/.match _method.to_s
|
106
|
-
if m then
|
107
|
-
return [] if empty?
|
108
|
-
elements.map { |e| e.send(m[1], *_args, &_block) }
|
109
|
-
else
|
110
|
-
element!.send(_method, *_args, &_block)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
def respond_to?(_method, _include_all=false)
|
116
|
-
return true if super
|
117
|
-
m = /^.*_all$/.match _method.to_s
|
118
|
-
if m then
|
119
|
-
return true if empty?
|
120
|
-
elements.first.respond_to? m[1], _include_all
|
121
|
-
else
|
122
|
-
return true if empty?
|
123
|
-
elements.first.respond_to? _method, _include_all
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
private
|
128
|
-
|
129
|
-
def child_context(_elements)
|
130
|
-
SearchContext.new _elements, self
|
131
|
-
end
|
132
|
-
|
133
|
-
def wrap_errors
|
134
|
-
begin
|
135
|
-
yield
|
136
|
-
rescue Selenium::WebDriver::Error::WebDriverError => e
|
137
|
-
raise WebdriverError.new e, self
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def search_elements(_options)
|
142
|
-
wrap_errors do
|
143
|
-
elements.inject([]) do |r, element|
|
144
|
-
r + element.find_elements(_options)
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
end
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
module Dsl
|
3
|
-
module Surfer
|
4
|
-
class SurfContext < SearchContext
|
5
|
-
|
6
|
-
attr_reader :driver
|
7
|
-
|
8
|
-
def_delegators 'driver.navigate', :back, :forward, :refresh
|
9
|
-
|
10
|
-
def initialize(_driver)
|
11
|
-
super nil, self
|
12
|
-
@driver = _driver
|
13
|
-
end
|
14
|
-
|
15
|
-
def root
|
16
|
-
self
|
17
|
-
end
|
18
|
-
|
19
|
-
def elements
|
20
|
-
[driver]
|
21
|
-
end
|
22
|
-
|
23
|
-
def to_html
|
24
|
-
driver.page_source
|
25
|
-
end
|
26
|
-
|
27
|
-
def current_uri
|
28
|
-
URI.parse driver.current_url
|
29
|
-
end
|
30
|
-
|
31
|
-
def cookies
|
32
|
-
driver.manage.all_cookies
|
33
|
-
end
|
34
|
-
|
35
|
-
def goto(_url, _params=nil)
|
36
|
-
_url += "?#{_params.to_query}" if _params
|
37
|
-
driver.get(_url)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
data/lib/crabfarm/dsl/surfer.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
require 'crabfarm/dsl/surfer/search_context'
|
2
|
-
require 'crabfarm/dsl/surfer/surf_context'
|
3
|
-
|
4
|
-
module Crabfarm
|
5
|
-
module Dsl
|
6
|
-
module Surfer
|
7
|
-
|
8
|
-
class Error < StandardError
|
9
|
-
attr_reader :source
|
10
|
-
|
11
|
-
def initialize(_message, _ctx)
|
12
|
-
super _message
|
13
|
-
@ctx = _ctx
|
14
|
-
@source = _ctx.root.page_source rescue nil # cache page source for future reference
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
class EmptySetError < Error; end
|
19
|
-
class WebdriverError < Error; end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|