crabfarm 0.5.3 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/driver_wrapper/pincers.rb +17 -0
- data/lib/crabfarm/adapters/parser/pincers.rb +23 -0
- data/lib/crabfarm/base_navigator.rb +6 -0
- data/lib/crabfarm/configuration.rb +4 -4
- data/lib/crabfarm/modes/publisher.rb +1 -1
- data/lib/crabfarm/templates/Crabfile.erb +4 -4
- data/lib/crabfarm/templates/Gemfile.erb +3 -5
- data/lib/crabfarm/version.rb +1 -1
- data/lib/crabfarm.rb +2 -1
- metadata +19 -6
- data/lib/crabfarm/adapters/driver_wrapper/surfer.rb +0 -13
- data/lib/crabfarm/dsl/surfer/search_context.rb +0 -152
- data/lib/crabfarm/dsl/surfer/surf_context.rb +0 -42
- data/lib/crabfarm/dsl/surfer.rb +0 -22
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 425cdb9cbc7e43b16e7ab9a7d8ef5f187f95d07d
|
4
|
+
data.tar.gz: a8f7360685f9febdb8e9076586ab1e856f6c0323
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 203b29582a08881d693923620e7cb601433be93edf3551f8daebc414fff2aca12e3bfcc34b3e81b0d045921cfbb5ad74a47e7d3858925c31bf300e995107b33d
|
7
|
+
data.tar.gz: 8b2215150d4976599385b2da16b1b5cb89f169ef6b0ba41b3e719cd56fc8d68b2c061d015cece7523ad72ec95378c9c49ad0faa292259a5f7645c17cc4af273d
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class Pincers::Core::SearchContext
|
2
|
+
def webdriver_elements
|
3
|
+
elements
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
module Crabfarm
|
8
|
+
module Adapters
|
9
|
+
module DriverWrapper
|
10
|
+
class Pincers
|
11
|
+
def self.wrap(_driver)
|
12
|
+
::Pincers.for_webdriver _driver
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Crabfarm
|
2
|
+
module Adapters
|
3
|
+
module Parser
|
4
|
+
class Pincers
|
5
|
+
def self.format
|
6
|
+
'html'
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.parse(_raw)
|
10
|
+
::Pincers.for_nokogiri ::Nokogiri::HTML _raw
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.preprocess_parsing_target(_target)
|
14
|
+
if _target.respond_to? :to_html
|
15
|
+
_target.to_html
|
16
|
+
else
|
17
|
+
_target
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -21,6 +21,12 @@ module Crabfarm
|
|
21
21
|
@params = _params
|
22
22
|
end
|
23
23
|
|
24
|
+
def navigate(_name, _params={})
|
25
|
+
TransitionService.transition(@context, _name, params.merge(_params)).navigator
|
26
|
+
end
|
27
|
+
|
28
|
+
alias :nav :navigate
|
29
|
+
|
24
30
|
def browser(_name=nil)
|
25
31
|
@context.pool.driver(_name)
|
26
32
|
end
|
@@ -15,10 +15,10 @@ module Crabfarm
|
|
15
15
|
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
16
16
|
[:webdriver_port, :integer, 'Remote port, only available in driver: remote'],
|
17
17
|
[:webdriver_capabilities, :mixed, 'Driver capabilities, depends on selected driver.'],
|
18
|
-
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or
|
18
|
+
[:webdriver_remote_timeout, :float, 'Request timeout in seconds, only available for remote or phantomjs driver.'],
|
19
19
|
[:webdriver_window_width, :integer, 'Initial browser window width.'],
|
20
20
|
[:webdriver_window_height, :integer, 'Initial browser window height.'],
|
21
|
-
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are
|
21
|
+
[:webdriver_dsl, :string, 'Webdriver wrapper to use, built in options are pincers and watir'],
|
22
22
|
|
23
23
|
# Phantom launcher configuration
|
24
24
|
[:phantom_load_images, :boolean, 'Phantomjs image loading, only for phantomjs driver.'],
|
@@ -53,7 +53,7 @@ module Crabfarm
|
|
53
53
|
def reset
|
54
54
|
@values = {
|
55
55
|
browser: 'phantomjs',
|
56
|
-
parser: :
|
56
|
+
parser: :pincers,
|
57
57
|
driver_factory: nil,
|
58
58
|
log_path: nil,
|
59
59
|
proxy: nil,
|
@@ -63,7 +63,7 @@ module Crabfarm
|
|
63
63
|
webdriver_remote_timeout: 120,
|
64
64
|
webdriver_window_width: 1280,
|
65
65
|
webdriver_window_height: 800,
|
66
|
-
webdriver_dsl: :
|
66
|
+
webdriver_dsl: :pincers,
|
67
67
|
phantom_load_images: false,
|
68
68
|
phantom_ssl: 'any',
|
69
69
|
phantom_bin_path: 'phantomjs',
|
@@ -3,8 +3,8 @@
|
|
3
3
|
set_browser :phantomjs
|
4
4
|
|
5
5
|
# The default parser engine for reducers that do not specify one.
|
6
|
-
# Available options are :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
7
|
-
set_parser :
|
6
|
+
# Available options are :pincers, :nokogiri and :pdf_parser. :pdf_parser requires an additional gem to be added to Gemfile
|
7
|
+
set_parser :pincers
|
8
8
|
|
9
9
|
# The path where every crawler log is stored.
|
10
10
|
set_log_path 'logs'
|
@@ -17,9 +17,9 @@ set_log_path 'logs'
|
|
17
17
|
|
18
18
|
# The following parameters only apply if using a webdriver based driver
|
19
19
|
|
20
|
-
# Selects the webdriver wrapper library to be used, options are :
|
20
|
+
# Selects the webdriver wrapper library to be used, options are :pincers, :watir and :capybara.
|
21
21
|
# Both watir and capybara require an additional gem to be added to Gemfile
|
22
|
-
set_webdriver_dsl :
|
22
|
+
set_webdriver_dsl :pincers
|
23
23
|
|
24
24
|
# Set the selected webdriver capabilities (check the driver documentation for more details)
|
25
25
|
# set_webdriver_capabilities
|
@@ -1,16 +1,14 @@
|
|
1
1
|
source 'https://rubygems.org'
|
2
2
|
|
3
3
|
gem "crabfarm", '<%= version %>'
|
4
|
+
gem "pincers", '~> 0.2.0'
|
4
5
|
|
5
|
-
# Comment this
|
6
|
+
# Comment this if not using a nokogiri based parser or browser
|
6
7
|
gem 'nokogiri', "~> 1.6.6"
|
7
8
|
|
8
|
-
# Comment this
|
9
|
+
# Comment this if not using a selenium webdriver based driver
|
9
10
|
gem "selenium-webdriver", "~> 2.45"
|
10
11
|
|
11
|
-
# Comment this if using other webdriver dsl
|
12
|
-
gem "watir-webdriver"
|
13
|
-
|
14
12
|
group :test do
|
15
13
|
gem "rspec", "~> 3.2.0"
|
16
14
|
gem "rspec-nc"
|
data/lib/crabfarm/version.rb
CHANGED
data/lib/crabfarm.rb
CHANGED
@@ -101,11 +101,12 @@ module Crabfarm
|
|
101
101
|
register :browser, :noop, 'Crabfarm::Adapters::Browser::Noop'
|
102
102
|
|
103
103
|
# bundled webdriver dsl adapters
|
104
|
-
register :webdriver_dsl, :
|
104
|
+
register :webdriver_dsl, :pincers, 'Crabfarm::Adapters::DriverWrapper::Pincers', dependencies: ['pincers']
|
105
105
|
register :webdriver_dsl, :watir, 'Crabfarm::Adapters::DriverWrapper::Watir', dependencies: ['watir-webdriver']
|
106
106
|
register :webdriver_dsl, :capybara, 'Crabfarm::Adapters::DriverWrapper::Capybara', dependencies: ['capybara']
|
107
107
|
|
108
108
|
# bundled parsers dsl adapters
|
109
|
+
register :parser, :pincers, 'Crabfarm::Adapters::Parser::Pincers', dependencies: ['pincers', 'nokogiri']
|
109
110
|
register :parser, :nokogiri, 'Crabfarm::Adapters::Parser::Nokogiri', dependencies: ['nokogiri']
|
110
111
|
register :parser, :pdf_reader, 'Crabfarm::Adapters::Parser::PdfReader', dependencies: ['pdf-reader']
|
111
112
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -184,6 +184,20 @@ dependencies:
|
|
184
184
|
- - ~>
|
185
185
|
- !ruby/object:Gem::Version
|
186
186
|
version: 1.6.6
|
187
|
+
- !ruby/object:Gem::Dependency
|
188
|
+
name: pincers
|
189
|
+
requirement: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - ~>
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: 0.2.0
|
194
|
+
type: :development
|
195
|
+
prerelease: false
|
196
|
+
version_requirements: !ruby/object:Gem::Requirement
|
197
|
+
requirements:
|
198
|
+
- - ~>
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: 0.2.0
|
187
201
|
- !ruby/object:Gem::Dependency
|
188
202
|
name: bundler
|
189
203
|
requirement: !ruby/object:Gem::Requirement
|
@@ -409,10 +423,11 @@ files:
|
|
409
423
|
- lib/crabfarm/adapters/browser/phantom_js.rb
|
410
424
|
- lib/crabfarm/adapters/browser/remote_webdriver.rb
|
411
425
|
- lib/crabfarm/adapters/driver_wrapper/capybara.rb
|
412
|
-
- lib/crabfarm/adapters/driver_wrapper/
|
426
|
+
- lib/crabfarm/adapters/driver_wrapper/pincers.rb
|
413
427
|
- lib/crabfarm/adapters/driver_wrapper/watir.rb
|
414
428
|
- lib/crabfarm/adapters/parser/nokogiri.rb
|
415
429
|
- lib/crabfarm/adapters/parser/pdf_reader.rb
|
430
|
+
- lib/crabfarm/adapters/parser/pincers.rb
|
416
431
|
- lib/crabfarm/assertion/context.rb
|
417
432
|
- lib/crabfarm/assertion/fields.rb
|
418
433
|
- lib/crabfarm/assertion/parsers.rb
|
@@ -428,9 +443,6 @@ files:
|
|
428
443
|
- lib/crabfarm/crabtrap_context.rb
|
429
444
|
- lib/crabfarm/crabtrap_runner.rb
|
430
445
|
- lib/crabfarm/driver_pool.rb
|
431
|
-
- lib/crabfarm/dsl/surfer/search_context.rb
|
432
|
-
- lib/crabfarm/dsl/surfer/surf_context.rb
|
433
|
-
- lib/crabfarm/dsl/surfer.rb
|
434
446
|
- lib/crabfarm/engines/async_state_manager.rb
|
435
447
|
- lib/crabfarm/engines/sync_state_manager.rb
|
436
448
|
- lib/crabfarm/errors.rb
|
@@ -528,3 +540,4 @@ signing_key:
|
|
528
540
|
specification_version: 4
|
529
541
|
summary: Crabfarm crawler creation framework
|
530
542
|
test_files: []
|
543
|
+
has_rdoc:
|
@@ -1,152 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
module Dsl
|
3
|
-
module Surfer
|
4
|
-
class SearchContext
|
5
|
-
include Enumerable
|
6
|
-
extend Forwardable
|
7
|
-
|
8
|
-
TIMEOUT = 10.0 # Default timeout for waiting operations
|
9
|
-
|
10
|
-
attr_accessor :elements, :parent
|
11
|
-
|
12
|
-
def_delegators :elements, :length, :count, :empty?
|
13
|
-
|
14
|
-
def initialize(_elements, _parent)
|
15
|
-
@elements = _elements
|
16
|
-
@parent = _parent
|
17
|
-
end
|
18
|
-
|
19
|
-
def webdriver_elements
|
20
|
-
@elements
|
21
|
-
end
|
22
|
-
|
23
|
-
def root
|
24
|
-
@parent.root
|
25
|
-
end
|
26
|
-
|
27
|
-
def each
|
28
|
-
elements.each { |el| yield child_context [el] }
|
29
|
-
end
|
30
|
-
|
31
|
-
def [](*args)
|
32
|
-
if args[0].is_a? String or args[0].is_a? Symbol
|
33
|
-
attribute args[0]
|
34
|
-
else
|
35
|
-
child_context Array(elements.send(:[],*args))
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
def first
|
40
|
-
if elements.first.nil? then nil else child_context [elements.first] end
|
41
|
-
end
|
42
|
-
|
43
|
-
def last
|
44
|
-
if elements.last.nil? then nil else child_context [elements.last] end
|
45
|
-
end
|
46
|
-
|
47
|
-
def element!
|
48
|
-
raise EmptySetError.new("This set is empty", self) if empty?
|
49
|
-
elements.first
|
50
|
-
end
|
51
|
-
|
52
|
-
def classes
|
53
|
-
wrap_errors { (element!['class'] || '').split(' ') }
|
54
|
-
end
|
55
|
-
|
56
|
-
def search(_selector=nil, _options={})
|
57
|
-
_options[:css] = _selector if _selector
|
58
|
-
|
59
|
-
wait_mode = _options.delete :wait
|
60
|
-
if wait_mode
|
61
|
-
|
62
|
-
# retrieve timeout
|
63
|
-
timeout = _options.delete :timeout
|
64
|
-
timeout = TIMEOUT if timeout.nil?
|
65
|
-
|
66
|
-
# use a selenium timeout
|
67
|
-
wrap_errors do
|
68
|
-
wait = Selenium::WebDriver::Wait.new(timeout: timeout)
|
69
|
-
wait.until do
|
70
|
-
new_elements = search_elements _options
|
71
|
-
|
72
|
-
# test wait condition
|
73
|
-
ok = case wait_mode
|
74
|
-
when :present then (new_elements.length > 0)
|
75
|
-
when :visible then (new_elements.length > 0 and new_elements.first.displayed?)
|
76
|
-
when :enabled then (new_elements.length > 0 and new_elements.first.displayed? and new_elements.first.enabled?)
|
77
|
-
when :not_present then (new_elements.length == 0)
|
78
|
-
when :not_visible then (not new_elements.any? { |e| e.displayed? })
|
79
|
-
else
|
80
|
-
raise SetupError.new "Invalid wait mode '#{wait_mode}'"
|
81
|
-
end
|
82
|
-
|
83
|
-
child_context new_elements if ok
|
84
|
-
end
|
85
|
-
end
|
86
|
-
else
|
87
|
-
child_context search_elements(_options)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def fill(_value)
|
92
|
-
wrap_errors do
|
93
|
-
element!.clear
|
94
|
-
element!.send_keys _value
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
def to_html
|
99
|
-
elements.map { |e| e['outerHTML'] }.join
|
100
|
-
end
|
101
|
-
|
102
|
-
# Any methods missing are forwarded to the main element (first).
|
103
|
-
def method_missing(_method, *_args, &_block)
|
104
|
-
wrap_errors do
|
105
|
-
m = /^(.*)_all$/.match _method.to_s
|
106
|
-
if m then
|
107
|
-
return [] if empty?
|
108
|
-
elements.map { |e| e.send(m[1], *_args, &_block) }
|
109
|
-
else
|
110
|
-
element!.send(_method, *_args, &_block)
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
def respond_to?(_method, _include_all=false)
|
116
|
-
return true if super
|
117
|
-
m = /^.*_all$/.match _method.to_s
|
118
|
-
if m then
|
119
|
-
return true if empty?
|
120
|
-
elements.first.respond_to? m[1], _include_all
|
121
|
-
else
|
122
|
-
return true if empty?
|
123
|
-
elements.first.respond_to? _method, _include_all
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
private
|
128
|
-
|
129
|
-
def child_context(_elements)
|
130
|
-
SearchContext.new _elements, self
|
131
|
-
end
|
132
|
-
|
133
|
-
def wrap_errors
|
134
|
-
begin
|
135
|
-
yield
|
136
|
-
rescue Selenium::WebDriver::Error::WebDriverError => e
|
137
|
-
raise WebdriverError.new e, self
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def search_elements(_options)
|
142
|
-
wrap_errors do
|
143
|
-
elements.inject([]) do |r, element|
|
144
|
-
r + element.find_elements(_options)
|
145
|
-
end
|
146
|
-
end
|
147
|
-
end
|
148
|
-
|
149
|
-
end
|
150
|
-
end
|
151
|
-
end
|
152
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
module Crabfarm
|
2
|
-
module Dsl
|
3
|
-
module Surfer
|
4
|
-
class SurfContext < SearchContext
|
5
|
-
|
6
|
-
attr_reader :driver
|
7
|
-
|
8
|
-
def_delegators 'driver.navigate', :back, :forward, :refresh
|
9
|
-
|
10
|
-
def initialize(_driver)
|
11
|
-
super nil, self
|
12
|
-
@driver = _driver
|
13
|
-
end
|
14
|
-
|
15
|
-
def root
|
16
|
-
self
|
17
|
-
end
|
18
|
-
|
19
|
-
def elements
|
20
|
-
[driver]
|
21
|
-
end
|
22
|
-
|
23
|
-
def to_html
|
24
|
-
driver.page_source
|
25
|
-
end
|
26
|
-
|
27
|
-
def current_uri
|
28
|
-
URI.parse driver.current_url
|
29
|
-
end
|
30
|
-
|
31
|
-
def cookies
|
32
|
-
driver.manage.all_cookies
|
33
|
-
end
|
34
|
-
|
35
|
-
def goto(_url, _params=nil)
|
36
|
-
_url += "?#{_params.to_query}" if _params
|
37
|
-
driver.get(_url)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
data/lib/crabfarm/dsl/surfer.rb
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
require 'crabfarm/dsl/surfer/search_context'
|
2
|
-
require 'crabfarm/dsl/surfer/surf_context'
|
3
|
-
|
4
|
-
module Crabfarm
|
5
|
-
module Dsl
|
6
|
-
module Surfer
|
7
|
-
|
8
|
-
class Error < StandardError
|
9
|
-
attr_reader :source
|
10
|
-
|
11
|
-
def initialize(_message, _ctx)
|
12
|
-
super _message
|
13
|
-
@ctx = _ctx
|
14
|
-
@source = _ctx.root.page_source rescue nil # cache page source for future reference
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
class EmptySetError < Error; end
|
19
|
-
class WebdriverError < Error; end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
end
|