crabfarm 0.7.9 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +2 -1
- data/lib/crabfarm/adapters/browser/chenso.rb +3 -4
- data/lib/crabfarm/adapters/browser/noop.rb +1 -3
- data/lib/crabfarm/configuration.rb +5 -2
- data/lib/crabfarm/context.rb +7 -3
- data/lib/crabfarm/crabtrap_context.rb +4 -3
- data/lib/crabfarm/live/context.rb +5 -1
- data/lib/crabfarm/modes/generator.rb +1 -0
- data/lib/crabfarm/support/webdriver_factory.rb +30 -41
- data/lib/crabfarm/templates/Crabfile.erb +6 -0
- data/lib/crabfarm/templates/Gemfile.erb +1 -1
- data/lib/crabfarm/templates/README.md.erb +44 -0
- data/lib/crabfarm/version.rb +1 -1
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61b08ac4fbd03cce9915e677accfd155a32df06d
|
4
|
+
data.tar.gz: c6dfcfde7084e9a169c3b841b0db323f91e618f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a6e29253f64e0c783b7ad257cbcd66f247b50310f275b3ac1384ced3c5fb8fba1942dfc64da72833c40c414e2251405d39f01ee40d1b1de2f0cd10dad873b86
|
7
|
+
data.tar.gz: 84ac0447cfdacf3bb6e04ea73b937a157a13954ba5a5325a10b60d61d8261b9a46315acdd38fd522daa7e592dc6375cfedece54fb6779ae2569f41b26b6198cc
|
@@ -7,9 +7,10 @@ module Crabfarm
|
|
7
7
|
|
8
8
|
attr_accessor :config
|
9
9
|
|
10
|
-
def initialize(_proxy=nil)
|
10
|
+
def initialize(_proxy = nil, _proxy_auth = nil)
|
11
11
|
@config = load_driver_config
|
12
12
|
@config[:proxy] = _proxy
|
13
|
+
@config[:proxy_auth] = _proxy_auth
|
13
14
|
end
|
14
15
|
|
15
16
|
def build_driver(_session_id)
|
@@ -4,10 +4,10 @@ module Crabfarm
|
|
4
4
|
module Adapters
|
5
5
|
module Browser
|
6
6
|
class Chenso < Base
|
7
|
-
|
8
|
-
def initialize(_proxy=nil)
|
7
|
+
def initialize(_proxy = nil, _proxy_auth = nil)
|
9
8
|
@config = load_chenso_config
|
10
9
|
@config[:proxy] = _proxy
|
10
|
+
@config[:proxy_auth] = _proxy_auth
|
11
11
|
end
|
12
12
|
|
13
13
|
def build_driver(_session_id)
|
@@ -22,10 +22,9 @@ module Crabfarm
|
|
22
22
|
|
23
23
|
def load_chenso_config
|
24
24
|
{
|
25
|
-
|
25
|
+
user_agent: Crabfarm.config.user_agent
|
26
26
|
}
|
27
27
|
end
|
28
|
-
|
29
28
|
end
|
30
29
|
end
|
31
30
|
end
|
@@ -4,14 +4,12 @@ module Crabfarm
|
|
4
4
|
module Adapters
|
5
5
|
module Browser
|
6
6
|
class Noop < Base
|
7
|
-
|
8
|
-
def initialize(_proxy=nil)
|
7
|
+
def initialize(_proxy = nil, _proxy_user = nil)
|
9
8
|
end
|
10
9
|
|
11
10
|
def build_driver(_session_id)
|
12
11
|
_session_id || :noop
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
17
15
|
end
|
@@ -10,6 +10,8 @@ module Crabfarm
|
|
10
10
|
[:parser, :string, 'Default parser engine used by reducers'],
|
11
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
12
12
|
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
13
|
+
[:proxy_auth, :string, 'Proxy authentication parameters as user:password'],
|
14
|
+
[:user_agent, :string, 'Allows overriding default driver user agent, only available in chenso'],
|
13
15
|
|
14
16
|
# Webdriver configuration parameters
|
15
17
|
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
@@ -56,6 +58,8 @@ module Crabfarm
|
|
56
58
|
driver_factory: nil,
|
57
59
|
log_path: nil,
|
58
60
|
proxy: nil,
|
61
|
+
proxy_auth: nil,
|
62
|
+
user_agent: nil,
|
59
63
|
webdriver_capabilities: nil,
|
60
64
|
webdriver_host: 'localhost',
|
61
65
|
webdriver_port: '8080',
|
@@ -87,8 +91,7 @@ module Crabfarm
|
|
87
91
|
def crabtrap_config
|
88
92
|
{
|
89
93
|
bin_path: crabtrap_bin_path,
|
90
|
-
log_level: crabtrap_log_level
|
91
|
-
proxy: proxy
|
94
|
+
log_level: crabtrap_log_level
|
92
95
|
}
|
93
96
|
end
|
94
97
|
|
data/lib/crabfarm/context.rb
CHANGED
@@ -48,7 +48,7 @@ module Crabfarm
|
|
48
48
|
|
49
49
|
def init_browser_adapter
|
50
50
|
if @browser_adapter.nil?
|
51
|
-
@browser_adapter = build_browser_adapter proxy
|
51
|
+
@browser_adapter = build_browser_adapter proxy, proxy_auth
|
52
52
|
@browser_adapter.prepare_driver_services
|
53
53
|
end
|
54
54
|
end
|
@@ -67,14 +67,18 @@ module Crabfarm
|
|
67
67
|
@pool = nil
|
68
68
|
end
|
69
69
|
|
70
|
-
def build_browser_adapter(_proxy)
|
71
|
-
Strategies.load(:browser, config.browser).new
|
70
|
+
def build_browser_adapter(_proxy, _proxy_auth)
|
71
|
+
Strategies.load(:browser, config.browser).new(_proxy, _proxy_auth)
|
72
72
|
end
|
73
73
|
|
74
74
|
def proxy
|
75
75
|
config.proxy
|
76
76
|
end
|
77
77
|
|
78
|
+
def proxy_auth
|
79
|
+
config.proxy_auth
|
80
|
+
end
|
81
|
+
|
78
82
|
def config
|
79
83
|
Crabfarm.config
|
80
84
|
end
|
@@ -2,7 +2,6 @@ require 'crabfarm/crabtrap_runner'
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class CrabtrapContext < Context
|
5
|
-
|
6
5
|
attr_accessor :mode
|
7
6
|
|
8
7
|
def initialize(_mode=:pass, _path=nil)
|
@@ -78,13 +77,15 @@ module Crabfarm
|
|
78
77
|
end
|
79
78
|
|
80
79
|
def proxy
|
81
|
-
# just step over configuration proxy
|
82
80
|
proxy_address
|
83
81
|
end
|
84
82
|
|
83
|
+
def proxy_auth
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
85
87
|
def proxy_address
|
86
88
|
"127.0.0.1:#{@port}"
|
87
89
|
end
|
88
|
-
|
89
90
|
end
|
90
91
|
end
|
@@ -15,9 +15,13 @@ module Crabfarm
|
|
15
15
|
"127.0.0.1:#{@manager.proxy_port}"
|
16
16
|
end
|
17
17
|
|
18
|
+
def proxy_auth
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
|
18
22
|
private
|
19
23
|
|
20
|
-
def build_browser_adapter(_proxy)
|
24
|
+
def build_browser_adapter(_proxy, _proxy_auth)
|
21
25
|
# use a special browser adapter to override primary driver
|
22
26
|
return BrowserAdapter.new @manager
|
23
27
|
end
|
@@ -32,6 +32,7 @@ module Crabfarm
|
|
32
32
|
path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
|
33
33
|
path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
|
34
34
|
path(_name, 'logs', '.gitkeep').render('dot_gitkeep')
|
35
|
+
path(_name, 'README.md').render('README.md', binding)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
@@ -1,61 +1,50 @@
|
|
1
1
|
require 'selenium-webdriver'
|
2
2
|
|
3
3
|
module Crabfarm
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Support
|
5
|
+
module WebdriverFactory
|
6
|
+
extend self
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
def build_chrome_driver(_options={})
|
9
|
+
capabilities = Selenium::WebDriver::Remote::Capabilities.chrome
|
10
|
+
capabilities.proxy = build_proxy(_options) if _options[:proxy].present?
|
10
11
|
|
11
|
-
|
12
|
-
capabilities.proxy = Selenium::WebDriver::Proxy.new({
|
13
|
-
:http => _options[:proxy],
|
14
|
-
:ssl => _options[:proxy]
|
15
|
-
})
|
12
|
+
setup_webdriver Selenium::WebDriver.for(:chrome, detach: false, desired_capabilities: capabilities), _options
|
16
13
|
end
|
17
14
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
def build_firefox_driver(_options={})
|
22
|
-
capabilities = Selenium::WebDriver::Remote::Capabilities.firefox
|
15
|
+
def build_firefox_driver(_options={})
|
16
|
+
capabilities = Selenium::WebDriver::Remote::Capabilities.firefox
|
17
|
+
capabilities.proxy = build_proxy(_options) if _options[:proxy].present?
|
23
18
|
|
24
|
-
|
25
|
-
capabilities.proxy = Selenium::WebDriver::Proxy.new({
|
26
|
-
:http => _options[:proxy],
|
27
|
-
:ssl => _options[:proxy]
|
28
|
-
})
|
19
|
+
setup_webdriver Selenium::WebDriver.for(:firefox, desired_capabilities: capabilities), _options
|
29
20
|
end
|
30
21
|
|
31
|
-
|
32
|
-
|
22
|
+
def build_remote_driver(_options={})
|
23
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
24
|
+
client.timeout = _options[:remote_timeout]
|
25
|
+
client.proxy = build_proxy(_options) if _options[:proxy].present?
|
26
|
+
|
27
|
+
setup_webdriver(Selenium::WebDriver.for(:remote, {
|
28
|
+
:url => _options[:remote_host],
|
29
|
+
:http_client => client,
|
30
|
+
:desired_capabilities => _options[:capabilities] || Selenium::WebDriver::Remote::Capabilities.firefox
|
31
|
+
}), _options)
|
32
|
+
end
|
33
33
|
|
34
|
-
|
35
|
-
client = Selenium::WebDriver::Remote::Http::Default.new
|
36
|
-
client.timeout = _options[:remote_timeout]
|
34
|
+
private
|
37
35
|
|
38
|
-
|
39
|
-
|
36
|
+
def build_proxy(_options)
|
37
|
+
# TODO: support authentication
|
38
|
+
Selenium::WebDriver::Proxy.new({
|
40
39
|
:http => _options[:proxy],
|
41
40
|
:ssl => _options[:proxy]
|
42
41
|
})
|
43
42
|
end
|
44
43
|
|
45
|
-
|
46
|
-
:
|
47
|
-
|
48
|
-
|
49
|
-
}), _options)
|
50
|
-
end
|
51
|
-
|
52
|
-
private
|
53
|
-
|
54
|
-
def common_setup(_driver, _options)
|
55
|
-
_driver.manage.window.resize_to(_options[:window_width], _options[:window_height]) rescue nil
|
56
|
-
return _driver
|
44
|
+
def setup_webdriver(_driver, _options)
|
45
|
+
_driver.manage.window.resize_to(_options[:window_width], _options[:window_height]) rescue nil
|
46
|
+
return _driver
|
47
|
+
end
|
57
48
|
end
|
58
|
-
|
59
49
|
end
|
60
|
-
end
|
61
50
|
end
|
@@ -12,6 +12,12 @@ set_log_path 'logs'
|
|
12
12
|
# Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
|
13
13
|
# set_proxy 'the.proxy.address'
|
14
14
|
|
15
|
+
# Set crawler proxy authentication, this setting is overrided when running the crawler in crabfarm.io
|
16
|
+
# set_proxy_auth 'user:password'
|
17
|
+
|
18
|
+
# Set the crawler's user agent string
|
19
|
+
# set_user_agent 'MyCrawler'
|
20
|
+
|
15
21
|
# General webdriver configuration
|
16
22
|
########################################
|
17
23
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+

|
2
|
+
|
3
|
+
## <%= name %>
|
4
|
+
|
5
|
+
This is a crawler created using the [Crabfarm framework](http://crabfarm.io/#/teaser). It is composed of navigators which allow you to access different sections of a website to extract infomation or perform actions.
|
6
|
+
|
7
|
+
To learn more about how this crawler was put together read the [documentation](http://github.com/platanus/crabfarm-gem).
|
8
|
+
|
9
|
+
### Deploy
|
10
|
+
|
11
|
+
To deploy <%= name %> first you must have an active account for the [Crabfarm Grid](https://grid.crabfarm.io).
|
12
|
+
|
13
|
+
If you already have an account, simply do:
|
14
|
+
```shell
|
15
|
+
crabfarm p
|
16
|
+
```
|
17
|
+
|
18
|
+
This will ask you for you credentials if it haven't and upload it to the grid.
|
19
|
+
|
20
|
+
### Use
|
21
|
+
|
22
|
+
To use the crawler from Ruby you can install cangrejo-gem:
|
23
|
+
|
24
|
+
```
|
25
|
+
gem install 'cangrejo'
|
26
|
+
```
|
27
|
+
|
28
|
+
and then
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
require 'cangrejo'
|
32
|
+
Cangrejo.connect 'org/repo' do |session|
|
33
|
+
session.navigate(:navigator_name, parameter_name: 'hello')
|
34
|
+
end
|
35
|
+
```
|
36
|
+
|
37
|
+
For more information, visit the [cangrejo-gem repository](https://github.com/platanus/cangrejo-gem).
|
38
|
+
|
39
|
+
If you prefer NodeJS instead, there is [Camaron](https://github.com/platanus/camaron)
|
40
|
+
|
41
|
+
|
42
|
+
### Navigators
|
43
|
+
|
44
|
+
Take a look at the [usage examples](tree/master/spec/navigators)
|
data/lib/crabfarm/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -223,6 +223,9 @@ dependencies:
|
|
223
223
|
- - ~>
|
224
224
|
- !ruby/object:Gem::Version
|
225
225
|
version: '0.7'
|
226
|
+
- - '>='
|
227
|
+
- !ruby/object:Gem::Version
|
228
|
+
version: 0.7.11
|
226
229
|
type: :development
|
227
230
|
prerelease: false
|
228
231
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -230,6 +233,9 @@ dependencies:
|
|
230
233
|
- - ~>
|
231
234
|
- !ruby/object:Gem::Version
|
232
235
|
version: '0.7'
|
236
|
+
- - '>='
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: 0.7.11
|
233
239
|
- !ruby/object:Gem::Dependency
|
234
240
|
name: bundler
|
235
241
|
requirement: !ruby/object:Gem::Requirement
|
@@ -535,6 +541,7 @@ files:
|
|
535
541
|
- lib/crabfarm/support/webdriver_factory.rb
|
536
542
|
- lib/crabfarm/templates/Crabfile.erb
|
537
543
|
- lib/crabfarm/templates/Gemfile.erb
|
544
|
+
- lib/crabfarm/templates/README.md.erb
|
538
545
|
- lib/crabfarm/templates/boot.rb.erb
|
539
546
|
- lib/crabfarm/templates/crabfarm_bin.erb
|
540
547
|
- lib/crabfarm/templates/dot_crabfarm.erb
|
@@ -576,7 +583,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
576
583
|
version: '0'
|
577
584
|
requirements: []
|
578
585
|
rubyforge_project:
|
579
|
-
rubygems_version: 2.
|
586
|
+
rubygems_version: 2.6.1
|
580
587
|
signing_key:
|
581
588
|
specification_version: 4
|
582
589
|
summary: Crabfarm is a TDD oriented web scrapping framework
|