crabfarm 0.7.9 → 0.7.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/crabfarm/adapters/browser/abstract_webdriver.rb +2 -1
- data/lib/crabfarm/adapters/browser/chenso.rb +3 -4
- data/lib/crabfarm/adapters/browser/noop.rb +1 -3
- data/lib/crabfarm/configuration.rb +5 -2
- data/lib/crabfarm/context.rb +7 -3
- data/lib/crabfarm/crabtrap_context.rb +4 -3
- data/lib/crabfarm/live/context.rb +5 -1
- data/lib/crabfarm/modes/generator.rb +1 -0
- data/lib/crabfarm/support/webdriver_factory.rb +30 -41
- data/lib/crabfarm/templates/Crabfile.erb +6 -0
- data/lib/crabfarm/templates/Gemfile.erb +1 -1
- data/lib/crabfarm/templates/README.md.erb +44 -0
- data/lib/crabfarm/version.rb +1 -1
- metadata +10 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 61b08ac4fbd03cce9915e677accfd155a32df06d
|
4
|
+
data.tar.gz: c6dfcfde7084e9a169c3b841b0db323f91e618f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1a6e29253f64e0c783b7ad257cbcd66f247b50310f275b3ac1384ced3c5fb8fba1942dfc64da72833c40c414e2251405d39f01ee40d1b1de2f0cd10dad873b86
|
7
|
+
data.tar.gz: 84ac0447cfdacf3bb6e04ea73b937a157a13954ba5a5325a10b60d61d8261b9a46315acdd38fd522daa7e592dc6375cfedece54fb6779ae2569f41b26b6198cc
|
@@ -7,9 +7,10 @@ module Crabfarm
|
|
7
7
|
|
8
8
|
attr_accessor :config
|
9
9
|
|
10
|
-
def initialize(_proxy=nil)
|
10
|
+
def initialize(_proxy = nil, _proxy_auth = nil)
|
11
11
|
@config = load_driver_config
|
12
12
|
@config[:proxy] = _proxy
|
13
|
+
@config[:proxy_auth] = _proxy_auth
|
13
14
|
end
|
14
15
|
|
15
16
|
def build_driver(_session_id)
|
@@ -4,10 +4,10 @@ module Crabfarm
|
|
4
4
|
module Adapters
|
5
5
|
module Browser
|
6
6
|
class Chenso < Base
|
7
|
-
|
8
|
-
def initialize(_proxy=nil)
|
7
|
+
def initialize(_proxy = nil, _proxy_auth = nil)
|
9
8
|
@config = load_chenso_config
|
10
9
|
@config[:proxy] = _proxy
|
10
|
+
@config[:proxy_auth] = _proxy_auth
|
11
11
|
end
|
12
12
|
|
13
13
|
def build_driver(_session_id)
|
@@ -22,10 +22,9 @@ module Crabfarm
|
|
22
22
|
|
23
23
|
def load_chenso_config
|
24
24
|
{
|
25
|
-
|
25
|
+
user_agent: Crabfarm.config.user_agent
|
26
26
|
}
|
27
27
|
end
|
28
|
-
|
29
28
|
end
|
30
29
|
end
|
31
30
|
end
|
@@ -4,14 +4,12 @@ module Crabfarm
|
|
4
4
|
module Adapters
|
5
5
|
module Browser
|
6
6
|
class Noop < Base
|
7
|
-
|
8
|
-
def initialize(_proxy=nil)
|
7
|
+
def initialize(_proxy = nil, _proxy_user = nil)
|
9
8
|
end
|
10
9
|
|
11
10
|
def build_driver(_session_id)
|
12
11
|
_session_id || :noop
|
13
12
|
end
|
14
|
-
|
15
13
|
end
|
16
14
|
end
|
17
15
|
end
|
@@ -10,6 +10,8 @@ module Crabfarm
|
|
10
10
|
[:parser, :string, 'Default parser engine used by reducers'],
|
11
11
|
[:log_path, :string, 'Path where logs should be stored'],
|
12
12
|
[:proxy, :string, 'If given, a proxy is used to connect to the internet if driver supports it'],
|
13
|
+
[:proxy_auth, :string, 'Proxy authentication parameters as user:password'],
|
14
|
+
[:user_agent, :string, 'Allows overriding default driver user agent, only available in chenso'],
|
13
15
|
|
14
16
|
# Webdriver configuration parameters
|
15
17
|
[:webdriver_host, :string, 'Remote host, only available in driver: remote'],
|
@@ -56,6 +58,8 @@ module Crabfarm
|
|
56
58
|
driver_factory: nil,
|
57
59
|
log_path: nil,
|
58
60
|
proxy: nil,
|
61
|
+
proxy_auth: nil,
|
62
|
+
user_agent: nil,
|
59
63
|
webdriver_capabilities: nil,
|
60
64
|
webdriver_host: 'localhost',
|
61
65
|
webdriver_port: '8080',
|
@@ -87,8 +91,7 @@ module Crabfarm
|
|
87
91
|
def crabtrap_config
|
88
92
|
{
|
89
93
|
bin_path: crabtrap_bin_path,
|
90
|
-
log_level: crabtrap_log_level
|
91
|
-
proxy: proxy
|
94
|
+
log_level: crabtrap_log_level
|
92
95
|
}
|
93
96
|
end
|
94
97
|
|
data/lib/crabfarm/context.rb
CHANGED
@@ -48,7 +48,7 @@ module Crabfarm
|
|
48
48
|
|
49
49
|
def init_browser_adapter
|
50
50
|
if @browser_adapter.nil?
|
51
|
-
@browser_adapter = build_browser_adapter proxy
|
51
|
+
@browser_adapter = build_browser_adapter proxy, proxy_auth
|
52
52
|
@browser_adapter.prepare_driver_services
|
53
53
|
end
|
54
54
|
end
|
@@ -67,14 +67,18 @@ module Crabfarm
|
|
67
67
|
@pool = nil
|
68
68
|
end
|
69
69
|
|
70
|
-
def build_browser_adapter(_proxy)
|
71
|
-
Strategies.load(:browser, config.browser).new
|
70
|
+
def build_browser_adapter(_proxy, _proxy_auth)
|
71
|
+
Strategies.load(:browser, config.browser).new(_proxy, _proxy_auth)
|
72
72
|
end
|
73
73
|
|
74
74
|
def proxy
|
75
75
|
config.proxy
|
76
76
|
end
|
77
77
|
|
78
|
+
def proxy_auth
|
79
|
+
config.proxy_auth
|
80
|
+
end
|
81
|
+
|
78
82
|
def config
|
79
83
|
Crabfarm.config
|
80
84
|
end
|
@@ -2,7 +2,6 @@ require 'crabfarm/crabtrap_runner'
|
|
2
2
|
|
3
3
|
module Crabfarm
|
4
4
|
class CrabtrapContext < Context
|
5
|
-
|
6
5
|
attr_accessor :mode
|
7
6
|
|
8
7
|
def initialize(_mode=:pass, _path=nil)
|
@@ -78,13 +77,15 @@ module Crabfarm
|
|
78
77
|
end
|
79
78
|
|
80
79
|
def proxy
|
81
|
-
# just step over configuration proxy
|
82
80
|
proxy_address
|
83
81
|
end
|
84
82
|
|
83
|
+
def proxy_auth
|
84
|
+
nil
|
85
|
+
end
|
86
|
+
|
85
87
|
def proxy_address
|
86
88
|
"127.0.0.1:#{@port}"
|
87
89
|
end
|
88
|
-
|
89
90
|
end
|
90
91
|
end
|
@@ -15,9 +15,13 @@ module Crabfarm
|
|
15
15
|
"127.0.0.1:#{@manager.proxy_port}"
|
16
16
|
end
|
17
17
|
|
18
|
+
def proxy_auth
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
|
18
22
|
private
|
19
23
|
|
20
|
-
def build_browser_adapter(_proxy)
|
24
|
+
def build_browser_adapter(_proxy, _proxy_auth)
|
21
25
|
# use a special browser adapter to override primary driver
|
22
26
|
return BrowserAdapter.new @manager
|
23
27
|
end
|
@@ -32,6 +32,7 @@ module Crabfarm
|
|
32
32
|
path(_name, 'spec', 'mementos', '.gitkeep').render('dot_gitkeep')
|
33
33
|
path(_name, 'spec', 'integration', '.gitkeep').render('dot_gitkeep')
|
34
34
|
path(_name, 'logs', '.gitkeep').render('dot_gitkeep')
|
35
|
+
path(_name, 'README.md').render('README.md', binding)
|
35
36
|
end
|
36
37
|
end
|
37
38
|
|
@@ -1,61 +1,50 @@
|
|
1
1
|
require 'selenium-webdriver'
|
2
2
|
|
3
3
|
module Crabfarm
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Support
|
5
|
+
module WebdriverFactory
|
6
|
+
extend self
|
7
7
|
|
8
|
-
|
9
|
-
|
8
|
+
def build_chrome_driver(_options={})
|
9
|
+
capabilities = Selenium::WebDriver::Remote::Capabilities.chrome
|
10
|
+
capabilities.proxy = build_proxy(_options) if _options[:proxy].present?
|
10
11
|
|
11
|
-
|
12
|
-
capabilities.proxy = Selenium::WebDriver::Proxy.new({
|
13
|
-
:http => _options[:proxy],
|
14
|
-
:ssl => _options[:proxy]
|
15
|
-
})
|
12
|
+
setup_webdriver Selenium::WebDriver.for(:chrome, detach: false, desired_capabilities: capabilities), _options
|
16
13
|
end
|
17
14
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
def build_firefox_driver(_options={})
|
22
|
-
capabilities = Selenium::WebDriver::Remote::Capabilities.firefox
|
15
|
+
def build_firefox_driver(_options={})
|
16
|
+
capabilities = Selenium::WebDriver::Remote::Capabilities.firefox
|
17
|
+
capabilities.proxy = build_proxy(_options) if _options[:proxy].present?
|
23
18
|
|
24
|
-
|
25
|
-
capabilities.proxy = Selenium::WebDriver::Proxy.new({
|
26
|
-
:http => _options[:proxy],
|
27
|
-
:ssl => _options[:proxy]
|
28
|
-
})
|
19
|
+
setup_webdriver Selenium::WebDriver.for(:firefox, desired_capabilities: capabilities), _options
|
29
20
|
end
|
30
21
|
|
31
|
-
|
32
|
-
|
22
|
+
def build_remote_driver(_options={})
|
23
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
24
|
+
client.timeout = _options[:remote_timeout]
|
25
|
+
client.proxy = build_proxy(_options) if _options[:proxy].present?
|
26
|
+
|
27
|
+
setup_webdriver(Selenium::WebDriver.for(:remote, {
|
28
|
+
:url => _options[:remote_host],
|
29
|
+
:http_client => client,
|
30
|
+
:desired_capabilities => _options[:capabilities] || Selenium::WebDriver::Remote::Capabilities.firefox
|
31
|
+
}), _options)
|
32
|
+
end
|
33
33
|
|
34
|
-
|
35
|
-
client = Selenium::WebDriver::Remote::Http::Default.new
|
36
|
-
client.timeout = _options[:remote_timeout]
|
34
|
+
private
|
37
35
|
|
38
|
-
|
39
|
-
|
36
|
+
def build_proxy(_options)
|
37
|
+
# TODO: support authentication
|
38
|
+
Selenium::WebDriver::Proxy.new({
|
40
39
|
:http => _options[:proxy],
|
41
40
|
:ssl => _options[:proxy]
|
42
41
|
})
|
43
42
|
end
|
44
43
|
|
45
|
-
|
46
|
-
:
|
47
|
-
|
48
|
-
|
49
|
-
}), _options)
|
50
|
-
end
|
51
|
-
|
52
|
-
private
|
53
|
-
|
54
|
-
def common_setup(_driver, _options)
|
55
|
-
_driver.manage.window.resize_to(_options[:window_width], _options[:window_height]) rescue nil
|
56
|
-
return _driver
|
44
|
+
def setup_webdriver(_driver, _options)
|
45
|
+
_driver.manage.window.resize_to(_options[:window_width], _options[:window_height]) rescue nil
|
46
|
+
return _driver
|
47
|
+
end
|
57
48
|
end
|
58
|
-
|
59
49
|
end
|
60
|
-
end
|
61
50
|
end
|
@@ -12,6 +12,12 @@ set_log_path 'logs'
|
|
12
12
|
# Set crawler proxy, this setting is overrided when running the crawler in crabfarm.io
|
13
13
|
# set_proxy 'the.proxy.address'
|
14
14
|
|
15
|
+
# Set crawler proxy authentication, this setting is overrided when running the crawler in crabfarm.io
|
16
|
+
# set_proxy_auth 'user:password'
|
17
|
+
|
18
|
+
# Set the crawler's user agent string
|
19
|
+
# set_user_agent 'MyCrawler'
|
20
|
+
|
15
21
|
# General webdriver configuration
|
16
22
|
########################################
|
17
23
|
|
@@ -0,0 +1,44 @@
|
|
1
|
+
![Crabfarm](http://crabfarm.io/img/teaser-bg-3.png)
|
2
|
+
|
3
|
+
## <%= name %>
|
4
|
+
|
5
|
+
This is a crawler created using the [Crabfarm framework](http://crabfarm.io/#/teaser). It is composed of navigators which allow you to access different sections of a website to extract infomation or perform actions.
|
6
|
+
|
7
|
+
To learn more about how this crawler was put together read the [documentation](http://github.com/platanus/crabfarm-gem).
|
8
|
+
|
9
|
+
### Deploy
|
10
|
+
|
11
|
+
To deploy <%= name %> first you must have an active account for the [Crabfarm Grid](https://grid.crabfarm.io).
|
12
|
+
|
13
|
+
If you already have an account, simply do:
|
14
|
+
```shell
|
15
|
+
crabfarm p
|
16
|
+
```
|
17
|
+
|
18
|
+
This will ask you for you credentials if it haven't and upload it to the grid.
|
19
|
+
|
20
|
+
### Use
|
21
|
+
|
22
|
+
To use the crawler from Ruby you can install cangrejo-gem:
|
23
|
+
|
24
|
+
```
|
25
|
+
gem install 'cangrejo'
|
26
|
+
```
|
27
|
+
|
28
|
+
and then
|
29
|
+
|
30
|
+
```ruby
|
31
|
+
require 'cangrejo'
|
32
|
+
Cangrejo.connect 'org/repo' do |session|
|
33
|
+
session.navigate(:navigator_name, parameter_name: 'hello')
|
34
|
+
end
|
35
|
+
```
|
36
|
+
|
37
|
+
For more information, visit the [cangrejo-gem repository](https://github.com/platanus/cangrejo-gem).
|
38
|
+
|
39
|
+
If you prefer NodeJS instead, there is [Camaron](https://github.com/platanus/camaron)
|
40
|
+
|
41
|
+
|
42
|
+
### Navigators
|
43
|
+
|
44
|
+
Take a look at the [usage examples](tree/master/spec/navigators)
|
data/lib/crabfarm/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crabfarm
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ignacio Baixas
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-05-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -223,6 +223,9 @@ dependencies:
|
|
223
223
|
- - ~>
|
224
224
|
- !ruby/object:Gem::Version
|
225
225
|
version: '0.7'
|
226
|
+
- - '>='
|
227
|
+
- !ruby/object:Gem::Version
|
228
|
+
version: 0.7.11
|
226
229
|
type: :development
|
227
230
|
prerelease: false
|
228
231
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -230,6 +233,9 @@ dependencies:
|
|
230
233
|
- - ~>
|
231
234
|
- !ruby/object:Gem::Version
|
232
235
|
version: '0.7'
|
236
|
+
- - '>='
|
237
|
+
- !ruby/object:Gem::Version
|
238
|
+
version: 0.7.11
|
233
239
|
- !ruby/object:Gem::Dependency
|
234
240
|
name: bundler
|
235
241
|
requirement: !ruby/object:Gem::Requirement
|
@@ -535,6 +541,7 @@ files:
|
|
535
541
|
- lib/crabfarm/support/webdriver_factory.rb
|
536
542
|
- lib/crabfarm/templates/Crabfile.erb
|
537
543
|
- lib/crabfarm/templates/Gemfile.erb
|
544
|
+
- lib/crabfarm/templates/README.md.erb
|
538
545
|
- lib/crabfarm/templates/boot.rb.erb
|
539
546
|
- lib/crabfarm/templates/crabfarm_bin.erb
|
540
547
|
- lib/crabfarm/templates/dot_crabfarm.erb
|
@@ -576,7 +583,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
576
583
|
version: '0'
|
577
584
|
requirements: []
|
578
585
|
rubyforge_project:
|
579
|
-
rubygems_version: 2.
|
586
|
+
rubygems_version: 2.6.1
|
580
587
|
signing_key:
|
581
588
|
specification_version: 4
|
582
589
|
summary: Crabfarm is a TDD oriented web scrapping framework
|