cangrejo 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/cangrejo.rb +23 -0
- data/lib/cangrejo/configurator.rb +26 -0
- data/lib/cangrejo/errors.rb +7 -0
- data/lib/cangrejo/modes/git.rb +64 -0
- data/lib/cangrejo/modes/local.rb +35 -0
- data/lib/cangrejo/modes/remote.rb +33 -0
- data/lib/cangrejo/net/socket_http.rb +24 -0
- data/lib/cangrejo/net/socket_uri.rb +46 -0
- data/lib/cangrejo/restclient/json_resource.rb +38 -0
- data/lib/cangrejo/restclient/request_extensions.rb +27 -0
- data/lib/cangrejo/session.rb +62 -0
- data/lib/cangrejo/support/launcher.rb +43 -0
- data/lib/cangrejo/version.rb +3 -0
- metadata +113 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b7efa941c84bc31f5de7d5b6da7e8d0a04167ed0
|
4
|
+
data.tar.gz: fca6d854265bc9eb67d7e85a6e13190e8ac0afa2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e0ae8d46ac60dd3b41144a95a1f4d0e1bc04b9aafddeeb927a5c942b7c65a5f71d94668282394a953bd5ee1ab6202d24520e25414c538f8fa0216fb2ea606d1c
|
7
|
+
data.tar.gz: 34065108e6e15336eda42d5fcbf543c537bccd581e4f0114cee0b3571a35973f5ab345cebf56d98a43be067944a10e27fca050f34e96f67072f89fc03816737e
|
data/lib/cangrejo.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require "cangrejo/version"
|
2
|
+
require "cangrejo/errors"
|
3
|
+
require "cangrejo/configurator"
|
4
|
+
require "cangrejo/session"
|
5
|
+
|
6
|
+
module Cangrejo
|
7
|
+
|
8
|
+
@@config = OpenStruct.new({
|
9
|
+
crabfarm_host: 'http://www.crabfarm.com',
|
10
|
+
crawler_cache_path: 'tmp/crawler_cache',
|
11
|
+
temp_path: 'tmp',
|
12
|
+
hold_by_default: false,
|
13
|
+
crawlers: Hash.new
|
14
|
+
})
|
15
|
+
|
16
|
+
def self.config
|
17
|
+
@@config
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.configure
|
21
|
+
yield Configurator.new @@config
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Cangrejo
|
2
|
+
class Configurator
|
3
|
+
|
4
|
+
[
|
5
|
+
:crabfarm_host,
|
6
|
+
:crawler_cache_path,
|
7
|
+
:temp_path,
|
8
|
+
:hold_by_default
|
9
|
+
]
|
10
|
+
.each do |name|
|
11
|
+
define_method "set_#{name}" do |value|
|
12
|
+
@config.send("#{name}=", value)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(_config)
|
17
|
+
@config = _config
|
18
|
+
end
|
19
|
+
|
20
|
+
def crawler(_name, _options)
|
21
|
+
@config.crawlers[_name] = _options
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "git"
|
2
|
+
require "cangrejo/modes/local"
|
3
|
+
|
4
|
+
module Cangrejo
|
5
|
+
module Modes
|
6
|
+
class Git < Local
|
7
|
+
|
8
|
+
def initialize(_url, _commit, _relative_path, _name=nil)
|
9
|
+
@url = _url
|
10
|
+
@commit = _commit
|
11
|
+
@relative_path = _relative_path
|
12
|
+
@needs_bundle = false
|
13
|
+
@name = _name || Digest::MD5.hexdigest(@url)
|
14
|
+
super deploy_path
|
15
|
+
end
|
16
|
+
|
17
|
+
def setup
|
18
|
+
ensure_repo_clone
|
19
|
+
ensure_repo_commit
|
20
|
+
ensure_deps
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def deploy_path
|
27
|
+
if @relative_path.present?
|
28
|
+
File.join repo_path, @relative_path
|
29
|
+
else
|
30
|
+
repo_path
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def cache_path
|
35
|
+
Cangrejo.config.crawler_cache_path
|
36
|
+
end
|
37
|
+
|
38
|
+
def repo_path
|
39
|
+
File.join cache_path, @name
|
40
|
+
end
|
41
|
+
|
42
|
+
def ensure_repo_clone
|
43
|
+
unless File.exists? File.join(repo_path, '.git')
|
44
|
+
::Git.clone(@url, @name, :path => cache_path)
|
45
|
+
@needs_bundle = true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ensure_repo_commit
|
50
|
+
g = ::Git.open repo_path
|
51
|
+
if g.log.first.sha != @commit
|
52
|
+
g.fetch
|
53
|
+
g.checkout @commit
|
54
|
+
@needs_bundle = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def ensure_deps
|
59
|
+
%x[cd '#{deploy_path}' && bundle install] if @needs_bundle
|
60
|
+
@needs_bundle = false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "cangrejo/restclient/request_extensions"
|
2
|
+
require "cangrejo/restclient/json_resource"
|
3
|
+
require "cangrejo/support/launcher"
|
4
|
+
|
5
|
+
module Cangrejo
|
6
|
+
module Modes
|
7
|
+
class Local
|
8
|
+
|
9
|
+
def initialize(_path)
|
10
|
+
@path = _path
|
11
|
+
end
|
12
|
+
|
13
|
+
def setup
|
14
|
+
init_launcher
|
15
|
+
init_rest_client
|
16
|
+
end
|
17
|
+
|
18
|
+
def release
|
19
|
+
@launcher.kill
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def init_launcher
|
25
|
+
@launcher = Support::Launcher.new @path
|
26
|
+
@launcher.launch
|
27
|
+
end
|
28
|
+
|
29
|
+
def init_rest_client
|
30
|
+
RestClient::JsonResource.new Net::SocketUri.new(@launcher.host, '/api/state')
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "cangrejo/restclient/json_resource"
|
2
|
+
|
3
|
+
module Cangrejo
|
4
|
+
module Modes
|
5
|
+
class Remote
|
6
|
+
|
7
|
+
def initialize(_name)
|
8
|
+
@name = _name
|
9
|
+
end
|
10
|
+
|
11
|
+
def setup
|
12
|
+
sessions = prepare_resource "api/crawlers/#{@name}/sessions"
|
13
|
+
sessions.post({}.to_json)
|
14
|
+
return prepare_resource "api/sessions/#{sessions.id}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def release
|
18
|
+
# nothing
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def prepare_resource(_path)
|
24
|
+
RestClient::JsonResource.new URI.join(remote_host, _path)
|
25
|
+
end
|
26
|
+
|
27
|
+
def remote_host
|
28
|
+
Cangrejo.config.crabfarm_host
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Net
|
2
|
+
# Overrides the connect method to simply connect to a unix domain socket.
|
3
|
+
class SocketHttp < HTTP
|
4
|
+
attr_reader :socket_path
|
5
|
+
|
6
|
+
# URI should be a relative URI giving the path on the HTTP server.
|
7
|
+
# socket_path is the filesystem path to the socket the server is listening to.
|
8
|
+
def initialize(_socket_path, _dummy=nil)
|
9
|
+
@socket_path = _socket_path
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
# Create the socket object.
|
14
|
+
def connect
|
15
|
+
@socket = Net::BufferedIO.new UNIXSocket.new socket_path
|
16
|
+
on_connect
|
17
|
+
end
|
18
|
+
|
19
|
+
# Override to prevent errors concatenating relative URI objects.
|
20
|
+
def addr_port
|
21
|
+
File.basename(socket_path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Net
|
2
|
+
class SocketUri
|
3
|
+
|
4
|
+
def initialize(_socket_path, _path)
|
5
|
+
@socket_path = _socket_path[7..-1]
|
6
|
+
@path = _path
|
7
|
+
end
|
8
|
+
|
9
|
+
def hostname
|
10
|
+
@socket_path
|
11
|
+
end
|
12
|
+
|
13
|
+
def host
|
14
|
+
@socket_path
|
15
|
+
end
|
16
|
+
|
17
|
+
def request_uri
|
18
|
+
@path
|
19
|
+
end
|
20
|
+
|
21
|
+
def empty?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
def port
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
|
29
|
+
def path
|
30
|
+
@path
|
31
|
+
end
|
32
|
+
|
33
|
+
def user
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def password
|
38
|
+
nil
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
"#{@socket_path}::@path"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "rest_client"
|
2
|
+
|
3
|
+
module RestClient
|
4
|
+
|
5
|
+
class JsonResource < Resource
|
6
|
+
|
7
|
+
def get(additional_headers={}, &block)
|
8
|
+
additional_headers['Accept'] = 'json'
|
9
|
+
r = super additional_headers, &block
|
10
|
+
@state = JSON.parse r
|
11
|
+
end
|
12
|
+
|
13
|
+
def post(payload, additional_headers={}, &block)
|
14
|
+
r = super payload.to_json, decorate_headers(additional_headers), &block
|
15
|
+
@state = JSON.parse r
|
16
|
+
end
|
17
|
+
|
18
|
+
def put(payload, additional_headers={}, &block)
|
19
|
+
r = super payload.to_json, decorate_headers(additional_headers), &block
|
20
|
+
@state = JSON.parse r
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def decorate_headers(_headers)
|
26
|
+
_headers['Content-Type'] = 'application/json'
|
27
|
+
_headers
|
28
|
+
end
|
29
|
+
|
30
|
+
def method_missing(_method, *_args, &_block)
|
31
|
+
if @state.is_a? Hash and @state.has_key? _method.to_s
|
32
|
+
@state[_method.to_s]
|
33
|
+
else super end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "cangrejo/net/socket_http"
|
2
|
+
require "cangrejo/net/socket_uri"
|
3
|
+
require "rest_client"
|
4
|
+
|
5
|
+
module RestClient
|
6
|
+
module RequestExtensions
|
7
|
+
|
8
|
+
def process_url_params(_url, _headers)
|
9
|
+
if _url.is_a? String then super(_url, _headers) else _url end
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse_url(_url)
|
13
|
+
if _url.is_a? String then super(_url) else _url end
|
14
|
+
end
|
15
|
+
|
16
|
+
def net_http_class
|
17
|
+
if url.is_a? Net::SocketUri then
|
18
|
+
Net::SocketHttp
|
19
|
+
else super end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
class Request
|
25
|
+
prepend RequestExtensions
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require "ostruct"
|
2
|
+
require "cangrejo/modes/remote"
|
3
|
+
require "cangrejo/modes/git"
|
4
|
+
require "cangrejo/modes/local"
|
5
|
+
|
6
|
+
module Cangrejo
|
7
|
+
class Session
|
8
|
+
include Forwardable
|
9
|
+
|
10
|
+
attr_reader :doc
|
11
|
+
|
12
|
+
def initialize(_name, _options={})
|
13
|
+
@name = _name
|
14
|
+
options = Cangrejo.config.crawlers.fetch(_name, {}).merge _options
|
15
|
+
select_mode options
|
16
|
+
start unless _options.fetch :hold, Cangrejo.config.hold_by_default
|
17
|
+
end
|
18
|
+
|
19
|
+
def start
|
20
|
+
raise ConfigurationError.new 'Session already started' unless @rest.nil?
|
21
|
+
@rest = @mode.setup
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def state_name
|
26
|
+
@rest.name
|
27
|
+
end
|
28
|
+
|
29
|
+
def state_params
|
30
|
+
@rest.params
|
31
|
+
end
|
32
|
+
|
33
|
+
def raw_doc
|
34
|
+
@rest.doc
|
35
|
+
end
|
36
|
+
|
37
|
+
def crawl(_state, _params={})
|
38
|
+
raise ConfigurationError.new 'Session not started' if @rest.nil?
|
39
|
+
@rest.put(name: _state, params: _params)
|
40
|
+
@doc = OpenStruct.new @rest.doc
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def release
|
45
|
+
@mode.release
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def select_mode(_options)
|
52
|
+
@mode = if _options.has_key? :path
|
53
|
+
Modes::Local.new _options[:path]
|
54
|
+
elsif _options.has_key? :git_remote
|
55
|
+
Modes::Git.new _options[:git_remote], _options[:git_commit], _options[:relative_path], @name
|
56
|
+
else
|
57
|
+
Modes::Remote.new @name
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Cangrejo
|
2
|
+
module Support
|
3
|
+
class Launcher
|
4
|
+
|
5
|
+
def initialize(_path)
|
6
|
+
@path = _path
|
7
|
+
select_socket_file
|
8
|
+
end
|
9
|
+
|
10
|
+
def host
|
11
|
+
"unix://#{@socket_file}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def launch
|
15
|
+
gem_path = File.join(@path, 'Gemfile')
|
16
|
+
# TODO: for some reason, the gemfile path must be specified here, maybe because of rbenv?
|
17
|
+
@pid = Process.spawn({ 'BUNDLE_GEMFILE' => gem_path }, "bin/crabfarm s --host #{host}", chdir: @path)
|
18
|
+
wait_for_socket
|
19
|
+
end
|
20
|
+
|
21
|
+
def kill
|
22
|
+
unless @pid.nil?
|
23
|
+
Process.kill 'INT', @pid
|
24
|
+
Process.wait @pid
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def select_socket_file
|
31
|
+
@socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
|
32
|
+
end
|
33
|
+
|
34
|
+
def random_filename
|
35
|
+
File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
|
36
|
+
end
|
37
|
+
|
38
|
+
def wait_for_socket
|
39
|
+
sleep 0.1 while not File.exist? @socket_file
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cangrejo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ignacio Baixas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rest-client
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: git
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- ignacio@platan.us
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- lib/cangrejo/configurator.rb
|
77
|
+
- lib/cangrejo/errors.rb
|
78
|
+
- lib/cangrejo/modes/git.rb
|
79
|
+
- lib/cangrejo/modes/local.rb
|
80
|
+
- lib/cangrejo/modes/remote.rb
|
81
|
+
- lib/cangrejo/net/socket_http.rb
|
82
|
+
- lib/cangrejo/net/socket_uri.rb
|
83
|
+
- lib/cangrejo/restclient/json_resource.rb
|
84
|
+
- lib/cangrejo/restclient/request_extensions.rb
|
85
|
+
- lib/cangrejo/session.rb
|
86
|
+
- lib/cangrejo/support/launcher.rb
|
87
|
+
- lib/cangrejo/version.rb
|
88
|
+
- lib/cangrejo.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.0.14
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Crabfarm client for ruby
|
113
|
+
test_files: []
|