cangrejo 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/cangrejo.rb +23 -0
- data/lib/cangrejo/configurator.rb +26 -0
- data/lib/cangrejo/errors.rb +7 -0
- data/lib/cangrejo/modes/git.rb +64 -0
- data/lib/cangrejo/modes/local.rb +35 -0
- data/lib/cangrejo/modes/remote.rb +33 -0
- data/lib/cangrejo/net/socket_http.rb +24 -0
- data/lib/cangrejo/net/socket_uri.rb +46 -0
- data/lib/cangrejo/restclient/json_resource.rb +38 -0
- data/lib/cangrejo/restclient/request_extensions.rb +27 -0
- data/lib/cangrejo/session.rb +62 -0
- data/lib/cangrejo/support/launcher.rb +43 -0
- data/lib/cangrejo/version.rb +3 -0
- metadata +113 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b7efa941c84bc31f5de7d5b6da7e8d0a04167ed0
|
4
|
+
data.tar.gz: fca6d854265bc9eb67d7e85a6e13190e8ac0afa2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e0ae8d46ac60dd3b41144a95a1f4d0e1bc04b9aafddeeb927a5c942b7c65a5f71d94668282394a953bd5ee1ab6202d24520e25414c538f8fa0216fb2ea606d1c
|
7
|
+
data.tar.gz: 34065108e6e15336eda42d5fcbf543c537bccd581e4f0114cee0b3571a35973f5ab345cebf56d98a43be067944a10e27fca050f34e96f67072f89fc03816737e
|
data/lib/cangrejo.rb
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
require "cangrejo/version"
|
2
|
+
require "cangrejo/errors"
|
3
|
+
require "cangrejo/configurator"
|
4
|
+
require "cangrejo/session"
|
5
|
+
|
6
|
+
module Cangrejo
|
7
|
+
|
8
|
+
@@config = OpenStruct.new({
|
9
|
+
crabfarm_host: 'http://www.crabfarm.com',
|
10
|
+
crawler_cache_path: 'tmp/crawler_cache',
|
11
|
+
temp_path: 'tmp',
|
12
|
+
hold_by_default: false,
|
13
|
+
crawlers: Hash.new
|
14
|
+
})
|
15
|
+
|
16
|
+
def self.config
|
17
|
+
@@config
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.configure
|
21
|
+
yield Configurator.new @@config
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Cangrejo
|
2
|
+
class Configurator
|
3
|
+
|
4
|
+
[
|
5
|
+
:crabfarm_host,
|
6
|
+
:crawler_cache_path,
|
7
|
+
:temp_path,
|
8
|
+
:hold_by_default
|
9
|
+
]
|
10
|
+
.each do |name|
|
11
|
+
define_method "set_#{name}" do |value|
|
12
|
+
@config.send("#{name}=", value)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize(_config)
|
17
|
+
@config = _config
|
18
|
+
end
|
19
|
+
|
20
|
+
def crawler(_name, _options)
|
21
|
+
@config.crawlers[_name] = _options
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require "git"
|
2
|
+
require "cangrejo/modes/local"
|
3
|
+
|
4
|
+
module Cangrejo
|
5
|
+
module Modes
|
6
|
+
class Git < Local
|
7
|
+
|
8
|
+
def initialize(_url, _commit, _relative_path, _name=nil)
|
9
|
+
@url = _url
|
10
|
+
@commit = _commit
|
11
|
+
@relative_path = _relative_path
|
12
|
+
@needs_bundle = false
|
13
|
+
@name = _name || Digest::MD5.hexdigest(@url)
|
14
|
+
super deploy_path
|
15
|
+
end
|
16
|
+
|
17
|
+
def setup
|
18
|
+
ensure_repo_clone
|
19
|
+
ensure_repo_commit
|
20
|
+
ensure_deps
|
21
|
+
super
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def deploy_path
|
27
|
+
if @relative_path.present?
|
28
|
+
File.join repo_path, @relative_path
|
29
|
+
else
|
30
|
+
repo_path
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def cache_path
|
35
|
+
Cangrejo.config.crawler_cache_path
|
36
|
+
end
|
37
|
+
|
38
|
+
def repo_path
|
39
|
+
File.join cache_path, @name
|
40
|
+
end
|
41
|
+
|
42
|
+
def ensure_repo_clone
|
43
|
+
unless File.exists? File.join(repo_path, '.git')
|
44
|
+
::Git.clone(@url, @name, :path => cache_path)
|
45
|
+
@needs_bundle = true
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def ensure_repo_commit
|
50
|
+
g = ::Git.open repo_path
|
51
|
+
if g.log.first.sha != @commit
|
52
|
+
g.fetch
|
53
|
+
g.checkout @commit
|
54
|
+
@needs_bundle = true
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def ensure_deps
|
59
|
+
%x[cd '#{deploy_path}' && bundle install] if @needs_bundle
|
60
|
+
@needs_bundle = false
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require "cangrejo/restclient/request_extensions"
|
2
|
+
require "cangrejo/restclient/json_resource"
|
3
|
+
require "cangrejo/support/launcher"
|
4
|
+
|
5
|
+
module Cangrejo
|
6
|
+
module Modes
|
7
|
+
class Local
|
8
|
+
|
9
|
+
def initialize(_path)
|
10
|
+
@path = _path
|
11
|
+
end
|
12
|
+
|
13
|
+
def setup
|
14
|
+
init_launcher
|
15
|
+
init_rest_client
|
16
|
+
end
|
17
|
+
|
18
|
+
def release
|
19
|
+
@launcher.kill
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def init_launcher
|
25
|
+
@launcher = Support::Launcher.new @path
|
26
|
+
@launcher.launch
|
27
|
+
end
|
28
|
+
|
29
|
+
def init_rest_client
|
30
|
+
RestClient::JsonResource.new Net::SocketUri.new(@launcher.host, '/api/state')
|
31
|
+
end
|
32
|
+
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "cangrejo/restclient/json_resource"
|
2
|
+
|
3
|
+
module Cangrejo
|
4
|
+
module Modes
|
5
|
+
class Remote
|
6
|
+
|
7
|
+
def initialize(_name)
|
8
|
+
@name = _name
|
9
|
+
end
|
10
|
+
|
11
|
+
def setup
|
12
|
+
sessions = prepare_resource "api/crawlers/#{@name}/sessions"
|
13
|
+
sessions.post({}.to_json)
|
14
|
+
return prepare_resource "api/sessions/#{sessions.id}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def release
|
18
|
+
# nothing
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
def prepare_resource(_path)
|
24
|
+
RestClient::JsonResource.new URI.join(remote_host, _path)
|
25
|
+
end
|
26
|
+
|
27
|
+
def remote_host
|
28
|
+
Cangrejo.config.crabfarm_host
|
29
|
+
end
|
30
|
+
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Net
|
2
|
+
# Overrides the connect method to simply connect to a unix domain socket.
|
3
|
+
class SocketHttp < HTTP
|
4
|
+
attr_reader :socket_path
|
5
|
+
|
6
|
+
# URI should be a relative URI giving the path on the HTTP server.
|
7
|
+
# socket_path is the filesystem path to the socket the server is listening to.
|
8
|
+
def initialize(_socket_path, _dummy=nil)
|
9
|
+
@socket_path = _socket_path
|
10
|
+
super
|
11
|
+
end
|
12
|
+
|
13
|
+
# Create the socket object.
|
14
|
+
def connect
|
15
|
+
@socket = Net::BufferedIO.new UNIXSocket.new socket_path
|
16
|
+
on_connect
|
17
|
+
end
|
18
|
+
|
19
|
+
# Override to prevent errors concatenating relative URI objects.
|
20
|
+
def addr_port
|
21
|
+
File.basename(socket_path)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Net
|
2
|
+
class SocketUri
|
3
|
+
|
4
|
+
def initialize(_socket_path, _path)
|
5
|
+
@socket_path = _socket_path[7..-1]
|
6
|
+
@path = _path
|
7
|
+
end
|
8
|
+
|
9
|
+
def hostname
|
10
|
+
@socket_path
|
11
|
+
end
|
12
|
+
|
13
|
+
def host
|
14
|
+
@socket_path
|
15
|
+
end
|
16
|
+
|
17
|
+
def request_uri
|
18
|
+
@path
|
19
|
+
end
|
20
|
+
|
21
|
+
def empty?
|
22
|
+
false
|
23
|
+
end
|
24
|
+
|
25
|
+
def port
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
|
29
|
+
def path
|
30
|
+
@path
|
31
|
+
end
|
32
|
+
|
33
|
+
def user
|
34
|
+
nil
|
35
|
+
end
|
36
|
+
|
37
|
+
def password
|
38
|
+
nil
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_s
|
42
|
+
"#{@socket_path}::@path"
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
require "rest_client"
|
2
|
+
|
3
|
+
module RestClient
|
4
|
+
|
5
|
+
class JsonResource < Resource
|
6
|
+
|
7
|
+
def get(additional_headers={}, &block)
|
8
|
+
additional_headers['Accept'] = 'json'
|
9
|
+
r = super additional_headers, &block
|
10
|
+
@state = JSON.parse r
|
11
|
+
end
|
12
|
+
|
13
|
+
def post(payload, additional_headers={}, &block)
|
14
|
+
r = super payload.to_json, decorate_headers(additional_headers), &block
|
15
|
+
@state = JSON.parse r
|
16
|
+
end
|
17
|
+
|
18
|
+
def put(payload, additional_headers={}, &block)
|
19
|
+
r = super payload.to_json, decorate_headers(additional_headers), &block
|
20
|
+
@state = JSON.parse r
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def decorate_headers(_headers)
|
26
|
+
_headers['Content-Type'] = 'application/json'
|
27
|
+
_headers
|
28
|
+
end
|
29
|
+
|
30
|
+
def method_missing(_method, *_args, &_block)
|
31
|
+
if @state.is_a? Hash and @state.has_key? _method.to_s
|
32
|
+
@state[_method.to_s]
|
33
|
+
else super end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require "cangrejo/net/socket_http"
|
2
|
+
require "cangrejo/net/socket_uri"
|
3
|
+
require "rest_client"
|
4
|
+
|
5
|
+
module RestClient
|
6
|
+
module RequestExtensions
|
7
|
+
|
8
|
+
def process_url_params(_url, _headers)
|
9
|
+
if _url.is_a? String then super(_url, _headers) else _url end
|
10
|
+
end
|
11
|
+
|
12
|
+
def parse_url(_url)
|
13
|
+
if _url.is_a? String then super(_url) else _url end
|
14
|
+
end
|
15
|
+
|
16
|
+
def net_http_class
|
17
|
+
if url.is_a? Net::SocketUri then
|
18
|
+
Net::SocketHttp
|
19
|
+
else super end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
23
|
+
|
24
|
+
class Request
|
25
|
+
prepend RequestExtensions
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require "ostruct"
|
2
|
+
require "cangrejo/modes/remote"
|
3
|
+
require "cangrejo/modes/git"
|
4
|
+
require "cangrejo/modes/local"
|
5
|
+
|
6
|
+
module Cangrejo
|
7
|
+
class Session
|
8
|
+
include Forwardable
|
9
|
+
|
10
|
+
attr_reader :doc
|
11
|
+
|
12
|
+
def initialize(_name, _options={})
|
13
|
+
@name = _name
|
14
|
+
options = Cangrejo.config.crawlers.fetch(_name, {}).merge _options
|
15
|
+
select_mode options
|
16
|
+
start unless _options.fetch :hold, Cangrejo.config.hold_by_default
|
17
|
+
end
|
18
|
+
|
19
|
+
def start
|
20
|
+
raise ConfigurationError.new 'Session already started' unless @rest.nil?
|
21
|
+
@rest = @mode.setup
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def state_name
|
26
|
+
@rest.name
|
27
|
+
end
|
28
|
+
|
29
|
+
def state_params
|
30
|
+
@rest.params
|
31
|
+
end
|
32
|
+
|
33
|
+
def raw_doc
|
34
|
+
@rest.doc
|
35
|
+
end
|
36
|
+
|
37
|
+
def crawl(_state, _params={})
|
38
|
+
raise ConfigurationError.new 'Session not started' if @rest.nil?
|
39
|
+
@rest.put(name: _state, params: _params)
|
40
|
+
@doc = OpenStruct.new @rest.doc
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
def release
|
45
|
+
@mode.release
|
46
|
+
self
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def select_mode(_options)
|
52
|
+
@mode = if _options.has_key? :path
|
53
|
+
Modes::Local.new _options[:path]
|
54
|
+
elsif _options.has_key? :git_remote
|
55
|
+
Modes::Git.new _options[:git_remote], _options[:git_commit], _options[:relative_path], @name
|
56
|
+
else
|
57
|
+
Modes::Remote.new @name
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
module Cangrejo
|
2
|
+
module Support
|
3
|
+
class Launcher
|
4
|
+
|
5
|
+
def initialize(_path)
|
6
|
+
@path = _path
|
7
|
+
select_socket_file
|
8
|
+
end
|
9
|
+
|
10
|
+
def host
|
11
|
+
"unix://#{@socket_file}"
|
12
|
+
end
|
13
|
+
|
14
|
+
def launch
|
15
|
+
gem_path = File.join(@path, 'Gemfile')
|
16
|
+
# TODO: for some reason, the gemfile path must be specified here, maybe because of rbenv?
|
17
|
+
@pid = Process.spawn({ 'BUNDLE_GEMFILE' => gem_path }, "bin/crabfarm s --host #{host}", chdir: @path)
|
18
|
+
wait_for_socket
|
19
|
+
end
|
20
|
+
|
21
|
+
def kill
|
22
|
+
unless @pid.nil?
|
23
|
+
Process.kill 'INT', @pid
|
24
|
+
Process.wait @pid
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def select_socket_file
|
31
|
+
@socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
|
32
|
+
end
|
33
|
+
|
34
|
+
def random_filename
|
35
|
+
File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
|
36
|
+
end
|
37
|
+
|
38
|
+
def wait_for_socket
|
39
|
+
sleep 0.1 while not File.exist? @socket_file
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cangrejo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ignacio Baixas
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-01-02 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rest-client
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.7.2
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 1.7.2
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: git
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: bundler
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ~>
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.6'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '1.6'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description:
|
70
|
+
email:
|
71
|
+
- ignacio@platan.us
|
72
|
+
executables: []
|
73
|
+
extensions: []
|
74
|
+
extra_rdoc_files: []
|
75
|
+
files:
|
76
|
+
- lib/cangrejo/configurator.rb
|
77
|
+
- lib/cangrejo/errors.rb
|
78
|
+
- lib/cangrejo/modes/git.rb
|
79
|
+
- lib/cangrejo/modes/local.rb
|
80
|
+
- lib/cangrejo/modes/remote.rb
|
81
|
+
- lib/cangrejo/net/socket_http.rb
|
82
|
+
- lib/cangrejo/net/socket_uri.rb
|
83
|
+
- lib/cangrejo/restclient/json_resource.rb
|
84
|
+
- lib/cangrejo/restclient/request_extensions.rb
|
85
|
+
- lib/cangrejo/session.rb
|
86
|
+
- lib/cangrejo/support/launcher.rb
|
87
|
+
- lib/cangrejo/version.rb
|
88
|
+
- lib/cangrejo.rb
|
89
|
+
homepage: ''
|
90
|
+
licenses:
|
91
|
+
- MIT
|
92
|
+
metadata: {}
|
93
|
+
post_install_message:
|
94
|
+
rdoc_options: []
|
95
|
+
require_paths:
|
96
|
+
- lib
|
97
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
98
|
+
requirements:
|
99
|
+
- - '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
103
|
+
requirements:
|
104
|
+
- - '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
requirements: []
|
108
|
+
rubyforge_project:
|
109
|
+
rubygems_version: 2.0.14
|
110
|
+
signing_key:
|
111
|
+
specification_version: 4
|
112
|
+
summary: Crabfarm client for ruby
|
113
|
+
test_files: []
|