cangrejo 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b7efa941c84bc31f5de7d5b6da7e8d0a04167ed0
4
+ data.tar.gz: fca6d854265bc9eb67d7e85a6e13190e8ac0afa2
5
+ SHA512:
6
+ metadata.gz: e0ae8d46ac60dd3b41144a95a1f4d0e1bc04b9aafddeeb927a5c942b7c65a5f71d94668282394a953bd5ee1ab6202d24520e25414c538f8fa0216fb2ea606d1c
7
+ data.tar.gz: 34065108e6e15336eda42d5fcbf543c537bccd581e4f0114cee0b3571a35973f5ab345cebf56d98a43be067944a10e27fca050f34e96f67072f89fc03816737e
@@ -0,0 +1,23 @@
1
+ require "cangrejo/version"
2
+ require "cangrejo/errors"
3
+ require "cangrejo/configurator"
4
+ require "cangrejo/session"
5
+
6
+ module Cangrejo
7
+
8
+ @@config = OpenStruct.new({
9
+ crabfarm_host: 'http://www.crabfarm.com',
10
+ crawler_cache_path: 'tmp/crawler_cache',
11
+ temp_path: 'tmp',
12
+ hold_by_default: false,
13
+ crawlers: Hash.new
14
+ })
15
+
16
+ def self.config
17
+ @@config
18
+ end
19
+
20
+ def self.configure
21
+ yield Configurator.new @@config
22
+ end
23
+ end
@@ -0,0 +1,26 @@
1
+ module Cangrejo
2
+ class Configurator
3
+
4
+ [
5
+ :crabfarm_host,
6
+ :crawler_cache_path,
7
+ :temp_path,
8
+ :hold_by_default
9
+ ]
10
+ .each do |name|
11
+ define_method "set_#{name}" do |value|
12
+ @config.send("#{name}=", value)
13
+ end
14
+ end
15
+
16
+ def initialize(_config)
17
+ @config = _config
18
+ end
19
+
20
+ def crawler(_name, _options)
21
+ @config.crawlers[_name] = _options
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,7 @@
1
+ module Cangrejo
2
+
3
+ class Error < StandardError; end
4
+
5
+ class ConfigurationError < Error; end
6
+
7
+ end
@@ -0,0 +1,64 @@
1
+ require "git"
2
+ require "cangrejo/modes/local"
3
+
4
+ module Cangrejo
5
+ module Modes
6
+ class Git < Local
7
+
8
+ def initialize(_url, _commit, _relative_path, _name=nil)
9
+ @url = _url
10
+ @commit = _commit
11
+ @relative_path = _relative_path
12
+ @needs_bundle = false
13
+ @name = _name || Digest::MD5.hexdigest(@url)
14
+ super deploy_path
15
+ end
16
+
17
+ def setup
18
+ ensure_repo_clone
19
+ ensure_repo_commit
20
+ ensure_deps
21
+ super
22
+ end
23
+
24
+ private
25
+
26
+ def deploy_path
27
+ if @relative_path.present?
28
+ File.join repo_path, @relative_path
29
+ else
30
+ repo_path
31
+ end
32
+ end
33
+
34
+ def cache_path
35
+ Cangrejo.config.crawler_cache_path
36
+ end
37
+
38
+ def repo_path
39
+ File.join cache_path, @name
40
+ end
41
+
42
+ def ensure_repo_clone
43
+ unless File.exists? File.join(repo_path, '.git')
44
+ ::Git.clone(@url, @name, :path => cache_path)
45
+ @needs_bundle = true
46
+ end
47
+ end
48
+
49
+ def ensure_repo_commit
50
+ g = ::Git.open repo_path
51
+ if g.log.first.sha != @commit
52
+ g.fetch
53
+ g.checkout @commit
54
+ @needs_bundle = true
55
+ end
56
+ end
57
+
58
+ def ensure_deps
59
+ %x[cd '#{deploy_path}' && bundle install] if @needs_bundle
60
+ @needs_bundle = false
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,35 @@
1
+ require "cangrejo/restclient/request_extensions"
2
+ require "cangrejo/restclient/json_resource"
3
+ require "cangrejo/support/launcher"
4
+
5
+ module Cangrejo
6
+ module Modes
7
+ class Local
8
+
9
+ def initialize(_path)
10
+ @path = _path
11
+ end
12
+
13
+ def setup
14
+ init_launcher
15
+ init_rest_client
16
+ end
17
+
18
+ def release
19
+ @launcher.kill
20
+ end
21
+
22
+ private
23
+
24
+ def init_launcher
25
+ @launcher = Support::Launcher.new @path
26
+ @launcher.launch
27
+ end
28
+
29
+ def init_rest_client
30
+ RestClient::JsonResource.new Net::SocketUri.new(@launcher.host, '/api/state')
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,33 @@
1
+ require "cangrejo/restclient/json_resource"
2
+
3
+ module Cangrejo
4
+ module Modes
5
+ class Remote
6
+
7
+ def initialize(_name)
8
+ @name = _name
9
+ end
10
+
11
+ def setup
12
+ sessions = prepare_resource "api/crawlers/#{@name}/sessions"
13
+ sessions.post({}.to_json)
14
+ return prepare_resource "api/sessions/#{sessions.id}"
15
+ end
16
+
17
+ def release
18
+ # nothing
19
+ end
20
+
21
+ private
22
+
23
+ def prepare_resource(_path)
24
+ RestClient::JsonResource.new URI.join(remote_host, _path)
25
+ end
26
+
27
+ def remote_host
28
+ Cangrejo.config.crabfarm_host
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,24 @@
1
+ module Net
2
+ # Overrides the connect method to simply connect to a unix domain socket.
3
+ class SocketHttp < HTTP
4
+ attr_reader :socket_path
5
+
6
+ # URI should be a relative URI giving the path on the HTTP server.
7
+ # socket_path is the filesystem path to the socket the server is listening to.
8
+ def initialize(_socket_path, _dummy=nil)
9
+ @socket_path = _socket_path
10
+ super
11
+ end
12
+
13
+ # Create the socket object.
14
+ def connect
15
+ @socket = Net::BufferedIO.new UNIXSocket.new socket_path
16
+ on_connect
17
+ end
18
+
19
+ # Override to prevent errors concatenating relative URI objects.
20
+ def addr_port
21
+ File.basename(socket_path)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,46 @@
1
+ module Net
2
+ class SocketUri
3
+
4
+ def initialize(_socket_path, _path)
5
+ @socket_path = _socket_path[7..-1]
6
+ @path = _path
7
+ end
8
+
9
+ def hostname
10
+ @socket_path
11
+ end
12
+
13
+ def host
14
+ @socket_path
15
+ end
16
+
17
+ def request_uri
18
+ @path
19
+ end
20
+
21
+ def empty?
22
+ false
23
+ end
24
+
25
+ def port
26
+ nil
27
+ end
28
+
29
+ def path
30
+ @path
31
+ end
32
+
33
+ def user
34
+ nil
35
+ end
36
+
37
+ def password
38
+ nil
39
+ end
40
+
41
+ def to_s
42
+ "#{@socket_path}::@path"
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,38 @@
1
+ require "rest_client"
2
+
3
+ module RestClient
4
+
5
+ class JsonResource < Resource
6
+
7
+ def get(additional_headers={}, &block)
8
+ additional_headers['Accept'] = 'json'
9
+ r = super additional_headers, &block
10
+ @state = JSON.parse r
11
+ end
12
+
13
+ def post(payload, additional_headers={}, &block)
14
+ r = super payload.to_json, decorate_headers(additional_headers), &block
15
+ @state = JSON.parse r
16
+ end
17
+
18
+ def put(payload, additional_headers={}, &block)
19
+ r = super payload.to_json, decorate_headers(additional_headers), &block
20
+ @state = JSON.parse r
21
+ end
22
+
23
+ private
24
+
25
+ def decorate_headers(_headers)
26
+ _headers['Content-Type'] = 'application/json'
27
+ _headers
28
+ end
29
+
30
+ def method_missing(_method, *_args, &_block)
31
+ if @state.is_a? Hash and @state.has_key? _method.to_s
32
+ @state[_method.to_s]
33
+ else super end
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,27 @@
1
+ require "cangrejo/net/socket_http"
2
+ require "cangrejo/net/socket_uri"
3
+ require "rest_client"
4
+
5
+ module RestClient
6
+ module RequestExtensions
7
+
8
+ def process_url_params(_url, _headers)
9
+ if _url.is_a? String then super(_url, _headers) else _url end
10
+ end
11
+
12
+ def parse_url(_url)
13
+ if _url.is_a? String then super(_url) else _url end
14
+ end
15
+
16
+ def net_http_class
17
+ if url.is_a? Net::SocketUri then
18
+ Net::SocketHttp
19
+ else super end
20
+ end
21
+
22
+ end
23
+
24
+ class Request
25
+ prepend RequestExtensions
26
+ end
27
+ end
@@ -0,0 +1,62 @@
1
+ require "ostruct"
2
+ require "cangrejo/modes/remote"
3
+ require "cangrejo/modes/git"
4
+ require "cangrejo/modes/local"
5
+
6
+ module Cangrejo
7
+ class Session
8
+ include Forwardable
9
+
10
+ attr_reader :doc
11
+
12
+ def initialize(_name, _options={})
13
+ @name = _name
14
+ options = Cangrejo.config.crawlers.fetch(_name, {}).merge _options
15
+ select_mode options
16
+ start unless _options.fetch :hold, Cangrejo.config.hold_by_default
17
+ end
18
+
19
+ def start
20
+ raise ConfigurationError.new 'Session already started' unless @rest.nil?
21
+ @rest = @mode.setup
22
+ self
23
+ end
24
+
25
+ def state_name
26
+ @rest.name
27
+ end
28
+
29
+ def state_params
30
+ @rest.params
31
+ end
32
+
33
+ def raw_doc
34
+ @rest.doc
35
+ end
36
+
37
+ def crawl(_state, _params={})
38
+ raise ConfigurationError.new 'Session not started' if @rest.nil?
39
+ @rest.put(name: _state, params: _params)
40
+ @doc = OpenStruct.new @rest.doc
41
+ self
42
+ end
43
+
44
+ def release
45
+ @mode.release
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def select_mode(_options)
52
+ @mode = if _options.has_key? :path
53
+ Modes::Local.new _options[:path]
54
+ elsif _options.has_key? :git_remote
55
+ Modes::Git.new _options[:git_remote], _options[:git_commit], _options[:relative_path], @name
56
+ else
57
+ Modes::Remote.new @name
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,43 @@
1
+ module Cangrejo
2
+ module Support
3
+ class Launcher
4
+
5
+ def initialize(_path)
6
+ @path = _path
7
+ select_socket_file
8
+ end
9
+
10
+ def host
11
+ "unix://#{@socket_file}"
12
+ end
13
+
14
+ def launch
15
+ gem_path = File.join(@path, 'Gemfile')
16
+ # TODO: for some reason, the gemfile path must be specified here, maybe because of rbenv?
17
+ @pid = Process.spawn({ 'BUNDLE_GEMFILE' => gem_path }, "bin/crabfarm s --host #{host}", chdir: @path)
18
+ wait_for_socket
19
+ end
20
+
21
+ def kill
22
+ unless @pid.nil?
23
+ Process.kill 'INT', @pid
24
+ Process.wait @pid
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def select_socket_file
31
+ @socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
32
+ end
33
+
34
+ def random_filename
35
+ File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
36
+ end
37
+
38
+ def wait_for_socket
39
+ sleep 0.1 while not File.exist? @socket_file
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module Cangrejo
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cangrejo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ignacio Baixas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rest-client
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: git
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description:
70
+ email:
71
+ - ignacio@platan.us
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - lib/cangrejo/configurator.rb
77
+ - lib/cangrejo/errors.rb
78
+ - lib/cangrejo/modes/git.rb
79
+ - lib/cangrejo/modes/local.rb
80
+ - lib/cangrejo/modes/remote.rb
81
+ - lib/cangrejo/net/socket_http.rb
82
+ - lib/cangrejo/net/socket_uri.rb
83
+ - lib/cangrejo/restclient/json_resource.rb
84
+ - lib/cangrejo/restclient/request_extensions.rb
85
+ - lib/cangrejo/session.rb
86
+ - lib/cangrejo/support/launcher.rb
87
+ - lib/cangrejo/version.rb
88
+ - lib/cangrejo.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.0.14
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Crabfarm client for ruby
113
+ test_files: []