cangrejo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b7efa941c84bc31f5de7d5b6da7e8d0a04167ed0
4
+ data.tar.gz: fca6d854265bc9eb67d7e85a6e13190e8ac0afa2
5
+ SHA512:
6
+ metadata.gz: e0ae8d46ac60dd3b41144a95a1f4d0e1bc04b9aafddeeb927a5c942b7c65a5f71d94668282394a953bd5ee1ab6202d24520e25414c538f8fa0216fb2ea606d1c
7
+ data.tar.gz: 34065108e6e15336eda42d5fcbf543c537bccd581e4f0114cee0b3571a35973f5ab345cebf56d98a43be067944a10e27fca050f34e96f67072f89fc03816737e
@@ -0,0 +1,23 @@
1
+ require "cangrejo/version"
2
+ require "cangrejo/errors"
3
+ require "cangrejo/configurator"
4
+ require "cangrejo/session"
5
+
6
+ module Cangrejo
7
+
8
+ @@config = OpenStruct.new({
9
+ crabfarm_host: 'http://www.crabfarm.com',
10
+ crawler_cache_path: 'tmp/crawler_cache',
11
+ temp_path: 'tmp',
12
+ hold_by_default: false,
13
+ crawlers: Hash.new
14
+ })
15
+
16
+ def self.config
17
+ @@config
18
+ end
19
+
20
+ def self.configure
21
+ yield Configurator.new @@config
22
+ end
23
+ end
@@ -0,0 +1,26 @@
1
+ module Cangrejo
2
+ class Configurator
3
+
4
+ [
5
+ :crabfarm_host,
6
+ :crawler_cache_path,
7
+ :temp_path,
8
+ :hold_by_default
9
+ ]
10
+ .each do |name|
11
+ define_method "set_#{name}" do |value|
12
+ @config.send("#{name}=", value)
13
+ end
14
+ end
15
+
16
+ def initialize(_config)
17
+ @config = _config
18
+ end
19
+
20
+ def crawler(_name, _options)
21
+ @config.crawlers[_name] = _options
22
+ end
23
+
24
+ end
25
+
26
+ end
@@ -0,0 +1,7 @@
1
+ module Cangrejo
2
+
3
+ class Error < StandardError; end
4
+
5
+ class ConfigurationError < Error; end
6
+
7
+ end
@@ -0,0 +1,64 @@
1
+ require "git"
2
+ require "cangrejo/modes/local"
3
+
4
+ module Cangrejo
5
+ module Modes
6
+ class Git < Local
7
+
8
+ def initialize(_url, _commit, _relative_path, _name=nil)
9
+ @url = _url
10
+ @commit = _commit
11
+ @relative_path = _relative_path
12
+ @needs_bundle = false
13
+ @name = _name || Digest::MD5.hexdigest(@url)
14
+ super deploy_path
15
+ end
16
+
17
+ def setup
18
+ ensure_repo_clone
19
+ ensure_repo_commit
20
+ ensure_deps
21
+ super
22
+ end
23
+
24
+ private
25
+
26
+ def deploy_path
27
+ if @relative_path.present?
28
+ File.join repo_path, @relative_path
29
+ else
30
+ repo_path
31
+ end
32
+ end
33
+
34
+ def cache_path
35
+ Cangrejo.config.crawler_cache_path
36
+ end
37
+
38
+ def repo_path
39
+ File.join cache_path, @name
40
+ end
41
+
42
+ def ensure_repo_clone
43
+ unless File.exists? File.join(repo_path, '.git')
44
+ ::Git.clone(@url, @name, :path => cache_path)
45
+ @needs_bundle = true
46
+ end
47
+ end
48
+
49
+ def ensure_repo_commit
50
+ g = ::Git.open repo_path
51
+ if g.log.first.sha != @commit
52
+ g.fetch
53
+ g.checkout @commit
54
+ @needs_bundle = true
55
+ end
56
+ end
57
+
58
+ def ensure_deps
59
+ %x[cd '#{deploy_path}' && bundle install] if @needs_bundle
60
+ @needs_bundle = false
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,35 @@
1
+ require "cangrejo/restclient/request_extensions"
2
+ require "cangrejo/restclient/json_resource"
3
+ require "cangrejo/support/launcher"
4
+
5
+ module Cangrejo
6
+ module Modes
7
+ class Local
8
+
9
+ def initialize(_path)
10
+ @path = _path
11
+ end
12
+
13
+ def setup
14
+ init_launcher
15
+ init_rest_client
16
+ end
17
+
18
+ def release
19
+ @launcher.kill
20
+ end
21
+
22
+ private
23
+
24
+ def init_launcher
25
+ @launcher = Support::Launcher.new @path
26
+ @launcher.launch
27
+ end
28
+
29
+ def init_rest_client
30
+ RestClient::JsonResource.new Net::SocketUri.new(@launcher.host, '/api/state')
31
+ end
32
+
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,33 @@
1
+ require "cangrejo/restclient/json_resource"
2
+
3
+ module Cangrejo
4
+ module Modes
5
+ class Remote
6
+
7
+ def initialize(_name)
8
+ @name = _name
9
+ end
10
+
11
+ def setup
12
+ sessions = prepare_resource "api/crawlers/#{@name}/sessions"
13
+ sessions.post({}.to_json)
14
+ return prepare_resource "api/sessions/#{sessions.id}"
15
+ end
16
+
17
+ def release
18
+ # nothing
19
+ end
20
+
21
+ private
22
+
23
+ def prepare_resource(_path)
24
+ RestClient::JsonResource.new URI.join(remote_host, _path)
25
+ end
26
+
27
+ def remote_host
28
+ Cangrejo.config.crabfarm_host
29
+ end
30
+
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,24 @@
1
+ module Net
2
+ # Overrides the connect method to simply connect to a unix domain socket.
3
+ class SocketHttp < HTTP
4
+ attr_reader :socket_path
5
+
6
+ # URI should be a relative URI giving the path on the HTTP server.
7
+ # socket_path is the filesystem path to the socket the server is listening to.
8
+ def initialize(_socket_path, _dummy=nil)
9
+ @socket_path = _socket_path
10
+ super
11
+ end
12
+
13
+ # Create the socket object.
14
+ def connect
15
+ @socket = Net::BufferedIO.new UNIXSocket.new socket_path
16
+ on_connect
17
+ end
18
+
19
+ # Override to prevent errors concatenating relative URI objects.
20
+ def addr_port
21
+ File.basename(socket_path)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,46 @@
1
+ module Net
2
+ class SocketUri
3
+
4
+ def initialize(_socket_path, _path)
5
+ @socket_path = _socket_path[7..-1]
6
+ @path = _path
7
+ end
8
+
9
+ def hostname
10
+ @socket_path
11
+ end
12
+
13
+ def host
14
+ @socket_path
15
+ end
16
+
17
+ def request_uri
18
+ @path
19
+ end
20
+
21
+ def empty?
22
+ false
23
+ end
24
+
25
+ def port
26
+ nil
27
+ end
28
+
29
+ def path
30
+ @path
31
+ end
32
+
33
+ def user
34
+ nil
35
+ end
36
+
37
+ def password
38
+ nil
39
+ end
40
+
41
+ def to_s
42
+ "#{@socket_path}::@path"
43
+ end
44
+
45
+ end
46
+ end
@@ -0,0 +1,38 @@
1
+ require "rest_client"
2
+
3
+ module RestClient
4
+
5
+ class JsonResource < Resource
6
+
7
+ def get(additional_headers={}, &block)
8
+ additional_headers['Accept'] = 'json'
9
+ r = super additional_headers, &block
10
+ @state = JSON.parse r
11
+ end
12
+
13
+ def post(payload, additional_headers={}, &block)
14
+ r = super payload.to_json, decorate_headers(additional_headers), &block
15
+ @state = JSON.parse r
16
+ end
17
+
18
+ def put(payload, additional_headers={}, &block)
19
+ r = super payload.to_json, decorate_headers(additional_headers), &block
20
+ @state = JSON.parse r
21
+ end
22
+
23
+ private
24
+
25
+ def decorate_headers(_headers)
26
+ _headers['Content-Type'] = 'application/json'
27
+ _headers
28
+ end
29
+
30
+ def method_missing(_method, *_args, &_block)
31
+ if @state.is_a? Hash and @state.has_key? _method.to_s
32
+ @state[_method.to_s]
33
+ else super end
34
+ end
35
+
36
+ end
37
+
38
+ end
@@ -0,0 +1,27 @@
1
+ require "cangrejo/net/socket_http"
2
+ require "cangrejo/net/socket_uri"
3
+ require "rest_client"
4
+
5
+ module RestClient
6
+ module RequestExtensions
7
+
8
+ def process_url_params(_url, _headers)
9
+ if _url.is_a? String then super(_url, _headers) else _url end
10
+ end
11
+
12
+ def parse_url(_url)
13
+ if _url.is_a? String then super(_url) else _url end
14
+ end
15
+
16
+ def net_http_class
17
+ if url.is_a? Net::SocketUri then
18
+ Net::SocketHttp
19
+ else super end
20
+ end
21
+
22
+ end
23
+
24
+ class Request
25
+ prepend RequestExtensions
26
+ end
27
+ end
@@ -0,0 +1,62 @@
1
+ require "ostruct"
2
+ require "cangrejo/modes/remote"
3
+ require "cangrejo/modes/git"
4
+ require "cangrejo/modes/local"
5
+
6
+ module Cangrejo
7
+ class Session
8
+ include Forwardable
9
+
10
+ attr_reader :doc
11
+
12
+ def initialize(_name, _options={})
13
+ @name = _name
14
+ options = Cangrejo.config.crawlers.fetch(_name, {}).merge _options
15
+ select_mode options
16
+ start unless _options.fetch :hold, Cangrejo.config.hold_by_default
17
+ end
18
+
19
+ def start
20
+ raise ConfigurationError.new 'Session already started' unless @rest.nil?
21
+ @rest = @mode.setup
22
+ self
23
+ end
24
+
25
+ def state_name
26
+ @rest.name
27
+ end
28
+
29
+ def state_params
30
+ @rest.params
31
+ end
32
+
33
+ def raw_doc
34
+ @rest.doc
35
+ end
36
+
37
+ def crawl(_state, _params={})
38
+ raise ConfigurationError.new 'Session not started' if @rest.nil?
39
+ @rest.put(name: _state, params: _params)
40
+ @doc = OpenStruct.new @rest.doc
41
+ self
42
+ end
43
+
44
+ def release
45
+ @mode.release
46
+ self
47
+ end
48
+
49
+ private
50
+
51
+ def select_mode(_options)
52
+ @mode = if _options.has_key? :path
53
+ Modes::Local.new _options[:path]
54
+ elsif _options.has_key? :git_remote
55
+ Modes::Git.new _options[:git_remote], _options[:git_commit], _options[:relative_path], @name
56
+ else
57
+ Modes::Remote.new @name
58
+ end
59
+ end
60
+
61
+ end
62
+ end
@@ -0,0 +1,43 @@
1
+ module Cangrejo
2
+ module Support
3
+ class Launcher
4
+
5
+ def initialize(_path)
6
+ @path = _path
7
+ select_socket_file
8
+ end
9
+
10
+ def host
11
+ "unix://#{@socket_file}"
12
+ end
13
+
14
+ def launch
15
+ gem_path = File.join(@path, 'Gemfile')
16
+ # TODO: for some reason, the gemfile path must be specified here, maybe because of rbenv?
17
+ @pid = Process.spawn({ 'BUNDLE_GEMFILE' => gem_path }, "bin/crabfarm s --host #{host}", chdir: @path)
18
+ wait_for_socket
19
+ end
20
+
21
+ def kill
22
+ unless @pid.nil?
23
+ Process.kill 'INT', @pid
24
+ Process.wait @pid
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ def select_socket_file
31
+ @socket_file = random_filename while @socket_file.nil? or File.exist? @socket_file
32
+ end
33
+
34
+ def random_filename
35
+ File.join(Cangrejo.config.temp_path, "csocket-#{Random.rand(1000000)}.sock")
36
+ end
37
+
38
+ def wait_for_socket
39
+ sleep 0.1 while not File.exist? @socket_file
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,3 @@
1
+ module Cangrejo
2
+ VERSION = "0.0.1"
3
+ end
metadata ADDED
@@ -0,0 +1,113 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cangrejo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Ignacio Baixas
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-01-02 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rest-client
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: 1.7.2
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: 1.7.2
27
+ - !ruby/object:Gem::Dependency
28
+ name: git
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: bundler
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ~>
46
+ - !ruby/object:Gem::Version
47
+ version: '1.6'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ~>
53
+ - !ruby/object:Gem::Version
54
+ version: '1.6'
55
+ - !ruby/object:Gem::Dependency
56
+ name: rake
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ description:
70
+ email:
71
+ - ignacio@platan.us
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - lib/cangrejo/configurator.rb
77
+ - lib/cangrejo/errors.rb
78
+ - lib/cangrejo/modes/git.rb
79
+ - lib/cangrejo/modes/local.rb
80
+ - lib/cangrejo/modes/remote.rb
81
+ - lib/cangrejo/net/socket_http.rb
82
+ - lib/cangrejo/net/socket_uri.rb
83
+ - lib/cangrejo/restclient/json_resource.rb
84
+ - lib/cangrejo/restclient/request_extensions.rb
85
+ - lib/cangrejo/session.rb
86
+ - lib/cangrejo/support/launcher.rb
87
+ - lib/cangrejo/version.rb
88
+ - lib/cangrejo.rb
89
+ homepage: ''
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ requirements:
99
+ - - '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ required_rubygems_version: !ruby/object:Gem::Requirement
103
+ requirements:
104
+ - - '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ requirements: []
108
+ rubyforge_project:
109
+ rubygems_version: 2.0.14
110
+ signing_key:
111
+ specification_version: 4
112
+ summary: Crabfarm client for ruby
113
+ test_files: []