solrizer-rabbit 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
19
+ *.swp
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use default@solrizer-rabbit
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in solrizer-rabbit.gemspec
4
+ gemspec
5
+
6
+ gem 'solrizer-fedora', :git=>'git://github.com/projecthydra/solrizer-fedora.git', :ref=>'1dbb815'
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ # Solrizer::Rabbit
2
+
3
+ Solrizer-rabbit is a gem for indexing ActiveFedora objects into solr by using RabbitMQ.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'solrizer-rabbit'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install solrizer-rabbit
18
+
19
+ ## Usage
20
+
21
+ <code>
22
+ # search fedora for a list of pids, write them into the queue
23
+ rake solrizer:rabbit:enqueue
24
+
25
+ # read the pids from fedora and index them. threads defaults to 1
26
+ rake solrizer:rabbit:index threads=7
27
+ </code>
28
+
29
+
30
+ ## Contributing
31
+
32
+ 1. Fork it
33
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
34
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
35
+ 4. Push to the branch (`git push origin my-new-feature`)
36
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,59 @@
1
+ require "solrizer-rabbit/version"
2
+
3
+ require "solrizer-fedora"
4
+ require "solrizer-rabbit/queue_index_worker"
5
+ require "solrizer-rabbit/buffered_indexer"
6
+
7
+ module Solrizer
8
+ module Rabbit
9
+ def self.queue_name
10
+ ENV['queue'] || 'index'
11
+ end
12
+
13
+ def self.enqueue
14
+ q = Carrot.queue(queue_name)
15
+
16
+ connections.each do |conn|
17
+ conn.search(nil) do |object|
18
+ q.publish(object.pid)
19
+ end
20
+ end
21
+
22
+ Carrot.stop
23
+ end
24
+
25
+ def self.work
26
+ worker_count = (ENV['threads'] || 1).to_i
27
+
28
+ workers = []
29
+ threads = []
30
+ worker_count.times do |n|
31
+ worker = Solrizer::Rabbit::QueueIndexWorker.new
32
+ workers << worker
33
+ threads << Thread.new { worker.run }
34
+ end
35
+
36
+ Signal.trap("INT") { workers.each {|w| w.stop} }
37
+
38
+ threads.each do |thread|
39
+ thread.join
40
+ end
41
+ puts "Sending commit to solr"
42
+ ActiveFedora::SolrService.instance.conn.commit
43
+ end
44
+
45
+
46
+ private
47
+
48
+ def self.connections
49
+ if ActiveFedora.config.sharded?
50
+ return ActiveFedora.config.credentials.map { |cred| ActiveFedora::RubydoraConnection.new(cred).connection}
51
+ else
52
+ return [ActiveFedora::RubydoraConnection.new(ActiveFedora.config.credentials).connection]
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ load File.join(File.dirname(__FILE__),"tasks/solrizer-rabbit.rake") if defined?(Rake)
59
+
@@ -0,0 +1,89 @@
1
+ module Solrizer::Rabbit
2
+ class BufferedIndexer
3
+ include ActiveSupport::Benchmarkable
4
+ BUFFER_SIZE = 1000
5
+ COMMIT_EVERY = 0
6
+
7
+
8
+ def initialize(conn)
9
+ @count = 0
10
+ @batch_count = 0
11
+ @add_buffer = []
12
+ @delete_buffer = []
13
+ @solr = conn
14
+ end
15
+
16
+ def flush(commit = false)
17
+ try_to_add unless @add_buffer.empty?
18
+ @add_buffer = []
19
+ try_to_delete unless @delete_buffer.empty?
20
+ @delete_buffer = []
21
+ @count = 0
22
+ maybe_commit()
23
+ end
24
+
25
+ def try_to_add
26
+ tries = 0
27
+ begin
28
+ benchmark "#{$$} -- #{Rails.env} solr add" do
29
+ solr.add @add_buffer
30
+ end
31
+ rescue TimeoutError
32
+ ## The timeout is set in this parameter. It is 60 seconds by default.
33
+ # rsolr.connection.connection.read_timeout = 60
34
+ tries += 1
35
+ puts "Timeout #{tries}"
36
+ sleep(10 * tries) # wait a little longer each time through
37
+ retry if tries < 5
38
+ raise "Adding docs timed out #{tries} times. Qutting."
39
+ end
40
+ end
41
+
42
+ def try_to_delete
43
+ tries = 0
44
+ begin
45
+ benchmark "#{$$} -- #{Rails.env} solr delete" do
46
+ solr.delete_by_id @delete_buffer
47
+ end
48
+ rescue TimeoutError
49
+ ## The timeout is set in this parameter. It is 60 seconds by default.
50
+ # rsolr.connection.connection.read_timeout = 60
51
+ tries += 1
52
+ puts "Timeout #{tries}"
53
+ sleep(10 * tries) # wait a little longer each time through
54
+ retry if tries < 5
55
+ raise "Adding docs timed out #{tries} times. Qutting."
56
+ end
57
+ end
58
+
59
+ def maybe_commit(force=false)
60
+ return if COMMIT_EVERY == 0 && !force
61
+ @batch_count += 1
62
+ if force || @batch_count > COMMIT_EVERY
63
+ solr.commit
64
+ @batch_count =0
65
+ end
66
+ end
67
+
68
+ def add(doc)
69
+ @add_buffer << doc
70
+ increment
71
+ end
72
+ def delete_by_id(doc)
73
+ @delete_buffer << doc
74
+ increment
75
+ end
76
+
77
+ private
78
+
79
+ def increment
80
+ @count += 1
81
+ flush if @count >= BUFFER_SIZE
82
+ end
83
+
84
+ def solr
85
+ @solr
86
+ end
87
+ end
88
+ end
89
+
@@ -0,0 +1,43 @@
1
+ require 'carrot'
2
+ module Solrizer
3
+ module Rabbit
4
+ class QueueIndexWorker
5
+ def initialize
6
+ Thread.current[:carrot] = Carrot.new()#:host=>'mediashelf.eu')
7
+ @q = Carrot.queue(Solrizer::Rabbit.queue_name)
8
+
9
+ @buff = BufferedIndexer.new(ActiveFedora::SolrService.instance.conn)
10
+ @stopped = false
11
+ end
12
+
13
+ def stop
14
+ puts "finishing writes"
15
+ @stopped=true
16
+ end
17
+
18
+ def run
19
+ indexer = Solrizer::Fedora::Indexer.new
20
+ while !@stopped && msg = @q.pop
21
+ begin
22
+ obj = Solrizer::Fedora::Repository.get_object(msg)
23
+ solr_doc = indexer.create_document( obj )
24
+ @buff.add(solr_doc)
25
+ rescue RSolr::Error::Http, Errno::ECONNREFUSED => exception
26
+ puts "Fatal #{exception.class}, exception see log"
27
+ logger.fatal( "\n\n#{exception.class} (#{exception.message})\n\n")
28
+ logger.flush if logger.respond_to? :flush #Rails logger is flushable, mediashelf-loggable isn't
29
+ exit!
30
+ rescue StandardError => exception
31
+ puts "Caught #{exception.class}, while procesing `#{msg}` see log"
32
+ logger.fatal( "\n\n#{exception.class} (#{exception.message}) while procesing `#{msg}`:\n " + exception.backtrace.join("\n ") + "\n\n")
33
+ logger.flush if logger.respond_to? :flush #Rails logger is flushable, mediashelf-loggable isn't
34
+ end
35
+ end
36
+ puts "flushing buffers"
37
+ @buff.flush(true)
38
+ Carrot.stop
39
+ puts "done"
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,33 @@
1
+ module Solrizer::Rabbit
2
+ class ShardedIndexer
3
+ attr_accessor :shards
4
+ def initialize
5
+ self.shards = YAML.load_file(Rails.root + 'config/shards.yml')[Rails.env]
6
+ @buffers = []
7
+ shards.each do |conf|
8
+ @buffers << DTU::BufferedIndexer.new(RSolr.connect(:url=>conf))
9
+ end
10
+ end
11
+
12
+ def add(doc)
13
+ buffer(doc['id']).add(doc)
14
+ end
15
+
16
+ def delete(doc)
17
+ buffer(doc['id']).delete_by_id(doc['id'])
18
+ end
19
+
20
+ def flush(commit = false)
21
+ @buffers.each {|b| b.flush(commit)}
22
+ end
23
+
24
+ def buffer(id)
25
+ raise "No id" unless id
26
+ n = Digest::MD5.hexdigest(id.to_s).hex % @buffers.count
27
+ @buffers[n]
28
+ end
29
+
30
+ end
31
+ end
32
+
33
+
@@ -0,0 +1,5 @@
1
+ module Solrizer
2
+ module Rabbit
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,14 @@
1
+ namespace :solrizer do
2
+ namespace :rabbit do
3
+ desc "Enqueue all pids"
4
+ task :enqueue => :environment do
5
+ Solrizer::Rabbit.enqueue
6
+ end
7
+
8
+ desc "Run the index worker"
9
+ task :index => :environment do
10
+ Solrizer::Rabbit.work
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/solrizer-rabbit/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Justin Coyne"]
6
+ gem.email = ["justin.coyne@yourmediashelf.com"]
7
+ gem.description = %q{Solrize fedora objects using a queue}
8
+ gem.summary = %q{Solrize fedora objects using a queue}
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "solrizer-rabbit"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Solrizer::Rabbit::VERSION
17
+
18
+ gem.add_dependency('solrizer-fedora', '~> 2.1')
19
+ gem.add_dependency('carrot')
20
+
21
+ gem.add_development_dependency('rspec')
22
+
23
+ end
@@ -0,0 +1,2 @@
1
+ require 'solrizer-rabbit'
2
+
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+ describe Solrizer::Rabbit::QueueIndexWorker do
3
+
4
+ before do
5
+ @mock_buffer = mock()
6
+ Solrizer::Rabbit::BufferedIndexer.should_receive(:new).with(kind_of RSolr::Client).and_return(@mock_buffer)
7
+ @stub_queue = stub('queue')
8
+ @stub_queue.should_receive(:pop).and_return('foo:123', 'foo:231', nil)
9
+ Carrot.should_receive(:queue).and_return(@stub_queue)
10
+ end
11
+ it "should run" do
12
+ @mock_buffer.should_receive(:flush)
13
+ @mock_buffer.should_receive(:add).with("document 1")
14
+ @mock_buffer.should_receive(:add).with("document 2")
15
+
16
+ obj1 = stub('obj1', :pid=>'foo:123')
17
+ obj2 = stub('obj2', :pid=>'foo:231')
18
+ @mock_indexer = mock("indexer")
19
+ @mock_indexer.should_receive(:create_document).with(obj1).and_return("document 1")
20
+ @mock_indexer.should_receive(:create_document).with(obj2).and_return("document 2")
21
+ Solrizer::Fedora::Indexer.should_receive(:new).and_return(@mock_indexer)
22
+ Solrizer::Fedora::Repository.should_receive(:get_object).with('foo:123').and_return(obj1)
23
+ Solrizer::Fedora::Repository.should_receive(:get_object).with('foo:231').and_return(obj2)
24
+
25
+ subject.run
26
+ end
27
+
28
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solrizer-rabbit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Justin Coyne
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: solrizer-fedora
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.1'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.1'
30
+ - !ruby/object:Gem::Dependency
31
+ name: carrot
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Solrize fedora objects using a queue
63
+ email:
64
+ - justin.coyne@yourmediashelf.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - .rvmrc
71
+ - Gemfile
72
+ - LICENSE
73
+ - README.md
74
+ - Rakefile
75
+ - lib/solrizer-rabbit.rb
76
+ - lib/solrizer-rabbit/buffered_indexer.rb
77
+ - lib/solrizer-rabbit/queue_index_worker.rb
78
+ - lib/solrizer-rabbit/sharded_indexer.rb
79
+ - lib/solrizer-rabbit/version.rb
80
+ - lib/tasks/solrizer-rabbit.rake
81
+ - solrizer-rabbit.gemspec
82
+ - spec/spec_helper.rb
83
+ - spec/unit/queue_index_worker_spec.rb
84
+ homepage: ''
85
+ licenses: []
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubyforge_project:
104
+ rubygems_version: 1.8.24
105
+ signing_key:
106
+ specification_version: 3
107
+ summary: Solrize fedora objects using a queue
108
+ test_files:
109
+ - spec/spec_helper.rb
110
+ - spec/unit/queue_index_worker_spec.rb