solrizer-rabbit 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+
19
+ *.swp
data/.rvmrc ADDED
@@ -0,0 +1 @@
1
+ rvm use default@solrizer-rabbit
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in solrizer-rabbit.gemspec
4
+ gemspec
5
+
6
+ gem 'solrizer-fedora', :git=>'git://github.com/projecthydra/solrizer-fedora.git', :ref=>'1dbb815'
data/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 TODO: Write your name
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,36 @@
1
+ # Solrizer::Rabbit
2
+
3
+ Solrizer-rabbit is a gem for indexing ActiveFedora objects into solr by using RabbitMQ.
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ gem 'solrizer-rabbit'
10
+
11
+ And then execute:
12
+
13
+ $ bundle
14
+
15
+ Or install it yourself as:
16
+
17
+ $ gem install solrizer-rabbit
18
+
19
+ ## Usage
20
+
21
+ <code>
22
+ # search fedora for a list of pids, write them into the queue
23
+ rake solrizer:rabbit:enqueue
24
+
25
+ # read the pids from fedora and index them. threads defaults to 1
26
+ rake solrizer:rabbit:index threads=7
27
+ </code>
28
+
29
+
30
+ ## Contributing
31
+
32
+ 1. Fork it
33
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
34
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
35
+ 4. Push to the branch (`git push origin my-new-feature`)
36
+ 5. Create new Pull Request
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
@@ -0,0 +1,59 @@
1
+ require "solrizer-rabbit/version"
2
+
3
+ require "solrizer-fedora"
4
+ require "solrizer-rabbit/queue_index_worker"
5
+ require "solrizer-rabbit/buffered_indexer"
6
+
7
+ module Solrizer
8
+ module Rabbit
9
+ def self.queue_name
10
+ ENV['queue'] || 'index'
11
+ end
12
+
13
+ def self.enqueue
14
+ q = Carrot.queue(queue_name)
15
+
16
+ connections.each do |conn|
17
+ conn.search(nil) do |object|
18
+ q.publish(object.pid)
19
+ end
20
+ end
21
+
22
+ Carrot.stop
23
+ end
24
+
25
+ def self.work
26
+ worker_count = (ENV['threads'] || 1).to_i
27
+
28
+ workers = []
29
+ threads = []
30
+ worker_count.times do |n|
31
+ worker = Solrizer::Rabbit::QueueIndexWorker.new
32
+ workers << worker
33
+ threads << Thread.new { worker.run }
34
+ end
35
+
36
+ Signal.trap("INT") { workers.each {|w| w.stop} }
37
+
38
+ threads.each do |thread|
39
+ thread.join
40
+ end
41
+ puts "Sending commit to solr"
42
+ ActiveFedora::SolrService.instance.conn.commit
43
+ end
44
+
45
+
46
+ private
47
+
48
+ def self.connections
49
+ if ActiveFedora.config.sharded?
50
+ return ActiveFedora.config.credentials.map { |cred| ActiveFedora::RubydoraConnection.new(cred).connection}
51
+ else
52
+ return [ActiveFedora::RubydoraConnection.new(ActiveFedora.config.credentials).connection]
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ load File.join(File.dirname(__FILE__),"tasks/solrizer-rabbit.rake") if defined?(Rake)
59
+
@@ -0,0 +1,89 @@
1
+ module Solrizer::Rabbit
2
+ class BufferedIndexer
3
+ include ActiveSupport::Benchmarkable
4
+ BUFFER_SIZE = 1000
5
+ COMMIT_EVERY = 0
6
+
7
+
8
+ def initialize(conn)
9
+ @count = 0
10
+ @batch_count = 0
11
+ @add_buffer = []
12
+ @delete_buffer = []
13
+ @solr = conn
14
+ end
15
+
16
+ def flush(commit = false)
17
+ try_to_add unless @add_buffer.empty?
18
+ @add_buffer = []
19
+ try_to_delete unless @delete_buffer.empty?
20
+ @delete_buffer = []
21
+ @count = 0
22
+ maybe_commit()
23
+ end
24
+
25
+ def try_to_add
26
+ tries = 0
27
+ begin
28
+ benchmark "#{$$} -- #{Rails.env} solr add" do
29
+ solr.add @add_buffer
30
+ end
31
+ rescue TimeoutError
32
+ ## The timeout is set in this parameter. It is 60 seconds by default.
33
+ # rsolr.connection.connection.read_timeout = 60
34
+ tries += 1
35
+ puts "Timeout #{tries}"
36
+ sleep(10 * tries) # wait a little longer each time through
37
+ retry if tries < 5
38
+ raise "Adding docs timed out #{tries} times. Qutting."
39
+ end
40
+ end
41
+
42
+ def try_to_delete
43
+ tries = 0
44
+ begin
45
+ benchmark "#{$$} -- #{Rails.env} solr delete" do
46
+ solr.delete_by_id @delete_buffer
47
+ end
48
+ rescue TimeoutError
49
+ ## The timeout is set in this parameter. It is 60 seconds by default.
50
+ # rsolr.connection.connection.read_timeout = 60
51
+ tries += 1
52
+ puts "Timeout #{tries}"
53
+ sleep(10 * tries) # wait a little longer each time through
54
+ retry if tries < 5
55
+ raise "Adding docs timed out #{tries} times. Qutting."
56
+ end
57
+ end
58
+
59
+ def maybe_commit(force=false)
60
+ return if COMMIT_EVERY == 0 && !force
61
+ @batch_count += 1
62
+ if force || @batch_count > COMMIT_EVERY
63
+ solr.commit
64
+ @batch_count =0
65
+ end
66
+ end
67
+
68
+ def add(doc)
69
+ @add_buffer << doc
70
+ increment
71
+ end
72
+ def delete_by_id(doc)
73
+ @delete_buffer << doc
74
+ increment
75
+ end
76
+
77
+ private
78
+
79
+ def increment
80
+ @count += 1
81
+ flush if @count >= BUFFER_SIZE
82
+ end
83
+
84
+ def solr
85
+ @solr
86
+ end
87
+ end
88
+ end
89
+
@@ -0,0 +1,43 @@
1
+ require 'carrot'
2
+ module Solrizer
3
+ module Rabbit
4
+ class QueueIndexWorker
5
+ def initialize
6
+ Thread.current[:carrot] = Carrot.new()#:host=>'mediashelf.eu')
7
+ @q = Carrot.queue(Solrizer::Rabbit.queue_name)
8
+
9
+ @buff = BufferedIndexer.new(ActiveFedora::SolrService.instance.conn)
10
+ @stopped = false
11
+ end
12
+
13
+ def stop
14
+ puts "finishing writes"
15
+ @stopped=true
16
+ end
17
+
18
+ def run
19
+ indexer = Solrizer::Fedora::Indexer.new
20
+ while !@stopped && msg = @q.pop
21
+ begin
22
+ obj = Solrizer::Fedora::Repository.get_object(msg)
23
+ solr_doc = indexer.create_document( obj )
24
+ @buff.add(solr_doc)
25
+ rescue RSolr::Error::Http, Errno::ECONNREFUSED => exception
26
+ puts "Fatal #{exception.class}, exception see log"
27
+ logger.fatal( "\n\n#{exception.class} (#{exception.message})\n\n")
28
+ logger.flush if logger.respond_to? :flush #Rails logger is flushable, mediashelf-loggable isn't
29
+ exit!
30
+ rescue StandardError => exception
31
+ puts "Caught #{exception.class}, while procesing `#{msg}` see log"
32
+ logger.fatal( "\n\n#{exception.class} (#{exception.message}) while procesing `#{msg}`:\n " + exception.backtrace.join("\n ") + "\n\n")
33
+ logger.flush if logger.respond_to? :flush #Rails logger is flushable, mediashelf-loggable isn't
34
+ end
35
+ end
36
+ puts "flushing buffers"
37
+ @buff.flush(true)
38
+ Carrot.stop
39
+ puts "done"
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,33 @@
1
+ module Solrizer::Rabbit
2
+ class ShardedIndexer
3
+ attr_accessor :shards
4
+ def initialize
5
+ self.shards = YAML.load_file(Rails.root + 'config/shards.yml')[Rails.env]
6
+ @buffers = []
7
+ shards.each do |conf|
8
+ @buffers << DTU::BufferedIndexer.new(RSolr.connect(:url=>conf))
9
+ end
10
+ end
11
+
12
+ def add(doc)
13
+ buffer(doc['id']).add(doc)
14
+ end
15
+
16
+ def delete(doc)
17
+ buffer(doc['id']).delete_by_id(doc['id'])
18
+ end
19
+
20
+ def flush(commit = false)
21
+ @buffers.each {|b| b.flush(commit)}
22
+ end
23
+
24
+ def buffer(id)
25
+ raise "No id" unless id
26
+ n = Digest::MD5.hexdigest(id.to_s).hex % @buffers.count
27
+ @buffers[n]
28
+ end
29
+
30
+ end
31
+ end
32
+
33
+
@@ -0,0 +1,5 @@
1
+ module Solrizer
2
+ module Rabbit
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,14 @@
1
+ namespace :solrizer do
2
+ namespace :rabbit do
3
+ desc "Enqueue all pids"
4
+ task :enqueue => :environment do
5
+ Solrizer::Rabbit.enqueue
6
+ end
7
+
8
+ desc "Run the index worker"
9
+ task :index => :environment do
10
+ Solrizer::Rabbit.work
11
+ end
12
+ end
13
+ end
14
+
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/solrizer-rabbit/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.authors = ["Justin Coyne"]
6
+ gem.email = ["justin.coyne@yourmediashelf.com"]
7
+ gem.description = %q{Solrize fedora objects using a queue}
8
+ gem.summary = %q{Solrize fedora objects using a queue}
9
+ gem.homepage = ""
10
+
11
+ gem.files = `git ls-files`.split($\)
12
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
13
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
14
+ gem.name = "solrizer-rabbit"
15
+ gem.require_paths = ["lib"]
16
+ gem.version = Solrizer::Rabbit::VERSION
17
+
18
+ gem.add_dependency('solrizer-fedora', '~> 2.1')
19
+ gem.add_dependency('carrot')
20
+
21
+ gem.add_development_dependency('rspec')
22
+
23
+ end
@@ -0,0 +1,2 @@
1
+ require 'solrizer-rabbit'
2
+
@@ -0,0 +1,28 @@
1
+ require 'spec_helper'
2
+ describe Solrizer::Rabbit::QueueIndexWorker do
3
+
4
+ before do
5
+ @mock_buffer = mock()
6
+ Solrizer::Rabbit::BufferedIndexer.should_receive(:new).with(kind_of RSolr::Client).and_return(@mock_buffer)
7
+ @stub_queue = stub('queue')
8
+ @stub_queue.should_receive(:pop).and_return('foo:123', 'foo:231', nil)
9
+ Carrot.should_receive(:queue).and_return(@stub_queue)
10
+ end
11
+ it "should run" do
12
+ @mock_buffer.should_receive(:flush)
13
+ @mock_buffer.should_receive(:add).with("document 1")
14
+ @mock_buffer.should_receive(:add).with("document 2")
15
+
16
+ obj1 = stub('obj1', :pid=>'foo:123')
17
+ obj2 = stub('obj2', :pid=>'foo:231')
18
+ @mock_indexer = mock("indexer")
19
+ @mock_indexer.should_receive(:create_document).with(obj1).and_return("document 1")
20
+ @mock_indexer.should_receive(:create_document).with(obj2).and_return("document 2")
21
+ Solrizer::Fedora::Indexer.should_receive(:new).and_return(@mock_indexer)
22
+ Solrizer::Fedora::Repository.should_receive(:get_object).with('foo:123').and_return(obj1)
23
+ Solrizer::Fedora::Repository.should_receive(:get_object).with('foo:231').and_return(obj2)
24
+
25
+ subject.run
26
+ end
27
+
28
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: solrizer-rabbit
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Justin Coyne
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-05-24 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: solrizer-fedora
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: '2.1'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ~>
28
+ - !ruby/object:Gem::Version
29
+ version: '2.1'
30
+ - !ruby/object:Gem::Dependency
31
+ name: carrot
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ - !ruby/object:Gem::Dependency
47
+ name: rspec
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ description: Solrize fedora objects using a queue
63
+ email:
64
+ - justin.coyne@yourmediashelf.com
65
+ executables: []
66
+ extensions: []
67
+ extra_rdoc_files: []
68
+ files:
69
+ - .gitignore
70
+ - .rvmrc
71
+ - Gemfile
72
+ - LICENSE
73
+ - README.md
74
+ - Rakefile
75
+ - lib/solrizer-rabbit.rb
76
+ - lib/solrizer-rabbit/buffered_indexer.rb
77
+ - lib/solrizer-rabbit/queue_index_worker.rb
78
+ - lib/solrizer-rabbit/sharded_indexer.rb
79
+ - lib/solrizer-rabbit/version.rb
80
+ - lib/tasks/solrizer-rabbit.rake
81
+ - solrizer-rabbit.gemspec
82
+ - spec/spec_helper.rb
83
+ - spec/unit/queue_index_worker_spec.rb
84
+ homepage: ''
85
+ licenses: []
86
+ post_install_message:
87
+ rdoc_options: []
88
+ require_paths:
89
+ - lib
90
+ required_ruby_version: !ruby/object:Gem::Requirement
91
+ none: false
92
+ requirements:
93
+ - - ! '>='
94
+ - !ruby/object:Gem::Version
95
+ version: '0'
96
+ required_rubygems_version: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ requirements: []
103
+ rubyforge_project:
104
+ rubygems_version: 1.8.24
105
+ signing_key:
106
+ specification_version: 3
107
+ summary: Solrize fedora objects using a queue
108
+ test_files:
109
+ - spec/spec_helper.rb
110
+ - spec/unit/queue_index_worker_spec.rb