acts_as_background_solr 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,86 @@
1
+ = acts_as_background_solr Rails plugin
2
+ This plugin extends the functionality of the acts_as_solr plugin to
3
+ provide for a disconnected background job that synchronizes data with:
4
+ Solr in batch. acts_as_solr works by sending changes do Solr for each
5
+ object immediately following any change. While this is nice as changes
6
+ are immediately viewable, it has a few drawbacks:
7
+
8
+ * Invoking commit on solr requires a new searcher to be opened which
9
+ is slow
10
+
11
+ * There is no way to keep track of an object that was saved when
12
+ notification to solr failed (or when the database transaction
13
+ rolled back)
14
+
15
+ Acts as background solr extends the acts_as_solr plugin to focus on
16
+ background processing.
17
+
18
+ There is one other major changes. Acts as solr calls Model.find for
19
+ each result from the result set. Acts as background solr will
20
+ reconstitute your objects from the attributes stored in solr
21
+ completely avoiding any required database hits when searching against
22
+ solr. This requires that you modify schema.xml to store the fields you
23
+ are indexing.
24
+
25
+ == Installation
26
+ Use this in place of acts_as_solr as in your models, e.g.
27
+
28
+ acts_as_background_solr
29
+
30
+ The options :background, :if and :auto_commit will be automatically
31
+ overridden.
32
+
33
+ Each model can track changes in one of two ways:
34
+
35
+ * Default: Explicitly log changes from the model using listeners
36
+
37
+ * Database triggers: You can instead use db triggers on your model
38
+ tables to track changes. If you use this method, set the option
39
+ :db_triggers => true
40
+
41
+ Example:
42
+
43
+ class User < ActiveRecord::Bae
44
+
45
+ acts_as_background_solr :additional_fields => [:first_name, :last_name], :exclude_fields => ['encrypted_password'], :db_triggers => true
46
+
47
+ end
48
+
49
+ This plugin depends on the following table structure (this is written
50
+ for postgresql):
51
+
52
+ create sequence solr_sync_records_seq start with 1;
53
+ create table solr_sync_records (
54
+ id integer constraint solr_sync_records_id_pk primary key default nextval('solr_sync_records_seq'),
55
+ model varchar(50) not null,
56
+ model_id integer not null,
57
+ created_at timestamp default now() not null
58
+ );
59
+
60
+ -- common access path
61
+ create index solr_sync_records_model_id_idx on solr_sync_records(model, model_id);
62
+
63
+ To update the actual data stored in Solr, you need to invoke the
64
+ following method:
65
+
66
+ SolrBatch.process_all
67
+
68
+ We're using openwfe to schedule this job to run every few minutes, but
69
+ any scheduler should work.
70
+
71
+ This method updates records in bulk, issuing a single commit when
72
+ records are updated. The current algorithm updates up to 5,000 records
73
+ per model in a single call to this method. The way this works is each
74
+ call to SolrBatch.process_all has a default batch size of 500. Each
75
+ call will attempt up to 10 iterations, w/ each iteration updating up
76
+ to 500 records.
77
+
78
+ If you want to change the batch size, provide your own implementation
79
+ of SolrBatch.process_all
80
+
81
+ == Authors
82
+ Michael Bryzek
83
+ mbryzek<at>alum.mit.edu
84
+
85
+ == Release Information
86
+ Released under the MIT license.
data/init.rb ADDED
@@ -0,0 +1,21 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'acts_as_background_solr'
@@ -0,0 +1,177 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require File.dirname(__FILE__) + '/solr_batch'
22
+ require File.dirname(__FILE__) + '/class_methods'
23
+ require File.dirname(__FILE__) + '/instance_methods'
24
+
25
+ # Example:
26
+ # class User < CachedModel
27
+ # acts_as_background_solr :additional_fields => [:first_name, :last_name], :exclude_fields => ['encrypted_password'], :db_triggers => true
28
+ # end
29
+
30
+ module ActsAsBackgroundSolr
31
+ module ActsMethods
32
+
33
+ def acts_as_background_solr(options={}, solr_options={})
34
+ extend ClassMethods
35
+ include InstanceMethods
36
+
37
+ options[:auto_commit] = false
38
+ options[:if] = true
39
+ options[:background] = false
40
+
41
+ acts_as_solr(options, solr_options)
42
+
43
+ unless options[:db_triggers]
44
+ ## Allows use to setup database triggers to monitor changes to the objects
45
+ after_save :solr_flag_record
46
+ after_destroy :solr_flag_record
47
+ end
48
+
49
+
50
+ ## Override implementations of acts_as_solr/parse_* - there were
51
+ ## no easy hooks to override how to prevent the database queries
52
+ ## from occurring.
53
+
54
+ # Method used by mostly all the ClassMethods when doing a search
55
+ def self.parse_query(query=nil, options={}, models=nil)
56
+ valid_options = [:offset, :limit, :facets, :models, :results_format, :order, :scores, :operator]
57
+ query_options = {}
58
+ return if query.nil?
59
+ raise "Invalid parameters: #{(options.keys - valid_options).join(',')}" unless (options.keys - valid_options).empty?
60
+ begin
61
+ ActsAsSolr::Deprecation.validate_query(options)
62
+ query_options[:start] = options[:offset]
63
+ query_options[:rows] = options[:limit]
64
+ query_options[:operator] = options[:operator]
65
+
66
+ # first steps on the facet parameter processing
67
+ if options[:facets]
68
+ query_options[:facets] = {}
69
+ query_options[:facets][:limit] = -1 # TODO: make this configurable
70
+ query_options[:facets][:sort] = :count if options[:facets][:sort]
71
+ query_options[:facets][:mincount] = 0
72
+ query_options[:facets][:mincount] = 1 if options[:facets][:zeros] == false
73
+ query_options[:facets][:fields] = options[:facets][:fields].collect{|k| "#{k}_facet"} if options[:facets][:fields]
74
+ query_options[:filter_queries] = replace_types(options[:facets][:browse].collect{|k| "#{k.sub!(/ *: */,"_facet:")}"}) if options[:facets][:browse]
75
+ query_options[:facets][:queries] = replace_types(options[:facets][:query].collect{|k| "#{k.sub!(/ *: */,"_t:")}"}) if options[:facets][:query]
76
+ end
77
+
78
+ query_options[:field_list] = ['*', 'score']
79
+ query = "(#{query.gsub(/ *: */,"_t:")}) #{models}"
80
+ query.gsub!('pk_t','pk_i')
81
+ ## Automatically filter to this model
82
+ query = "type_t:#{self} AND #{query}" unless query.match(/type_t:/)
83
+ logger.debug("SOLR QUERY: #{query}")
84
+ order = options[:order].split(/\s*,\s*/).collect{|e| e.gsub(/\s+/,'_t ') }.join(',') if options[:order]
85
+ query_options[:query] = replace_types([query])[0] # TODO adjust replace_types to work with String or Array
86
+
87
+ if options[:order]
88
+ # TODO: set the sort parameter instead of the old ;order. style.
89
+ query_options[:query] << ';' << replace_types([order], false)[0]
90
+ end
91
+
92
+ ActsAsSolr::Post.execute(Solr::Request::Standard.new(query_options))
93
+ rescue
94
+ raise "There was a problem executing your search: #{$!}"
95
+ end
96
+ end
97
+
98
+
99
+ ## Override parse_results to reconstruct options from the actual
100
+ ## query results instead of having to hit the database
101
+ def self.parse_results(solr_data, options = {})
102
+ results = {
103
+ :docs => [],
104
+ :total => 0
105
+ }
106
+ configuration = {
107
+ :format => :objects
108
+ }
109
+ results.update(:facets => {'facet_fields' => []}) if options[:facets]
110
+ return ActsAsSolr::SearchResults.new(results) if solr_data.total == 0
111
+ configuration.update(options) if options.is_a?(Hash)
112
+ return super unless configuration[:format] == :objects
113
+
114
+ result = []
115
+ solr_data.docs.each do |doc|
116
+ object = self.send(:new)
117
+ doc.each do |k,v|
118
+ next if k == "id"
119
+ if (k == 'pk_i')
120
+ ## TODO: Not sure if there is a potential for more than one value
121
+ object.id = v[0]
122
+ else
123
+ field = k.sub(/\_[^_]+$/, '')
124
+ object.send("#{field}=", convert_value(object.class, field, v)) if object.respond_to?("#{field}=")
125
+ end
126
+ end
127
+ result << object
128
+ end
129
+
130
+ # ids = solr_data.docs.collect {|doc| doc["#{solr_configuration[:primary_key_field]}"]}.flatten
131
+ # conditions = [ "#{self.table_name}.#{primary_key} in (?)", ids ]
132
+ # result = configuration[:format] == :objects ? reorder(self.find(:all, :conditions => conditions), ids) : ids
133
+ add_scores(result, solr_data) if configuration[:format] == :objects && options[:scores]
134
+
135
+ results.update(:facets => solr_data.data['facet_counts']) if options[:facets]
136
+ results.update({:docs => result, :total => solr_data.total, :max_score => solr_data.max_score})
137
+ ActsAsSolr::SearchResults.new(results)
138
+
139
+ end
140
+ end
141
+
142
+ private
143
+ def convert_value(klass, field, value)
144
+ return nil unless value
145
+ if value.is_a?(Array)
146
+ return value if value.size != 1
147
+ value = value[0]
148
+ end
149
+ return value unless value.is_a?(String)
150
+ column = klass.columns_hash[field]
151
+ return value.to_str unless column ## Occurs for included fields
152
+
153
+ case column.sql_type.intern
154
+ when :integer: return value.to_i
155
+ when :numeric: return BigDecimal.new(value)
156
+ when :float: return value.to_f
157
+ when :date: return Date.parse(value)
158
+ when :datetime: return parse_datetime(value)
159
+ when :time: return parse_datetime(value)
160
+ when :boolean: return Boolean.new(value)
161
+ end
162
+ return value.to_str
163
+ end
164
+
165
+ def parse_datetime(value)
166
+ begin
167
+ return Time.parse(value)
168
+ rescue Exception => e
169
+ raise "Error parsing date \"#{value}\": #{e}"
170
+ end
171
+ end
172
+
173
+ end
174
+ end
175
+
176
+ # reopen ActiveRecord and include the acts_as_background_solr method
177
+ ActiveRecord::Base.extend ActsAsBackgroundSolr::ActsMethods
@@ -0,0 +1,34 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ module ActsAsBackgroundSolr #:nodoc:
22
+
23
+ module ClassMethods
24
+ def solr_sync_pending_records(batch_size=500)
25
+ SolrBatch.sync_pending_records(self, { :batch_size => batch_size } );
26
+ end
27
+
28
+ def solr_requeue_all
29
+ SolrBatch.requeue_all(self)
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,11 @@
1
+ module ActsAsBackgroundSolr #:nodoc:
2
+
3
+ module InstanceMethods
4
+
5
+ def solr_flag_record
6
+ SolrBatch.flag_record(self.class, id)
7
+ end
8
+
9
+ end
10
+
11
+ end
data/lib/solr_batch.rb ADDED
@@ -0,0 +1,136 @@
1
+ class SolrBatch
2
+
3
+ ## This procedure should be run in the background with whatever job
4
+ ## scheduler you are comfortable with
5
+ def SolrBatch.process_all
6
+ models = ActiveRecord::Base.connection.select_values("select distinct model from solr_sync_records")
7
+ if models.size == 0
8
+ RAILS_DEFAULT_LOGGER.debug("SolrBatch.process_all: No records pending synchronization")
9
+ else
10
+ first_klass = nil
11
+ models.each do |model|
12
+ klass = eval(model)
13
+ first_klass = klass unless first_klass
14
+ SolrBatch.sync_pending_records(klass, { :do_commit => false, :do_optimize => true })
15
+ end
16
+
17
+ ## Commit/optimize after ALL records have been processed
18
+ first_klass.send(:solr_commit)
19
+ first_klass.send(:solr_optimize)
20
+ end
21
+ end
22
+
23
+ ## Flags an individual record as pending synchronization with Solr
24
+ def SolrBatch.flag_record(model_name, model_id)
25
+ sql = 'insert into solr_sync_records (model, model_id) values (?, ?)'
26
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, model_name, model_id]))
27
+ end
28
+
29
+ ## Marks all records for this model as pending synchronization with Solr
30
+ def SolrBatch.requeue_all(ar_model)
31
+ sql = <<-endsql
32
+ insert into solr_sync_records
33
+ (model, model_id)
34
+ select ?, id from #{ar_model.send(:table_name)}
35
+ EXCEPT
36
+ select ?, model_id from solr_sync_records where model = ?
37
+ endsql
38
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, ar_model, ar_model, ar_model]))
39
+ end
40
+
41
+ # Ex: from within ActiveRecord Model, call:
42
+ # SolrBatch.sync_pending_records(self, { :batch_size => 500 } );
43
+ def SolrBatch.sync_pending_records(ar_model, opts = {}, iteration=1)
44
+ batch_size = opts.delete(:batch_size)
45
+ batch_size = 500 if batch_size.nil? || batch_size < 1
46
+ do_commit = opts.delete(:do_commit) || true
47
+ do_optimize = opts.delete(:do_optimize) || true
48
+ raise "Invalid option keys: #{opts.keys}" unless opts.keys.size == 0
49
+
50
+ RAILS_DEFAULT_LOGGER.debug("SolrBatch.sync_pending_records(#{ar_model})")
51
+ items = get_pending_records(ar_model, batch_size)
52
+ solr_log(ar_model, 'No records to process') and return if items.size == 0
53
+
54
+ delete_batch = []
55
+ add_batch = []
56
+ processed = {}
57
+ items.each do |o|
58
+ solr_id = o.send(:solr_id)
59
+ if !processed.has_key?(solr_id) ## Duplicates are possible in the log table
60
+ processed[solr_id] = true
61
+ if o.solr_sync_record_deleted_p == 't'
62
+ delete_batch << solr_id
63
+ else
64
+ add_batch << o.to_solr_doc
65
+ end
66
+ end
67
+ end
68
+
69
+ # We delete in bulk with a single query
70
+ ActsAsSolr::Post.execute(Solr::Request::Delete.new(:query => id_query(delete_batch))) if delete_batch.size > 0
71
+ ar_model.send(:solr_add, add_batch) if add_batch.size > 0
72
+ SolrBatch.delete_from_queue(ar_model, items.last.solr_sync_records_log_id)
73
+
74
+ solr_log(ar_model, "#{items.size} items have been batch added to index.")
75
+
76
+ if (items.size >= batch_size && iteration < 10)
77
+ ## More records of this class to process
78
+ SolrBatch.sync_pending_records(ar_model, opts, iteration + 1)
79
+ else
80
+ ar_model.send(:solr_commit) if do_commit
81
+ ar_model.send(:solr_optimize) if do_optimize
82
+ end
83
+ end
84
+
85
+ private
86
+ def SolrBatch.solr_log(ar_model, message)
87
+ RAILS_DEFAULT_LOGGER.debug("Solr Sync for #{ar_model.name}: #{message}")
88
+ end
89
+
90
+ ## Removes all records from the solr pending sync queue for this
91
+ ## model up to the specified sync id
92
+ def SolrBatch.delete_from_queue(ar_model, last_sync_id)
93
+ sql = 'delete from solr_sync_records where model = ? and id <= ?'
94
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, ar_model, last_sync_id]))
95
+ end
96
+
97
+ ## Turns a list of IDs into a single solr query matching all those
98
+ ## IDs
99
+ def SolrBatch.id_query(solr_ids)
100
+ s = nil
101
+ solr_ids.each do |id|
102
+ if s
103
+ s << ' OR '
104
+ else
105
+ s = ''
106
+ end
107
+ s << "id:" << id.sub(/:/, '\:')
108
+ end
109
+ s
110
+ end
111
+
112
+ ## Returns an array of records for the specified active record model
113
+ ## that are pending synchronization
114
+ def SolrBatch.get_pending_records(ar_model, batch_size)
115
+ sql = <<-endsql
116
+ select log.id as solr_sync_records_log_id, log.model_id as solr_sync_records_model_id,
117
+ case when ar.id is null then 't' else 'f' end as solr_sync_record_deleted_p,
118
+ ar.*
119
+ from solr_sync_records log
120
+ left outer join #{ar_model.send('table_name')} ar on ar.id = log.model_id
121
+ order by log.id asc
122
+ limit #{batch_size}
123
+ endsql
124
+
125
+ ar_model.send(:find_by_sql, sql)
126
+ end
127
+
128
+ private
129
+ def SolrBatch.sanitize_sql_array(ary)
130
+ statement = ary.shift
131
+ connection = ActiveRecord::Base.connection
132
+ ary.each { |v| statement.sub!('?', "'#{connection.quote_string(v.to_s)}'") }
133
+ statement
134
+ end
135
+
136
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: acts_as_background_solr
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.5"
7
+ date: 2007-09-09 00:00:00 -04:00
8
+ summary: Extends the functionality of the acts_as_solr plugin to provide for disconnected background job processing
9
+ require_paths:
10
+ - lib
11
+ email: mbryzek@alum.mit.edu
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: acts_as_backbround_solr
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Michael Bryzek
31
+ files:
32
+ - init.rb
33
+ - lib/acts_as_background_solr.rb
34
+ - lib/class_methods.rb
35
+ - lib/instance_methods.rb
36
+ - lib/solr_batch.rb
37
+ - README
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies: []
51
+