acts_as_background_solr 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README ADDED
@@ -0,0 +1,86 @@
1
+ = acts_as_background_solr Rails plugin
2
+ This plugin extends the functionality of the acts_as_solr plugin to
3
+ provide for a disconnected background job that synchronizes data with:
4
+ Solr in batch. acts_as_solr works by sending changes do Solr for each
5
+ object immediately following any change. While this is nice as changes
6
+ are immediately viewable, it has a few drawbacks:
7
+
8
+ * Invoking commit on solr requires a new searcher to be opened which
9
+ is slow
10
+
11
+ * There is no way to keep track of an object that was saved when
12
+ notification to solr failed (or when the database transaction
13
+ rolled back)
14
+
15
+ Acts as background solr extends the acts_as_solr plugin to focus on
16
+ background processing.
17
+
18
+ There is one other major changes. Acts as solr calls Model.find for
19
+ each result from the result set. Acts as background solr will
20
+ reconstitute your objects from the attributes stored in solr
21
+ completely avoiding any required database hits when searching against
22
+ solr. This requires that you modify schema.xml to store the fields you
23
+ are indexing.
24
+
25
+ == Installation
26
+ Use this in place of acts_as_solr as in your models, e.g.
27
+
28
+ acts_as_background_solr
29
+
30
+ The options :background, :if and :auto_commit will be automatically
31
+ overridden.
32
+
33
+ Each model can track changes in one of two ways:
34
+
35
+ * Default: Explicitly log changes from the model using listeners
36
+
37
+ * Database triggers: You can instead use db triggers on your model
38
+ tables to track changes. If you use this method, set the option
39
+ :db_triggers => true
40
+
41
+ Example:
42
+
43
+ class User < ActiveRecord::Bae
44
+
45
+ acts_as_background_solr :additional_fields => [:first_name, :last_name], :exclude_fields => ['encrypted_password'], :db_triggers => true
46
+
47
+ end
48
+
49
+ This plugin depends on the following table structure (this is written
50
+ for postgresql):
51
+
52
+ create sequence solr_sync_records_seq start with 1;
53
+ create table solr_sync_records (
54
+ id integer constraint solr_sync_records_id_pk primary key default nextval('solr_sync_records_seq'),
55
+ model varchar(50) not null,
56
+ model_id integer not null,
57
+ created_at timestamp default now() not null
58
+ );
59
+
60
+ -- common access path
61
+ create index solr_sync_records_model_id_idx on solr_sync_records(model, model_id);
62
+
63
+ To update the actual data stored in Solr, you need to invoke the
64
+ following method:
65
+
66
+ SolrBatch.process_all
67
+
68
+ We're using openwfe to schedule this job to run every few minutes, but
69
+ any scheduler should work.
70
+
71
+ This method updates records in bulk, issuing a single commit when
72
+ records are updated. The current algorithm updates up to 5,000 records
73
+ per model in a single call to this method. The way this works is each
74
+ call to SolrBatch.process_all has a default batch size of 500. Each
75
+ call will attempt up to 10 iterations, w/ each iteration updating up
76
+ to 500 records.
77
+
78
+ If you want to change the batch size, provide your own implementation
79
+ of SolrBatch.process_all
80
+
81
+ == Authors
82
+ Michael Bryzek
83
+ mbryzek<at>alum.mit.edu
84
+
85
+ == Release Information
86
+ Released under the MIT license.
data/init.rb ADDED
@@ -0,0 +1,21 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require 'acts_as_background_solr'
@@ -0,0 +1,177 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ require File.dirname(__FILE__) + '/solr_batch'
22
+ require File.dirname(__FILE__) + '/class_methods'
23
+ require File.dirname(__FILE__) + '/instance_methods'
24
+
25
+ # Example:
26
+ # class User < CachedModel
27
+ # acts_as_background_solr :additional_fields => [:first_name, :last_name], :exclude_fields => ['encrypted_password'], :db_triggers => true
28
+ # end
29
+
30
+ module ActsAsBackgroundSolr
31
+ module ActsMethods
32
+
33
+ def acts_as_background_solr(options={}, solr_options={})
34
+ extend ClassMethods
35
+ include InstanceMethods
36
+
37
+ options[:auto_commit] = false
38
+ options[:if] = true
39
+ options[:background] = false
40
+
41
+ acts_as_solr(options, solr_options)
42
+
43
+ unless options[:db_triggers]
44
+ ## Allows use to setup database triggers to monitor changes to the objects
45
+ after_save :solr_flag_record
46
+ after_destroy :solr_flag_record
47
+ end
48
+
49
+
50
+ ## Override implementations of acts_as_solr/parse_* - there were
51
+ ## no easy hooks to override how to prevent the database queries
52
+ ## from occurring.
53
+
54
+ # Method used by mostly all the ClassMethods when doing a search
55
+ def self.parse_query(query=nil, options={}, models=nil)
56
+ valid_options = [:offset, :limit, :facets, :models, :results_format, :order, :scores, :operator]
57
+ query_options = {}
58
+ return if query.nil?
59
+ raise "Invalid parameters: #{(options.keys - valid_options).join(',')}" unless (options.keys - valid_options).empty?
60
+ begin
61
+ ActsAsSolr::Deprecation.validate_query(options)
62
+ query_options[:start] = options[:offset]
63
+ query_options[:rows] = options[:limit]
64
+ query_options[:operator] = options[:operator]
65
+
66
+ # first steps on the facet parameter processing
67
+ if options[:facets]
68
+ query_options[:facets] = {}
69
+ query_options[:facets][:limit] = -1 # TODO: make this configurable
70
+ query_options[:facets][:sort] = :count if options[:facets][:sort]
71
+ query_options[:facets][:mincount] = 0
72
+ query_options[:facets][:mincount] = 1 if options[:facets][:zeros] == false
73
+ query_options[:facets][:fields] = options[:facets][:fields].collect{|k| "#{k}_facet"} if options[:facets][:fields]
74
+ query_options[:filter_queries] = replace_types(options[:facets][:browse].collect{|k| "#{k.sub!(/ *: */,"_facet:")}"}) if options[:facets][:browse]
75
+ query_options[:facets][:queries] = replace_types(options[:facets][:query].collect{|k| "#{k.sub!(/ *: */,"_t:")}"}) if options[:facets][:query]
76
+ end
77
+
78
+ query_options[:field_list] = ['*', 'score']
79
+ query = "(#{query.gsub(/ *: */,"_t:")}) #{models}"
80
+ query.gsub!('pk_t','pk_i')
81
+ ## Automatically filter to this model
82
+ query = "type_t:#{self} AND #{query}" unless query.match(/type_t:/)
83
+ logger.debug("SOLR QUERY: #{query}")
84
+ order = options[:order].split(/\s*,\s*/).collect{|e| e.gsub(/\s+/,'_t ') }.join(',') if options[:order]
85
+ query_options[:query] = replace_types([query])[0] # TODO adjust replace_types to work with String or Array
86
+
87
+ if options[:order]
88
+ # TODO: set the sort parameter instead of the old ;order. style.
89
+ query_options[:query] << ';' << replace_types([order], false)[0]
90
+ end
91
+
92
+ ActsAsSolr::Post.execute(Solr::Request::Standard.new(query_options))
93
+ rescue
94
+ raise "There was a problem executing your search: #{$!}"
95
+ end
96
+ end
97
+
98
+
99
+ ## Override parse_results to reconstruct options from the actual
100
+ ## query results instead of having to hit the database
101
+ def self.parse_results(solr_data, options = {})
102
+ results = {
103
+ :docs => [],
104
+ :total => 0
105
+ }
106
+ configuration = {
107
+ :format => :objects
108
+ }
109
+ results.update(:facets => {'facet_fields' => []}) if options[:facets]
110
+ return ActsAsSolr::SearchResults.new(results) if solr_data.total == 0
111
+ configuration.update(options) if options.is_a?(Hash)
112
+ return super unless configuration[:format] == :objects
113
+
114
+ result = []
115
+ solr_data.docs.each do |doc|
116
+ object = self.send(:new)
117
+ doc.each do |k,v|
118
+ next if k == "id"
119
+ if (k == 'pk_i')
120
+ ## TODO: Not sure if there is a potential for more than one value
121
+ object.id = v[0]
122
+ else
123
+ field = k.sub(/\_[^_]+$/, '')
124
+ object.send("#{field}=", convert_value(object.class, field, v)) if object.respond_to?("#{field}=")
125
+ end
126
+ end
127
+ result << object
128
+ end
129
+
130
+ # ids = solr_data.docs.collect {|doc| doc["#{solr_configuration[:primary_key_field]}"]}.flatten
131
+ # conditions = [ "#{self.table_name}.#{primary_key} in (?)", ids ]
132
+ # result = configuration[:format] == :objects ? reorder(self.find(:all, :conditions => conditions), ids) : ids
133
+ add_scores(result, solr_data) if configuration[:format] == :objects && options[:scores]
134
+
135
+ results.update(:facets => solr_data.data['facet_counts']) if options[:facets]
136
+ results.update({:docs => result, :total => solr_data.total, :max_score => solr_data.max_score})
137
+ ActsAsSolr::SearchResults.new(results)
138
+
139
+ end
140
+ end
141
+
142
+ private
143
+ def convert_value(klass, field, value)
144
+ return nil unless value
145
+ if value.is_a?(Array)
146
+ return value if value.size != 1
147
+ value = value[0]
148
+ end
149
+ return value unless value.is_a?(String)
150
+ column = klass.columns_hash[field]
151
+ return value.to_str unless column ## Occurs for included fields
152
+
153
+ case column.sql_type.intern
154
+ when :integer: return value.to_i
155
+ when :numeric: return BigDecimal.new(value)
156
+ when :float: return value.to_f
157
+ when :date: return Date.parse(value)
158
+ when :datetime: return parse_datetime(value)
159
+ when :time: return parse_datetime(value)
160
+ when :boolean: return Boolean.new(value)
161
+ end
162
+ return value.to_str
163
+ end
164
+
165
+ def parse_datetime(value)
166
+ begin
167
+ return Time.parse(value)
168
+ rescue Exception => e
169
+ raise "Error parsing date \"#{value}\": #{e}"
170
+ end
171
+ end
172
+
173
+ end
174
+ end
175
+
176
+ # reopen ActiveRecord and include the acts_as_background_solr method
177
+ ActiveRecord::Base.extend ActsAsBackgroundSolr::ActsMethods
@@ -0,0 +1,34 @@
1
+ # Copyright (c) 2007 Michael Bryzek
2
+ #
3
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ # of this software and associated documentation files (the "Software"), to deal
5
+ # in the Software without restriction, including without limitation the rights
6
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ # copies of the Software, and to permit persons to whom the Software is
8
+ # furnished to do so, subject to the following conditions:
9
+ #
10
+ # The above copyright notice and this permission notice shall be included in all
11
+ # copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ # SOFTWARE.
20
+
21
+ module ActsAsBackgroundSolr #:nodoc:
22
+
23
+ module ClassMethods
24
+ def solr_sync_pending_records(batch_size=500)
25
+ SolrBatch.sync_pending_records(self, { :batch_size => batch_size } );
26
+ end
27
+
28
+ def solr_requeue_all
29
+ SolrBatch.requeue_all(self)
30
+ end
31
+
32
+ end
33
+
34
+ end
@@ -0,0 +1,11 @@
1
+ module ActsAsBackgroundSolr #:nodoc:
2
+
3
+ module InstanceMethods
4
+
5
+ def solr_flag_record
6
+ SolrBatch.flag_record(self.class, id)
7
+ end
8
+
9
+ end
10
+
11
+ end
data/lib/solr_batch.rb ADDED
@@ -0,0 +1,136 @@
1
+ class SolrBatch
2
+
3
+ ## This procedure should be run in the background with whatever job
4
+ ## scheduler you are comfortable with
5
+ def SolrBatch.process_all
6
+ models = ActiveRecord::Base.connection.select_values("select distinct model from solr_sync_records")
7
+ if models.size == 0
8
+ RAILS_DEFAULT_LOGGER.debug("SolrBatch.process_all: No records pending synchronization")
9
+ else
10
+ first_klass = nil
11
+ models.each do |model|
12
+ klass = eval(model)
13
+ first_klass = klass unless first_klass
14
+ SolrBatch.sync_pending_records(klass, { :do_commit => false, :do_optimize => true })
15
+ end
16
+
17
+ ## Commit/optimize after ALL records have been processed
18
+ first_klass.send(:solr_commit)
19
+ first_klass.send(:solr_optimize)
20
+ end
21
+ end
22
+
23
+ ## Flags an individual record as pending synchronization with Solr
24
+ def SolrBatch.flag_record(model_name, model_id)
25
+ sql = 'insert into solr_sync_records (model, model_id) values (?, ?)'
26
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, model_name, model_id]))
27
+ end
28
+
29
+ ## Marks all records for this model as pending synchronization with Solr
30
+ def SolrBatch.requeue_all(ar_model)
31
+ sql = <<-endsql
32
+ insert into solr_sync_records
33
+ (model, model_id)
34
+ select ?, id from #{ar_model.send(:table_name)}
35
+ EXCEPT
36
+ select ?, model_id from solr_sync_records where model = ?
37
+ endsql
38
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, ar_model, ar_model, ar_model]))
39
+ end
40
+
41
+ # Ex: from within ActiveRecord Model, call:
42
+ # SolrBatch.sync_pending_records(self, { :batch_size => 500 } );
43
+ def SolrBatch.sync_pending_records(ar_model, opts = {}, iteration=1)
44
+ batch_size = opts.delete(:batch_size)
45
+ batch_size = 500 if batch_size.nil? || batch_size < 1
46
+ do_commit = opts.delete(:do_commit) || true
47
+ do_optimize = opts.delete(:do_optimize) || true
48
+ raise "Invalid option keys: #{opts.keys}" unless opts.keys.size == 0
49
+
50
+ RAILS_DEFAULT_LOGGER.debug("SolrBatch.sync_pending_records(#{ar_model})")
51
+ items = get_pending_records(ar_model, batch_size)
52
+ solr_log(ar_model, 'No records to process') and return if items.size == 0
53
+
54
+ delete_batch = []
55
+ add_batch = []
56
+ processed = {}
57
+ items.each do |o|
58
+ solr_id = o.send(:solr_id)
59
+ if !processed.has_key?(solr_id) ## Duplicates are possible in the log table
60
+ processed[solr_id] = true
61
+ if o.solr_sync_record_deleted_p == 't'
62
+ delete_batch << solr_id
63
+ else
64
+ add_batch << o.to_solr_doc
65
+ end
66
+ end
67
+ end
68
+
69
+ # We delete in bulk with a single query
70
+ ActsAsSolr::Post.execute(Solr::Request::Delete.new(:query => id_query(delete_batch))) if delete_batch.size > 0
71
+ ar_model.send(:solr_add, add_batch) if add_batch.size > 0
72
+ SolrBatch.delete_from_queue(ar_model, items.last.solr_sync_records_log_id)
73
+
74
+ solr_log(ar_model, "#{items.size} items have been batch added to index.")
75
+
76
+ if (items.size >= batch_size && iteration < 10)
77
+ ## More records of this class to process
78
+ SolrBatch.sync_pending_records(ar_model, opts, iteration + 1)
79
+ else
80
+ ar_model.send(:solr_commit) if do_commit
81
+ ar_model.send(:solr_optimize) if do_optimize
82
+ end
83
+ end
84
+
85
+ private
86
+ def SolrBatch.solr_log(ar_model, message)
87
+ RAILS_DEFAULT_LOGGER.debug("Solr Sync for #{ar_model.name}: #{message}")
88
+ end
89
+
90
+ ## Removes all records from the solr pending sync queue for this
91
+ ## model up to the specified sync id
92
+ def SolrBatch.delete_from_queue(ar_model, last_sync_id)
93
+ sql = 'delete from solr_sync_records where model = ? and id <= ?'
94
+ ActiveRecord::Base.connection.execute(sanitize_sql_array([sql, ar_model, last_sync_id]))
95
+ end
96
+
97
+ ## Turns a list of IDs into a single solr query matching all those
98
+ ## IDs
99
+ def SolrBatch.id_query(solr_ids)
100
+ s = nil
101
+ solr_ids.each do |id|
102
+ if s
103
+ s << ' OR '
104
+ else
105
+ s = ''
106
+ end
107
+ s << "id:" << id.sub(/:/, '\:')
108
+ end
109
+ s
110
+ end
111
+
112
+ ## Returns an array of records for the specified active record model
113
+ ## that are pending synchronization
114
+ def SolrBatch.get_pending_records(ar_model, batch_size)
115
+ sql = <<-endsql
116
+ select log.id as solr_sync_records_log_id, log.model_id as solr_sync_records_model_id,
117
+ case when ar.id is null then 't' else 'f' end as solr_sync_record_deleted_p,
118
+ ar.*
119
+ from solr_sync_records log
120
+ left outer join #{ar_model.send('table_name')} ar on ar.id = log.model_id
121
+ order by log.id asc
122
+ limit #{batch_size}
123
+ endsql
124
+
125
+ ar_model.send(:find_by_sql, sql)
126
+ end
127
+
128
+ private
129
+ def SolrBatch.sanitize_sql_array(ary)
130
+ statement = ary.shift
131
+ connection = ActiveRecord::Base.connection
132
+ ary.each { |v| statement.sub!('?', "'#{connection.quote_string(v.to_s)}'") }
133
+ statement
134
+ end
135
+
136
+ end
metadata ADDED
@@ -0,0 +1,51 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: acts_as_background_solr
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.5"
7
+ date: 2007-09-09 00:00:00 -04:00
8
+ summary: Extends the functionality of the acts_as_solr plugin to provide for disconnected background job processing
9
+ require_paths:
10
+ - lib
11
+ email: mbryzek@alum.mit.edu
12
+ homepage:
13
+ rubyforge_project:
14
+ description:
15
+ autorequire: acts_as_backbround_solr
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Michael Bryzek
31
+ files:
32
+ - init.rb
33
+ - lib/acts_as_background_solr.rb
34
+ - lib/class_methods.rb
35
+ - lib/instance_methods.rb
36
+ - lib/solr_batch.rb
37
+ - README
38
+ test_files: []
39
+
40
+ rdoc_options: []
41
+
42
+ extra_rdoc_files:
43
+ - README
44
+ executables: []
45
+
46
+ extensions: []
47
+
48
+ requirements: []
49
+
50
+ dependencies: []
51
+