acts_as_xapian 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +148 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/acts_as_xapian.gemspec +70 -0
- data/generators/acts_as_xapian/USAGE +1 -0
- data/generators/acts_as_xapian/acts_as_xapian_generator.rb +14 -0
- data/generators/acts_as_xapian/templates/migrations/migration.rb +14 -0
- data/generators/acts_as_xapian/templates/tasks/xapian.rake +42 -0
- data/lib/acts_as_xapian/base.rb +215 -0
- data/lib/acts_as_xapian/core_ext/array.rb +24 -0
- data/lib/acts_as_xapian/index.rb +45 -0
- data/lib/acts_as_xapian/query_base.rb +159 -0
- data/lib/acts_as_xapian/readable_index.rb +117 -0
- data/lib/acts_as_xapian/search.rb +67 -0
- data/lib/acts_as_xapian/similar.rb +61 -0
- data/lib/acts_as_xapian/writeable_index.rb +152 -0
- data/lib/acts_as_xapian.rb +5 -0
- data/spec/acts_as_xapian_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +111 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class Index
|
3
|
+
@@db_path = nil
|
4
|
+
@@init_values = []
|
5
|
+
|
6
|
+
cattr_reader :config, :db_path, :stemmer
|
7
|
+
|
8
|
+
class <<self
|
9
|
+
######################################################################
|
10
|
+
# Initialisation
|
11
|
+
def init(classname = nil, options = nil)
|
12
|
+
# store class and options for use later, when we open the db in readable_init
|
13
|
+
@@init_values.push([classname,options]) unless classname.nil?
|
14
|
+
end
|
15
|
+
|
16
|
+
# Reads the config file (if any) and sets up the path to the database we'll be using
|
17
|
+
def prepare_environment
|
18
|
+
return unless @@db_path.nil?
|
19
|
+
|
20
|
+
# barf if we can't figure out the environment
|
21
|
+
environment = (ENV['RAILS_ENV'] || RAILS_ENV)
|
22
|
+
raise "Set RAILS_ENV, so acts_as_xapian can find the right Xapian database" unless environment
|
23
|
+
|
24
|
+
# check for a config file
|
25
|
+
config_file = File.join(RAILS_ROOT, 'config', 'xapian.yml')
|
26
|
+
@@config = File.exists?(config_file) ? YAML.load_file(config_file)[environment] : {}
|
27
|
+
# figure out where the DBs should go
|
28
|
+
if config['base_db_path']
|
29
|
+
db_parent_path = File.join(RAILS_ROOT, config['base_db_path'])
|
30
|
+
else
|
31
|
+
db_parent_path = File.join(RAILS_ROOT, 'db', 'xapiandbs')
|
32
|
+
end
|
33
|
+
|
34
|
+
# make the directory for the xapian databases to go in
|
35
|
+
Dir.mkdir(db_parent_path) unless File.exists?(db_parent_path)
|
36
|
+
|
37
|
+
@@db_path = File.join(db_parent_path, environment)
|
38
|
+
|
39
|
+
# make some things that don't depend on the db
|
40
|
+
# XXX this gets made once for each acts_as_xapian. Oh well.
|
41
|
+
@@stemmer = Xapian::Stem.new('english')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Base class for Search and Similar below
|
3
|
+
class QueryBase
|
4
|
+
attr_accessor :offset, :limit, :query, :query_models, :runtime, :cached_results
|
5
|
+
@@unlimited = 1000000
|
6
|
+
|
7
|
+
# Return a description of the query
|
8
|
+
def description
|
9
|
+
self.query.description
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns the mset for the query
|
13
|
+
def matches(reload = false)
|
14
|
+
return @matches unless @matches.nil? || reload
|
15
|
+
|
16
|
+
begin
|
17
|
+
self.runtime += Benchmark::realtime do
|
18
|
+
# If using find_options conditions have Xapian return the entire match set
|
19
|
+
# TODO Revisit. This is extremely inefficient for large indices
|
20
|
+
@matches = @index.enquire.mset(@postpone_limit ? 0 : @offset, @postpone_limit ? @@unlimited : @limit, @check_at_least)
|
21
|
+
end
|
22
|
+
@matches
|
23
|
+
rescue IOError => e
|
24
|
+
if @retried.nil? && /DatabaseModifiedError/.match(e.message.to_s)
|
25
|
+
@retried = true
|
26
|
+
@index.reset_enquire!
|
27
|
+
initialize_enquire
|
28
|
+
retry
|
29
|
+
end
|
30
|
+
raise e
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Estimate total number of results
|
35
|
+
# Note: Unreliable if using find_options with conditions or joins
|
36
|
+
def matches_estimated
|
37
|
+
@matches_estimated || self.matches.matches_estimated
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return query string with spelling correction
|
41
|
+
def spelling_correction
|
42
|
+
correction = @index.query_parser.get_corrected_query_string
|
43
|
+
correction.empty? ? nil : correction
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return array of models found
|
47
|
+
def results
|
48
|
+
# If they've already pulled out the results, just return them.
|
49
|
+
return self.cached_results unless self.cached_results.nil?
|
50
|
+
|
51
|
+
docs = nil
|
52
|
+
self.runtime += Benchmark::realtime do
|
53
|
+
# Pull out all the results
|
54
|
+
docs = self.matches.matches.map {|doc| {:data => doc.document.data, :percent => doc.percent, :weight => doc.weight, :collapse_count => doc.collapse_count} }
|
55
|
+
end
|
56
|
+
|
57
|
+
# Log time taken, excluding database lookups below which will be displayed separately by ActiveRecord
|
58
|
+
ActiveRecord::Base.logger.debug(" Xapian query (%.5fs) #{self.log_description.gsub('%','%%')}" % self.runtime) if ActiveRecord::Base.logger
|
59
|
+
|
60
|
+
# Group the ids by the model they belong to
|
61
|
+
lhash = docs.inject({}) do |s,doc|
|
62
|
+
model_name, id = doc[:data].split('-')
|
63
|
+
(s[model_name] ||= []) << id
|
64
|
+
s
|
65
|
+
end
|
66
|
+
|
67
|
+
if @postpone_limit
|
68
|
+
found = lhash.map do |(class_name, ids)|
|
69
|
+
model = class_name.constantize # constantize is expensive do once
|
70
|
+
model.with_xapian_scope(ids) { model.find(:all, @find_options.merge(:select => "#{model.table_name}.#{model.primary_key}")) }.map {|m| m.xapian_document_term }
|
71
|
+
end.flatten
|
72
|
+
|
73
|
+
self.runtime += Benchmark::realtime do
|
74
|
+
found = found.inject({}) {|s,i| s[i] = true; s } # hash key searching is MUCH faster than an array sequential scan
|
75
|
+
docs.delete_if {|doc| !found.delete(doc[:data]) }
|
76
|
+
|
77
|
+
@matches_estimated = docs.size
|
78
|
+
|
79
|
+
docs = docs[@offset,@limit] || []
|
80
|
+
|
81
|
+
lhash = docs.inject({}) do |s,doc|
|
82
|
+
model_name, id = doc[:data].split('-')
|
83
|
+
(s[model_name] ||= []) << id
|
84
|
+
s
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# for each class, look up the associated ids
|
90
|
+
chash = lhash.inject({}) do |out, (class_name, ids)|
|
91
|
+
model = class_name.constantize # constantize is expensive do once
|
92
|
+
found = model.with_xapian_scope(ids) { model.find(:all, @find_options) }
|
93
|
+
out[class_name] = found.inject({}) {|s,f| s[f.id] = f; s }
|
94
|
+
out
|
95
|
+
end
|
96
|
+
|
97
|
+
# add the model to each doc
|
98
|
+
docs.each do |doc|
|
99
|
+
model_name, id = doc[:data].split('-')
|
100
|
+
doc[:model] = chash[model_name][id.to_i]
|
101
|
+
end
|
102
|
+
|
103
|
+
self.cached_results = docs
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
|
108
|
+
def initialize_db(models)
|
109
|
+
self.runtime = 0.0
|
110
|
+
|
111
|
+
@index = ReadableIndex.index_for(models)
|
112
|
+
|
113
|
+
raise "ActsAsXapian::ReadableIndex not initialized" if @index.nil?
|
114
|
+
end
|
115
|
+
|
116
|
+
# Set self.query before calling this
|
117
|
+
def initialize_query(options)
|
118
|
+
self.runtime += Benchmark::realtime do
|
119
|
+
@offset = options[:offset].to_i
|
120
|
+
@limit = (options[:limit] || @@unlimited).to_i
|
121
|
+
@check_at_least = (options[:check_at_least] || 100).to_i
|
122
|
+
@sort_by_prefix = options[:sort_by_prefix]
|
123
|
+
@sort_by_ascending = options[:sort_by_ascending].nil? ? true : options[:sort_by_ascending]
|
124
|
+
@collapse_by_prefix = options[:collapse_by_prefix]
|
125
|
+
@find_options = options[:find_options]
|
126
|
+
@postpone_limit = !(@find_options.blank? || (@find_options[:conditions].blank? && @find_options[:joins].blank?))
|
127
|
+
|
128
|
+
self.cached_results = nil
|
129
|
+
end
|
130
|
+
|
131
|
+
initialize_enquire
|
132
|
+
end
|
133
|
+
|
134
|
+
def initialize_enquire
|
135
|
+
self.runtime += Benchmark::realtime do
|
136
|
+
@index.enquire.query = self.query
|
137
|
+
|
138
|
+
if @sort_by_prefix.nil?
|
139
|
+
@index.enquire.sort_by_relevance!
|
140
|
+
else
|
141
|
+
value = @index.values_by_prefix[@sort_by_prefix]
|
142
|
+
raise "couldn't find prefix '#{@sort_by_prefix}'" if value.nil?
|
143
|
+
# Xapian has inverted the meaning of ascending order to handle relevence sorting
|
144
|
+
# "keys which sort higher by string compare are better"
|
145
|
+
@index.enquire.sort_by_value_then_relevance!(value, !@sort_by_ascending)
|
146
|
+
end
|
147
|
+
|
148
|
+
if @collapse_by_prefix.nil?
|
149
|
+
@index.enquire.collapse_key = Xapian.BAD_VALUENO
|
150
|
+
else
|
151
|
+
value = @index.values_by_prefix[@collapse_by_prefix]
|
152
|
+
raise "couldn't find prefix '#{@collapse_by_prefix}'" if value.nil?
|
153
|
+
@index.enquire.collapse_key = value
|
154
|
+
end
|
155
|
+
end
|
156
|
+
true
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class ReadableIndex < Index
|
3
|
+
@@available_indicies = {}
|
4
|
+
|
5
|
+
attr_reader :enquire, :query_parser, :values_by_prefix
|
6
|
+
|
7
|
+
# Takes an array of model classes and returns an index object to be
|
8
|
+
# used for searching across the given models
|
9
|
+
#
|
10
|
+
# Prevents query parser interaction across multiple models unless
|
11
|
+
# performing a multi model search
|
12
|
+
def self.index_for(models)
|
13
|
+
index_key = models.map {|m| m.to_s }.sort.join('---')
|
14
|
+
if @@available_indicies.key?(index_key)
|
15
|
+
index = @@available_indicies[index_key]
|
16
|
+
index.reset_enquire!
|
17
|
+
index
|
18
|
+
else
|
19
|
+
index = self.new(models)
|
20
|
+
@@available_indicies[index_key] = index
|
21
|
+
index
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Opens the db for reading and builds the query parser
|
26
|
+
def initialize(models)
|
27
|
+
raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available
|
28
|
+
raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty?
|
29
|
+
|
30
|
+
self.class.prepare_environment
|
31
|
+
|
32
|
+
# basic Xapian objects
|
33
|
+
begin
|
34
|
+
@db = Xapian::Database.new(@@db_path)
|
35
|
+
@enquire = Xapian::Enquire.new(@db)
|
36
|
+
rescue IOError
|
37
|
+
raise "Xapian database not opened; have you built it with rake xapian:rebuild_index ?"
|
38
|
+
end
|
39
|
+
|
40
|
+
init_query_parser(models)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Creates a new search session
|
44
|
+
def reset_enquire!
|
45
|
+
@db.reopen # This grabs the latest db updates
|
46
|
+
@enquire = Xapian::Enquire.new(@db)
|
47
|
+
rescue IOError
|
48
|
+
raise "Xapian database not opened; have you built it with rake xapian:rebuild_index ?"
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
# Make a new query parser
|
54
|
+
def init_query_parser(models)
|
55
|
+
# for queries
|
56
|
+
@query_parser = Xapian::QueryParser.new
|
57
|
+
@query_parser.stemmer = @@stemmer
|
58
|
+
@query_parser.stemming_strategy = Xapian::QueryParser::STEM_SOME
|
59
|
+
@query_parser.database = @db
|
60
|
+
@query_parser.default_op = Xapian::Query::OP_AND
|
61
|
+
|
62
|
+
@terms_by_capital = {}
|
63
|
+
@values_by_number = {}
|
64
|
+
@values_by_prefix = {}
|
65
|
+
@value_ranges_store = []
|
66
|
+
|
67
|
+
models.each do |klass|
|
68
|
+
options = klass.xapian_options
|
69
|
+
# go through the various field types, and tell query parser about them,
|
70
|
+
# and error check them - i.e. check for consistency between models
|
71
|
+
@query_parser.add_boolean_prefix("model", "M")
|
72
|
+
@query_parser.add_boolean_prefix("modelid", "I")
|
73
|
+
(options[:terms] || []).each do |term|
|
74
|
+
raise "Use up to 3 single capital letters for term code" unless term[1].match(/^[A-Z]{1,3}$/)
|
75
|
+
raise "M and I are reserved for use as the model/id term" if term[1] == "M" || term[1] == "I"
|
76
|
+
raise "model and modelid are reserved for use as the model/id prefixes" if term[2] == "model" || term[2] == "modelid"
|
77
|
+
raise "Z is reserved for stemming terms" if term[1] == "Z"
|
78
|
+
raise "Already have code '#{term[1]}' in another model but with different prefix '#{@terms_by_capital[term[1]]}'" if @terms_by_capital.key?(term[1]) && @terms_by_capital[term[1]] != term[2]
|
79
|
+
@terms_by_capital[term[1]] = term[2]
|
80
|
+
@query_parser.add_prefix(term[2], term[1])
|
81
|
+
end
|
82
|
+
values = (options[:values] || [])
|
83
|
+
values = values.select {|i| i[3] == :number } + values.reject {|i| i[3] == :number }
|
84
|
+
values.each do |value|
|
85
|
+
raise "Value index '#{value[1]}' must be an integer, is #{value[1].class}" unless value[1].instance_of?(Fixnum)
|
86
|
+
raise "Already have value index '#{value[1]}' in another model but with different prefix '#{@values_by_number[value[1]]}'" if @values_by_number.key?(value[1]) && @values_by_number[value[1]] != value[2]
|
87
|
+
raise "Already have value prefix '#{value[2]}' in another model but with different index '#{@values_by_prefix[value[2]]}'" if value[3] == :number && @values_by_prefix.key?(value[2]) && @values_by_prefix[value[2]] != value[1]
|
88
|
+
|
89
|
+
# date types are special, mark them so the first model they're seen for
|
90
|
+
if !@values_by_number.key?(value[1])
|
91
|
+
value_range = case value[3]
|
92
|
+
when :date
|
93
|
+
Xapian::DateValueRangeProcessor.new(value[1])
|
94
|
+
when :string
|
95
|
+
Xapian::StringValueRangeProcessor.new(value[1])
|
96
|
+
when :number
|
97
|
+
Xapian::NumberValueRangeProcessor.new(value[1],"#{value[2]}:",true)
|
98
|
+
else
|
99
|
+
raise "Unknown value type '#{value[3]}'"
|
100
|
+
end
|
101
|
+
|
102
|
+
@query_parser.add_valuerangeprocessor(value_range)
|
103
|
+
|
104
|
+
# stop it being garbage collected, as
|
105
|
+
# add_valuerangeprocessor ref is outside Ruby's GC
|
106
|
+
@value_ranges_store.push(value_range)
|
107
|
+
end
|
108
|
+
|
109
|
+
@values_by_number[value[1]] = value[2]
|
110
|
+
@values_by_prefix[value[2]] = value[1]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
@values_by_prefix.freeze # This can be read outside the instance. Make sure it can't be changed there
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Search for a query string, returns an array of hashes in result order.
|
3
|
+
# Each hash contains the actual Rails object in :model, and other detail
|
4
|
+
# about relevancy etc. in other keys.
|
5
|
+
class Search < QueryBase
|
6
|
+
attr_accessor :query_string
|
7
|
+
|
8
|
+
@@parse_query_flags = Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_PHRASE |
|
9
|
+
Xapian::QueryParser::FLAG_LOVEHATE | Xapian::QueryParser::FLAG_WILDCARD |
|
10
|
+
Xapian::QueryParser::FLAG_SPELLING_CORRECTION
|
11
|
+
|
12
|
+
# Note that model_classes is not only sometimes useful here - it's
|
13
|
+
# essential to make sure the classes have been loaded, and thus
|
14
|
+
# acts_as_xapian called on them, so we know the fields for the query
|
15
|
+
# parser.
|
16
|
+
|
17
|
+
# model_classes - model classes to search within, e.g. [PublicBody,
|
18
|
+
# User]. Can take a single model class, or you can express the model
|
19
|
+
# class names in strings if you like.
|
20
|
+
# query_string - user inputed query string, with syntax much like Google Search
|
21
|
+
#
|
22
|
+
# options include
|
23
|
+
# - :limit - limit the number of records returned
|
24
|
+
# - :offset - start with this record number
|
25
|
+
# - :check_at_least - used for total match estimates. Set higher for greater accuracy at the cost of slower queries. default: 100
|
26
|
+
# - :sort_by_prefix - determines which data field to sort by. default: sort by relevance
|
27
|
+
# - :sort_by_ascending - determines which direction to sort. default: true (ascending sort)
|
28
|
+
# - :collapse_by_prefix - groups the return set by this prefix
|
29
|
+
# - :find_options - These options are passed through to the active record find. Be careful if searching against multiple model classes.
|
30
|
+
def initialize(model_classes, query_string, options = {})
|
31
|
+
# Check parameters, convert to actual array of model classes
|
32
|
+
model_classes = Array(model_classes).map do |model_class|
|
33
|
+
model_class = model_class.constantize if model_class.instance_of?(String)
|
34
|
+
raise "pass in the model class itself, or a string containing its name" unless model_class.instance_of?(Class)
|
35
|
+
model_class
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set things up
|
39
|
+
self.initialize_db(model_classes)
|
40
|
+
|
41
|
+
# Case of a string, searching for a Google-like syntax query
|
42
|
+
self.query_string = query_string
|
43
|
+
|
44
|
+
# Construct query which only finds things from specified models
|
45
|
+
model_query = Xapian::Query.new(Xapian::Query::OP_OR, model_classes.map {|mc| "M#{mc}" })
|
46
|
+
user_query = @index.query_parser.parse_query(self.query_string, @@parse_query_flags)
|
47
|
+
self.query = Xapian::Query.new(Xapian::Query::OP_AND, model_query, user_query)
|
48
|
+
|
49
|
+
# Call base class constructor
|
50
|
+
self.initialize_query(options)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return just normal words in the query i.e. Not operators, ones in
|
54
|
+
# date ranges or similar. Use this for cheap highlighting with
|
55
|
+
# TextHelper::highlight, and excerpt.
|
56
|
+
def words_to_highlight
|
57
|
+
query_nopunc = self.query_string.gsub(/[^\w:\.\/_]/i, " ").gsub(/\s+/, " ")
|
58
|
+
# Split on ' ' and remove anything with a :, . or / in it or boolean operators
|
59
|
+
query_nopunc.split(" ").reject {|o| o.match(/(:|\.|\/)|^(AND|NOT|OR|XOR)$/) }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Text for lines in log file
|
63
|
+
def log_description
|
64
|
+
"Search: #{self.query_string}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Search for models which contain theimportant terms taken from a specified
|
3
|
+
# list of models. i.e. Use to find documents similar to one (or more)
|
4
|
+
# documents, or use to refine searches.
|
5
|
+
class Similar < QueryBase
|
6
|
+
attr_accessor :query_models
|
7
|
+
attr_accessor :important_terms
|
8
|
+
|
9
|
+
# model_classes - model classes to search within, e.g. [PublicBody, User]
|
10
|
+
# query_models - list of models you want to find things similar to
|
11
|
+
def initialize(model_classes, query_models, options = {})
|
12
|
+
self.initialize_db(model_classes)
|
13
|
+
|
14
|
+
self.runtime += Benchmark::realtime do
|
15
|
+
# Case of an array, searching for models similar to those models in the array
|
16
|
+
self.query_models = query_models
|
17
|
+
|
18
|
+
# Find the documents by their unique term
|
19
|
+
input_models_query = Xapian::Query.new(Xapian::Query::OP_OR, query_models.map {|m| "I#{m.xapian_document_term}" })
|
20
|
+
begin
|
21
|
+
@index.enquire.query = input_models_query
|
22
|
+
|
23
|
+
# Get set of relevant terms for those documents
|
24
|
+
selection = Xapian::RSet.new()
|
25
|
+
@index.enquire.mset(0, 100, 100).matches.each {|m| selection.add_document(m.docid) } # XXX so this whole method will only work with 100 docs
|
26
|
+
|
27
|
+
# Bit weird that the function to make esets is part of the enquire
|
28
|
+
# object. This explains what exactly it does, which is to exclude
|
29
|
+
# terms in the existing query.
|
30
|
+
# http://thread.gmane.org/gmane.comp.search.xapian.general/3673/focus=3681
|
31
|
+
#
|
32
|
+
# Do main search for them
|
33
|
+
self.important_terms = @index.enquire.eset(40, selection).terms.map {|e| e.name }
|
34
|
+
rescue IOError => e
|
35
|
+
if @retried.nil? && /DatabaseModifiedError/.match(e.message.to_s)
|
36
|
+
@retried = true
|
37
|
+
@index.reset_enquire!
|
38
|
+
retry
|
39
|
+
end
|
40
|
+
raise e
|
41
|
+
end
|
42
|
+
|
43
|
+
similar_query = Xapian::Query.new(Xapian::Query::OP_OR, self.important_terms)
|
44
|
+
# Exclude original
|
45
|
+
combined_query = Xapian::Query.new(Xapian::Query::OP_AND_NOT, similar_query, input_models_query)
|
46
|
+
|
47
|
+
# Restrain to model classes
|
48
|
+
model_query = Xapian::Query.new(Xapian::Query::OP_OR, model_classes.map {|mc| "M#{mc}" })
|
49
|
+
self.query = Xapian::Query.new(Xapian::Query::OP_AND, model_query, combined_query)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Call base class constructor
|
53
|
+
self.initialize_query(options)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Text for lines in log file
|
57
|
+
def log_description
|
58
|
+
"Similar: #{self.query_models}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class WriteableIndex < Index
|
3
|
+
@@writable_db = nil
|
4
|
+
@@writable_suffix = nil
|
5
|
+
|
6
|
+
cattr_reader :term_generator
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def delete_document(*args)
|
10
|
+
@@writable_db.delete_document(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def replace_document(*args)
|
14
|
+
@@writable_db.replace_document(*args)
|
15
|
+
end
|
16
|
+
|
17
|
+
def writable_init(suffix = "")
|
18
|
+
raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available
|
19
|
+
raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty?
|
20
|
+
|
21
|
+
# if DB is not nil, then we're already initialised, so don't do it again
|
22
|
+
return unless @@writable_db.nil?
|
23
|
+
|
24
|
+
prepare_environment
|
25
|
+
|
26
|
+
new_path = @@db_path + suffix
|
27
|
+
raise "writable_suffix/suffix inconsistency" if @@writable_suffix && @@writable_suffix != suffix
|
28
|
+
|
29
|
+
# for indexing
|
30
|
+
@@writable_db = Xapian::WritableDatabase.new(new_path, Xapian::DB_CREATE_OR_OPEN)
|
31
|
+
@@term_generator = Xapian::TermGenerator.new()
|
32
|
+
@@term_generator.set_flags(Xapian::TermGenerator::FLAG_SPELLING, 0)
|
33
|
+
@@term_generator.database = @@writable_db
|
34
|
+
@@term_generator.stemmer = @@stemmer
|
35
|
+
@@writable_suffix = suffix
|
36
|
+
end
|
37
|
+
|
38
|
+
######################################################################
|
39
|
+
# Index
|
40
|
+
|
41
|
+
# Update index with any changes needed, call this offline. Only call it
|
42
|
+
# from a script that exits - otherwise Xapian's writable database won't
|
43
|
+
# flush your changes. Specifying flush will reduce performance, but
|
44
|
+
# make sure that each index update is definitely saved to disk before
|
45
|
+
# logging in the database that it has been.
|
46
|
+
def update_index(flush = false, verbose = false)
|
47
|
+
# puts "start of self.update_index" if verbose
|
48
|
+
|
49
|
+
# Before calling writable_init we have to make sure every model class has been initialized.
|
50
|
+
# i.e. has had its class code loaded, so acts_as_xapian has been called inside it, and
|
51
|
+
# we have the info from acts_as_xapian.
|
52
|
+
model_classes = ActsAsXapianJob.find(:all, :select => 'model', :group => 'model').map {|a| a.model.constantize }
|
53
|
+
# If there are no models in the queue, then nothing to do
|
54
|
+
return if model_classes.empty?
|
55
|
+
|
56
|
+
self.writable_init
|
57
|
+
|
58
|
+
ids_to_refresh = ActsAsXapianJob.find(:all, :select => 'id').map { |i| i.id }
|
59
|
+
ids_to_refresh.each do |id|
|
60
|
+
begin
|
61
|
+
ActsAsXapianJob.transaction do
|
62
|
+
job = ActsAsXapianJob.find(id, :lock =>true)
|
63
|
+
puts "ActsAsXapian::WriteableIndex.update_index #{job.action} #{job.model} #{job.model_id.to_s}" if verbose
|
64
|
+
begin
|
65
|
+
case job.action
|
66
|
+
when 'update'
|
67
|
+
# XXX Index functions may reference other models, so we could eager load here too?
|
68
|
+
model = job.model.constantize.find(job.model_id) # :include => cls.constantize.xapian_options[:include]
|
69
|
+
model.xapian_index
|
70
|
+
when 'destroy'
|
71
|
+
# Make dummy model with right id, just for destruction
|
72
|
+
model = job.model.constantize.new
|
73
|
+
model.id = job.model_id
|
74
|
+
model.xapian_destroy
|
75
|
+
else
|
76
|
+
raise "unknown ActsAsXapianJob action '#{job.action}'"
|
77
|
+
end
|
78
|
+
rescue ActiveRecord::RecordNotFound => e
|
79
|
+
job.action = 'destroy'
|
80
|
+
retry
|
81
|
+
end
|
82
|
+
job.destroy
|
83
|
+
|
84
|
+
@@writable_db.flush if flush
|
85
|
+
end
|
86
|
+
rescue => detail
|
87
|
+
# print any error, and carry on so other things are indexed
|
88
|
+
# XXX If item is later deleted, this should give up, and it
|
89
|
+
# won't. It will keep trying (assuming update_index called from
|
90
|
+
# regular cron job) and mayhap cause trouble.
|
91
|
+
STDERR.puts("#{detail.backtrace.join("\n")}\nFAILED ActsAsXapian::WriteableIndex.update_index job #{id} #{$!}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# You must specify *all* the models here, this totally rebuilds the Xapian database.
|
97
|
+
# You'll want any readers to reopen the database after this.
|
98
|
+
def rebuild_index(model_classes, verbose = false)
|
99
|
+
raise "when rebuilding all, please call as first and only thing done in process / task" unless @@writable_db.nil?
|
100
|
+
|
101
|
+
prepare_environment
|
102
|
+
|
103
|
+
# Delete any existing .new database, and open a new one
|
104
|
+
new_path = "#{self.db_path}.new"
|
105
|
+
if File.exist?(new_path)
|
106
|
+
raise "found existing #{new_path} which is not Xapian flint database, please delete for me" unless File.exist?(File.join(new_path, "iamflint"))
|
107
|
+
FileUtils.rm_r(new_path)
|
108
|
+
end
|
109
|
+
self.writable_init(".new")
|
110
|
+
|
111
|
+
# Index everything
|
112
|
+
|
113
|
+
most_recent_job = ActsAsXapianJob.find(:first, :order => 'id DESC')
|
114
|
+
batch_size = 1000
|
115
|
+
model_classes.each do |model_class|
|
116
|
+
all_ids = model_class.find(:all, :select => model_class.primary_key, :order => model_class.primary_key).map {|i| i.id }
|
117
|
+
all_ids.each_slice(batch_size) do |ids|
|
118
|
+
puts "ActsAsXapian::WriteableIndex: New batch. Including ids #{ids.first} to #{ids.last}" if verbose
|
119
|
+
models = model_class.find(:all, :conditions => {model_class.primary_key => ids})
|
120
|
+
models.each do |model|
|
121
|
+
puts "ActsAsXapian::WriteableIndex.rebuild_index #{model_class} #{model.id}" if verbose
|
122
|
+
model.xapian_index
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
@@writable_db.flush
|
128
|
+
|
129
|
+
# Rename into place
|
130
|
+
old_path = self.db_path
|
131
|
+
temp_path = "#{old_path}.tmp"
|
132
|
+
if File.exist?(temp_path)
|
133
|
+
raise "temporary database found #{temp_path} which is not Xapian flint database, please delete for me" unless File.exist?(File.join(temp_path, "iamflint"))
|
134
|
+
FileUtils.rm_r(temp_path)
|
135
|
+
end
|
136
|
+
FileUtils.mv(old_path, temp_path) if File.exist?(old_path)
|
137
|
+
FileUtils.mv(new_path, old_path)
|
138
|
+
|
139
|
+
# Delete old database
|
140
|
+
if File.exist?(temp_path)
|
141
|
+
raise "old database now at #{temp_path} is not Xapian flint database, please delete for me" unless File.exist?(File.join(temp_path, "iamflint"))
|
142
|
+
FileUtils.rm_r(temp_path)
|
143
|
+
end
|
144
|
+
|
145
|
+
ActsAsXapianJob.delete_all ['id <= ?', most_recent_job.id] if most_recent_job
|
146
|
+
|
147
|
+
# You'll want to restart your FastCGI or Mongrel processes after this,
|
148
|
+
# so they get the new db
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,5 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'base' )
|
2
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'query_base' )
|
3
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'search' )
|
4
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'similar' )
|
5
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'core_ext/array' )
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|