acts_as_xapian 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +148 -0
- data/Rakefile +46 -0
- data/VERSION +1 -0
- data/acts_as_xapian.gemspec +70 -0
- data/generators/acts_as_xapian/USAGE +1 -0
- data/generators/acts_as_xapian/acts_as_xapian_generator.rb +14 -0
- data/generators/acts_as_xapian/templates/migrations/migration.rb +14 -0
- data/generators/acts_as_xapian/templates/tasks/xapian.rake +42 -0
- data/lib/acts_as_xapian/base.rb +215 -0
- data/lib/acts_as_xapian/core_ext/array.rb +24 -0
- data/lib/acts_as_xapian/index.rb +45 -0
- data/lib/acts_as_xapian/query_base.rb +159 -0
- data/lib/acts_as_xapian/readable_index.rb +117 -0
- data/lib/acts_as_xapian/search.rb +67 -0
- data/lib/acts_as_xapian/similar.rb +61 -0
- data/lib/acts_as_xapian/writeable_index.rb +152 -0
- data/lib/acts_as_xapian.rb +5 -0
- data/spec/acts_as_xapian_spec.rb +7 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +111 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class Index
|
3
|
+
@@db_path = nil
|
4
|
+
@@init_values = []
|
5
|
+
|
6
|
+
cattr_reader :config, :db_path, :stemmer
|
7
|
+
|
8
|
+
class <<self
|
9
|
+
######################################################################
|
10
|
+
# Initialisation
|
11
|
+
def init(classname = nil, options = nil)
|
12
|
+
# store class and options for use later, when we open the db in readable_init
|
13
|
+
@@init_values.push([classname,options]) unless classname.nil?
|
14
|
+
end
|
15
|
+
|
16
|
+
# Reads the config file (if any) and sets up the path to the database we'll be using
|
17
|
+
def prepare_environment
|
18
|
+
return unless @@db_path.nil?
|
19
|
+
|
20
|
+
# barf if we can't figure out the environment
|
21
|
+
environment = (ENV['RAILS_ENV'] || RAILS_ENV)
|
22
|
+
raise "Set RAILS_ENV, so acts_as_xapian can find the right Xapian database" unless environment
|
23
|
+
|
24
|
+
# check for a config file
|
25
|
+
config_file = File.join(RAILS_ROOT, 'config', 'xapian.yml')
|
26
|
+
@@config = File.exists?(config_file) ? YAML.load_file(config_file)[environment] : {}
|
27
|
+
# figure out where the DBs should go
|
28
|
+
if config['base_db_path']
|
29
|
+
db_parent_path = File.join(RAILS_ROOT, config['base_db_path'])
|
30
|
+
else
|
31
|
+
db_parent_path = File.join(RAILS_ROOT, 'db', 'xapiandbs')
|
32
|
+
end
|
33
|
+
|
34
|
+
# make the directory for the xapian databases to go in
|
35
|
+
Dir.mkdir(db_parent_path) unless File.exists?(db_parent_path)
|
36
|
+
|
37
|
+
@@db_path = File.join(db_parent_path, environment)
|
38
|
+
|
39
|
+
# make some things that don't depend on the db
|
40
|
+
# XXX this gets made once for each acts_as_xapian. Oh well.
|
41
|
+
@@stemmer = Xapian::Stem.new('english')
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Base class for Search and Similar below
|
3
|
+
class QueryBase
|
4
|
+
attr_accessor :offset, :limit, :query, :query_models, :runtime, :cached_results
|
5
|
+
@@unlimited = 1000000
|
6
|
+
|
7
|
+
# Return a description of the query
|
8
|
+
def description
|
9
|
+
self.query.description
|
10
|
+
end
|
11
|
+
|
12
|
+
# Returns the mset for the query
|
13
|
+
def matches(reload = false)
|
14
|
+
return @matches unless @matches.nil? || reload
|
15
|
+
|
16
|
+
begin
|
17
|
+
self.runtime += Benchmark::realtime do
|
18
|
+
# If using find_options conditions have Xapian return the entire match set
|
19
|
+
# TODO Revisit. This is extremely inefficient for large indices
|
20
|
+
@matches = @index.enquire.mset(@postpone_limit ? 0 : @offset, @postpone_limit ? @@unlimited : @limit, @check_at_least)
|
21
|
+
end
|
22
|
+
@matches
|
23
|
+
rescue IOError => e
|
24
|
+
if @retried.nil? && /DatabaseModifiedError/.match(e.message.to_s)
|
25
|
+
@retried = true
|
26
|
+
@index.reset_enquire!
|
27
|
+
initialize_enquire
|
28
|
+
retry
|
29
|
+
end
|
30
|
+
raise e
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Estimate total number of results
|
35
|
+
# Note: Unreliable if using find_options with conditions or joins
|
36
|
+
def matches_estimated
|
37
|
+
@matches_estimated || self.matches.matches_estimated
|
38
|
+
end
|
39
|
+
|
40
|
+
# Return query string with spelling correction
|
41
|
+
def spelling_correction
|
42
|
+
correction = @index.query_parser.get_corrected_query_string
|
43
|
+
correction.empty? ? nil : correction
|
44
|
+
end
|
45
|
+
|
46
|
+
# Return array of models found
|
47
|
+
def results
|
48
|
+
# If they've already pulled out the results, just return them.
|
49
|
+
return self.cached_results unless self.cached_results.nil?
|
50
|
+
|
51
|
+
docs = nil
|
52
|
+
self.runtime += Benchmark::realtime do
|
53
|
+
# Pull out all the results
|
54
|
+
docs = self.matches.matches.map {|doc| {:data => doc.document.data, :percent => doc.percent, :weight => doc.weight, :collapse_count => doc.collapse_count} }
|
55
|
+
end
|
56
|
+
|
57
|
+
# Log time taken, excluding database lookups below which will be displayed separately by ActiveRecord
|
58
|
+
ActiveRecord::Base.logger.debug(" Xapian query (%.5fs) #{self.log_description.gsub('%','%%')}" % self.runtime) if ActiveRecord::Base.logger
|
59
|
+
|
60
|
+
# Group the ids by the model they belong to
|
61
|
+
lhash = docs.inject({}) do |s,doc|
|
62
|
+
model_name, id = doc[:data].split('-')
|
63
|
+
(s[model_name] ||= []) << id
|
64
|
+
s
|
65
|
+
end
|
66
|
+
|
67
|
+
if @postpone_limit
|
68
|
+
found = lhash.map do |(class_name, ids)|
|
69
|
+
model = class_name.constantize # constantize is expensive do once
|
70
|
+
model.with_xapian_scope(ids) { model.find(:all, @find_options.merge(:select => "#{model.table_name}.#{model.primary_key}")) }.map {|m| m.xapian_document_term }
|
71
|
+
end.flatten
|
72
|
+
|
73
|
+
self.runtime += Benchmark::realtime do
|
74
|
+
found = found.inject({}) {|s,i| s[i] = true; s } # hash key searching is MUCH faster than an array sequential scan
|
75
|
+
docs.delete_if {|doc| !found.delete(doc[:data]) }
|
76
|
+
|
77
|
+
@matches_estimated = docs.size
|
78
|
+
|
79
|
+
docs = docs[@offset,@limit] || []
|
80
|
+
|
81
|
+
lhash = docs.inject({}) do |s,doc|
|
82
|
+
model_name, id = doc[:data].split('-')
|
83
|
+
(s[model_name] ||= []) << id
|
84
|
+
s
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
# for each class, look up the associated ids
|
90
|
+
chash = lhash.inject({}) do |out, (class_name, ids)|
|
91
|
+
model = class_name.constantize # constantize is expensive do once
|
92
|
+
found = model.with_xapian_scope(ids) { model.find(:all, @find_options) }
|
93
|
+
out[class_name] = found.inject({}) {|s,f| s[f.id] = f; s }
|
94
|
+
out
|
95
|
+
end
|
96
|
+
|
97
|
+
# add the model to each doc
|
98
|
+
docs.each do |doc|
|
99
|
+
model_name, id = doc[:data].split('-')
|
100
|
+
doc[:model] = chash[model_name][id.to_i]
|
101
|
+
end
|
102
|
+
|
103
|
+
self.cached_results = docs
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
|
108
|
+
def initialize_db(models)
|
109
|
+
self.runtime = 0.0
|
110
|
+
|
111
|
+
@index = ReadableIndex.index_for(models)
|
112
|
+
|
113
|
+
raise "ActsAsXapian::ReadableIndex not initialized" if @index.nil?
|
114
|
+
end
|
115
|
+
|
116
|
+
# Set self.query before calling this
|
117
|
+
def initialize_query(options)
|
118
|
+
self.runtime += Benchmark::realtime do
|
119
|
+
@offset = options[:offset].to_i
|
120
|
+
@limit = (options[:limit] || @@unlimited).to_i
|
121
|
+
@check_at_least = (options[:check_at_least] || 100).to_i
|
122
|
+
@sort_by_prefix = options[:sort_by_prefix]
|
123
|
+
@sort_by_ascending = options[:sort_by_ascending].nil? ? true : options[:sort_by_ascending]
|
124
|
+
@collapse_by_prefix = options[:collapse_by_prefix]
|
125
|
+
@find_options = options[:find_options]
|
126
|
+
@postpone_limit = !(@find_options.blank? || (@find_options[:conditions].blank? && @find_options[:joins].blank?))
|
127
|
+
|
128
|
+
self.cached_results = nil
|
129
|
+
end
|
130
|
+
|
131
|
+
initialize_enquire
|
132
|
+
end
|
133
|
+
|
134
|
+
def initialize_enquire
|
135
|
+
self.runtime += Benchmark::realtime do
|
136
|
+
@index.enquire.query = self.query
|
137
|
+
|
138
|
+
if @sort_by_prefix.nil?
|
139
|
+
@index.enquire.sort_by_relevance!
|
140
|
+
else
|
141
|
+
value = @index.values_by_prefix[@sort_by_prefix]
|
142
|
+
raise "couldn't find prefix '#{@sort_by_prefix}'" if value.nil?
|
143
|
+
# Xapian has inverted the meaning of ascending order to handle relevence sorting
|
144
|
+
# "keys which sort higher by string compare are better"
|
145
|
+
@index.enquire.sort_by_value_then_relevance!(value, !@sort_by_ascending)
|
146
|
+
end
|
147
|
+
|
148
|
+
if @collapse_by_prefix.nil?
|
149
|
+
@index.enquire.collapse_key = Xapian.BAD_VALUENO
|
150
|
+
else
|
151
|
+
value = @index.values_by_prefix[@collapse_by_prefix]
|
152
|
+
raise "couldn't find prefix '#{@collapse_by_prefix}'" if value.nil?
|
153
|
+
@index.enquire.collapse_key = value
|
154
|
+
end
|
155
|
+
end
|
156
|
+
true
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -0,0 +1,117 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class ReadableIndex < Index
|
3
|
+
@@available_indicies = {}
|
4
|
+
|
5
|
+
attr_reader :enquire, :query_parser, :values_by_prefix
|
6
|
+
|
7
|
+
# Takes an array of model classes and returns an index object to be
|
8
|
+
# used for searching across the given models
|
9
|
+
#
|
10
|
+
# Prevents query parser interaction across multiple models unless
|
11
|
+
# performing a multi model search
|
12
|
+
def self.index_for(models)
|
13
|
+
index_key = models.map {|m| m.to_s }.sort.join('---')
|
14
|
+
if @@available_indicies.key?(index_key)
|
15
|
+
index = @@available_indicies[index_key]
|
16
|
+
index.reset_enquire!
|
17
|
+
index
|
18
|
+
else
|
19
|
+
index = self.new(models)
|
20
|
+
@@available_indicies[index_key] = index
|
21
|
+
index
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Opens the db for reading and builds the query parser
|
26
|
+
def initialize(models)
|
27
|
+
raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available
|
28
|
+
raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty?
|
29
|
+
|
30
|
+
self.class.prepare_environment
|
31
|
+
|
32
|
+
# basic Xapian objects
|
33
|
+
begin
|
34
|
+
@db = Xapian::Database.new(@@db_path)
|
35
|
+
@enquire = Xapian::Enquire.new(@db)
|
36
|
+
rescue IOError
|
37
|
+
raise "Xapian database not opened; have you built it with rake xapian:rebuild_index ?"
|
38
|
+
end
|
39
|
+
|
40
|
+
init_query_parser(models)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Creates a new search session
|
44
|
+
def reset_enquire!
|
45
|
+
@db.reopen # This grabs the latest db updates
|
46
|
+
@enquire = Xapian::Enquire.new(@db)
|
47
|
+
rescue IOError
|
48
|
+
raise "Xapian database not opened; have you built it with rake xapian:rebuild_index ?"
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
# Make a new query parser
|
54
|
+
def init_query_parser(models)
|
55
|
+
# for queries
|
56
|
+
@query_parser = Xapian::QueryParser.new
|
57
|
+
@query_parser.stemmer = @@stemmer
|
58
|
+
@query_parser.stemming_strategy = Xapian::QueryParser::STEM_SOME
|
59
|
+
@query_parser.database = @db
|
60
|
+
@query_parser.default_op = Xapian::Query::OP_AND
|
61
|
+
|
62
|
+
@terms_by_capital = {}
|
63
|
+
@values_by_number = {}
|
64
|
+
@values_by_prefix = {}
|
65
|
+
@value_ranges_store = []
|
66
|
+
|
67
|
+
models.each do |klass|
|
68
|
+
options = klass.xapian_options
|
69
|
+
# go through the various field types, and tell query parser about them,
|
70
|
+
# and error check them - i.e. check for consistency between models
|
71
|
+
@query_parser.add_boolean_prefix("model", "M")
|
72
|
+
@query_parser.add_boolean_prefix("modelid", "I")
|
73
|
+
(options[:terms] || []).each do |term|
|
74
|
+
raise "Use up to 3 single capital letters for term code" unless term[1].match(/^[A-Z]{1,3}$/)
|
75
|
+
raise "M and I are reserved for use as the model/id term" if term[1] == "M" || term[1] == "I"
|
76
|
+
raise "model and modelid are reserved for use as the model/id prefixes" if term[2] == "model" || term[2] == "modelid"
|
77
|
+
raise "Z is reserved for stemming terms" if term[1] == "Z"
|
78
|
+
raise "Already have code '#{term[1]}' in another model but with different prefix '#{@terms_by_capital[term[1]]}'" if @terms_by_capital.key?(term[1]) && @terms_by_capital[term[1]] != term[2]
|
79
|
+
@terms_by_capital[term[1]] = term[2]
|
80
|
+
@query_parser.add_prefix(term[2], term[1])
|
81
|
+
end
|
82
|
+
values = (options[:values] || [])
|
83
|
+
values = values.select {|i| i[3] == :number } + values.reject {|i| i[3] == :number }
|
84
|
+
values.each do |value|
|
85
|
+
raise "Value index '#{value[1]}' must be an integer, is #{value[1].class}" unless value[1].instance_of?(Fixnum)
|
86
|
+
raise "Already have value index '#{value[1]}' in another model but with different prefix '#{@values_by_number[value[1]]}'" if @values_by_number.key?(value[1]) && @values_by_number[value[1]] != value[2]
|
87
|
+
raise "Already have value prefix '#{value[2]}' in another model but with different index '#{@values_by_prefix[value[2]]}'" if value[3] == :number && @values_by_prefix.key?(value[2]) && @values_by_prefix[value[2]] != value[1]
|
88
|
+
|
89
|
+
# date types are special, mark them so the first model they're seen for
|
90
|
+
if !@values_by_number.key?(value[1])
|
91
|
+
value_range = case value[3]
|
92
|
+
when :date
|
93
|
+
Xapian::DateValueRangeProcessor.new(value[1])
|
94
|
+
when :string
|
95
|
+
Xapian::StringValueRangeProcessor.new(value[1])
|
96
|
+
when :number
|
97
|
+
Xapian::NumberValueRangeProcessor.new(value[1],"#{value[2]}:",true)
|
98
|
+
else
|
99
|
+
raise "Unknown value type '#{value[3]}'"
|
100
|
+
end
|
101
|
+
|
102
|
+
@query_parser.add_valuerangeprocessor(value_range)
|
103
|
+
|
104
|
+
# stop it being garbage collected, as
|
105
|
+
# add_valuerangeprocessor ref is outside Ruby's GC
|
106
|
+
@value_ranges_store.push(value_range)
|
107
|
+
end
|
108
|
+
|
109
|
+
@values_by_number[value[1]] = value[2]
|
110
|
+
@values_by_prefix[value[2]] = value[1]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
@values_by_prefix.freeze # This can be read outside the instance. Make sure it can't be changed there
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Search for a query string, returns an array of hashes in result order.
|
3
|
+
# Each hash contains the actual Rails object in :model, and other detail
|
4
|
+
# about relevancy etc. in other keys.
|
5
|
+
class Search < QueryBase
|
6
|
+
attr_accessor :query_string
|
7
|
+
|
8
|
+
@@parse_query_flags = Xapian::QueryParser::FLAG_BOOLEAN | Xapian::QueryParser::FLAG_PHRASE |
|
9
|
+
Xapian::QueryParser::FLAG_LOVEHATE | Xapian::QueryParser::FLAG_WILDCARD |
|
10
|
+
Xapian::QueryParser::FLAG_SPELLING_CORRECTION
|
11
|
+
|
12
|
+
# Note that model_classes is not only sometimes useful here - it's
|
13
|
+
# essential to make sure the classes have been loaded, and thus
|
14
|
+
# acts_as_xapian called on them, so we know the fields for the query
|
15
|
+
# parser.
|
16
|
+
|
17
|
+
# model_classes - model classes to search within, e.g. [PublicBody,
|
18
|
+
# User]. Can take a single model class, or you can express the model
|
19
|
+
# class names in strings if you like.
|
20
|
+
# query_string - user inputed query string, with syntax much like Google Search
|
21
|
+
#
|
22
|
+
# options include
|
23
|
+
# - :limit - limit the number of records returned
|
24
|
+
# - :offset - start with this record number
|
25
|
+
# - :check_at_least - used for total match estimates. Set higher for greater accuracy at the cost of slower queries. default: 100
|
26
|
+
# - :sort_by_prefix - determines which data field to sort by. default: sort by relevance
|
27
|
+
# - :sort_by_ascending - determines which direction to sort. default: true (ascending sort)
|
28
|
+
# - :collapse_by_prefix - groups the return set by this prefix
|
29
|
+
# - :find_options - These options are passed through to the active record find. Be careful if searching against multiple model classes.
|
30
|
+
def initialize(model_classes, query_string, options = {})
|
31
|
+
# Check parameters, convert to actual array of model classes
|
32
|
+
model_classes = Array(model_classes).map do |model_class|
|
33
|
+
model_class = model_class.constantize if model_class.instance_of?(String)
|
34
|
+
raise "pass in the model class itself, or a string containing its name" unless model_class.instance_of?(Class)
|
35
|
+
model_class
|
36
|
+
end
|
37
|
+
|
38
|
+
# Set things up
|
39
|
+
self.initialize_db(model_classes)
|
40
|
+
|
41
|
+
# Case of a string, searching for a Google-like syntax query
|
42
|
+
self.query_string = query_string
|
43
|
+
|
44
|
+
# Construct query which only finds things from specified models
|
45
|
+
model_query = Xapian::Query.new(Xapian::Query::OP_OR, model_classes.map {|mc| "M#{mc}" })
|
46
|
+
user_query = @index.query_parser.parse_query(self.query_string, @@parse_query_flags)
|
47
|
+
self.query = Xapian::Query.new(Xapian::Query::OP_AND, model_query, user_query)
|
48
|
+
|
49
|
+
# Call base class constructor
|
50
|
+
self.initialize_query(options)
|
51
|
+
end
|
52
|
+
|
53
|
+
# Return just normal words in the query i.e. Not operators, ones in
|
54
|
+
# date ranges or similar. Use this for cheap highlighting with
|
55
|
+
# TextHelper::highlight, and excerpt.
|
56
|
+
def words_to_highlight
|
57
|
+
query_nopunc = self.query_string.gsub(/[^\w:\.\/_]/i, " ").gsub(/\s+/, " ")
|
58
|
+
# Split on ' ' and remove anything with a :, . or / in it or boolean operators
|
59
|
+
query_nopunc.split(" ").reject {|o| o.match(/(:|\.|\/)|^(AND|NOT|OR|XOR)$/) }
|
60
|
+
end
|
61
|
+
|
62
|
+
# Text for lines in log file
|
63
|
+
def log_description
|
64
|
+
"Search: #{self.query_string}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
# Search for models which contain theimportant terms taken from a specified
|
3
|
+
# list of models. i.e. Use to find documents similar to one (or more)
|
4
|
+
# documents, or use to refine searches.
|
5
|
+
class Similar < QueryBase
|
6
|
+
attr_accessor :query_models
|
7
|
+
attr_accessor :important_terms
|
8
|
+
|
9
|
+
# model_classes - model classes to search within, e.g. [PublicBody, User]
|
10
|
+
# query_models - list of models you want to find things similar to
|
11
|
+
def initialize(model_classes, query_models, options = {})
|
12
|
+
self.initialize_db(model_classes)
|
13
|
+
|
14
|
+
self.runtime += Benchmark::realtime do
|
15
|
+
# Case of an array, searching for models similar to those models in the array
|
16
|
+
self.query_models = query_models
|
17
|
+
|
18
|
+
# Find the documents by their unique term
|
19
|
+
input_models_query = Xapian::Query.new(Xapian::Query::OP_OR, query_models.map {|m| "I#{m.xapian_document_term}" })
|
20
|
+
begin
|
21
|
+
@index.enquire.query = input_models_query
|
22
|
+
|
23
|
+
# Get set of relevant terms for those documents
|
24
|
+
selection = Xapian::RSet.new()
|
25
|
+
@index.enquire.mset(0, 100, 100).matches.each {|m| selection.add_document(m.docid) } # XXX so this whole method will only work with 100 docs
|
26
|
+
|
27
|
+
# Bit weird that the function to make esets is part of the enquire
|
28
|
+
# object. This explains what exactly it does, which is to exclude
|
29
|
+
# terms in the existing query.
|
30
|
+
# http://thread.gmane.org/gmane.comp.search.xapian.general/3673/focus=3681
|
31
|
+
#
|
32
|
+
# Do main search for them
|
33
|
+
self.important_terms = @index.enquire.eset(40, selection).terms.map {|e| e.name }
|
34
|
+
rescue IOError => e
|
35
|
+
if @retried.nil? && /DatabaseModifiedError/.match(e.message.to_s)
|
36
|
+
@retried = true
|
37
|
+
@index.reset_enquire!
|
38
|
+
retry
|
39
|
+
end
|
40
|
+
raise e
|
41
|
+
end
|
42
|
+
|
43
|
+
similar_query = Xapian::Query.new(Xapian::Query::OP_OR, self.important_terms)
|
44
|
+
# Exclude original
|
45
|
+
combined_query = Xapian::Query.new(Xapian::Query::OP_AND_NOT, similar_query, input_models_query)
|
46
|
+
|
47
|
+
# Restrain to model classes
|
48
|
+
model_query = Xapian::Query.new(Xapian::Query::OP_OR, model_classes.map {|mc| "M#{mc}" })
|
49
|
+
self.query = Xapian::Query.new(Xapian::Query::OP_AND, model_query, combined_query)
|
50
|
+
end
|
51
|
+
|
52
|
+
# Call base class constructor
|
53
|
+
self.initialize_query(options)
|
54
|
+
end
|
55
|
+
|
56
|
+
# Text for lines in log file
|
57
|
+
def log_description
|
58
|
+
"Similar: #{self.query_models}"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,152 @@
|
|
1
|
+
module ActsAsXapian
|
2
|
+
class WriteableIndex < Index
|
3
|
+
@@writable_db = nil
|
4
|
+
@@writable_suffix = nil
|
5
|
+
|
6
|
+
cattr_reader :term_generator
|
7
|
+
|
8
|
+
class << self
|
9
|
+
def delete_document(*args)
|
10
|
+
@@writable_db.delete_document(*args)
|
11
|
+
end
|
12
|
+
|
13
|
+
def replace_document(*args)
|
14
|
+
@@writable_db.replace_document(*args)
|
15
|
+
end
|
16
|
+
|
17
|
+
def writable_init(suffix = "")
|
18
|
+
raise NoXapianRubyBindingsError.new("Xapian Ruby bindings not installed") unless ActsAsXapian.bindings_available
|
19
|
+
raise "acts_as_xapian hasn't been called in any models" if @@init_values.empty?
|
20
|
+
|
21
|
+
# if DB is not nil, then we're already initialised, so don't do it again
|
22
|
+
return unless @@writable_db.nil?
|
23
|
+
|
24
|
+
prepare_environment
|
25
|
+
|
26
|
+
new_path = @@db_path + suffix
|
27
|
+
raise "writable_suffix/suffix inconsistency" if @@writable_suffix && @@writable_suffix != suffix
|
28
|
+
|
29
|
+
# for indexing
|
30
|
+
@@writable_db = Xapian::WritableDatabase.new(new_path, Xapian::DB_CREATE_OR_OPEN)
|
31
|
+
@@term_generator = Xapian::TermGenerator.new()
|
32
|
+
@@term_generator.set_flags(Xapian::TermGenerator::FLAG_SPELLING, 0)
|
33
|
+
@@term_generator.database = @@writable_db
|
34
|
+
@@term_generator.stemmer = @@stemmer
|
35
|
+
@@writable_suffix = suffix
|
36
|
+
end
|
37
|
+
|
38
|
+
######################################################################
|
39
|
+
# Index
|
40
|
+
|
41
|
+
# Update index with any changes needed, call this offline. Only call it
|
42
|
+
# from a script that exits - otherwise Xapian's writable database won't
|
43
|
+
# flush your changes. Specifying flush will reduce performance, but
|
44
|
+
# make sure that each index update is definitely saved to disk before
|
45
|
+
# logging in the database that it has been.
|
46
|
+
def update_index(flush = false, verbose = false)
|
47
|
+
# puts "start of self.update_index" if verbose
|
48
|
+
|
49
|
+
# Before calling writable_init we have to make sure every model class has been initialized.
|
50
|
+
# i.e. has had its class code loaded, so acts_as_xapian has been called inside it, and
|
51
|
+
# we have the info from acts_as_xapian.
|
52
|
+
model_classes = ActsAsXapianJob.find(:all, :select => 'model', :group => 'model').map {|a| a.model.constantize }
|
53
|
+
# If there are no models in the queue, then nothing to do
|
54
|
+
return if model_classes.empty?
|
55
|
+
|
56
|
+
self.writable_init
|
57
|
+
|
58
|
+
ids_to_refresh = ActsAsXapianJob.find(:all, :select => 'id').map { |i| i.id }
|
59
|
+
ids_to_refresh.each do |id|
|
60
|
+
begin
|
61
|
+
ActsAsXapianJob.transaction do
|
62
|
+
job = ActsAsXapianJob.find(id, :lock =>true)
|
63
|
+
puts "ActsAsXapian::WriteableIndex.update_index #{job.action} #{job.model} #{job.model_id.to_s}" if verbose
|
64
|
+
begin
|
65
|
+
case job.action
|
66
|
+
when 'update'
|
67
|
+
# XXX Index functions may reference other models, so we could eager load here too?
|
68
|
+
model = job.model.constantize.find(job.model_id) # :include => cls.constantize.xapian_options[:include]
|
69
|
+
model.xapian_index
|
70
|
+
when 'destroy'
|
71
|
+
# Make dummy model with right id, just for destruction
|
72
|
+
model = job.model.constantize.new
|
73
|
+
model.id = job.model_id
|
74
|
+
model.xapian_destroy
|
75
|
+
else
|
76
|
+
raise "unknown ActsAsXapianJob action '#{job.action}'"
|
77
|
+
end
|
78
|
+
rescue ActiveRecord::RecordNotFound => e
|
79
|
+
job.action = 'destroy'
|
80
|
+
retry
|
81
|
+
end
|
82
|
+
job.destroy
|
83
|
+
|
84
|
+
@@writable_db.flush if flush
|
85
|
+
end
|
86
|
+
rescue => detail
|
87
|
+
# print any error, and carry on so other things are indexed
|
88
|
+
# XXX If item is later deleted, this should give up, and it
|
89
|
+
# won't. It will keep trying (assuming update_index called from
|
90
|
+
# regular cron job) and mayhap cause trouble.
|
91
|
+
STDERR.puts("#{detail.backtrace.join("\n")}\nFAILED ActsAsXapian::WriteableIndex.update_index job #{id} #{$!}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# You must specify *all* the models here, this totally rebuilds the Xapian database.
|
97
|
+
# You'll want any readers to reopen the database after this.
|
98
|
+
def rebuild_index(model_classes, verbose = false)
|
99
|
+
raise "when rebuilding all, please call as first and only thing done in process / task" unless @@writable_db.nil?
|
100
|
+
|
101
|
+
prepare_environment
|
102
|
+
|
103
|
+
# Delete any existing .new database, and open a new one
|
104
|
+
new_path = "#{self.db_path}.new"
|
105
|
+
if File.exist?(new_path)
|
106
|
+
raise "found existing #{new_path} which is not Xapian flint database, please delete for me" unless File.exist?(File.join(new_path, "iamflint"))
|
107
|
+
FileUtils.rm_r(new_path)
|
108
|
+
end
|
109
|
+
self.writable_init(".new")
|
110
|
+
|
111
|
+
# Index everything
|
112
|
+
|
113
|
+
most_recent_job = ActsAsXapianJob.find(:first, :order => 'id DESC')
|
114
|
+
batch_size = 1000
|
115
|
+
model_classes.each do |model_class|
|
116
|
+
all_ids = model_class.find(:all, :select => model_class.primary_key, :order => model_class.primary_key).map {|i| i.id }
|
117
|
+
all_ids.each_slice(batch_size) do |ids|
|
118
|
+
puts "ActsAsXapian::WriteableIndex: New batch. Including ids #{ids.first} to #{ids.last}" if verbose
|
119
|
+
models = model_class.find(:all, :conditions => {model_class.primary_key => ids})
|
120
|
+
models.each do |model|
|
121
|
+
puts "ActsAsXapian::WriteableIndex.rebuild_index #{model_class} #{model.id}" if verbose
|
122
|
+
model.xapian_index
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
@@writable_db.flush
|
128
|
+
|
129
|
+
# Rename into place
|
130
|
+
old_path = self.db_path
|
131
|
+
temp_path = "#{old_path}.tmp"
|
132
|
+
if File.exist?(temp_path)
|
133
|
+
raise "temporary database found #{temp_path} which is not Xapian flint database, please delete for me" unless File.exist?(File.join(temp_path, "iamflint"))
|
134
|
+
FileUtils.rm_r(temp_path)
|
135
|
+
end
|
136
|
+
FileUtils.mv(old_path, temp_path) if File.exist?(old_path)
|
137
|
+
FileUtils.mv(new_path, old_path)
|
138
|
+
|
139
|
+
# Delete old database
|
140
|
+
if File.exist?(temp_path)
|
141
|
+
raise "old database now at #{temp_path} is not Xapian flint database, please delete for me" unless File.exist?(File.join(temp_path, "iamflint"))
|
142
|
+
FileUtils.rm_r(temp_path)
|
143
|
+
end
|
144
|
+
|
145
|
+
ActsAsXapianJob.delete_all ['id <= ?', most_recent_job.id] if most_recent_job
|
146
|
+
|
147
|
+
# You'll want to restart your FastCGI or Mongrel processes after this,
|
148
|
+
# so they get the new db
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,5 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'base' )
|
2
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'query_base' )
|
3
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'search' )
|
4
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'similar' )
|
5
|
+
require File.join(File.dirname(__FILE__), 'acts_as_xapian', 'core_ext/array' )
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|