ultrasphinx 1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+
2
+
3
+ module Ultrasphinx
4
+
5
+ =begin rdoc
6
+ == Spelling support
7
+
8
+ In order to spellcheck your user's query, Ultrasphinx bundles a small spelling module. First, make sure Aspell 0.6, an appropriate Aspell dictionary, and the Rubygem 'raspell' are all installed.
9
+
10
+ Then, copy <tt>examples/app.multi</tt> into your Aspell dictionary folder. It allows you to use Sphinx to generate a custom wordlist for your app. Modify it if you don't want to also use the default American English dictionary.
11
+
12
+ Then, to build the custom wordlist, run:
13
+ rake ultrasphinx:spelling:build
14
+
15
+ Now you can see if a query is correctly spelled as so:
16
+ @correction = Ultrasphinx::Spell.correct(@search.query)
17
+
18
+ If @correction is not nil, go ahead and suggest it to the user. Otherwise, the query was already correct.
19
+
20
+ =end
21
+
22
+ module Spell
23
+ SP = Aspell.new("app")
24
+ SP.suggestion_mode = Aspell::NORMAL
25
+ SP.set_option("ignore-case", "true")
26
+
27
+ def self.correct string
28
+ correction = string.gsub(/[\w\']+/) do |word|
29
+ unless SP.check(word)
30
+ SP.suggest(word).first
31
+ else
32
+ word
33
+ end
34
+ end
35
+
36
+ correction if correction != string
37
+ end
38
+
39
+ end
40
+ end
41
+
@@ -0,0 +1,276 @@
1
+
2
+ module Ultrasphinx
3
+
4
+ class Exception < ::Exception #:nodoc:
5
+ end
6
+ class ConfigurationError < Exception #:nodoc:
7
+ end
8
+ class DaemonError < Exception #:nodoc:
9
+ end
10
+
11
+ # internal file paths
12
+
13
+ SUBDIR = "config/ultrasphinx"
14
+
15
+ DIR = "#{RAILS_ROOT}/#{SUBDIR}"
16
+
17
+ CONF_PATH = "#{DIR}/#{RAILS_ENV}.conf"
18
+
19
+ ENV_BASE_PATH = "#{DIR}/#{RAILS_ENV}.base"
20
+
21
+ GENERIC_BASE_PATH = "#{DIR}/default.base"
22
+
23
+ BASE_PATH = (File.exist?(ENV_BASE_PATH) ? ENV_BASE_PATH : GENERIC_BASE_PATH)
24
+
25
+ raise ConfigurationError, "Please create a '#{SUBDIR}/#{RAILS_ENV}.base' or '#{SUBDIR}/default.base' file in order to use Ultrasphinx in your #{RAILS_ENV} environment." unless File.exist? BASE_PATH # XXX lame
26
+
27
+ # some miscellaneous constants
28
+
29
+ MAX_INT = 2**32-1
30
+
31
+ MAX_WORDS = 2**16 # maximum number of stopwords built
32
+
33
+ UNIFIED_INDEX_NAME = "complete"
34
+
35
+ COLUMN_TYPES = {:string => 'text', :text => 'text', :integer => 'numeric', :date => 'date', :datetime => 'date' }
36
+
37
+ CONFIG_MAP = {:username => 'sql_user',
38
+ :password => 'sql_pass',
39
+ :host => 'sql_host',
40
+ :database => 'sql_db',
41
+ :port => 'sql_port',
42
+ :socket => 'sql_sock'}
43
+
44
+ OPTIONAL_SPHINX_KEYS = ['morphology', 'stopwords', 'min_word_len', 'charset_type', 'charset_table', 'docinfo']
45
+
46
+ # some default settings for the sphinx conf files
47
+
48
+ SOURCE_DEFAULTS = %(
49
+ strip_html = 0
50
+ index_html_attrs =
51
+ sql_query_post =
52
+ sql_range_step = 20000
53
+ )
54
+
55
+ ADAPTER_DEFAULTS = {
56
+ "mysql" => %(
57
+ type = mysql
58
+ sql_query_pre = SET SESSION group_concat_max_len = 65535
59
+ sql_query_pre = SET NAMES utf8
60
+ ),
61
+ "postgresql" => %(
62
+ type = pgsql
63
+ )}
64
+
65
+
66
+ # Configuration file parser.
67
+ def self.options_for(heading, path)
68
+
69
+ section = open(path).read[/^#{heading}.*?\{(.*?)\}/m, 1]
70
+ unless section
71
+ Ultrasphinx.say "#{path} appears to be corrupted; please delete file"
72
+ raise ConfigurationError, "Missing heading #{heading.inspect}"
73
+ end
74
+
75
+ options = section.split("\n").map do |line|
76
+ line =~ /\s*(.*?)\s*=\s*([^\#]*)/
77
+ $1 ? [$1, $2.strip] : []
78
+ end
79
+
80
+ Hash[*options.flatten]
81
+ end
82
+
83
+ # introspect on the existing generated conf files
84
+
85
+ PLUGIN_SETTINGS = options_for('ultrasphinx', BASE_PATH)
86
+
87
+ DAEMON_SETTINGS = options_for('searchd', BASE_PATH)
88
+
89
+ STOPWORDS_PATH = "#{Ultrasphinx::PLUGIN_SETTINGS['path']}/stopwords.txt}"
90
+
91
+ MODEL_CONFIGURATION = {}
92
+
93
+
94
+
95
+ class << self
96
+
97
+ # Logger.
98
+ def say msg
99
+ $stderr.puts "** ultrasphinx: #{msg}"
100
+ end
101
+
102
+ # Force all the indexed models to load and fill the MODEL_CONFIGURATION hash.
103
+ def load_constants
104
+
105
+ Dir["#{RAILS_ROOT}/app/models/**/*.rb"].each do |filename|
106
+ next if filename =~ /\/(\.svn|CVS|\.bzr)\//
107
+ begin
108
+ open(filename) {|file| load filename if file.grep(/is_indexed/).any?}
109
+ rescue Object => e
110
+ say "warning; possibly critical autoload error on #{filename}"
111
+ say e.inspect
112
+ end
113
+ end
114
+
115
+ # build the field-to-type mappings
116
+ Fields.instance.configure(MODEL_CONFIGURATION)
117
+ end
118
+
119
+ # Complain if the database names go out of sync.
120
+ def verify_database_name
121
+ if File.exist? CONF_PATH
122
+ if options_for("source", CONF_PATH)['sql_db'] != ActiveRecord::Base.connection.instance_variable_get("@config")[:database]
123
+ say "warning; configured database name is out-of-date"
124
+ say "please run 'rake ultrasphinx:configure'"
125
+ end rescue nil
126
+ end
127
+ end
128
+
129
+
130
+ # Main SQL builder.
131
+ def configure
132
+ load_constants
133
+
134
+ puts "Rebuilding Ultrasphinx configurations for #{ENV['RAILS_ENV']} environment"
135
+ puts "Available models are #{MODEL_CONFIGURATION.keys.to_sentence}"
136
+ File.open(CONF_PATH, "w") do |conf|
137
+ conf.puts "\n# Auto-generated at #{Time.now}.\n# Hand modifications will be overwritten.\n"
138
+
139
+ conf.puts "\n# #{BASE_PATH}"
140
+ conf.puts open(BASE_PATH).read.sub(/^ultrasphinx.*?\{.*?\}/m, '') + "\n"
141
+
142
+ sphinx_source_list = []
143
+
144
+ conf.puts "\n# Source configuration\n\n"
145
+
146
+ puts "Generating SQL"
147
+ MODEL_CONFIGURATION.each_with_index do |model_options, class_id|
148
+ model, options = model_options
149
+ klass, source = model.constantize, model.tableize
150
+
151
+ # puts "SQL for #{model}"
152
+
153
+ sphinx_source_list << source
154
+
155
+ conf.puts "source #{source}\n{"
156
+ conf.puts SOURCE_DEFAULTS
157
+
158
+ # apparently we're supporting postgres now
159
+ connection_settings = klass.connection.instance_variable_get("@config")
160
+
161
+ adapter_defaults = ADAPTER_DEFAULTS[connection_settings[:adapter]]
162
+ raise ConfigurationError, "Unsupported database adapter" unless adapter_defaults
163
+ conf.puts adapter_defaults
164
+
165
+ connection_settings.each do |key, value|
166
+ conf.puts "#{CONFIG_MAP[key]} = #{value}" if CONFIG_MAP[key]
167
+ end
168
+
169
+ table, pkey = klass.table_name, klass.primary_key
170
+ condition_strings, join_strings = Array(options[:conditions]).map{|condition| "(#{condition})"}, []
171
+ column_strings = ["(#{table}.#{pkey} * #{MODEL_CONFIGURATION.size} + #{class_id}) AS id",
172
+ "#{class_id} AS class_id", "'#{klass.name}' AS class"]
173
+ remaining_columns = Fields.instance.keys - ["class", "class_id"]
174
+
175
+ conf.puts "\nsql_query_range = SELECT MIN(#{pkey}), MAX(#{pkey}) FROM #{table}"
176
+
177
+ options[:fields].to_a.each do |f|
178
+ column, as = f.is_a?(Hash) ? [f[:field], f[:as]] : [f, f]
179
+ column_strings << Fields.instance.cast("#{table}.#{column}", as)
180
+ remaining_columns.delete(as)
181
+ end
182
+
183
+ options[:includes].to_a.each do |join|
184
+ join_klass = join[:model].constantize
185
+ association = klass.reflect_on_association(join[:model].underscore.to_sym)
186
+ if not association
187
+ if not join[:association_sql]
188
+ raise ConfigurationError, "Unknown association from #{klass} to #{join[:model]}"
189
+ else
190
+ join_strings << join[:association_sql]
191
+ end
192
+ else
193
+ join_strings << "LEFT OUTER JOIN #{join_klass.table_name} ON " +
194
+ if (macro = association.macro) == :belongs_to
195
+ "#{join_klass.table_name}.#{join_klass.primary_key} = #{table}.#{association.primary_key_name}"
196
+ elsif macro == :has_one
197
+ "#{table}.#{klass.primary_key} = #{join_klass.table_name}.#{association.instance_variable_get('@foreign_key_name')}"
198
+ else
199
+ raise ConfigurationError, "Unidentified association macro #{macro.inspect}"
200
+ end
201
+ end
202
+ column_strings << "#{join_klass.table_name}.#{join[:field]} AS #{join[:as] or join[:field]}"
203
+ remaining_columns.delete(join[:as] || join[:field])
204
+ end
205
+
206
+ options[:concats].to_a.select{|concat| concat[:model] and concat[:field]}.each do |group|
207
+ # only has_many's or explicit sql right now
208
+ join_klass = group[:model].constantize
209
+ if group[:association_sql]
210
+ join_strings << group[:association_sql]
211
+ else
212
+ association = klass.reflect_on_association(group[:association_name] ? group[:association_name].to_sym : group[:model].underscore.pluralize.to_sym)
213
+ join_strings << "LEFT OUTER JOIN #{join_klass.table_name} ON #{table}.#{klass.primary_key} = #{join_klass.table_name}.#{association.primary_key_name}" + (" AND (#{group[:conditions]})" if group[:conditions]).to_s # XXX make sure foreign key is right for polymorphic relationships
214
+ end
215
+ column_strings << Fields.instance.cast("GROUP_CONCAT(#{join_klass.table_name}.#{group[:field]} SEPARATOR ' ')", group[:as])
216
+ remaining_columns.delete(group[:as])
217
+ end
218
+
219
+ options[:concats].to_a.select{|concat| concat[:fields]}.each do |concat|
220
+ column_strings << Fields.instance.cast("CONCAT_WS(' ', #{concat[:fields].map{|field| "#{table}.#{field}"}.join(', ')})", concat[:as])
221
+ remaining_columns.delete(concat[:as])
222
+ end
223
+
224
+ # puts "#{model} has #{remaining_columns.inspect} remaining"
225
+ remaining_columns.each do |field|
226
+ column_strings << Fields.instance.null(field)
227
+ end
228
+
229
+ query_strings = ["SELECT", column_strings.sort_by do |string|
230
+ # sphinx wants them always in the same order, but "id" must be first
231
+ (field = string[/.*AS (.*)/, 1]) == "id" ? "*" : field
232
+ end.join(", ")]
233
+ query_strings << "FROM #{table}"
234
+ query_strings += join_strings.uniq
235
+ query_strings << "WHERE #{table}.#{pkey} >= $start AND #{table}.#{pkey} <= $end"
236
+ query_strings += condition_strings.uniq.map{|s| "AND #{s}"}
237
+ query_strings << "GROUP BY id"
238
+
239
+ conf.puts "sql_query = #{query_strings.join(" ")}"
240
+
241
+ groups = []
242
+ # group and date sorting params... this really only would have to be run once
243
+ Fields.instance.each do |field, type|
244
+ case type
245
+ when 'numeric'
246
+ groups << "sql_group_column = #{field}"
247
+ when 'date'
248
+ groups << "sql_date_column = #{field}"
249
+ end
250
+ end
251
+ conf.puts "\n" + groups.sort_by{|s| s[/= (.*)/, 1]}.join("\n")
252
+ conf.puts "\nsql_query_info = SELECT * FROM #{table} WHERE #{table}.#{pkey} = (($id - #{class_id}) / #{MODEL_CONFIGURATION.size})"
253
+ conf.puts "}\n\n"
254
+ end
255
+
256
+ conf.puts "\n# Index configuration\n\n"
257
+
258
+
259
+ # only output the unified index; no one uses the individual ones anyway
260
+
261
+ conf.puts "index #{UNIFIED_INDEX_NAME}"
262
+ conf.puts "{"
263
+ conf.puts sphinx_source_list.map {|s| "source = #{s}" }
264
+
265
+ OPTIONAL_SPHINX_KEYS.each do |key|
266
+ conf.puts "#{key} = #{PLUGIN_SETTINGS[key]}" if PLUGIN_SETTINGS[key]
267
+ end
268
+
269
+ conf.puts "path = #{PLUGIN_SETTINGS["path"]}/sphinx_index_#{UNIFIED_INDEX_NAME}"
270
+ conf.puts "}\n\n"
271
+ end
272
+
273
+ end
274
+
275
+ end
276
+ end
@@ -0,0 +1,125 @@
1
+
2
+ ENV['RAILS_ENV'] ||= "development"
3
+
4
+ namespace :ultrasphinx do
5
+
6
+ desc "Bootstrap a full Sphinx environment"
7
+ task :bootstrap => [:environment, :configure, :index, :start] do
8
+ end
9
+
10
+ desc "Rebuild the configuration file for this particular environment."
11
+ task :configure => :environment do
12
+ Ultrasphinx::configure
13
+ end
14
+
15
+ desc "Reindex the database and send an update signal to the search daemon."
16
+ task :index => :environment do
17
+ cmd = "indexer --config #{Ultrasphinx::CONF_PATH}"
18
+ cmd << " #{ENV['OPTS']} " if ENV['OPTS']
19
+ cmd << " --rotate" if ultrasphinx_daemon_running?
20
+ cmd << " #{Ultrasphinx::UNIFIED_INDEX_NAME}"
21
+ puts cmd
22
+ system cmd
23
+ end
24
+
25
+ namespace :daemon do
26
+ desc "Start the search daemon"
27
+ task :start => :environment do
28
+ raise Ultrasphinx::DaemonError, "Already running" if ultrasphinx_daemon_running?
29
+ # remove lockfiles
30
+ Dir[Ultrasphinx::PLUGIN_SETTINGS["path"] + "*spl"].each {|file| File.delete(file)}
31
+ system "searchd --config #{Ultrasphinx::CONF_PATH}"
32
+ sleep(2) # give daemon a chance to write the pid file
33
+ if ultrasphinx_daemon_running?
34
+ puts "Started successfully"
35
+ else
36
+ puts "Failed to start"
37
+ end
38
+ end
39
+
40
+ desc "Stop the search daemon"
41
+ task :stop => [:environment] do
42
+ raise Ultrasphinx::DaemonError, "Doesn't seem to be running" unless ultrasphinx_daemon_running?
43
+ system "kill #{pid = ultrasphinx_daemon_pid}"
44
+ puts "Stopped #{pid}."
45
+ end
46
+
47
+ desc "Restart the search daemon"
48
+ task :restart => [:environment, :stop, :start] do
49
+ end
50
+
51
+ desc "Tail queries in the log"
52
+ task :tail => :environment do
53
+ require 'file/tail'
54
+ puts "Tailing #{filename = Ultrasphinx::DAEMON_SETTINGS['query_log']}"
55
+ File.open(filename) do |log|
56
+ log.extend(File::Tail)
57
+ log.interval = 1
58
+ log.backward(10)
59
+ last = nil
60
+ log.tail do |line|
61
+ current = line[/\[\*\](.*)$/, 1]
62
+ last = current and puts current unless current == last
63
+ end
64
+ end
65
+ end
66
+
67
+ desc "Check if the search daemon is running"
68
+ task :status => :environment do
69
+ if ultrasphinx_daemon_running?
70
+ puts "Daemon is running."
71
+ else
72
+ puts "Daemon is stopped."
73
+ end
74
+ end
75
+ end
76
+
77
+
78
+
79
+ namespace :spelling do
80
+ desc "Rebuild custom spelling dictionary"
81
+ task :build => :environment do
82
+ ENV['OPTS'] = "--buildstops #{Ultrasphinx::STOPWORDS_PATH} #{Ultrasphinx::MAX_WORDS} --buildfreqs"
83
+ Rake::Task["ultrasphinx:index"].invoke
84
+ tmpfile = "/tmp/custom_words.txt"
85
+ words = []
86
+ puts "Filtering"
87
+ File.open(Ultrasphinx::STOPWORDS_PATH).each do |line|
88
+ if line =~ /^([^\s\d_]{4,}) (\d+)/
89
+ # XXX should be configurable
90
+ words << $1 if $2.to_i > 40
91
+ # ideally we would also skip words within X edit distance of a correction
92
+ # by aspell-en, in order to not add typos to the dictionary
93
+ end
94
+ end
95
+ puts "Writing #{words.size} words"
96
+ File.open(tmpfile, 'w').write(words.join("\n"))
97
+ puts "Loading into aspell"
98
+ system("aspell --lang=en create master custom.rws < #{tmpfile}")
99
+ end
100
+ end
101
+
102
+ end
103
+
104
+ # task shortcuts
105
+ namespace :us do
106
+ task :start => ["ultrasphinx:daemon:start"]
107
+ task :restart => ["ultrasphinx:daemon:restart"]
108
+ task :stop => ["ultrasphinx:daemon:stop"]
109
+ task :in => ["ultrasphinx:index"]
110
+ task :spell => ["ultrasphinx:spelling:build"]
111
+ task :conf => ["ultrasphinx:configure"]
112
+ task :boot => ["ultrasphinx:bootstrap"]
113
+ end
114
+
115
+ # support methods
116
+
117
+ def ultrasphinx_daemon_pid
118
+ open(open(Ultrasphinx::BASE_PATH).readlines.map do |line|
119
+ line[/^\s*pid_file\s*=\s*([^\s\#]*)/, 1]
120
+ end.compact.first).readline.chomp rescue nil # XXX ridiculous
121
+ end
122
+
123
+ def ultrasphinx_daemon_running?
124
+ ultrasphinx_daemon_pid and `ps #{ultrasphinx_daemon_pid} | wc`.to_i > 1
125
+ end
@@ -0,0 +1,40 @@
1
+ =Sphinx Client API 0.3.0
2
+
3
+ Patched for Ultrasphinx.
4
+
5
+ This document gives an overview of what is Sphinx itself and how to use in
6
+ within Ruby on Rails. For more information or documentation,
7
+ please go to http://www.sphinxsearch.com
8
+
9
+ ==Sphinx
10
+
11
+ Sphinx is a standalone full-text search engine, meant to provide fast,
12
+ size-efficient and relevant fulltext search functions to other applications.
13
+ Sphinx was specially designed to integrate well with SQL databases and
14
+ scripting languages. Currently built-in data sources support fetching data
15
+ either via direct connection to MySQL, or from an XML pipe.
16
+
17
+ Simplest way to communicate with Sphinx is to use <tt>searchd</tt> -
18
+ a daemon to search through fulltext indices from external software.
19
+
20
+ ==Documentation
21
+
22
+ You can create the documentation by running:
23
+
24
+ rake rdoc
25
+
26
+ ==Latest version
27
+
28
+ You can always get latest version from
29
+ http://kpumuk.info/projects/ror-plugins/sphinx
30
+
31
+ ==Credits
32
+
33
+ Dmytro Shteflyuk <kpumuk@kpumuk.info> http://kpumuk.info
34
+
35
+ Special thanks to Alexey Kovyrin <alexey@kovyrin.net> http://blog.kovyrin.net
36
+
37
+ ==License
38
+
39
+ This library is distributed under the terms of the Ruby license.
40
+ You can freely distribute/modify this library.