bdimcheff-dm-sphinx-adapter 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,619 @@
1
+ require 'riddle/client/filter'
2
+ require 'riddle/client/message'
3
+ require 'riddle/client/response'
4
+
5
+ module Riddle
6
+ class VersionError < StandardError; end
7
+ class ResponseError < StandardError; end
8
+
9
+ # This class was heavily based on the existing Client API by Dmytro Shteflyuk
10
+ # and Alexy Kovyrin. Their code worked fine, I just wanted something a bit
11
+ # more Ruby-ish (ie. lowercase and underscored method names). I also have
12
+ # used a few helper classes, just to neaten things up.
13
+ #
14
+ # Feel free to use it wherever. Send bug reports, patches, comments and
15
+ # suggestions to pat at freelancing-gods dot com.
16
+ #
17
+ # Most properties of the client are accessible through attribute accessors,
18
+ # and where relevant use symboles instead of the long constants common in
19
+ # other clients.
20
+ # Some examples:
21
+ #
22
+ # client.sort_mode = :extended
23
+ # client.sort_by = "birthday DESC"
24
+ # client.match_mode = :extended
25
+ #
26
+ # To add a filter, you will need to create a Filter object:
27
+ #
28
+ # client.filters << Riddle::Client::Filter.new("birthday",
29
+ # Time.at(1975, 1, 1).to_i..Time.at(1985, 1, 1).to_i, false)
30
+ #
31
+ class Client
32
+ Commands = {
33
+ :search => 0, # SEARCHD_COMMAND_SEARCH
34
+ :excerpt => 1, # SEARCHD_COMMAND_EXCERPT
35
+ :update => 2, # SEARCHD_COMMAND_UPDATE
36
+ :keywords => 3 # SEARCHD_COMMAND_KEYWORDS
37
+ }
38
+
39
+ Versions = {
40
+ :search => 0x113, # VER_COMMAND_SEARCH
41
+ :excerpt => 0x100, # VER_COMMAND_EXCERPT
42
+ :update => 0x101, # VER_COMMAND_UPDATE
43
+ :keywords => 0x100 # VER_COMMAND_KEYWORDS
44
+ }
45
+
46
+ Statuses = {
47
+ :ok => 0, # SEARCHD_OK
48
+ :error => 1, # SEARCHD_ERROR
49
+ :retry => 2, # SEARCHD_RETRY
50
+ :warning => 3 # SEARCHD_WARNING
51
+ }
52
+
53
+ MatchModes = {
54
+ :all => 0, # SPH_MATCH_ALL
55
+ :any => 1, # SPH_MATCH_ANY
56
+ :phrase => 2, # SPH_MATCH_PHRASE
57
+ :boolean => 3, # SPH_MATCH_BOOLEAN
58
+ :extended => 4, # SPH_MATCH_EXTENDED
59
+ :fullscan => 5, # SPH_MATCH_FULLSCAN
60
+ :extended2 => 6 # SPH_MATCH_EXTENDED2
61
+ }
62
+
63
+ RankModes = {
64
+ :proximity_bm25 => 0, # SPH_RANK_PROXIMITY_BM25
65
+ :bm25 => 1, # SPH_RANK_BM25
66
+ :none => 2, # SPH_RANK_NONE
67
+ :wordcount => 3 # SPH_RANK_WORDCOUNT
68
+ }
69
+
70
+ SortModes = {
71
+ :relevance => 0, # SPH_SORT_RELEVANCE
72
+ :attr_desc => 1, # SPH_SORT_ATTR_DESC
73
+ :attr_asc => 2, # SPH_SORT_ATTR_ASC
74
+ :time_segments => 3, # SPH_SORT_TIME_SEGMENTS
75
+ :extended => 4, # SPH_SORT_EXTENDED
76
+ :expr => 5 # SPH_SORT_EXPR
77
+ }
78
+
79
+ AttributeTypes = {
80
+ :integer => 1, # SPH_ATTR_INTEGER
81
+ :timestamp => 2, # SPH_ATTR_TIMESTAMP
82
+ :ordinal => 3, # SPH_ATTR_ORDINAL
83
+ :bool => 4, # SPH_ATTR_BOOL
84
+ :float => 5, # SPH_ATTR_FLOAT
85
+ :multi => 0x40000000 # SPH_ATTR_MULTI
86
+ }
87
+
88
+ GroupFunctions = {
89
+ :day => 0, # SPH_GROUPBY_DAY
90
+ :week => 1, # SPH_GROUPBY_WEEK
91
+ :month => 2, # SPH_GROUPBY_MONTH
92
+ :year => 3, # SPH_GROUPBY_YEAR
93
+ :attr => 4, # SPH_GROUPBY_ATTR
94
+ :attrpair => 5 # SPH_GROUPBY_ATTRPAIR
95
+ }
96
+
97
+ FilterTypes = {
98
+ :values => 0, # SPH_FILTER_VALUES
99
+ :range => 1, # SPH_FILTER_RANGE
100
+ :float_range => 2 # SPH_FILTER_FLOATRANGE
101
+ }
102
+
103
+ attr_accessor :server, :port, :offset, :limit, :max_matches,
104
+ :match_mode, :sort_mode, :sort_by, :weights, :id_range, :filters,
105
+ :group_by, :group_function, :group_clause, :group_distinct, :cut_off,
106
+ :retry_count, :retry_delay, :anchor, :index_weights, :rank_mode,
107
+ :max_query_time, :field_weights, :timeout
108
+ attr_reader :queue
109
+
110
+ # Can instantiate with a specific server and port - otherwise it assumes
111
+ # defaults of localhost and 3312 respectively. All other settings can be
112
+ # accessed and changed via the attribute accessors.
113
+ def initialize(server=nil, port=nil)
114
+ @server = server || "localhost"
115
+ @port = port || 3312
116
+
117
+ reset
118
+
119
+ @queue = []
120
+ end
121
+
122
+ # Reset attributes and settings to defaults.
123
+ def reset
124
+ # defaults
125
+ @offset = 0
126
+ @limit = 20
127
+ @max_matches = 1000
128
+ @match_mode = :all
129
+ @sort_mode = :relevance
130
+ @sort_by = ''
131
+ @weights = []
132
+ @id_range = 0..0
133
+ @filters = []
134
+ @group_by = ''
135
+ @group_function = :day
136
+ @group_clause = '@group desc'
137
+ @group_distinct = ''
138
+ @cut_off = 0
139
+ @retry_count = 0
140
+ @retry_delay = 0
141
+ @anchor = {}
142
+ # string keys are index names, integer values are weightings
143
+ @index_weights = {}
144
+ @rank_mode = :proximity_bm25
145
+ @max_query_time = 0
146
+ # string keys are field names, integer values are weightings
147
+ @field_weights = {}
148
+ @timeout = 0
149
+ end
150
+
151
+ # Set the geo-anchor point - with the names of the attributes that contain
152
+ # the latitude and longitude (in radians), and the reference position.
153
+ # Note that for geocoding to work properly, you must also set
154
+ # match_mode to :extended. To sort results by distance, you will
155
+ # need to set sort_mode to '@geodist asc' for example. Sphinx
156
+ # expects latitude and longitude to be returned from you SQL source
157
+ # in radians.
158
+ #
159
+ # Example:
160
+ # client.set_anchor('lat', -0.6591741, 'long', 2.530770)
161
+ #
162
+ def set_anchor(lat_attr, lat, long_attr, long)
163
+ @anchor = {
164
+ :latitude_attribute => lat_attr,
165
+ :latitude => lat,
166
+ :longitude_attribute => long_attr,
167
+ :longitude => long
168
+ }
169
+ end
170
+
171
+ # Append a query to the queue. This uses the same parameters as the query
172
+ # method.
173
+ def append_query(search, index = '*', comments = '')
174
+ @queue << query_message(search, index, comments)
175
+ end
176
+
177
+ # Run all the queries currently in the queue. This will return an array of
178
+ # results hashes.
179
+ def run
180
+ response = Response.new request(:search, @queue)
181
+
182
+ results = @queue.collect do
183
+ result = {
184
+ :matches => [],
185
+ :fields => [],
186
+ :attributes => {},
187
+ :attribute_names => [],
188
+ :words => {}
189
+ }
190
+
191
+ result[:status] = response.next_int
192
+ case result[:status]
193
+ when Statuses[:warning]
194
+ result[:warning] = response.next
195
+ when Statuses[:error]
196
+ result[:error] = response.next
197
+ next result
198
+ end
199
+
200
+ result[:fields] = response.next_array
201
+
202
+ attributes = response.next_int
203
+ for i in 0...attributes
204
+ attribute_name = response.next
205
+ type = response.next_int
206
+
207
+ result[:attributes][attribute_name] = type
208
+ result[:attribute_names] << attribute_name
209
+ end
210
+
211
+ matches = response.next_int
212
+ is_64_bit = response.next_int
213
+ for i in 0...matches
214
+ doc = is_64_bit > 0 ? response.next_64bit_int : response.next_int
215
+ weight = response.next_int
216
+
217
+ result[:matches] << {:doc => doc, :weight => weight, :index => i, :attributes => {}}
218
+ result[:attribute_names].each do |attr|
219
+ result[:matches].last[:attributes][attr] = attribute_from_type(
220
+ result[:attributes][attr], response
221
+ )
222
+ end
223
+ end
224
+
225
+ result[:total] = response.next_int.to_i || 0
226
+ result[:total_found] = response.next_int.to_i || 0
227
+ result[:time] = ('%.3f' % (response.next_int / 1000.0)).to_f || 0.0
228
+
229
+ words = response.next_int
230
+ for i in 0...words
231
+ word = response.next
232
+ docs = response.next_int
233
+ hits = response.next_int
234
+ result[:words][word] = {:docs => docs, :hits => hits}
235
+ end
236
+
237
+ result
238
+ end
239
+
240
+ @queue.clear
241
+ results
242
+ end
243
+
244
+ # Query the Sphinx daemon - defaulting to all indexes, but you can specify
245
+ # a specific one if you wish. The search parameter should be a string
246
+ # following Sphinx's expectations.
247
+ #
248
+ # The object returned from this method is a hash with the following keys:
249
+ #
250
+ # * :matches
251
+ # * :fields
252
+ # * :attributes
253
+ # * :attribute_names
254
+ # * :words
255
+ # * :total
256
+ # * :total_found
257
+ # * :time
258
+ # * :status
259
+ # * :warning (if appropriate)
260
+ # * :error (if appropriate)
261
+ #
262
+ # The key <tt>:matches</tt> returns an array of hashes - the actual search
263
+ # results. Each hash has the document id (<tt>:doc</tt>), the result
264
+ # weighting (<tt>:weight</tt>), and a hash of the attributes for the
265
+ # document (<tt>:attributes</tt>).
266
+ #
267
+ # The <tt>:fields</tt> and <tt>:attribute_names</tt> keys return list of
268
+ # fields and attributes for the documents. The key <tt>:attributes</tt>
269
+ # will return a hash of attribute name and type pairs, and <tt>:words</tt>
270
+ # returns a hash of hashes representing the words from the search, with the
271
+ # number of documents and hits for each, along the lines of:
272
+ #
273
+ # results[:words]["Pat"] #=> {:docs => 12, :hits => 15}
274
+ #
275
+ # <tt>:total</tt>, <tt>:total_found</tt> and <tt>:time</tt> return the
276
+ # number of matches available, the total number of matches (which may be
277
+ # greater than the maximum available, depending on the number of matches
278
+ # and your sphinx configuration), and the time in milliseconds that the
279
+ # query took to run.
280
+ #
281
+ # <tt>:status</tt> is the error code for the query - and if there was a
282
+ # related warning, it will be under the <tt>:warning</tt> key. Fatal errors
283
+ # will be described under <tt>:error</tt>.
284
+ #
285
+ def query(search, index = '*', comments = '')
286
+ @queue << query_message(search, index, comments)
287
+ self.run.first
288
+ end
289
+
290
+ # Build excerpts from search terms (the +words+) and the text of documents. Excerpts are bodies of text that have the +words+ highlighted.
291
+ # They may also be abbreviated to fit within a word limit.
292
+ #
293
+ # As part of the options hash, you will need to
294
+ # define:
295
+ # * :docs
296
+ # * :words
297
+ # * :index
298
+ #
299
+ # Optional settings include:
300
+ # * :before_match (defaults to <span class="match">)
301
+ # * :after_match (defaults to </span>)
302
+ # * :chunk_separator (defaults to ' &#8230; ' - which is an HTML ellipsis)
303
+ # * :limit (defaults to 256)
304
+ # * :around (defaults to 5)
305
+ # * :exact_phrase (defaults to false)
306
+ # * :single_passage (defaults to false)
307
+ #
308
+ # The defaults differ from the official PHP client, as I've opted for
309
+ # semantic HTML markup.
310
+ #
311
+ # Example:
312
+ #
313
+ # client.excerpts(:docs => ["Pat Allan, Pat Cash"], :words => 'Pat', :index => 'pats')
314
+ # #=> ["<span class=\"match\">Pat</span> Allan, <span class=\"match\">Pat</span> Cash"]
315
+ #
316
+ # lorem_lipsum = "Lorem ipsum dolor..."
317
+ #
318
+ # client.excerpts(:docs => ["Pat Allan, #{lorem_lipsum} Pat Cash"], :words => 'Pat', :index => 'pats')
319
+ # #=> ["<span class=\"match\">Pat</span> Allan, Lorem ipsum dolor sit amet, consectetur adipisicing
320
+ # elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua &#8230; . Excepteur
321
+ # sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est
322
+ # laborum. <span class=\"match\">Pat</span> Cash"]
323
+ #
324
+ # Workflow:
325
+ #
326
+ # Excerpt creation is completely isolated from searching the index. The nominated index is only used to
327
+ # discover encoding and charset information.
328
+ #
329
+ # Therefore, the workflow goes:
330
+ #
331
+ # 1. Do the sphinx query.
332
+ # 2. Fetch the documents found by sphinx from their repositories.
333
+ # 3. Pass the documents' text to +excerpts+ for marking up of matched terms.
334
+ #
335
+ def excerpts(options = {})
336
+ options[:index] ||= '*'
337
+ options[:before_match] ||= '<span class="match">'
338
+ options[:after_match] ||= '</span>'
339
+ options[:chunk_separator] ||= ' &#8230; ' # ellipsis
340
+ options[:limit] ||= 256
341
+ options[:around] ||= 5
342
+ options[:exact_phrase] ||= false
343
+ options[:single_passage] ||= false
344
+
345
+ response = Response.new request(:excerpt, excerpts_message(options))
346
+
347
+ options[:docs].collect { response.next }
348
+ end
349
+
350
+ # Update attributes - first parameter is the relevant index, second is an
351
+ # array of attributes to be updated, and the third is a hash, where the
352
+ # keys are the document ids, and the values are arrays with the attribute
353
+ # values - in the same order as the second parameter.
354
+ #
355
+ # Example:
356
+ #
357
+ # client.update('people', ['birthday'], {1 => [Time.at(1982, 20, 8).to_i]})
358
+ #
359
+ def update(index, attributes, values_by_doc)
360
+ response = Response.new request(
361
+ :update,
362
+ update_message(index, attributes, values_by_doc)
363
+ )
364
+
365
+ response.next_int
366
+ end
367
+
368
+ # Generates a keyword list for a given query. Each keyword is represented
369
+ # by a hash, with keys :tokenised and :normalised. If return_hits is set to
370
+ # true it will also report on the number of hits and documents for each
371
+ # keyword (see :hits and :docs keys respectively).
372
+ def keywords(query, index, return_hits = false)
373
+ response = Response.new request(
374
+ :keywords,
375
+ keywords_message(query, index, return_hits)
376
+ )
377
+
378
+ (0...response.next_int).collect do
379
+ hash = {}
380
+ hash[:tokenised] = response.next
381
+ hash[:normalised] = response.next
382
+
383
+ if return_hits
384
+ hash[:docs] = response.next_int
385
+ hash[:hits] = response.next_int
386
+ end
387
+
388
+ hash
389
+ end
390
+ end
391
+
392
+ private
393
+
394
+ # Connects to the Sphinx daemon, and yields a socket to use. The socket is
395
+ # closed at the end of the block.
396
+ def connect(&block)
397
+ socket = nil
398
+ if @timeout == 0
399
+ socket = initialise_connection
400
+ else
401
+ begin
402
+ Timeout.timeout(@timeout) { socket = initialise_connection }
403
+ rescue Timeout::Error
404
+ raise Riddle::ConnectionError,
405
+ "Connection to #{@server} on #{@port} timed out after #{@timeout} seconds"
406
+ end
407
+ end
408
+
409
+ begin
410
+ yield socket
411
+ ensure
412
+ socket.close
413
+ end
414
+ end
415
+
416
+ def initialise_connection
417
+ socket = TCPSocket.new @server, @port
418
+
419
+ # Checking version
420
+ version = socket.recv(4).unpack('N*').first
421
+ if version < 1
422
+ socket.close
423
+ raise VersionError, "Can only connect to searchd version 1.0 or better, not version #{version}"
424
+ end
425
+
426
+ # Send version
427
+ socket.send [1].pack('N'), 0
428
+
429
+ socket
430
+ end
431
+
432
+ # Send a collection of messages, for a command type (eg, search, excerpts,
433
+ # update), to the Sphinx daemon.
434
+ def request(command, messages)
435
+ response = ""
436
+ status = -1
437
+ version = 0
438
+ length = 0
439
+ message = Array(messages).join("")
440
+
441
+ connect do |socket|
442
+ case command
443
+ when :search
444
+ # Message length is +4 to account for the following count value for
445
+ # the number of messages (well, that's what I'm assuming).
446
+ socket.send [
447
+ Commands[command], Versions[command],
448
+ 4+message.length, messages.length
449
+ ].pack("nnNN") + message, 0
450
+ else
451
+ socket.send [
452
+ Commands[command], Versions[command], message.length
453
+ ].pack("nnN") + message, 0
454
+ end
455
+
456
+ header = socket.recv(8)
457
+ status, version, length = header.unpack('n2N')
458
+
459
+ while response.length < length
460
+ part = socket.recv(length - response.length)
461
+ response << part if part
462
+ end
463
+ end
464
+
465
+ if response.empty? || response.length != length
466
+ raise ResponseError, "No response from searchd (status: #{status}, version: #{version})"
467
+ end
468
+
469
+ case status
470
+ when Statuses[:ok]
471
+ if version < Versions[command]
472
+ puts format("searchd command v.%d.%d older than client (v.%d.%d)",
473
+ version >> 8, version & 0xff,
474
+ Versions[command] >> 8, Versions[command] & 0xff)
475
+ end
476
+ response
477
+ when Statuses[:warning]
478
+ length = response[0, 4].unpack('N*').first
479
+ puts response[4, length]
480
+ response[4 + length, response.length - 4 - length]
481
+ when Statuses[:error], Statuses[:retry]
482
+ raise ResponseError, "searchd error (status: #{status}): #{response[4, response.length - 4]}"
483
+ else
484
+ raise ResponseError, "Unknown searchd error (status: #{status})"
485
+ end
486
+ end
487
+
488
+ # Generation of the message to send to Sphinx for a search.
489
+ def query_message(search, index, comments = '')
490
+ message = Message.new
491
+
492
+ # Mode, Limits, Sort Mode
493
+ message.append_ints @offset, @limit, MatchModes[@match_mode],
494
+ RankModes[@rank_mode], SortModes[@sort_mode]
495
+ message.append_string @sort_by
496
+
497
+ # Query
498
+ message.append_string search
499
+
500
+ # Weights
501
+ message.append_int @weights.length
502
+ message.append_ints *@weights
503
+
504
+ # Index
505
+ message.append_string index
506
+
507
+ # ID Range
508
+ message.append_int 1
509
+ message.append_64bit_ints @id_range.first, @id_range.last
510
+
511
+ # Filters
512
+ message.append_int @filters.length
513
+ @filters.each { |filter| message.append filter.query_message }
514
+
515
+ # Grouping
516
+ message.append_int GroupFunctions[@group_function]
517
+ message.append_string @group_by
518
+ message.append_int @max_matches
519
+ message.append_string @group_clause
520
+ message.append_ints @cut_off, @retry_count, @retry_delay
521
+ message.append_string @group_distinct
522
+
523
+ # Anchor Point
524
+ if @anchor.empty?
525
+ message.append_int 0
526
+ else
527
+ message.append_int 1
528
+ message.append_string @anchor[:latitude_attribute]
529
+ message.append_string @anchor[:longitude_attribute]
530
+ message.append_floats @anchor[:latitude], @anchor[:longitude]
531
+ end
532
+
533
+ # Per Index Weights
534
+ message.append_int @index_weights.length
535
+ @index_weights.each do |key,val|
536
+ message.append_string key.to_s
537
+ message.append_int val
538
+ end
539
+
540
+ # Max Query Time
541
+ message.append_int @max_query_time
542
+
543
+ # Per Field Weights
544
+ message.append_int @field_weights.length
545
+ @field_weights.each do |key,val|
546
+ message.append_string key.to_s
547
+ message.append_int val
548
+ end
549
+
550
+ message.append_string comments
551
+
552
+ message.to_s
553
+ end
554
+
555
+ # Generation of the message to send to Sphinx for an excerpts request.
556
+ def excerpts_message(options)
557
+ message = Message.new
558
+
559
+ flags = 1
560
+ flags |= 2 if options[:exact_phrase]
561
+ flags |= 4 if options[:single_passage]
562
+ flags |= 8 if options[:use_boundaries]
563
+ flags |= 16 if options[:weight_order]
564
+
565
+ message.append [0, flags].pack('N2') # 0 = mode
566
+ message.append_string options[:index]
567
+ message.append_string options[:words]
568
+
569
+ # options
570
+ message.append_string options[:before_match]
571
+ message.append_string options[:after_match]
572
+ message.append_string options[:chunk_separator]
573
+ message.append_ints options[:limit], options[:around]
574
+
575
+ message.append_array options[:docs]
576
+
577
+ message.to_s
578
+ end
579
+
580
+ # Generation of the message to send to Sphinx to update attributes of a
581
+ # document.
582
+ def update_message(index, attributes, values_by_doc)
583
+ message = Message.new
584
+
585
+ message.append_string index
586
+ message.append_array attributes
587
+
588
+ message.append_int values_by_doc.length
589
+ values_by_doc.each do |key,values|
590
+ message.append_64bit_int key # document ID
591
+ message.append_ints *values # array of new values (integers)
592
+ end
593
+
594
+ message.to_s
595
+ end
596
+
597
+ # Generates the simple message to send to the daemon for a keywords request.
598
+ def keywords_message(query, index, return_hits)
599
+ message = Message.new
600
+
601
+ message.append_string query
602
+ message.append_string index
603
+ message.append_int return_hits ? 1 : 0
604
+
605
+ message.to_s
606
+ end
607
+
608
+ def attribute_from_type(type, response)
609
+ type -= AttributeTypes[:multi] if is_multi = type > AttributeTypes[:multi]
610
+
611
+ case type
612
+ when AttributeTypes[:float]
613
+ is_multi ? response.next_float_array : response.next_float
614
+ else
615
+ is_multi ? response.next_int_array : response.next_int
616
+ end
617
+ end
618
+ end
619
+ end