RubyGems - ultrasphinx - Versions diffs - 1 - Mend

ultrasphinx 1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

data/CHANGELOG +3 -0
data/LICENSE +184 -0
data/Manifest +21 -0
data/README +94 -0
data/Rakefile +21 -0
data/examples/app.multi +2 -0
data/examples/default.base +34 -0
data/init.rb +2 -0
data/lib/ultrasphinx.rb +20 -0
data/lib/ultrasphinx/autoload.rb +13 -0
data/lib/ultrasphinx/core_extensions.rb +51 -0
data/lib/ultrasphinx/fields.rb +78 -0
data/lib/ultrasphinx/is_indexed.rb +89 -0
data/lib/ultrasphinx/search.rb +441 -0
data/lib/ultrasphinx/spell.rb +41 -0
data/lib/ultrasphinx/ultrasphinx.rb +276 -0
data/tasks/ultrasphinx.rake +125 -0
data/vendor/sphinx/README +40 -0
data/vendor/sphinx/Rakefile +21 -0
data/vendor/sphinx/init.rb +1 -0
data/vendor/sphinx/lib/client.rb +647 -0
metadata +66 -0

data/vendor/sphinx/Rakefile ADDED

@@ -0,0 +1,21 @@
+require 'rake'
+require 'spec/rake/spectask'
+require 'rake/rdoctask'
+desc 'Default: run unit tests.'
+task :default => :spec
+desc 'Test the magic_enum plugin.'
+Spec::Rake::SpecTask.new(:spec) do |t|
+  t.libs << 'lib'
+  t.pattern = 'spec/*_spec.rb'
+end
+desc 'Generate documentation for the magic_enum plugin.'
+Rake::RDocTask.new(:rdoc) do |rdoc|
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title    = 'MagicEnum'
+  rdoc.options << '--line-numbers' << '--inline-source'
+  rdoc.rdoc_files.include('README')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end

data/vendor/sphinx/init.rb ADDED

	@@ -0,0 +1 @@
1	+ require File.dirname(__FILE__) + '/lib/client'

data/vendor/sphinx/lib/client.rb ADDED

@@ -0,0 +1,647 @@
+# = client.rb - Sphinx Client API
+#
+# Author::    Dmytro Shteflyuk <mailto:kpumuk@kpumuk.info>.
+# Copyright:: Copyright (c) 2006 - 2007 Dmytro Shteflyuk
+# License::   Distributes under the same terms as Ruby
+# Version::   0.3.0
+# Website::   http://kpumuk.info/projects/ror-plugins/sphinx
+#
+# This library is distributed under the terms of the Ruby license.
+# You can freely distribute/modify this library.
+# ==Sphinx Client API
+#
+# The Sphinx Client API is used to communicate with <tt>searchd</tt>
+# daemon and get search results from Sphinx.
+#
+# ===Usage
+#
+#   sphinx = Sphinx::Client.new
+#   result = sphinx.Query('test')
+#   ids = result['matches'].map { |id, value| id }.join(',')
+#   posts = Post.find :all, :conditions => "id IN (#{ids})"
+#
+#   docs = posts.map(&:body)
+#   excerpts = sphinx.BuildExcerpts(docs, 'index', 'test')
+module Sphinx
+  # :stopdoc:
+  class SphinxError < StandardError; end
+  class SphinxArgumentError < SphinxError; end
+  class SphinxConnectError < SphinxError; end
+  class SphinxResponseError < SphinxError; end
+  class SphinxInternalError < SphinxError; end
+  class SphinxTemporaryError < SphinxError; end
+  class SphinxUnknownError < SphinxError; end
+  # :startdoc:
+  class Client
+    # :stopdoc:
+    # Known searchd commands
+    # search command
+    SEARCHD_COMMAND_SEARCH  = 0
+    # excerpt command
+    SEARCHD_COMMAND_EXCERPT = 1
+    # update command
+    SEARCHD_COMMAND_UPDATE  = 2
+    # Current client-side command implementation versions
+    # search command version
+    VER_COMMAND_SEARCH  = 0x107
+    # excerpt command version
+    VER_COMMAND_EXCERPT = 0x100
+    # update command version
+    VER_COMMAND_UPDATE  = 0x100
+    # Known searchd status codes
+    # general success, command-specific reply follows
+    SEARCHD_OK      = 0
+    # general failure, command-specific reply may follow
+    SEARCHD_ERROR   = 1
+    # temporaty failure, client should retry later
+    SEARCHD_RETRY   = 2
+    # general success, warning message and command-specific reply follow
+    SEARCHD_WARNING = 3
+    # :startdoc:
+    # Known match modes
+    # match all query words
+    SPH_MATCH_ALL      = 0
+    # match any query word
+    SPH_MATCH_ANY      = 1
+    # match this exact phrase
+    SPH_MATCH_PHRASE   = 2
+    # match this boolean query
+    SPH_MATCH_BOOLEAN  = 3
+    # match this extended query
+    SPH_MATCH_EXTENDED = 4
+    # Known sort modes
+    # sort by document relevance desc, then by date
+    SPH_SORT_RELEVANCE     = 0
+    # sort by document date desc, then by relevance desc
+    SPH_SORT_ATTR_DESC     = 1
+    # sort by document date asc, then by relevance desc
+    SPH_SORT_ATTR_ASC      = 2
+    # sort by time segments (hour/day/week/etc) desc, then by relevance desc
+    SPH_SORT_TIME_SEGMENTS = 3
+    # sort by SQL-like expression (eg. "@relevance DESC, price ASC, @id DESC")
+    SPH_SORT_EXTENDED      = 4
+    # Known attribute types
+    # this attr is just an integer
+    SPH_ATTR_INTEGER   = 1
+    # this attr is a timestamp
+    SPH_ATTR_TIMESTAMP = 2
+    # Known grouping functions
+    # group by day
+    SPH_GROUPBY_DAY   = 0
+    # group by week
+    SPH_GROUPBY_WEEK  = 1
+    # group by month
+    SPH_GROUPBY_MONTH = 2
+    # group by year
+    SPH_GROUPBY_YEAR  = 3
+    # group by attribute value
+    SPH_GROUPBY_ATTR  = 4
+    # Constructs the <tt>Sphinx::Client</tt> object and sets options to their default values.
+    def initialize
+      @host       = 'localhost'         # searchd host (default is "localhost")
+      @port       = 3312                # searchd port (default is 3312)
+      @offset     = 0                   # how many records to seek from result-set start (default is 0)
+      @limit      = 20                  # how many records to return from result-set starting at offset (default is 20)
+      @mode       = SPH_MATCH_ALL       # query matching mode (default is SPH_MATCH_ALL)
+      @weights    = []                  # per-field weights (default is 1 for all fields)
+      @sort       = SPH_SORT_RELEVANCE  # match sorting mode (default is SPH_SORT_RELEVANCE)
+      @sortby     = ''                  # attribute to sort by (defualt is "")
+      @min_id     = 0                   # min ID to match (default is 0)
+      @max_id     = 0xFFFFFFFF          # max ID to match (default is UINT_MAX)
+      @filters    = []                  # search filters
+      @groupby    = ''                  # group-by attribute name
+      @groupfunc  = SPH_GROUPBY_DAY     # function to pre-process group-by attribute value with
+      @groupsort  = '@group desc'       # group-by sorting clause (to sort groups in result set with)
+      @maxmatches = 1000                # max matches to retrieve
+      @error      = ''                  # last error message
+      @warning    = ''                  # last warning message
+    end
+    # Get last error message.
+    def GetLastError
+      @error
+    end
+    # Get last warning message.
+    def GetLastWarning
+      @warning
+    end
+    # Set searchd server.
+    def SetServer(host, port)
+      assert { host.instance_of? String }
+      assert { port.instance_of? Fixnum }
+      @host = host
+      @port = port
+    end
+    # Set match offset, count, and max number to retrieve.
+    def SetLimits(offset, limit, max = 0)
+      assert { offset.instance_of? Fixnum }
+      assert { limit.instance_of? Fixnum }
+      assert { max.instance_of? Fixnum }
+      assert { offset >= 0 }
+      assert { limit > 0 }
+      assert { max >= 0 }
+      @offset = offset
+      @limit = limit
+      @maxmatches = max if max > 0
+    end
+    # Set match mode.
+    def SetMatchMode(mode)
+      assert { mode == SPH_MATCH_ALL \
+            || mode == SPH_MATCH_ANY \
+            || mode == SPH_MATCH_PHRASE \
+            || mode == SPH_MATCH_BOOLEAN \
+            || mode == SPH_MATCH_EXTENDED }
+      @mode = mode
+    end
+    # Set matches sorting mode.
+    def SetSortMode(mode, sortby = '')
+      assert { mode == SPH_SORT_RELEVANCE \
+            || mode == SPH_SORT_ATTR_DESC \
+            || mode == SPH_SORT_ATTR_ASC \
+            || mode == SPH_SORT_TIME_SEGMENTS \
+            || mode == SPH_SORT_EXTENDED }
+      assert { sortby.instance_of? String }
+      assert { mode == SPH_SORT_RELEVANCE || !sortby.empty? }
+      @sort = mode
+      @sortby = sortby
+    end
+    # Set per-field weights.
+    def SetWeights(weights)
+      assert { weights.instance_of? Array }
+      weights.each do |weight|
+        assert { weight.instance_of? Fixnum }
+      end
+      @weights = weights
+    end
+    # Set IDs range to match.
+    #
+    # Only match those records where document ID is beetwen <tt>min_id</tt> and <tt>max_id</tt>
+    # (including <tt>min_id</tt> and <tt>max_id</tt>).
+    def SetIDRange(min, max)
+      assert { min.instance_of? Fixnum }
+      assert { max.instance_of? Fixnum }
+      assert { min <= max }
+      @min_id = min
+      @max_id = max
+    end
+    # Set values filter.
+    #
+    # Only match those records where <tt>attribute</tt> column values
+    # are in specified set.
+    def SetFilter(attribute, values, exclude = false)
+      assert { attribute.instance_of? String }
+      assert { values.instance_of? Array }
+      assert { !values.empty? }
+      if values.instance_of?(Array) && values.size > 0
+        values.each do |value|
+          assert { value.instance_of? Fixnum }
+        end
+        @filters << { 'attr' => attribute, 'exclude' => exclude, 'values' => values }
+      end
+    end
+    # Set range filter.
+    #
+    # Only match those records where <tt>attribute</tt> column value
+    # is beetwen <tt>min</tt> and <tt>max</tt> (including <tt>min</tt> and <tt>max</tt>).
+    def SetFilterRange(attribute, min, max, exclude = false)
+      assert { attribute.instance_of? String }
+      assert { min.instance_of? Fixnum }
+      assert { max.instance_of? Fixnum }
+      assert { min <= max }
+      @filters << { 'attr' => attribute, 'exclude' => exclude, 'min' => min, 'max' => max }
+    end
+    # Set grouping attribute and function.
+    #
+    # In grouping mode, all matches are assigned to different groups
+    # based on grouping function value.
+    #
+    # Each group keeps track of the total match count, and the best match
+    # (in this group) according to current sorting function.
+    #
+    # The final result set contains one best match per group, with
+    # grouping function value and matches count attached.
+    #
+	# Groups in result set could be sorted by any sorting clause,
+	# including both document attributes and the following special
+	# internal Sphinx attributes:
+	#
+	# * @id - match document ID;
+	# * @weight, @rank, @relevance -  match weight;
+	# * @group - groupby function value;
+	# * @count - amount of matches in group.
+	#
+	# the default mode is to sort by groupby value in descending order,
+	# ie. by '@group desc'.
+	#
+	# 'total_found' would contain total amount of matching groups over
+	# the whole index.
+	#
+	# WARNING: grouping is done in fixed memory and thus its results
+	# are only approximate; so there might be more groups reported
+	# in total_found than actually present. @count might also
+	# be underestimated.
+    #
+    # For example, if sorting by relevance and grouping by "published"
+    # attribute with SPH_GROUPBY_DAY function, then the result set will
+    # contain one most relevant match per each day when there were any
+    # matches published, with day number and per-day match count attached,
+    # and sorted by day number in descending order (ie. recent days first).
+    def SetGroupBy(attribute, func, groupsort = '@group desc')
+      assert { attribute.instance_of? String }
+      assert { groupsort.instance_of? String }
+      assert { func == SPH_GROUPBY_DAY \
+            || func == SPH_GROUPBY_WEEK \
+            || func == SPH_GROUPBY_MONTH \
+            || func == SPH_GROUPBY_YEAR \
+            || func == SPH_GROUPBY_ATTR }
+      @groupby = attribute
+      @groupfunc = func
+      @groupsort = groupsort
+    end
+    # Connect to searchd server and run given search query.
+    #
+    # * <tt>query</tt> -- query string
+    # * <tt>index</tt> -- index name to query, default is "*" which means to query all indexes
+    #
+    # returns hash which has the following keys on success:
+    #
+    # * <tt>'matches'</tt> -- hash which maps found document_id to ('weight', 'group') hash
+    # * <tt>'total'</tt> -- total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
+    # * <tt>'total_found'</tt> -- total amount of matching documents in index
+    # * <tt>'time'</tt> -- search time
+    # * <tt>'words'</tt> -- hash which maps query terms (stemmed!) to ('docs', 'hits') hash
+    def Query(query, index = '*')
+      sock = self.Connect
+      # build request
+      # mode and limits
+      req = [@offset, @limit, @mode, @sort].pack('NNNN')
+      req << [@sortby.length].pack('N') + @sortby
+      # query itself
+      req << [query.length].pack('N') + query
+      # weights
+      req << [@weights.length].pack('N')
+      req << @weights.pack('N' * @weights.length)
+      # indexes
+      req << [index.length].pack('N') + index
+      # id range
+      req << [@min_id.to_i, @max_id.to_i].pack('NN')
+      # filters
+      req << [@filters.length].pack('N')
+      @filters.each do |filter|
+        req << [filter['attr'].length].pack('N') + filter['attr']
+        unless filter['values'].nil?
+          req << [filter['values'].length].pack('N')
+          req << filter['values'].pack('N' * filter['values'].length)
+        else
+          req << [0, filter['min'], filter['max']].pack('NNN')
+        end
+        req << [filter['exclude'] ? 1 : 0].pack('N')
+      end
+      # group-by, max matches, sort-by-group flag
+      req << [@groupfunc, @groupby.length].pack('NN') + @groupby
+      req << [@maxmatches].pack('N')
+      req << [@groupsort.length].pack('N') + @groupsort
+      # send query, get response
+      len = req.length
+      # add header
+      req = [SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, len].pack('nnN') + req
+      sock.send(req, 0)
+      response = GetResponse(sock, VER_COMMAND_SEARCH)
+      # parse response
+      result = {}
+      max = response.length # protection from broken response
+      # read schema
+      p = 0
+      fields = []
+      attrs = {}
+      attrs_names_in_order = []
+      nfields = response[p, 4].unpack('N*').first; p += 4
+      while nfields > 0 and p < max
+        nfields -= 1
+        len = response[p, 4].unpack('N*').first; p += 4
+        fields << response[p, len]; p += len
+      end
+      result['fields'] = fields
+      nattrs = response[p, 4].unpack('N*').first; p += 4
+      while nattrs > 0 && p < max
+        nattrs -= 1
+        len = response[p, 4].unpack('N*').first; p += 4
+        attr = response[p, len]; p += len
+        type = response[p, 4].unpack('N*').first; p += 4
+        attrs[attr] = type
+        attrs_names_in_order << attr
+      end
+      result['attrs'] = attrs
+      # read match count
+      count = response[p, 4].unpack('N*').first; p += 4
+      # read matches
+      result['matches'], index = {}, 0
+      while count > 0 and p < max
+        count -= 1
+        doc, weight = response[p, 8].unpack('N*N*'); p += 8
+        result['matches'][doc] ||= {}
+        result['matches'][doc]['weight'] = weight
+        result['matches'][doc]['index'] = index
+        attrs_names_in_order.each do |attr|
+          val = response[p, 4].unpack('N*').first; p += 4
+          result['matches'][doc]['attrs'] ||= {}
+          result['matches'][doc]['attrs'][attr] = val
+        end
+        index += 1
+      end
+      result['total'], result['total_found'], msecs, words = response[p, 16].unpack('N*N*N*N*'); p += 16
+      result['time'] = '%.3f' % (msecs / 1000.0)
+      result['words'] = {}
+      while words > 0 and p < max
+        words -= 1
+        len = response[p, 4].unpack('N*').first; p += 4
+        word = response[p, len]; p += len
+        docs, hits = response[p, 8].unpack('N*N*'); p += 8
+        result['words'][word] = { 'docs' => docs, 'hits' => hits }
+      end
+      result
+    end
+    # Connect to searchd server and generate exceprts from given documents.
+    #
+    # * <tt>docs</tt> -- an array of strings which represent the documents' contents
+    # * <tt>index</tt> -- a string specifiying the index which settings will be used
+    # for stemming, lexing and case folding
+    # * <tt>words</tt> -- a string which contains the words to highlight
+    # * <tt>opts</tt> is a hash which contains additional optional highlighting parameters.
+    #
+    # You can use following parameters:
+    # * <tt>'before_match'</tt> -- a string to insert before a set of matching words, default is "<b>"
+    # * <tt>'after_match'</tt> -- a string to insert after a set of matching words, default is "<b>"
+    # * <tt>'chunk_separator'</tt> -- a string to insert between excerpts chunks, default is " ... "
+    # * <tt>'limit'</tt> -- max excerpt size in symbols (codepoints), default is 256
+    # * <tt>'around'</tt> -- how much words to highlight around each match, default is 5
+    #
+    # Returns an array of string excerpts on success.
+    def BuildExcerpts(docs, index, words, opts = {})
+      assert { docs.instance_of? Array }
+      assert { index.instance_of? String }
+      assert { words.instance_of? String }
+      assert { opts.instance_of? Hash }
+      sock = self.Connect
+      # fixup options
+      opts['before_match'] ||= '<b>';
+      opts['after_match'] ||= '</b>';
+      opts['chunk_separator'] ||= ' ... ';
+      opts['limit'] ||= 256;
+      opts['around'] ||= 5;
+      # build request
+      # v.1.0 req
+      req = [0, 1].pack('N2'); # mode=0, flags=1 (remove spaces)
+      # req index
+      req << [index.length].pack('N') + index
+      # req words
+      req << [words.length].pack('N') + words
+      # options
+      req << [opts['before_match'].length].pack('N') + opts['before_match']
+      req << [opts['after_match'].length].pack('N') + opts['after_match']
+      req << [opts['chunk_separator'].length].pack('N') + opts['chunk_separator']
+      req << [opts['limit'].to_i, opts['around'].to_i].pack('NN')
+      # documents
+      req << [docs.size].pack('N');
+      docs.each do |doc|
+        assert { doc.instance_of? String }
+        req << [doc.length].pack('N') + doc
+      end
+      # send query, get response
+      len = req.length
+      # add header
+      req = [SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, len].pack('nnN') + req
+      sock.send(req, 0)
+      response = GetResponse(sock, VER_COMMAND_EXCERPT)
+      # parse response
+      p = 0
+      res = []
+      rlen = response.length
+      docs.each do |doc|
+        len = response[p, 4].unpack('N*').first; p += 4
+        if p + len > rlen
+          @error = 'incomplete reply'
+          raise SphinxResponseError, @error
+        end
+        res << response[p, len]; p += len
+      end
+      return res
+    end
+	# Attribute updates
+    #
+	# Update specified attributes on specified documents.
+	#
+	# * <tt>index</tt> is a name of the index to be updated
+	# * <tt>attrs</tt> is an array of attribute name strings.
+	# * <tt>values</tt> is a hash where key is document id, and value is an array of
+	# new attribute values
+	#
+	# Returns number of actually updated documents (0 or more) on success.
+	# Returns -1 on failure.
+	#
+	# Usage example:
+	#    sphinx.UpdateAttributes('index', ['group'], { 123 => [456] })
+    def UpdateAttributes(index, attrs, values)
+      # verify everything
+      assert { index.instance_of? String }
+      assert { attrs.instance_of? Array }
+      attrs.each do |attr|
+        assert { attr.instance_of? String }
+      end
+      assert { values.instance_of? Hash }
+      values.each do |id, entry|
+        assert { id.instance_of? Fixnum }
+        assert { entry.instance_of? Array }
+        assert { entry.length == attrs.length }
+        entry.each do |v|
+          assert { v.instance_of? Fixnum }
+        end
+      end
+      # build request
+      req = [index.length].pack('N') + index
+      req << [attrs.length].pack('N')
+      attrs.each do |attr|
+        req << [attr.length].pack('N') + attr
+      end
+      req << [values.length].pack('N')
+      values.each do |id, entry|
+        req << [id].pack('N')
+        req << entry.pack('N' * entry.length)
+      end
+      # connect, send query, get response
+      sock = self.Connect
+      len = req.length
+      req = [SEARCHD_COMMAND_UPDATE, VER_COMMAND_UPDATE, len].pack('nnN') + req # add header
+      sock.send(req, 0)
+      response = self.GetResponse(sock, VER_COMMAND_UPDATE)
+      # parse response
+      response[0, 4].unpack('N*').first
+    end
+    protected
+      # Connect to searchd server.
+      def Connect
+        begin
+          sock = TCPSocket.new(@host, @port)
+        rescue
+          @error = "connection to #{@host}:#{@port} failed"
+          raise SphinxConnectError, @error
+        end
+        v = sock.recv(4).unpack('N*').first
+        if v < 1
+          sock.close
+          @error = "expected searchd protocol version 1+, got version '#{v}'"
+          raise SphinxConnectError, @error
+        end
+        sock.send([1].pack('N'), 0)
+        sock
+      end
+      # Get and check response packet from searchd server.
+      def GetResponse(sock, client_version)
+        header = sock.recv(8)
+        status, ver, len = header.unpack('n2N')
+        response = ''
+        left = len
+        while left > 0 do
+          begin
+            chunk = sock.recv(left)
+            if chunk
+              response << chunk
+              left -= chunk.length
+            end
+          rescue EOFError
+            break
+          end
+        end
+        sock.close
+        # check response
+        read = response.length
+        if response.empty? or read != len
+          @error = len \
+            ? "failed to read searchd response (status=#{status}, ver=#{ver}, len=#{len}, read=#{read})" \
+            : 'received zero-sized searchd response'
+          raise SphinxResponseError, @error
+        end
+        # check status
+        if (status == SEARCHD_WARNING)
+          wlen = response[0, 4].unpack('N*').first
+          @warning = response[4, wlen]
+          return response[4 + wlen, response.length - 4 - wlen]
+        end
+        if status == SEARCHD_ERROR
+          @error = 'searchd error: ' + response[4, response.length - 4]
+          raise SphinxInternalError, @error
+        end
+        if status == SEARCHD_RETRY
+          @error = 'temporary searchd error: ' + response[4, response.length - 4]
+          raise SphinxTemporaryError, @error
+        end
+        unless status == SEARCHD_OK
+          @error = "unknown status code: '#{status}'"
+          raise SphinxUnknownError, @error
+        end
+        # check version
+        if ver < client_version
+          @warning = "searchd command v.#{ver >> 8}.#{ver & 0xff} older than client's " +
+            "v.#{client_version >> 8}.#{client_version & 0xff}, some options might not work"
+        end
+        return response
+      end
+      # :stopdoc:
+      def assert
+        raise 'Assertion failed!' unless yield if $DEBUG
+      end
+      # :startdoc:
+  end
+end