RubyGems - ferret - Versions diffs - 0.9.6 → 0.10.0 - Mend

ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (295) hide show

data/MIT-LICENSE +1 -1
data/README +12 -24
data/Rakefile +38 -54
data/TODO +14 -17
data/ext/analysis.c +982 -823
data/ext/analysis.h +133 -76
data/ext/array.c +96 -58
data/ext/array.h +40 -13
data/ext/bitvector.c +476 -118
data/ext/bitvector.h +264 -22
data/ext/compound_io.c +217 -229
data/ext/defines.h +49 -0
data/ext/document.c +107 -317
data/ext/document.h +31 -65
data/ext/except.c +81 -36
data/ext/except.h +117 -55
data/ext/extconf.rb +2 -9
data/ext/ferret.c +211 -104
data/ext/ferret.h +22 -11
data/ext/filter.c +97 -82
data/ext/fs_store.c +348 -367
data/ext/global.c +226 -188
data/ext/global.h +44 -26
data/ext/hash.c +474 -391
data/ext/hash.h +441 -68
data/ext/hashset.c +124 -96
data/ext/hashset.h +169 -20
data/ext/helper.c +56 -5
data/ext/helper.h +7 -0
data/ext/inc/lang.h +29 -49
data/ext/inc/threading.h +31 -0
data/ext/ind.c +288 -278
data/ext/ind.h +68 -0
data/ext/index.c +5688 -0
data/ext/index.h +663 -616
data/ext/lang.h +29 -49
data/ext/libstemmer.c +3 -3
data/ext/mem_pool.c +84 -0
data/ext/mem_pool.h +35 -0
data/ext/posh.c +1006 -0
data/ext/posh.h +1007 -0
data/ext/priorityqueue.c +117 -194
data/ext/priorityqueue.h +135 -39
data/ext/q_boolean.c +1305 -1108
data/ext/q_const_score.c +106 -93
data/ext/q_filtered_query.c +138 -135
data/ext/q_fuzzy.c +206 -242
data/ext/q_match_all.c +94 -80
data/ext/q_multi_term.c +663 -0
data/ext/q_parser.c +667 -593
data/ext/q_phrase.c +992 -555
data/ext/q_prefix.c +72 -61
data/ext/q_range.c +235 -210
data/ext/q_span.c +1480 -1166
data/ext/q_term.c +273 -246
data/ext/q_wildcard.c +127 -114
data/ext/r_analysis.c +1720 -711
data/ext/r_index.c +3049 -0
data/ext/r_qparser.c +433 -146
data/ext/r_search.c +2934 -1993
data/ext/r_store.c +372 -143
data/ext/r_utils.c +941 -0
data/ext/ram_store.c +330 -326
data/ext/search.c +1291 -668
data/ext/search.h +403 -702
data/ext/similarity.c +91 -113
data/ext/similarity.h +45 -30
data/ext/sort.c +721 -484
data/ext/stopwords.c +361 -273
data/ext/store.c +556 -58
data/ext/store.h +706 -126
data/ext/tags +3578 -2780
data/ext/term_vectors.c +352 -0
data/ext/threading.h +31 -0
data/ext/win32.h +54 -0
data/lib/ferret.rb +5 -17
data/lib/ferret/document.rb +130 -2
data/lib/ferret/index.rb +577 -26
data/lib/ferret/number_tools.rb +157 -0
data/lib/ferret_version.rb +3 -0
data/test/test_helper.rb +5 -13
data/test/unit/analysis/tc_analyzer.rb +513 -1
data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
data/test/unit/index/tc_index.rb +183 -240
data/test/unit/index/tc_index_reader.rb +312 -479
data/test/unit/index/tc_index_writer.rb +397 -13
data/test/unit/index/th_doc.rb +269 -206
data/test/unit/query_parser/tc_query_parser.rb +40 -33
data/test/unit/search/tc_filter.rb +59 -71
data/test/unit/search/tc_fuzzy_query.rb +24 -16
data/test/unit/search/tc_index_searcher.rb +23 -201
data/test/unit/search/tc_multi_searcher.rb +78 -226
data/test/unit/search/tc_search_and_sort.rb +93 -81
data/test/unit/search/tc_sort.rb +23 -23
data/test/unit/search/tc_sort_field.rb +7 -7
data/test/unit/search/tc_spans.rb +51 -47
data/test/unit/search/tm_searcher.rb +339 -0
data/test/unit/store/tc_fs_store.rb +1 -1
data/test/unit/store/tm_store_lock.rb +3 -3
data/test/unit/tc_document.rb +81 -0
data/test/unit/ts_analysis.rb +1 -1
data/test/unit/ts_utils.rb +1 -1
data/test/unit/utils/tc_bit_vector.rb +288 -0
data/test/unit/utils/tc_number_tools.rb +117 -0
data/test/unit/utils/tc_priority_queue.rb +106 -0
metadata +140 -301
data/CHANGELOG +0 -9
data/ext/dummy.exe +0 -0
data/ext/field.c +0 -408
data/ext/frtio.h +0 -13
data/ext/inc/except.h +0 -90
data/ext/index_io.c +0 -382
data/ext/index_rw.c +0 -2658
data/ext/lang.c +0 -41
data/ext/nix_io.c +0 -134
data/ext/q_multi_phrase.c +0 -380
data/ext/r_doc.c +0 -582
data/ext/r_index_io.c +0 -1021
data/ext/r_term.c +0 -219
data/ext/term.c +0 -820
data/ext/termdocs.c +0 -611
data/ext/vector.c +0 -637
data/ext/w32_io.c +0 -150
data/lib/ferret/analysis.rb +0 -11
data/lib/ferret/analysis/analyzers.rb +0 -112
data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
data/lib/ferret/analysis/token.rb +0 -100
data/lib/ferret/analysis/token_filters.rb +0 -86
data/lib/ferret/analysis/token_stream.rb +0 -26
data/lib/ferret/analysis/tokenizers.rb +0 -112
data/lib/ferret/analysis/word_list_loader.rb +0 -27
data/lib/ferret/document/document.rb +0 -152
data/lib/ferret/document/field.rb +0 -312
data/lib/ferret/index/compound_file_io.rb +0 -338
data/lib/ferret/index/document_writer.rb +0 -289
data/lib/ferret/index/field_infos.rb +0 -279
data/lib/ferret/index/fields_io.rb +0 -181
data/lib/ferret/index/index.rb +0 -675
data/lib/ferret/index/index_file_names.rb +0 -33
data/lib/ferret/index/index_reader.rb +0 -503
data/lib/ferret/index/index_writer.rb +0 -534
data/lib/ferret/index/multi_reader.rb +0 -377
data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
data/lib/ferret/index/segment_infos.rb +0 -130
data/lib/ferret/index/segment_merge_info.rb +0 -49
data/lib/ferret/index/segment_merge_queue.rb +0 -16
data/lib/ferret/index/segment_merger.rb +0 -358
data/lib/ferret/index/segment_reader.rb +0 -412
data/lib/ferret/index/segment_term_enum.rb +0 -169
data/lib/ferret/index/segment_term_vector.rb +0 -58
data/lib/ferret/index/term.rb +0 -53
data/lib/ferret/index/term_buffer.rb +0 -83
data/lib/ferret/index/term_doc_enum.rb +0 -291
data/lib/ferret/index/term_enum.rb +0 -52
data/lib/ferret/index/term_info.rb +0 -37
data/lib/ferret/index/term_infos_io.rb +0 -321
data/lib/ferret/index/term_vector_offset_info.rb +0 -20
data/lib/ferret/index/term_vectors_io.rb +0 -553
data/lib/ferret/query_parser.rb +0 -312
data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
data/lib/ferret/search.rb +0 -50
data/lib/ferret/search/boolean_clause.rb +0 -100
data/lib/ferret/search/boolean_query.rb +0 -299
data/lib/ferret/search/boolean_scorer.rb +0 -294
data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
data/lib/ferret/search/conjunction_scorer.rb +0 -99
data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
data/lib/ferret/search/explanation.rb +0 -41
data/lib/ferret/search/field_cache.rb +0 -215
data/lib/ferret/search/field_doc.rb +0 -31
data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
data/lib/ferret/search/filter.rb +0 -11
data/lib/ferret/search/filtered_query.rb +0 -130
data/lib/ferret/search/filtered_term_enum.rb +0 -79
data/lib/ferret/search/fuzzy_query.rb +0 -154
data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
data/lib/ferret/search/hit_collector.rb +0 -34
data/lib/ferret/search/hit_queue.rb +0 -11
data/lib/ferret/search/index_searcher.rb +0 -200
data/lib/ferret/search/match_all_query.rb +0 -104
data/lib/ferret/search/multi_phrase_query.rb +0 -216
data/lib/ferret/search/multi_searcher.rb +0 -261
data/lib/ferret/search/multi_term_query.rb +0 -65
data/lib/ferret/search/non_matching_scorer.rb +0 -22
data/lib/ferret/search/phrase_positions.rb +0 -55
data/lib/ferret/search/phrase_query.rb +0 -214
data/lib/ferret/search/phrase_scorer.rb +0 -152
data/lib/ferret/search/prefix_query.rb +0 -54
data/lib/ferret/search/query.rb +0 -140
data/lib/ferret/search/query_filter.rb +0 -51
data/lib/ferret/search/range_filter.rb +0 -103
data/lib/ferret/search/range_query.rb +0 -139
data/lib/ferret/search/req_excl_scorer.rb +0 -125
data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
data/lib/ferret/search/score_doc.rb +0 -38
data/lib/ferret/search/score_doc_comparator.rb +0 -114
data/lib/ferret/search/scorer.rb +0 -91
data/lib/ferret/search/similarity.rb +0 -278
data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
data/lib/ferret/search/sort.rb +0 -112
data/lib/ferret/search/sort_comparator.rb +0 -60
data/lib/ferret/search/sort_field.rb +0 -91
data/lib/ferret/search/spans.rb +0 -12
data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
data/lib/ferret/search/spans/span_first_query.rb +0 -79
data/lib/ferret/search/spans/span_near_query.rb +0 -108
data/lib/ferret/search/spans/span_not_query.rb +0 -130
data/lib/ferret/search/spans/span_or_query.rb +0 -176
data/lib/ferret/search/spans/span_query.rb +0 -25
data/lib/ferret/search/spans/span_scorer.rb +0 -74
data/lib/ferret/search/spans/span_term_query.rb +0 -105
data/lib/ferret/search/spans/span_weight.rb +0 -84
data/lib/ferret/search/spans/spans_enum.rb +0 -44
data/lib/ferret/search/term_query.rb +0 -128
data/lib/ferret/search/term_scorer.rb +0 -183
data/lib/ferret/search/top_docs.rb +0 -36
data/lib/ferret/search/top_field_docs.rb +0 -17
data/lib/ferret/search/weight.rb +0 -54
data/lib/ferret/search/wildcard_query.rb +0 -26
data/lib/ferret/search/wildcard_term_enum.rb +0 -61
data/lib/ferret/stemmers.rb +0 -1
data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
data/lib/ferret/store.rb +0 -5
data/lib/ferret/store/buffered_index_io.rb +0 -190
data/lib/ferret/store/directory.rb +0 -141
data/lib/ferret/store/fs_store.rb +0 -381
data/lib/ferret/store/index_io.rb +0 -245
data/lib/ferret/store/ram_store.rb +0 -286
data/lib/ferret/utils.rb +0 -8
data/lib/ferret/utils/bit_vector.rb +0 -123
data/lib/ferret/utils/date_tools.rb +0 -138
data/lib/ferret/utils/number_tools.rb +0 -91
data/lib/ferret/utils/parameter.rb +0 -41
data/lib/ferret/utils/priority_queue.rb +0 -120
data/lib/ferret/utils/string_helper.rb +0 -47
data/lib/ferret/utils/thread_local.rb +0 -28
data/lib/ferret/utils/weak_key_hash.rb +0 -60
data/lib/rferret.rb +0 -37
data/rake_utils/code_statistics.rb +0 -106
data/test/benchmark/tb_ram_store.rb +0 -76
data/test/benchmark/tb_rw_vint.rb +0 -26
data/test/functional/thread_safety_index_test.rb +0 -81
data/test/functional/thread_safety_test.rb +0 -137
data/test/longrunning/tc_numbertools.rb +0 -60
data/test/longrunning/tm_store.rb +0 -19
data/test/unit/analysis/ctc_analyzer.rb +0 -532
data/test/unit/analysis/data/wordfile +0 -6
data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
data/test/unit/analysis/rtc_stop_filter.rb +0 -14
data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
data/test/unit/analysis/tc_token.rb +0 -25
data/test/unit/document/rtc_field.rb +0 -28
data/test/unit/document/tc_document.rb +0 -47
data/test/unit/document/tc_field.rb +0 -98
data/test/unit/index/rtc_compound_file_io.rb +0 -107
data/test/unit/index/rtc_field_infos.rb +0 -127
data/test/unit/index/rtc_fields_io.rb +0 -167
data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
data/test/unit/index/rtc_segment_infos.rb +0 -74
data/test/unit/index/rtc_segment_term_docs.rb +0 -17
data/test/unit/index/rtc_segment_term_enum.rb +0 -60
data/test/unit/index/rtc_segment_term_vector.rb +0 -71
data/test/unit/index/rtc_term_buffer.rb +0 -57
data/test/unit/index/rtc_term_info.rb +0 -19
data/test/unit/index/rtc_term_infos_io.rb +0 -192
data/test/unit/index/rtc_term_vectors_io.rb +0 -108
data/test/unit/index/tc_term.rb +0 -27
data/test/unit/index/tc_term_voi.rb +0 -18
data/test/unit/search/rtc_similarity.rb +0 -37
data/test/unit/search/rtc_sort_field.rb +0 -14
data/test/unit/search/tc_multi_searcher2.rb +0 -126
data/test/unit/store/rtc_fs_store.rb +0 -62
data/test/unit/store/rtc_ram_store.rb +0 -15
data/test/unit/store/rtm_store.rb +0 -150
data/test/unit/store/rtm_store_lock.rb +0 -2
data/test/unit/ts_document.rb +0 -2
data/test/unit/utils/rtc_bit_vector.rb +0 -73
data/test/unit/utils/rtc_date_tools.rb +0 -50
data/test/unit/utils/rtc_number_tools.rb +0 -59
data/test/unit/utils/rtc_parameter.rb +0 -40
data/test/unit/utils/rtc_priority_queue.rb +0 -62
data/test/unit/utils/rtc_string_helper.rb +0 -21
data/test/unit/utils/rtc_thread.rb +0 -61
data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
data/test/utils/number_to_spoken.rb +0 -132

data/lib/ferret/search/filter.rb DELETED Viewed

@@ -1,11 +0,0 @@
-module Ferret::Search
-  # Abstract base class providing a mechanism to restrict searches to a subset
-  # of an index.
-  class Filter
-    # Returns a BitSet with true for documents which should be permitted in
-    # search results, and false for those that should not.
-    def bits(reader)
-      raise NotImplementedError
-    end
-  end
-end

data/lib/ferret/search/filtered_query.rb DELETED Viewed

@@ -1,130 +0,0 @@
-module Ferret::Search
-  # A query that applies a filter to the results of another query.
-  #
-  # Note: the bits are retrieved from the filter each time this
-  # query is used in a search - use a CachingWrapperFilter to avoid
-  # regenerating the bits every time.
-  class FilteredQuery < Query
-    attr_accessor :sub_query
-    attr_reader :filter
-    # Constructs a new query which applies a filter to the results of the
-    # original query.
-    #
-    # Filter.bits() will be called every time this query is used in a search.
-    #
-    # query::  Query to be filtered, cannot be +nil+.
-    # filter:: Filter to apply to query results, cannot be +nil+.
-    def initialize(query, filter)
-      super()
-      @sub_query = query
-      @filter = filter
-    end
-    # Returns a Weight that applies the filter to the enclosed query's Weight.
-    # This is accomplished by overriding the Scorer returned by the Weight.
-    def create_weight(searcher)
-      sub_weight = @sub_query.create_weight(searcher)
-      similarity = @sub_query.similarity(searcher)
-      return FilteredWeight.new(self, sub_weight, similarity)
-    end
-    class FilteredScorer < Scorer
-      def initialize(sub_scorer, bits, similarity)
-        super(similarity)
-        @sub_scorer = sub_scorer
-        @bits = bits
-      end
-      # pass these methods through to the enclosed scorer
-      def next?() return @sub_scorer.next?; end
-      def doc() return @sub_scorer.doc; end
-      def skip_to(i) return @sub_scorer.skip_to(i); end
-      # if the document has been filtered out, set score to 0.0
-      def score()
-        return (@bits.get(@sub_scorer.doc) ? @sub_scorer.score() : 0.0)
-      end
-      # add an explanation about whether the document was filtered
-      def explain(i)
-        exp = @sub_scorer.explain(i)
-        if (@bits.get(i))
-          exp.description = "allowed by filter: #{exp.description}"
-        else
-          exp.description = "removed by filter: #{exp.description}"
-        end
-        return exp
-      end
-    end
-    class FilteredWeight < Weight
-      attr_reader :query
-      def initialize(query, sub_weight, similarity)
-        @query = query
-        @sub_weight = sub_weight
-        @similarity = similarity
-      end
-      # pass these methods through to enclosed query's weight
-      def value()
-        return @sub_weight.value
-      end
-      def sum_of_squared_weights()
-        return @sub_weight.sum_of_squared_weights
-      end
-      def normalize(v)
-        return @sub_weight.normalize(v)
-      end
-      def explain(ir, i)
-        return @sub_weight.explain(ir, i)
-      end
-      # return a scorer that overrides the enclosed query's score if
-      # the given hit has been filtered out.
-      def scorer(reader)
-        scorer = @sub_weight.scorer(reader)
-        bits = @query.filter.bits(reader)
-        return FilteredScorer.new(scorer, bits, @similarity)
-      end
-    end
-    # Rewrites the wrapped query.
-    def rewrite(reader)
-      rewritten = @sub_query.rewrite(reader)
-      if (rewritten != @sub_query)
-        clone = self.clone()
-        clone.query = rewritten
-        return clone
-      else
-        return self
-      end
-    end
-    # inherit javadoc
-    def extract_terms(terms)
-      @sub_query.extract_terms(terms)
-    end
-    # Prints a user-readable version of this query.
-    def to_s(f = nil)
-      return "filtered(#{@sub_query.to_s(f)})->#{@filter}"
-    end
-    # Returns true iff +o+ is equal to this.
-    def eql?(o)
-      return (o.instance_of?(FilteredQuery) and
-        (@sub_query == o.sub_query) and (@filter == o.filter))
-    end
-    alias :== :eql?
-    # Returns a hash code value for this object.
-    def hash()
-      return @sub_query.hash ^ @filter.hash
-    end
-  end
-end

data/lib/ferret/search/filtered_term_enum.rb DELETED Viewed

@@ -1,79 +0,0 @@
-module Ferret::Search
-  # Abstract class for enumerating a subset of all terms.
-  #
-  # Term enumerations are always ordered by Term.<=>().  Each term in
-  # the enumeration is greater than all that precede it.
-  class FilteredTermEnum < Ferret::Index::TermEnum
-    # Returns the current Term in the enumeration.
-    # Returns nil if no Term matches or all terms have been enumerated.
-    attr_reader :term
-    def initialize()
-      @term = nil
-      @enum = nil
-      @reader = nil
-    end
-    # Equality compare on the term
-    def term_compare(term)
-      raise NotImplementedError
-    end
-    # Equality measure on the term
-    def difference()
-      raise NotImplementedError
-    end
-    # Indiciates the end of the enumeration has been reached
-    def end_enum()
-      raise NotImplementedError
-    end
-    def enum=(enum)
-      @enum = enum
-      # Find the first term that matches
-      term = @enum.term()
-      if (term != nil and term_compare(term))
-        @term = term
-      else
-        next?
-      end
-    end
-    # Returns the doc_freq of the current Term in the enumeration.
-    # Returns -1 if no Term matches or all terms have been enumerated.
-    def doc_freq()
-      if (@enum == nil)
-        return -1
-      end
-      return @enum.doc_freq()
-    end
-    # Increments the enumeration to the next element.  True if one exists.
-    def next?()
-      return false if (@enum == nil) # enum not initialized
-      @term = nil
-      while @term.nil?
-        if end_enum() or ! @enum.next?
-          return false
-        end
-        term = @enum.term()
-        if (term_compare(term))
-          @term = term
-          return true
-        end
-      end
-      @term = nil
-      return false
-    end
-    # Closes the enumeration to further activity, freeing resources.
-    def close()
-      @enum.close()
-      @term = nil
-      @enum = nil
-    end
-  end
-end

data/lib/ferret/search/fuzzy_query.rb DELETED Viewed

@@ -1,154 +0,0 @@
-module Ferret::Search
-  # Implements the fuzzy search query. The similiarity measurement
-  # is based on the Levenshtein (distance) algorithm.
-  class FuzzyQuery < MultiTermQuery
-    @@default_min_similarity = 0.5
-    @@default_prefix_length = 0
-    def FuzzyQuery.default_min_similarity()
-      return @@default_min_similarity
-    end
-    def FuzzyQuery.default_min_similarity=(minimum_similarity)
-      if (minimum_similarity >= 1.0)
-        raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
-      elsif (minimum_similarity < 0.0)
-        raise ArgumentError, "minimum_similarity cannot be less than 0"
-      end
-      @@default_min_similarity = minimum_similarity
-    end
-    def FuzzyQuery.default_prefix_length()
-      return @@default_prefix_length
-    end
-    def FuzzyQuery.default_prefix_length=(prefix_length)
-      if (prefix_length < 0)
-        raise ArgumentError, "prefix_length cannot be less than 0"
-      end
-      @@default_prefix_length = prefix_length
-    end
-    attr_reader :prefix_length, :minimum_similarity
-    # Create a new FuzzyQuery that will match terms with a similarity
-    # of at least +minimum_similarity+ to +term+.
-    # If a +prefix_length+ > 0 is specified, a common prefix
-    # of that length is also required.
-    #
-    # term::               the term to search for
-    # minimum_similarity:: a value between 0 and 1 to set the required
-    #                      similarity between the query term and the matching
-    #                      terms. For example, for a +minimum_similarity+ of
-    #                      <tt>0.5</tt> a term of the same length as the query
-    #                      term is considered similar to the query term if the
-    #                      edit distance between both terms is less than
-    #                      <tt>length(term)*0.5</tt>
-    # prefix_length::      length of common (non-fuzzy) prefix. This is the
-    #                      number of characters at the start of a term that
-    #                      must be identical (fuzzy) to the query term if the
-    #                      query is to match that term.
-    # raises::             ArgumentError if minimum_similarity is >= 1 or < 0
-    #                      or if prefix_length < 0
-    def initialize(term,
-                   minimum_similarity = @@default_min_similarity,
-                   prefix_length = @@default_prefix_length)
-      super(term)
-      if (minimum_similarity >= 1.0)
-        raise ArgumentError, "minimum_similarity >= 1"
-      elsif (minimum_similarity < 0.0)
-        raise ArgumentError, "minimum_similarity < 0"
-      end
-      if (prefix_length < 0)
-        raise ArgumentError, "prefix_length < 0"
-      end
-      @minimum_similarity = minimum_similarity
-      @prefix_length = prefix_length
-    end
-    def get_term_enum(reader)
-      return FuzzyTermEnum.new(reader, @term, @minimum_similarity, @prefix_length)
-    end
-    def rewrite(reader)
-      fuzzy_enum = get_term_enum(reader)
-      max_clause_count = BooleanQuery.max_clause_count
-      st_queue = ScoreTermQueue.new(max_clause_count)
-      begin
-        begin
-          min_score = 0.0
-          score = 0.0
-          t = fuzzy_enum.term()
-          if t
-            score = fuzzy_enum.difference()
-            # terms come in alphabetical order, therefore if queue is full and score
-            # not bigger than min_score, we can skip
-            if(st_queue.size < max_clause_count or score > min_score)
-              st_queue.insert(ScoreTerm.new(t, score))
-              min_score = st_queue.top.score # maintain min_score
-            end
-          end
-        end while fuzzy_enum.next?
-      ensure
-        fuzzy_enum.close()
-      end
-      bq = BooleanQuery.new(true)
-      st_queue.size.times do |i|
-        st = st_queue.pop()
-        tq = TermQuery.new(st.term)                     # found a match
-        tq.boost = boost() * st.score                   # set the boost
-        bq.add_query(tq, BooleanClause::Occur::SHOULD)  # add to query
-      end
-      return bq
-    end
-    def to_s(field = nil)
-      buffer = ""
-      buffer << "#{@term.field}:" if @term.field != field
-      buffer << "#{@term.text}~"
-      buffer << minimum_similarity.to_s if minimum_similarity != 0.5
-      buffer << "^#{boost()}" if (boost() != 1.0)
-      return buffer
-    end
-    class ScoreTerm
-      attr_accessor :term, :score
-      def initialize(term, score)
-        @term = term
-        @score = score
-      end
-    end
-    class ScoreTermQueue < Ferret::Utils::PriorityQueue
-      # See PriorityQueue#less_than(o1, o2)
-      def less_than(st1, st2)
-        if (st1.score == st1.score)
-          return st1.term > st2.term
-        else
-          return st1.score < st2.score
-        end
-      end
-    end
-    def eql?(o)
-      return (o.instance_of?(FuzzyQuery) and super(o) and
-              (@minimum_similarity == o.minimum_similarity) and
-              (@prefix_length == fuzzyQuery.prefix_length))
-    end
-    alias :== :eql?
-    def hash()
-      return super ^ @minimum_similarity.hash ^ @prefix_length.hash
-    end
-  end
-end

data/lib/ferret/search/fuzzy_term_enum.rb DELETED Viewed

@@ -1,247 +0,0 @@
-require 'monitor'
-module Ferret::Search
-  # Subclass of FilteredTermEnum for enumerating all terms that are similiar
-  # to the specified filter term.
-  #
-  # Term enumerations are always ordered by Term.compareTo().  Each term in
-  # the enumeration is greater than all that precede it.
-  class FuzzyTermEnum < FilteredTermEnum
-    include MonitorMixin
-    include Ferret::Index
-    attr_reader :end_enum
-    # This should be somewhere around the average long word.
-    # If it is longer, we waste time and space. If it is shorter, we waste a
-    # little bit of time growing the array as we encounter longer words.
-    TYPICAL_LONGEST_WORD_IN_INDEX = 19
-    # Constructor for enumeration of all terms from specified +reader+ which
-    # share a prefix of length +prefix_length+ with +term+ and which have a
-    # fuzzy similarity > +min_similarity+.
-    #
-    # After calling the constructor the enumeration is already pointing to the
-    # first valid term if such a term exists.
-    #
-    # reader:: Delivers terms.
-    # term:: Pattern term.
-    # min_similarity:: Minimum required similarity for terms from the reader.
-    # Default value is 0.5.
-    # prefix_length:: Length of required common prefix. Default value is 0.
-    def initialize(reader, term,
-                   minimum_similarity = FuzzyQuery.default_min_similarity,
-                   prefix_length = FuzzyQuery.default_prefix_length)
-      super()
-      @reader = reader
-      @end_enum = false
-      @max_distances = Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)
-      if (minimum_similarity >= 1.0)
-        raise ArgumentError, "minimum_similarity cannot be greater than or equal to 1"
-      elsif (minimum_similarity < 0.0)
-        raise ArgumentError, "minimum_similarity cannot be less than 0"
-      end
-      if(prefix_length < 0)
-        raise ArgumentError, "prefix_length cannot be less than 0"
-      end
-      @minimum_similarity = minimum_similarity
-      @scale_factor = 1.0 / (1.0 - @minimum_similarity)
-      @search_term = term
-      @field = @search_term.field
-      # The prefix could be longer than the word.
-      # It's kind of silly though.  It means we must match the entire word.
-      term_length = @search_term.text.length
-      if prefix_length > term_length
-        @prefix_length = term_length
-      else
-        @prefix_length = prefix_length
-      end
-      @text = @search_term.text[@prefix_length..-1]
-      @prefix = @search_term.text[0, @prefix_length]
-      initialize_max_distances()
-      # Allows us save time required to create a new array
-      # everytime similarity is called.
-      @d = init_distance_array()
-      self.enum = reader.terms_from(Term.new(@search_term.field, @prefix))
-    end
-    # The term_compare method in FuzzyTermEnum uses Levenshtein distance to
-    # calculate the distance between the given term and the comparing term.
-    def term_compare(term)
-      if (@field == term.field and term.text[0, @prefix_length] == @prefix)
-        target = term.text[@prefix_length..-1]
-        @similarity = similarity(target)
-        return (@similarity > @minimum_similarity)
-      end
-      @end_enum = true
-      return false
-    end
-    def difference()
-      return  (@scale_factor * (@similarity - @minimum_similarity))
-    end
-    # ****************************
-    # Compute Levenshtein distance
-    # ****************************
-    # Finds and returns the smallest of three integers
-    def min(a, b, c)
-      t = (a < b) ? a : b
-      return (t < c) ? t : c
-    end
-    def init_distance_array()
-      return Array.new(@text.length() + 1) {Array.new(TYPICAL_LONGEST_WORD_IN_INDEX)}
-    end
-    # Similarity returns a number that is 1.0 or less (including negative
-    # numbers) based on how similar the Term is compared to a target term.  It
-    # returns exactly 0.0 when
-    #
-    #    edit_distance < maximum_edit_distance
-    #
-    # Otherwise it returns:
-    #
-    #    1 - (edit_distance / length)
-    #
-    # where length is the length of the shortest term (text or target)
-    # including a prefix that are identical and edit_distance is the
-    # Levenshtein distance for the two words.
-    #
-    # Embedded within this algorithm is a fail-fast Levenshtein distance
-    # algorithm.  The fail-fast algorithm differs from the standard
-    # Levenshtein distance algorithm in that it is aborted if it is discovered
-    # that the mimimum distance between the words is greater than some
-    # threshold.
-    #
-    # To calculate the maximum distance threshold we use the following formula:
-    #
-    #    (1 - minimum_similarity) * length
-    #
-    # where length is the shortest term including any prefix that is not part
-    # of the similarity comparision.  This formula was derived by solving for
-    # what maximum value of distance returns false for the following
-    # statements:
-    #
-    #    similarity = 1 - (distance / (prefix_length + [textlen, targetlen].min))
-    #    return (similarity > minimum_similarity)
-    #
-    # where distance is the Levenshtein distance for the two words.
-    #
-    # Levenshtein distance (also known as edit distance) is a measure of
-    # similiarity between two strings where the distance is measured as the
-    # number of character deletions, insertions or substitutions required to
-    # transform one string to the other string.
-    #
-    # target:: the target word or phrase
-    # returns:: the similarity,  0.0 or less indicates that it matches less
-    #    than the required threshold and 1.0 indicates that the text and
-    #    target are identical
-    def similarity(target)
-      synchronize do
-        m = target.length
-        n = @text.length
-        if (n == 0)
-          # we don't have anything to compare.  That means if we just add the
-          # letters for m we get the new word
-          return (@prefix_length == 0) ? 0.0 : 1.0 - (m.to_f / @prefix_length)
-        end
-        if (m == 0)
-          return (@prefix_length == 0) ? 0.0 : 1.0 - (n.to_f / @prefix_length)
-        end
-        max_distance = max_distance(m)
-        if (max_distance < (m-n).abs)
-          #just adding the characters of m to n or vice-versa results in
-          #too many edits
-          #for example "pre" length is 3 and "prefixes" length is 8.  We can see that
-          #given this optimal circumstance, the edit distance cannot be less than 5.
-          #which is 8-3 or more precisesly Math.abs(3-8).
-          #if our maximum edit distance is 4, then we can discard this word
-          #without looking at it.
-          return 0.0
-        end
-        #let's make sure we have enough room in our array to do the distance calculations.
-        if (@d[0].length <= m)
-          grow_distance_array(m)
-        end
-        # init matrix d
-        (n+1).times {|i| @d[i][0] = i}
-        (m+1).times {|j| @d[0][j] = j}
-        # start computing edit distance
-        1.upto(n) do |i|
-          best_possible_edit_distance = m
-          s_i = @text[i-1]
-          1.upto(m) do |j|
-            if (s_i != target[j-1])
-              @d[i][j] = min(@d[i-1][j], @d[i][j-1], @d[i-1][j-1])+1
-            else
-              @d[i][j] = min(@d[i-1][j]+1, @d[i][j-1]+1, @d[i-1][j-1])
-            end
-            if @d[i][j] < best_possible_edit_distance
-              best_possible_edit_distance = @d[i][j]
-            end
-          end
-          # After calculating row i, the best possible edit distance can be
-          # found by found by finding the smallest value in a given column.
-          # If the best_possible_edit_distance is greater than the max distance,
-          # abort.
-          if (i > max_distance and best_possible_edit_distance > max_distance)
-            # equal is okay, but not greater
-            # the closest the target can be to the text is just too far away.
-            # this target is leaving the party early.
-            return 0.0
-          end
-        end
-        # this will return less than 0.0 when the edit distance is
-        # greater than the number of characters in the shorter word.
-        # but this was the formula that was previously used in FuzzyTermEnum,
-        # so it has not been changed (even though minimum_similarity must be
-        # greater than 0.0)
-        return 1.0 - (@d[n][m].to_f / (@prefix_length + (n < m ? n : m)))
-      end
-    end
-    # Grow the second dimension of the array, so that we can calculate the
-    # Levenshtein difference.
-    def grow_distance_array(m)
-      @d = @d.map {Array.new(m+1)}
-    end
-    # The max Distance is the maximum Levenshtein distance for the text
-    # compared to some other value that results in score that is
-    # better than the minimum similarity.
-    # m:: the length of the "other value"
-    # returns:: the maximum levenshtein distance that we care about
-    def max_distance(m)
-      return @max_distances[m] ||= calculate_max_distance(m)
-    end
-    def initialize_max_distances()
-      @max_distances.length.times do |i|
-        @max_distances[i] = calculate_max_distance(i)
-      end
-    end
-    def calculate_max_distance(m)
-      return ((1-@minimum_similarity) * ([@text.length, m].min + @prefix_length))
-    end
-  end
-end