RubyGems - ferret - Versions diffs - 0.9.6 → 0.10.0 - Mend

ferret 0.9.6 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (295) hide show

data/MIT-LICENSE +1 -1
data/README +12 -24
data/Rakefile +38 -54
data/TODO +14 -17
data/ext/analysis.c +982 -823
data/ext/analysis.h +133 -76
data/ext/array.c +96 -58
data/ext/array.h +40 -13
data/ext/bitvector.c +476 -118
data/ext/bitvector.h +264 -22
data/ext/compound_io.c +217 -229
data/ext/defines.h +49 -0
data/ext/document.c +107 -317
data/ext/document.h +31 -65
data/ext/except.c +81 -36
data/ext/except.h +117 -55
data/ext/extconf.rb +2 -9
data/ext/ferret.c +211 -104
data/ext/ferret.h +22 -11
data/ext/filter.c +97 -82
data/ext/fs_store.c +348 -367
data/ext/global.c +226 -188
data/ext/global.h +44 -26
data/ext/hash.c +474 -391
data/ext/hash.h +441 -68
data/ext/hashset.c +124 -96
data/ext/hashset.h +169 -20
data/ext/helper.c +56 -5
data/ext/helper.h +7 -0
data/ext/inc/lang.h +29 -49
data/ext/inc/threading.h +31 -0
data/ext/ind.c +288 -278
data/ext/ind.h +68 -0
data/ext/index.c +5688 -0
data/ext/index.h +663 -616
data/ext/lang.h +29 -49
data/ext/libstemmer.c +3 -3
data/ext/mem_pool.c +84 -0
data/ext/mem_pool.h +35 -0
data/ext/posh.c +1006 -0
data/ext/posh.h +1007 -0
data/ext/priorityqueue.c +117 -194
data/ext/priorityqueue.h +135 -39
data/ext/q_boolean.c +1305 -1108
data/ext/q_const_score.c +106 -93
data/ext/q_filtered_query.c +138 -135
data/ext/q_fuzzy.c +206 -242
data/ext/q_match_all.c +94 -80
data/ext/q_multi_term.c +663 -0
data/ext/q_parser.c +667 -593
data/ext/q_phrase.c +992 -555
data/ext/q_prefix.c +72 -61
data/ext/q_range.c +235 -210
data/ext/q_span.c +1480 -1166
data/ext/q_term.c +273 -246
data/ext/q_wildcard.c +127 -114
data/ext/r_analysis.c +1720 -711
data/ext/r_index.c +3049 -0
data/ext/r_qparser.c +433 -146
data/ext/r_search.c +2934 -1993
data/ext/r_store.c +372 -143
data/ext/r_utils.c +941 -0
data/ext/ram_store.c +330 -326
data/ext/search.c +1291 -668
data/ext/search.h +403 -702
data/ext/similarity.c +91 -113
data/ext/similarity.h +45 -30
data/ext/sort.c +721 -484
data/ext/stopwords.c +361 -273
data/ext/store.c +556 -58
data/ext/store.h +706 -126
data/ext/tags +3578 -2780
data/ext/term_vectors.c +352 -0
data/ext/threading.h +31 -0
data/ext/win32.h +54 -0
data/lib/ferret.rb +5 -17
data/lib/ferret/document.rb +130 -2
data/lib/ferret/index.rb +577 -26
data/lib/ferret/number_tools.rb +157 -0
data/lib/ferret_version.rb +3 -0
data/test/test_helper.rb +5 -13
data/test/unit/analysis/tc_analyzer.rb +513 -1
data/test/unit/analysis/{ctc_tokenstream.rb → tc_token_stream.rb} +23 -0
data/test/unit/index/tc_index.rb +183 -240
data/test/unit/index/tc_index_reader.rb +312 -479
data/test/unit/index/tc_index_writer.rb +397 -13
data/test/unit/index/th_doc.rb +269 -206
data/test/unit/query_parser/tc_query_parser.rb +40 -33
data/test/unit/search/tc_filter.rb +59 -71
data/test/unit/search/tc_fuzzy_query.rb +24 -16
data/test/unit/search/tc_index_searcher.rb +23 -201
data/test/unit/search/tc_multi_searcher.rb +78 -226
data/test/unit/search/tc_search_and_sort.rb +93 -81
data/test/unit/search/tc_sort.rb +23 -23
data/test/unit/search/tc_sort_field.rb +7 -7
data/test/unit/search/tc_spans.rb +51 -47
data/test/unit/search/tm_searcher.rb +339 -0
data/test/unit/store/tc_fs_store.rb +1 -1
data/test/unit/store/tm_store_lock.rb +3 -3
data/test/unit/tc_document.rb +81 -0
data/test/unit/ts_analysis.rb +1 -1
data/test/unit/ts_utils.rb +1 -1
data/test/unit/utils/tc_bit_vector.rb +288 -0
data/test/unit/utils/tc_number_tools.rb +117 -0
data/test/unit/utils/tc_priority_queue.rb +106 -0
metadata +140 -301
data/CHANGELOG +0 -9
data/ext/dummy.exe +0 -0
data/ext/field.c +0 -408
data/ext/frtio.h +0 -13
data/ext/inc/except.h +0 -90
data/ext/index_io.c +0 -382
data/ext/index_rw.c +0 -2658
data/ext/lang.c +0 -41
data/ext/nix_io.c +0 -134
data/ext/q_multi_phrase.c +0 -380
data/ext/r_doc.c +0 -582
data/ext/r_index_io.c +0 -1021
data/ext/r_term.c +0 -219
data/ext/term.c +0 -820
data/ext/termdocs.c +0 -611
data/ext/vector.c +0 -637
data/ext/w32_io.c +0 -150
data/lib/ferret/analysis.rb +0 -11
data/lib/ferret/analysis/analyzers.rb +0 -112
data/lib/ferret/analysis/standard_tokenizer.rb +0 -71
data/lib/ferret/analysis/token.rb +0 -100
data/lib/ferret/analysis/token_filters.rb +0 -86
data/lib/ferret/analysis/token_stream.rb +0 -26
data/lib/ferret/analysis/tokenizers.rb +0 -112
data/lib/ferret/analysis/word_list_loader.rb +0 -27
data/lib/ferret/document/document.rb +0 -152
data/lib/ferret/document/field.rb +0 -312
data/lib/ferret/index/compound_file_io.rb +0 -338
data/lib/ferret/index/document_writer.rb +0 -289
data/lib/ferret/index/field_infos.rb +0 -279
data/lib/ferret/index/fields_io.rb +0 -181
data/lib/ferret/index/index.rb +0 -675
data/lib/ferret/index/index_file_names.rb +0 -33
data/lib/ferret/index/index_reader.rb +0 -503
data/lib/ferret/index/index_writer.rb +0 -534
data/lib/ferret/index/multi_reader.rb +0 -377
data/lib/ferret/index/multiple_term_doc_pos_enum.rb +0 -98
data/lib/ferret/index/segment_infos.rb +0 -130
data/lib/ferret/index/segment_merge_info.rb +0 -49
data/lib/ferret/index/segment_merge_queue.rb +0 -16
data/lib/ferret/index/segment_merger.rb +0 -358
data/lib/ferret/index/segment_reader.rb +0 -412
data/lib/ferret/index/segment_term_enum.rb +0 -169
data/lib/ferret/index/segment_term_vector.rb +0 -58
data/lib/ferret/index/term.rb +0 -53
data/lib/ferret/index/term_buffer.rb +0 -83
data/lib/ferret/index/term_doc_enum.rb +0 -291
data/lib/ferret/index/term_enum.rb +0 -52
data/lib/ferret/index/term_info.rb +0 -37
data/lib/ferret/index/term_infos_io.rb +0 -321
data/lib/ferret/index/term_vector_offset_info.rb +0 -20
data/lib/ferret/index/term_vectors_io.rb +0 -553
data/lib/ferret/query_parser.rb +0 -312
data/lib/ferret/query_parser/query_parser.tab.rb +0 -928
data/lib/ferret/search.rb +0 -50
data/lib/ferret/search/boolean_clause.rb +0 -100
data/lib/ferret/search/boolean_query.rb +0 -299
data/lib/ferret/search/boolean_scorer.rb +0 -294
data/lib/ferret/search/caching_wrapper_filter.rb +0 -40
data/lib/ferret/search/conjunction_scorer.rb +0 -99
data/lib/ferret/search/disjunction_sum_scorer.rb +0 -205
data/lib/ferret/search/exact_phrase_scorer.rb +0 -32
data/lib/ferret/search/explanation.rb +0 -41
data/lib/ferret/search/field_cache.rb +0 -215
data/lib/ferret/search/field_doc.rb +0 -31
data/lib/ferret/search/field_sorted_hit_queue.rb +0 -184
data/lib/ferret/search/filter.rb +0 -11
data/lib/ferret/search/filtered_query.rb +0 -130
data/lib/ferret/search/filtered_term_enum.rb +0 -79
data/lib/ferret/search/fuzzy_query.rb +0 -154
data/lib/ferret/search/fuzzy_term_enum.rb +0 -247
data/lib/ferret/search/hit_collector.rb +0 -34
data/lib/ferret/search/hit_queue.rb +0 -11
data/lib/ferret/search/index_searcher.rb +0 -200
data/lib/ferret/search/match_all_query.rb +0 -104
data/lib/ferret/search/multi_phrase_query.rb +0 -216
data/lib/ferret/search/multi_searcher.rb +0 -261
data/lib/ferret/search/multi_term_query.rb +0 -65
data/lib/ferret/search/non_matching_scorer.rb +0 -22
data/lib/ferret/search/phrase_positions.rb +0 -55
data/lib/ferret/search/phrase_query.rb +0 -214
data/lib/ferret/search/phrase_scorer.rb +0 -152
data/lib/ferret/search/prefix_query.rb +0 -54
data/lib/ferret/search/query.rb +0 -140
data/lib/ferret/search/query_filter.rb +0 -51
data/lib/ferret/search/range_filter.rb +0 -103
data/lib/ferret/search/range_query.rb +0 -139
data/lib/ferret/search/req_excl_scorer.rb +0 -125
data/lib/ferret/search/req_opt_sum_scorer.rb +0 -70
data/lib/ferret/search/score_doc.rb +0 -38
data/lib/ferret/search/score_doc_comparator.rb +0 -114
data/lib/ferret/search/scorer.rb +0 -91
data/lib/ferret/search/similarity.rb +0 -278
data/lib/ferret/search/sloppy_phrase_scorer.rb +0 -47
data/lib/ferret/search/sort.rb +0 -112
data/lib/ferret/search/sort_comparator.rb +0 -60
data/lib/ferret/search/sort_field.rb +0 -91
data/lib/ferret/search/spans.rb +0 -12
data/lib/ferret/search/spans/near_spans_enum.rb +0 -304
data/lib/ferret/search/spans/span_first_query.rb +0 -79
data/lib/ferret/search/spans/span_near_query.rb +0 -108
data/lib/ferret/search/spans/span_not_query.rb +0 -130
data/lib/ferret/search/spans/span_or_query.rb +0 -176
data/lib/ferret/search/spans/span_query.rb +0 -25
data/lib/ferret/search/spans/span_scorer.rb +0 -74
data/lib/ferret/search/spans/span_term_query.rb +0 -105
data/lib/ferret/search/spans/span_weight.rb +0 -84
data/lib/ferret/search/spans/spans_enum.rb +0 -44
data/lib/ferret/search/term_query.rb +0 -128
data/lib/ferret/search/term_scorer.rb +0 -183
data/lib/ferret/search/top_docs.rb +0 -36
data/lib/ferret/search/top_field_docs.rb +0 -17
data/lib/ferret/search/weight.rb +0 -54
data/lib/ferret/search/wildcard_query.rb +0 -26
data/lib/ferret/search/wildcard_term_enum.rb +0 -61
data/lib/ferret/stemmers.rb +0 -1
data/lib/ferret/stemmers/porter_stemmer.rb +0 -218
data/lib/ferret/store.rb +0 -5
data/lib/ferret/store/buffered_index_io.rb +0 -190
data/lib/ferret/store/directory.rb +0 -141
data/lib/ferret/store/fs_store.rb +0 -381
data/lib/ferret/store/index_io.rb +0 -245
data/lib/ferret/store/ram_store.rb +0 -286
data/lib/ferret/utils.rb +0 -8
data/lib/ferret/utils/bit_vector.rb +0 -123
data/lib/ferret/utils/date_tools.rb +0 -138
data/lib/ferret/utils/number_tools.rb +0 -91
data/lib/ferret/utils/parameter.rb +0 -41
data/lib/ferret/utils/priority_queue.rb +0 -120
data/lib/ferret/utils/string_helper.rb +0 -47
data/lib/ferret/utils/thread_local.rb +0 -28
data/lib/ferret/utils/weak_key_hash.rb +0 -60
data/lib/rferret.rb +0 -37
data/rake_utils/code_statistics.rb +0 -106
data/test/benchmark/tb_ram_store.rb +0 -76
data/test/benchmark/tb_rw_vint.rb +0 -26
data/test/functional/thread_safety_index_test.rb +0 -81
data/test/functional/thread_safety_test.rb +0 -137
data/test/longrunning/tc_numbertools.rb +0 -60
data/test/longrunning/tm_store.rb +0 -19
data/test/unit/analysis/ctc_analyzer.rb +0 -532
data/test/unit/analysis/data/wordfile +0 -6
data/test/unit/analysis/rtc_letter_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_lower_case_filter.rb +0 -20
data/test/unit/analysis/rtc_lower_case_tokenizer.rb +0 -27
data/test/unit/analysis/rtc_per_field_analyzer_wrapper.rb +0 -39
data/test/unit/analysis/rtc_porter_stem_filter.rb +0 -16
data/test/unit/analysis/rtc_standard_analyzer.rb +0 -20
data/test/unit/analysis/rtc_standard_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_stop_analyzer.rb +0 -20
data/test/unit/analysis/rtc_stop_filter.rb +0 -14
data/test/unit/analysis/rtc_white_space_analyzer.rb +0 -21
data/test/unit/analysis/rtc_white_space_tokenizer.rb +0 -20
data/test/unit/analysis/rtc_word_list_loader.rb +0 -32
data/test/unit/analysis/tc_token.rb +0 -25
data/test/unit/document/rtc_field.rb +0 -28
data/test/unit/document/tc_document.rb +0 -47
data/test/unit/document/tc_field.rb +0 -98
data/test/unit/index/rtc_compound_file_io.rb +0 -107
data/test/unit/index/rtc_field_infos.rb +0 -127
data/test/unit/index/rtc_fields_io.rb +0 -167
data/test/unit/index/rtc_multiple_term_doc_pos_enum.rb +0 -83
data/test/unit/index/rtc_segment_infos.rb +0 -74
data/test/unit/index/rtc_segment_term_docs.rb +0 -17
data/test/unit/index/rtc_segment_term_enum.rb +0 -60
data/test/unit/index/rtc_segment_term_vector.rb +0 -71
data/test/unit/index/rtc_term_buffer.rb +0 -57
data/test/unit/index/rtc_term_info.rb +0 -19
data/test/unit/index/rtc_term_infos_io.rb +0 -192
data/test/unit/index/rtc_term_vectors_io.rb +0 -108
data/test/unit/index/tc_term.rb +0 -27
data/test/unit/index/tc_term_voi.rb +0 -18
data/test/unit/search/rtc_similarity.rb +0 -37
data/test/unit/search/rtc_sort_field.rb +0 -14
data/test/unit/search/tc_multi_searcher2.rb +0 -126
data/test/unit/store/rtc_fs_store.rb +0 -62
data/test/unit/store/rtc_ram_store.rb +0 -15
data/test/unit/store/rtm_store.rb +0 -150
data/test/unit/store/rtm_store_lock.rb +0 -2
data/test/unit/ts_document.rb +0 -2
data/test/unit/utils/rtc_bit_vector.rb +0 -73
data/test/unit/utils/rtc_date_tools.rb +0 -50
data/test/unit/utils/rtc_number_tools.rb +0 -59
data/test/unit/utils/rtc_parameter.rb +0 -40
data/test/unit/utils/rtc_priority_queue.rb +0 -62
data/test/unit/utils/rtc_string_helper.rb +0 -21
data/test/unit/utils/rtc_thread.rb +0 -61
data/test/unit/utils/rtc_weak_key_hash.rb +0 -25
data/test/utils/number_to_spoken.rb +0 -132

data/lib/ferret/analysis/token_filters.rb DELETED Viewed

@@ -1,86 +0,0 @@
-module Ferret::Analysis
-  # A TokenFilter is a TokenStream whose input is another token stream.
-  #
-  # This is an abstract class.
-  class TokenFilter < TokenStream
-    # Close the input TokenStream.
-    def close()
-      @input.close()
-    end
-    protected
-      # Construct a token stream filtering the given input.
-      def initialize(input)
-        @input = input
-      end
-  end
-  # Normalizes token text to lower case.
-  class LowerCaseFilter < TokenFilter
-    def next()
-      t = @input.next()
-      if (t == nil)
-        return nil
-      end
-      t.text = t.text.downcase()
-      return t
-    end
-  end
-  # Removes stop words from a token stream. To will need to pass your own
-  # set of stopwords to use this stop filter. If you with to use the default
-  # list of stopwords then use the StopAnalyzer.
-  class StopFilter < TokenFilter
-    # Constructs a filter which removes words from the input
-    # TokenStream that are named in the array of words.
-    def initialize(input, stop_set)
-      super(input);
-      @stop_set = stop_set
-    end
-    def StopFilter.new_with_file(input, path)
-      ws = WordListLoader.word_set_from_file(path)
-      return StopFilter.new(input, ws)
-    end
-    # Returns the next input Token whose termText() is not a stop word.
-    def next()
-      # return the first non-stop word found
-      while token = @input.next()
-        return token if ! @stop_set.include?(token.text)
-      end
-      return nil
-    end
-  end
-  # Transforms the token stream as per the Porter stemming algorithm.
-  # Note: the input to the stemming filter must already be in lower case,
-  # so you will need to use LowerCaseFilter or LowerCaseTokenizer further
-  # down the Tokenizer chain in order for this to work properly!
-  #
-  # To use this filter with other analyzers, you'll want to write an
-  # Analyzer class that sets up the TokenStream chain as you want it.
-  # To use this with LowerCaseTokenizer, for example, you'd write an
-  # analyzer like this:
-  #
-  #   def MyAnalyzer < Analyzer
-  #     def token_stream(field, reader)
-  #       return PorterStemFilter.new(LowerCaseTokenizer.new(reader))
-  #     end
-  #   end
-  class PorterStemFilter < TokenFilter
-    # Returns the next input Token, after being stemmed
-    def next()
-      token = @input.next()
-      if (token == nil)
-        return nil
-      else
-        token.text = Stemmable.stem_porter(token.text)
-      end
-      token
-    end
-  end
-end

data/lib/ferret/analysis/token_stream.rb DELETED Viewed

@@ -1,26 +0,0 @@
-module Ferret::Analysis
-  # A TokenStream enumerates the sequence of tokens, either from
-  # fields of a document or from query text.
-  #
-  # This is an abstract class.  Concrete subclasses are:
-  # * Tokenizer, a TokenStream whose input is a Reader; and
-  # * TokenFilter, a TokenStream whose input is another TokenStream.
-  class TokenStream
-    # Returns the next token in the stream, or null at EOS.
-    def next
-      raise NotImplementedError
-    end
-    # Releases resources associated with this stream.
-    def close
-      raise NotImplementedError
-    end
-    # Iterates through the tokens in the field
-    def each # :yields: token
-      while (n = self.next())
-        yield n
-      end
-    end
-  end
-end

data/lib/ferret/analysis/tokenizers.rb DELETED Viewed

@@ -1,112 +0,0 @@
-require 'strscan'
-module Ferret::Analysis
-  # A Tokenizer is a TokenStream whose input is a Reader.
-  #
-  # This is an abstract class.
-  class Tokenizer < TokenStream
-    # By default, closes the input Reader.
-    def close()
-      @input.close()
-    end
-    protected
-      # Construct a token stream processing the given input.
-      def initialize(input)
-        @input = input
-      end
-  end
-  # An abstract base class for simple regular expression oriented
-  # tokenizers. Very powerful tokenizers can be created using this class as
-  # can be seen from the StandardTokenizer class. Bellow is an example of a
-  # simple implementation of a LetterTokenizer using an RegExpTokenizer.
-  # Basically, a token is a sequence of alphabetic characters separated by
-  # one or more non-alphabetic characters.
-  #
-  #   class LetterTokenizer < RegExpTokenizer
-  #       def token_re()
-  #         /[[:alpha:]]+/
-  #       end
-  #   end
-  class RegExpTokenizer < Tokenizer
-    # Initialize with an IO implementing input such as a file.
-    #
-    # input:: must have a read(count) method which returns an array or string
-    #         of _count_ chars.
-    def initialize(input)
-      #@token_buffer = Token.new("", 0, 0)
-      if input.is_a? String
-        @ss = StringScanner.new(input)
-      else
-        @ss = StringScanner.new(input.read())
-      end
-    end
-    # Returns the next token in the stream, or null at EOS.
-    def next()
-      if @ss.scan_until(token_re)
-        term = @ss.matched
-        term_end = @ss.pos
-        term_start = term_end - term.size
-      else
-        return nil
-      end
-      #return @token_buffer.set!(normalize(term), term_start, term_end)
-      return Token.new(normalize(term), term_start, term_end)
-    end
-    def close()
-      @ss = nil
-    end
-    protected
-      # returns the regular expression used to find the next token
-      TOKEN_RE = /[[:alpha:]]+/
-      def token_re
-        TOKEN_RE
-      end
-      # Called on each token to normalize it before it is added to the
-      # token.  The default implementation does nothing.  Subclasses may
-      # use this to, e.g., lowercase tokens.
-      def normalize(str) return str end
-  end
-  # A LetterTokenizer is a tokenizer that divides text at non-letters.
-  # That's to say, it defines tokens as maximal strings of adjacent letters,
-  # as defined by the regular expression _/[[:alpha:]]+/_.
-  class LetterTokenizer < RegExpTokenizer
-    protected
-      # Collects only characters which satisfy the regular expression
-      # _/[[:alpha:]]+/_.
-      TOKEN_RE = /[[:alpha:]]+/
-      def token_re
-        TOKEN_RE
-      end
-  end
-  # LowerCaseTokenizer performs the function of LetterTokenizer
-  # and LowerCaseFilter together.  It divides text at non-letters and converts
-  # them to lower case.
-  class LowerCaseTokenizer < LetterTokenizer
-    protected
-      def normalize(str)
-        return str.downcase
-      end
-  end
-  # A WhiteSpaceTokenizer is a tokenizer that divides text at whiteSpace.
-  # Adjacent sequences of non-WhiteSpace characters form tokens.
-  class WhiteSpaceTokenizer < RegExpTokenizer
-    protected
-      # Collects only characters which are not spaces tabs or carraige returns
-      TOKEN_RE = /\S+/
-      def token_re
-        TOKEN_RE
-      end
-  end
-end

data/lib/ferret/analysis/word_list_loader.rb DELETED Viewed

@@ -1,27 +0,0 @@
-require 'set'
-module Ferret::Analysis
-  # Loader for text files that represent a list of stopwords.
-  module WordListLoader
-    # Loads a text file and adds every line as an entry to a HashSet (omitting
-    # leading and trailing whitespace). Every line of the file should contain only
-    # one word. The words need to be in lowercase if you make use of an
-    # Analyzer which uses LowerCaseFilter (like GermanAnalyzer).
-    #
-    # path:: path to file containing the wordlist
-    # return:: A HashSet with the file's words
-    def WordListLoader.word_set_from_file(path)
-      result = Set.new()
-      File.open(path) do |word_file|
-        # we have to strip the end of line characters
-        word_file.each {|line| result << line[0..-2] }
-      end
-      return result
-    end
-    def WordListLoader.word_set_from_array(word_array)
-      result = Set.new()
-      word_array.each {|word| result << word }
-      return result
-    end
-  end
-end

data/lib/ferret/document/document.rb DELETED Viewed

@@ -1,152 +0,0 @@
-module Ferret::Document
-  # Documents are the unit of indexing and search.
-  #
-  # A Document is a set of fields.  Each field has a name and a textual
-  # value.  A field may be Field#stored?() with the document, in which case
-  # it is returned with search hits on the document.  Thus each document
-  # should typically contain one or more stored fields which uniquely
-  # identify it.
-  #
-  # Note that fields which are _not_ Field#stored?() are _not_ available in
-  # documents retrieved from the index, e.g. with Hits#doc, Searcher#doc or
-  # IndexReader#document.
-  #
-  # Several fields may be added with the same name.  In this case, if the
-  # fields are indexed, their text is treated as though appended for the
-  # purposes of search.
-  #
-  # Note that add like the remove_field(s) methods only makes sense prior to
-  # adding a document to an index. These methods cannot be used to change
-  # the content of an existing index! In order to achieve this, a document
-  # has to be deleted from an index and a new changed version of that
-  # document has to be added.
-  class Document
-    attr_accessor :boost
-    # Constructs a new document with no fields.
-    def initialize()
-      # Values are multiplied into the value of Field#boost of each field in
-      # this document.  Thus, this method in effect sets a default boost for
-      # the fields of this document.
-      #
-      # The default value is 1.0.
-      #
-      # Note: This value is not stored directly with the document in the
-      # index.  Documents returned from IndexReader#document and Hits#doc
-      # may thus not have the same value present as when this document was
-      # indexed.
-      @boost = 1.0
-      @fields = {}
-    end
-    # Returns an array of all fields. Note that it is possible for two
-    # fields to appear with the same field name. These will be concatenated
-    # in the index.
-    def all_fields
-      @fields.values.flatten
-    end
-    # Returns the number of distinct fields held within the document. This
-    # counts fields which have multiple entries as one.
-    def field_count()
-      return @fields.size
-    end
-    # Returns the number of entries held within the document. This counts
-    # all sections so for fields which have multiple entries, each entry
-    # is counted
-    def entry_count()
-      return @fields.values.flatten.size
-    end
-    # Adds a field to a document.  Several fields may be added with the same
-    # name.  In this case, if the fields are indexed, their text is treated
-    # as though appended for the purposes of search.
-    #
-    # Note that add like the remove_field(s) methods only makes sense prior
-    # to adding a document to an index. These methods cannot be used to
-    # change the content of an existing index! In order to achieve this, a
-    # document has to be deleted from an index and a new changed version of
-    # that document has to be added.
-    def add_field(field)
-      (@fields[field.name.to_s] ||= []) << field
-    end
-    alias :<< :add_field
-    # Removes the first field of this name if it exists.
-    def remove_field(name)
-      @fields[name.to_s].delete_at(0)
-    end
-    # Removes all fields with the given name from the document.
-    #
-    # If there is no field with the specified name, the document remains
-    # unchanged.
-    #
-    # Note that the remove_field(s) methods like the add method only make
-    # sense prior to adding a document to an index. These methods cannot be
-    # used to change the content of an existing index! In order to achieve
-    # this, a document has to be deleted from an index and a new changed
-    # version of that document has to be added.
-    def remove_fields(name)
-      @fields.delete(name.to_s)
-    end
-    # Returns the first field with the given name.
-    # This method can return _nil_.
-    #
-    # name:: the name of the field
-    # Return:: a _Field_ array
-    def field(name)
-      @fields[name.to_s] ? @fields[name.to_s][0] : nil
-    end
-    # Returns an array of all fields with the given name.
-    # This method can return _nil_.
-    #
-    # name:: the name of the field
-    # Return:: a _Field_ array
-    def fields(name)
-      @fields[name.to_s]
-    end
-    # Returns an array of values of the field specified as the method
-    # parameter.  This method can return _nil_.
-    #
-    # name:: the name of the field
-    # Return:: a _String_ of field values
-    def values(name)
-      return nil if @fields[name.to_s].nil?
-      @fields[name.to_s].map {|f| f.data if not f.binary? }.join(" ")
-    end
-    alias :[] :values
-    # Sets the data in field +field+ to +text+. If there is more than one
-    # field of that name then it will set the data in the first field of that
-    # name. If there is no field of that name, then a new one will be created
-    def []=(field_name, data)
-      field = field(field_name.to_s)
-      if field
-        field.data = data
-      else
-        add_field(Field.new(field_name.to_s, data))
-      end
-    end
-    # Returns an array of binaries of the field specified as the method
-    # parameter.  This method can return _nil_.
-    #
-    # name:: the name of the field
-    # Return:: a _String_ of field values
-    def binaries(name)
-      binaries = []
-      @fields[name.to_s].each {|f| binaries << f.data if f.binary? }
-      return binaries
-    end
-    # Prints the fields of a document for human consumption.#/
-    def to_s()
-      return "Document{\n  #{@fields.values.join("\n  ")}\n}"
-    end
-  end
-end

data/lib/ferret/document/field.rb DELETED Viewed

@@ -1,312 +0,0 @@
-module Ferret::Document
-  # A field is a section of a Document.  Each field has two parts, a name
-  # and a value.  Values may be free text, provided as a String or as a
-  # Reader, or they may be atomic keywords, which are not further processed.
-  # Such keywords may be used to represent dates, urls, etc.  Fields are
-  # optionally stored in the index, so that they may be returned with hits
-  # on the document.
-  class Field
-    # This value will be
-    # multiplied into the score of all hits on this field of this
-    # document.
-    #
-    # The boost is multiplied by Document#boost of the document
-    # containing this field.  If a document has multiple fields with the same
-    # name, all such values are multiplied together.  This product is then
-    # multipled by the value Similarity#length_norm(String,int), and
-    # rounded by Similarity#encode_norm(float) before it is stored in the
-    # index.  One should attempt to ensure that this product does not overflow
-    # the range of that encoding.
-    #
-    # See Document#set_boost(float)
-    # See Similarity#length_norm(String, int)
-    # See Similarity#encode_norm(float)
-    #
-    # Note: this value is not stored directly with the document in the index.
-    # Documents returned from IndexReader#document(int) and
-    # Hits#doc(int) may thus not have the same value present as when this field
-    # was indexed.
-    attr_accessor :boost, :data
-    attr_reader :name
-    # True iff the value of the field is to be stored in the index for
-    # return with search hits.  It is an error for this to be true if a
-    # field is Reader-valued.
-    def stored?() return @stored end
-    # True iff the value of the field is to be indexed, so that it may be
-    # searched on.
-    def indexed?() return @indexed end
-    # True iff the value of the field should be tokenized as text prior to
-    # indexing.  Un-tokenized fields are indexed as a single word and may
-    # not be Reader-valued.
-    def tokenized?() return @tokenized end
-    # True if the field is to be stored as a binary value. This can be used
-    # to store images or other binary data in the index if you wish
-    def binary?() return @binary end
-    # True if you want to compress the data that you store. This is a good
-    # idea for really large text fields. The ruby Zlib library is used to do
-    # the compression
-    def compressed?() return @compressed end
-    # True iff the term or terms used to index this field are stored as a
-    # term vector, available from IndexReader#term_freq_vector(). These
-    # methods do not provide access to the original content of the field,
-    # only to terms used to index it. If the original content must be
-    # preserved, use the _stored_ attribute instead.
-    #
-    # See IndexReader#term_freq_vector()
-    def store_term_vector?() return @store_term_vector end
-    # True if the positions of the indexed terms in this field are stored.
-    def store_positions?() return @store_position end
-    # True if the offsets of this field are stored. The offsets are the
-    # positions of the start and end characters of the token in the whole
-    # field string
-    def store_offsets?() return @store_offset end
-    # True if the norms are not stored for this field. No norms means that
-    # index-time boosting and field length normalization will be disabled.
-    # The benefit is less memory usage as norms take up one byte per indexed
-    # field for every document in the index.
-    def omit_norms?() return @omit_norms end
-    class Store < Ferret::Utils::Parameter
-      # Store the original field value in the index in a compressed form.
-      # This is useful for long documents and for binary valued fields.
-      COMPRESS = Store.new("COMPRESS")
-      # Store the original field value in the index. This is useful for
-      # short texts like a document's title which should be displayed with
-      # the results. The value is stored in its original form, i.e. no
-      # analyzer is used before it is stored.
-      YES = Store.new("YES")
-      # Do not store the field value in the index.
-      NO = Store.new("NO")
-    end
-    class Index < Ferret::Utils::Parameter
-      # Do not index the field value. This field can thus not be searched,
-      # but one can still access its contents provided it is Field.Store
-      # stored
-      NO = Index.new("NO")
-      # Index the field's value so it can be searched. An Analyzer will be
-      # used to tokenize and possibly further normalize the text before its
-      # terms will be stored in the index. This is useful for common text.
-      TOKENIZED = Index.new("TOKENIZED")
-      # Index the field's value without using an Analyzer, so it can be
-      # searched.  As no analyzer is used the value will be stored as a
-      # single term. This is useful for unique Ids like product numbers.
-      UNTOKENIZED = Index.new("UNTOKENIZED")
-      # Index the field's value without an Analyzer, and disable the storing
-      # of norms.  No norms means that index-time boosting and field length
-      # normalization will be disabled.  The benefit is less memory usage as
-      # norms take up one byte per indexed field for every document in the
-      # index.
-      NO_NORMS = Index.new("NO_NORMS");
-    end
-    class TermVector < Ferret::Utils::Parameter
-      # Do not store term vectors.
-      NO = TermVector.new("NO")
-      # Store the term vectors of each document. A term vector is a list of
-      # the document's terms and their number of occurences in that
-      # document.
-      YES = TermVector.new("YES")
-      # Store the term vector + token position information
-      #
-      # See #YES
-      WITH_POSITIONS = TermVector.new("WITH_POSITIONS")
-      # Store the term vector + Token offset information
-      #
-      # See #YES
-      WITH_OFFSETS = TermVector.new("WITH_OFFSETS")
-      # Store the term vector + Token position and offset information
-      #
-      # See #YES See #WITH_POSITIONS See #WITH_OFFSETS
-      WITH_POSITIONS_OFFSETS = TermVector.new("WITH_POSITIONS_OFFSETS")
-    end
-    # Create a field by specifying its name, value and how it will
-    # be saved in the index.
-    #
-    # name:: The name of the field
-    # value:: The string to process
-    # store:: Whether _value_ should be stored in the index
-    # index:: Whether the field should be indexed, and if so, if it should
-    #         be tokenized before indexing
-    #
-    # store_term_vector:: Whether term vector should be stored
-    #  * the field is neither stored nor indexed
-    #  * the field is not indexed but term_vector is _TermVector::YES_
-    #
-    # binary:: Whether you want to store binary data in this field. Default is
-    #    false
-    # boost:: the boost for this field. Default is 1.0. A larger number makes
-    # this field more important.
-    def initialize(name,
-                   value,
-                   store = Store::YES,
-                   index = Index::UNTOKENIZED,
-                   term_vector = TermVector::NO,
-                   binary = false,
-                   boost = 1.0)
-      if (index == Index::NO and store == Store::NO)
-        raise ArgumentError, "it doesn't make sense to have a field that " +
-          "is neither indexed nor stored"
-      end
-      if (index == Index::NO && term_vector != TermVector::NO)
-        raise ArgumentError, "cannot store term vector information for a " +
-          "field that is not indexed"
-      end
-      # The name of the field (e.g., "date", "subject", "title", or "body")
-      @name = name.to_s
-      # the one and only data object for all different kind of field values
-      @data = value
-      self.store = store
-      self.index = index
-      self.term_vector = term_vector
-      @binary = binary
-      @boost = boost
-    end
-    def store=(store)
-      case store
-      when Store::YES
-        @stored = true
-        @compressed = false
-      when Store::COMPRESS
-        @stored = true
-        @compressed = true
-      when Store::NO
-        @stored = false
-        @compressed = false
-      else
-        raise "unknown stored parameter " + store.to_s
-      end
-    end
-    def index=(index)
-      @omit_norms = false
-      case index
-      when Index::NO
-        @indexed = false
-        @tokenized = false
-      when Index::TOKENIZED
-        @indexed = true
-        @tokenized = true
-      when Index::UNTOKENIZED
-        @indexed = true
-        @tokenized = false
-      when Index::NO_NORMS
-        @indexed = true
-        @tokenized = false
-        @omit_norms = true
-      else
-        raise "unknown stored parameter " + index.to_s
-      end
-    end
-    def term_vector=(term_vector)
-      case term_vector
-      when TermVector::NO
-        @store_term_vector = false
-        @store_position = false
-        @store_offset = false
-      when TermVector::YES
-        @store_term_vector = true
-        @store_position = false
-        @store_offset = false
-      when TermVector::WITH_POSITIONS
-        @store_term_vector = true
-        @store_position = true
-        @store_offset = false
-      when TermVector::WITH_OFFSETS
-        @store_term_vector = true
-        @store_position = false
-        @store_offset = true
-      when TermVector::WITH_POSITIONS_OFFSETS
-        @store_term_vector = true
-        @store_position = true
-        @store_offset = true
-      else
-        raise "unknown term_vector parameter " + store_term_vector.to_s
-      end
-    end
-    # Returns the string value of the data that is stored in this field
-    def string_value
-      if @data.instance_of? String
-        return @data
-      elsif @data.respond_to? :read
-        return @data.read()
-      else
-        # if it is binary object try to return a string representation
-        return @data.to_s
-      end
-    end
-    # if the data is stored as a binary, just return it.
-    def binary_value
-      return @data
-    end
-    # Returns the string value of the data that is stored in this field
-    def reader_value
-      if @data.respond_to? :read
-        return @data
-      elsif @data.instance_of? String
-        return Ferret::Utils::StringHelper::StringReader.new(@data)
-      else
-        # if it is binary object try to return a string representation
-        return Ferret::Utils::StringHelper::StringReader.new(@data.to_s)
-      end
-    end
-    # Create a stored field with binary value. Optionally the value
-    # may be compressed. But it obviously won't be tokenized or
-    # term vectored or anything like that.
-    #
-    # name:: The name of the field
-    # value:: The binary value
-    # store:: How _value_ should be stored (compressed or not.)
-    def Field.new_binary_field(name, value, stored)
-      if (stored == Store::NO)
-        raise ArgumentError, "binary values can't be unstored"
-      end
-      Field.new(name, value, stored, Index::NO, TermVector::NO, true)
-    end
-    # Prints a Field for human consumption.
-    def to_s()
-      str = ""
-      if (@stored)
-        str << "stored"
-        str << (@compressed ? "/compressed," : "/uncompressed,")
-      end
-      str << "indexed," if (@indexed)
-      str << "tokenized," if (@tokenized)
-      str << "store_term_vector," if (@store_term_vector)
-      str << "store_offsets," if (@store_offset)
-      str << "store_positions," if (@store_position)
-      str << "omit_norms," if (@omit_norms)
-      str << "binary," if (@binary)
-      str << "<#{@name}:#{@binary ? '=bin_data=' : data}>"
-    end
-  end
-end