RubyGems - picky - Versions diffs - 4.6.5 → 4.6.6 - Mend

picky 4.6.5 → 4.6.6

Files changed (7) hide show

data/lib/picky/api/tokenizer/stemmer.rb +22 -0
data/lib/picky/loader.rb +1 -0
data/lib/picky/tokenizer.rb +15 -1
data/spec/functional/stemming_spec.rb +74 -0
data/spec/lib/api/tokenizer/stemmer_spec.rb +34 -0
data/spec/lib/tokenizer_spec.rb +4 -1
metadata +9 -4

data/lib/picky/api/tokenizer/stemmer.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Picky
+  module API
+    module Tokenizer
+      module Stemmer
+        def extract_stemmer thing
+          if thing.respond_to? :stem
+            thing
+          else
+            raise ArgumentError.new <<-ERROR
+The stems_with option needs a stemmer,
+which responds to #stem(text) and returns stemmed_text."
+ERROR
+          end
+        end
+      end
+    end
+  end
+end

data/lib/picky/loader.rb CHANGED Viewed

@@ -200,6 +200,7 @@ module Picky
       #
       def load_api
         load_relative 'api/tokenizer/character_substituter',
+                      'api/tokenizer/stemmer',
                       'api/search/boost'
       end

data/lib/picky/tokenizer.rb CHANGED Viewed

@@ -8,6 +8,7 @@ module Picky
     extend Picky::Helpers::Identification
     include API::Tokenizer::CharacterSubstituter
+    include API::Tokenizer::Stemmer
     def self.default_indexing_with options = {}
       @indexing = from options
@@ -51,6 +52,7 @@ Splits text on:     #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on
 Normalizes words:   #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
 Rejects tokens?     #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
 Substitutes chars?  #{@substituter ? "Yes, using #{@substituter}." : '-' }
+Stems?              #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
 Case sensitive?     #{@case_sensitive ? "Yes." : "-"}
       TOKENIZER
     end
@@ -135,6 +137,15 @@ Case sensitive?     #{@case_sensitive ? "Yes." : "-"}
     def substitute_characters text
       substituter?? substituter.substitute(text) : text
     end
+    # Stems tokens with this stemmer.
+    #
+    def stems_with stemmer
+      @stemmer = extract_stemmer stemmer
+    end
+    def stem text
+      stemmer?? stemmer.stem(text) : text
+    end
     # Reject tokens after tokenizing based on the given criteria.
     #
@@ -175,8 +186,9 @@ Case sensitive?     #{@case_sensitive ? "Yes." : "-"}
       raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
     end
-    attr_reader :substituter
+    attr_reader :substituter, :stemmer
     alias substituter? substituter
+    alias stemmer? stemmer
     def initialize options = {}
       options = default_options.merge options
@@ -196,6 +208,7 @@ A short overview:
   normalizes_words            [[/replace (this)/, 'with this \\1'], ...]
   rejects_token_if            Proc/lambda, default :blank?.to_proc
   substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
+  stems_with                  Instance responds to #stem(String)
   case_sensitive              true/false
 ERROR
@@ -259,6 +272,7 @@ ERROR
     #
     def tokens_for words
       words.collect! { |word| word.downcase!; word } if downcase?
+      words.collect! { |word| stem word } if stemmer?
       words
     end

data/spec/functional/stemming_spec.rb ADDED Viewed

@@ -0,0 +1,74 @@
+# encoding: utf-8
+#
+require 'spec_helper'
+require 'stemmer'
+describe 'stemming' do
+  let(:stemmer) {
+    # Fast stemmer does not conform with the API.
+    #
+    module Stemmer
+      class << self
+        alias_method :stem, :stem_word
+      end
+    end
+    Stemmer
+  }
+  describe 'examples' do
+    it 'works correctly' do
+      tokenizer = Picky::Tokenizer.new(stems_with: stemmer)
+      # Is this really correct? Shouldn't we split after normalizing?
+      #
+      # Yes – we split using more information.
+      #
+      tokenizer.stem('computers').should == 'comput'
+      tokenizer.stem('computing').should == 'comput'
+      tokenizer.stem('computed').should  == 'comput'
+      tokenizer.stem('computer').should  == 'comput'
+    end
+    # This tests the weights option.
+    #
+    it 'stems right' do
+      # Fix the Stemmer API.
+      #
+      module Stemmer
+        class << self
+          # stem_word is a bit silly, what else would you stem???
+          #
+          alias_method :stem, :stem_word
+        end
+      end
+      index = Picky::Index.new :stemming do
+        # Be aware that if !s are not removed from
+        # eg. Lemming!, then stemming won't work.
+        #
+        indexing removes_characters: /[^a-z\s]/i,
+                 stems_with: Stemmer
+        category :text
+      end
+      index.replace_from id: 1, text: "Hello good Sirs, these things here need stems to work!"
+      index.replace_from id: 2, text: "Stemming Lemming!"
+      try = Picky::Search.new index
+      # If you don't stem in the search, it should not be found!
+      #
+      try.search("text:stemming").ids.should == []
+      try = Picky::Search.new index do
+        searching stems_with: Stemmer
+      end
+      # With stemming in search AND indexing, it works :)
+      #
+      try.search("text:stemming").ids.should == [2, 1]
+      try.search("text:lem").ids.should == [2]
+    end
+  end
+end

data/spec/lib/api/tokenizer/stemmer_spec.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'spec_helper'
+describe Picky::API::Tokenizer do
+  let(:object) do
+    Class.new do
+      include Picky::API::Tokenizer::Stemmer
+    end.new
+  end
+  context 'extract_character_substituter' do
+    context 'with a substituter' do
+      let(:stemmer) do
+        Class.new do
+          def stem text
+            text.gsub /computers/, 'comput' # a simple one word stemmer ;)
+          end
+        end.new
+      end
+      it 'creates a tokenizer' do
+        object.extract_stemmer(stemmer).
+          stem("computers").should == 'comput'
+      end
+    end
+    context 'invalid tokenizer' do
+      it 'raises with a nice error message' do
+        expect {
+          object.extract_stemmer Object.new
+        }.to raise_error(<<-ERROR)
+The stems_with option needs a stemmer,
+which responds to #stem(text) and returns stemmed_text."
+ERROR
+      end
+    end
+  end
+end

data/spec/lib/tokenizer_spec.rb CHANGED Viewed

@@ -18,6 +18,7 @@ A short overview:
   normalizes_words            [[/replace (this)/, 'with this \\1'], ...]
   rejects_token_if            Proc/lambda, default :blank?.to_proc
   substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
+  stems_with                  Instance responds to #stem(String)
   case_sensitive              true/false
 MESSAGE
@@ -39,8 +40,9 @@ Removes characters: -
 Stopwords:          -
 Splits text on:     /\\s/
 Normalizes words:   -
-Rejects tokens?     Yes, see line 28 in app/application.rb
+Rejects tokens?     Yes, see line 29 in app/application.rb
 Substitutes chars?  -
+Stems?              -
 Case sensitive?     Yes.
 EXPECTED
       end
@@ -59,6 +61,7 @@ Splits text on:     /\\s/
 Normalizes words:   -
 Rejects tokens?     -
 Substitutes chars?  -
+Stems?              -
 Case sensitive?     -
 EXPECTED
           end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: picky
 version: !ruby/object:Gem::Version
-  version: 4.6.5
+  version: 4.6.6
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-09-24 00:00:00.000000000 Z
+date: 2012-10-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec
@@ -34,7 +34,7 @@ dependencies:
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 4.6.5
+        version: 4.6.6
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
@@ -42,7 +42,7 @@ dependencies:
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 4.6.5
+        version: 4.6.6
 - !ruby/object:Gem::Dependency
   name: text
   requirement: !ruby/object:Gem::Requirement
@@ -138,6 +138,7 @@ files:
 - lib/picky/analyzer.rb
 - lib/picky/api/search/boost.rb
 - lib/picky/api/tokenizer/character_substituter.rb
+- lib/picky/api/tokenizer/stemmer.rb
 - lib/picky/backends/backend.rb
 - lib/picky/backends/file/basic.rb
 - lib/picky/backends/file/json.rb
@@ -300,6 +301,7 @@ files:
 - spec/functional/regression_spec.rb
 - spec/functional/remap_qualifiers_spec.rb
 - spec/functional/speed_spec.rb
+- spec/functional/stemming_spec.rb
 - spec/functional/terminate_early_spec.rb
 - spec/functional/tokenizer_spec.rb
 - spec/functional/unique_ids_search_spec.rb
@@ -308,6 +310,7 @@ files:
 - spec/lib/analyzer_spec.rb
 - spec/lib/api/search/boost_spec.rb
 - spec/lib/api/tokenizer/character_substituter_spec.rb
+- spec/lib/api/tokenizer/stemmer_spec.rb
 - spec/lib/backends/backend_spec.rb
 - spec/lib/backends/file/basic_spec.rb
 - spec/lib/backends/file_spec.rb
@@ -457,6 +460,7 @@ test_files:
 - spec/functional/regression_spec.rb
 - spec/functional/remap_qualifiers_spec.rb
 - spec/functional/speed_spec.rb
+- spec/functional/stemming_spec.rb
 - spec/functional/terminate_early_spec.rb
 - spec/functional/tokenizer_spec.rb
 - spec/functional/unique_ids_search_spec.rb
@@ -465,6 +469,7 @@ test_files:
 - spec/lib/analyzer_spec.rb
 - spec/lib/api/search/boost_spec.rb
 - spec/lib/api/tokenizer/character_substituter_spec.rb
+- spec/lib/api/tokenizer/stemmer_spec.rb
 - spec/lib/backends/backend_spec.rb
 - spec/lib/backends/file/basic_spec.rb
 - spec/lib/backends/file_spec.rb