picky 4.6.5 → 4.6.6

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
1
+ module Picky
2
+ module API
3
+ module Tokenizer
4
+
5
+ module Stemmer
6
+
7
+ def extract_stemmer thing
8
+ if thing.respond_to? :stem
9
+ thing
10
+ else
11
+ raise ArgumentError.new <<-ERROR
12
+ The stems_with option needs a stemmer,
13
+ which responds to #stem(text) and returns stemmed_text."
14
+ ERROR
15
+ end
16
+ end
17
+
18
+ end
19
+
20
+ end
21
+ end
22
+ end
data/lib/picky/loader.rb CHANGED
@@ -200,6 +200,7 @@ module Picky
200
200
  #
201
201
  def load_api
202
202
  load_relative 'api/tokenizer/character_substituter',
203
+ 'api/tokenizer/stemmer',
203
204
  'api/search/boost'
204
205
  end
205
206
 
@@ -8,6 +8,7 @@ module Picky
8
8
 
9
9
  extend Picky::Helpers::Identification
10
10
  include API::Tokenizer::CharacterSubstituter
11
+ include API::Tokenizer::Stemmer
11
12
 
12
13
  def self.default_indexing_with options = {}
13
14
  @indexing = from options
@@ -51,6 +52,7 @@ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on
51
52
  Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
52
53
  Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
53
54
  Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
55
+ Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
54
56
  Case sensitive? #{@case_sensitive ? "Yes." : "-"}
55
57
  TOKENIZER
56
58
  end
@@ -135,6 +137,15 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
135
137
  def substitute_characters text
136
138
  substituter?? substituter.substitute(text) : text
137
139
  end
140
+
141
+ # Stems tokens with this stemmer.
142
+ #
143
+ def stems_with stemmer
144
+ @stemmer = extract_stemmer stemmer
145
+ end
146
+ def stem text
147
+ stemmer?? stemmer.stem(text) : text
148
+ end
138
149
 
139
150
  # Reject tokens after tokenizing based on the given criteria.
140
151
  #
@@ -175,8 +186,9 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
175
186
  raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
176
187
  end
177
188
 
178
- attr_reader :substituter
189
+ attr_reader :substituter, :stemmer
179
190
  alias substituter? substituter
191
+ alias stemmer? stemmer
180
192
 
181
193
  def initialize options = {}
182
194
  options = default_options.merge options
@@ -196,6 +208,7 @@ A short overview:
196
208
  normalizes_words [[/replace (this)/, 'with this \\1'], ...]
197
209
  rejects_token_if Proc/lambda, default :blank?.to_proc
198
210
  substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
211
+ stems_with Instance responds to #stem(String)
199
212
  case_sensitive true/false
200
213
 
201
214
  ERROR
@@ -259,6 +272,7 @@ ERROR
259
272
  #
260
273
  def tokens_for words
261
274
  words.collect! { |word| word.downcase!; word } if downcase?
275
+ words.collect! { |word| stem word } if stemmer?
262
276
  words
263
277
  end
264
278
 
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'spec_helper'
4
+
5
+ require 'stemmer'
6
+
7
+ describe 'stemming' do
8
+ let(:stemmer) {
9
+ # Fast stemmer does not conform with the API.
10
+ #
11
+ module Stemmer
12
+ class << self
13
+ alias_method :stem, :stem_word
14
+ end
15
+ end
16
+ Stemmer
17
+ }
18
+
19
+ describe 'examples' do
20
+ it 'works correctly' do
21
+ tokenizer = Picky::Tokenizer.new(stems_with: stemmer)
22
+
23
+ # Is this really correct? Shouldn't we split after normalizing?
24
+ #
25
+ # Yes – we split using more information.
26
+ #
27
+ tokenizer.stem('computers').should == 'comput'
28
+ tokenizer.stem('computing').should == 'comput'
29
+ tokenizer.stem('computed').should == 'comput'
30
+ tokenizer.stem('computer').should == 'comput'
31
+ end
32
+
33
+ # This tests the weights option.
34
+ #
35
+ it 'stems right' do
36
+ # Fix the Stemmer API.
37
+ #
38
+ module Stemmer
39
+ class << self
40
+ # stem_word is a bit silly, what else would you stem???
41
+ #
42
+ alias_method :stem, :stem_word
43
+ end
44
+ end
45
+
46
+ index = Picky::Index.new :stemming do
47
+ # Be aware that if !s are not removed from
48
+ # eg. Lemming!, then stemming won't work.
49
+ #
50
+ indexing removes_characters: /[^a-z\s]/i,
51
+ stems_with: Stemmer
52
+ category :text
53
+ end
54
+
55
+ index.replace_from id: 1, text: "Hello good Sirs, these things here need stems to work!"
56
+ index.replace_from id: 2, text: "Stemming Lemming!"
57
+
58
+ try = Picky::Search.new index
59
+
60
+ # If you don't stem in the search, it should not be found!
61
+ #
62
+ try.search("text:stemming").ids.should == []
63
+
64
+ try = Picky::Search.new index do
65
+ searching stems_with: Stemmer
66
+ end
67
+
68
+ # With stemming in search AND indexing, it works :)
69
+ #
70
+ try.search("text:stemming").ids.should == [2, 1]
71
+ try.search("text:lem").ids.should == [2]
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe Picky::API::Tokenizer do
4
+ let(:object) do
5
+ Class.new do
6
+ include Picky::API::Tokenizer::Stemmer
7
+ end.new
8
+ end
9
+ context 'extract_character_substituter' do
10
+ context 'with a substituter' do
11
+ let(:stemmer) do
12
+ Class.new do
13
+ def stem text
14
+ text.gsub /computers/, 'comput' # a simple one word stemmer ;)
15
+ end
16
+ end.new
17
+ end
18
+ it 'creates a tokenizer' do
19
+ object.extract_stemmer(stemmer).
20
+ stem("computers").should == 'comput'
21
+ end
22
+ end
23
+ context 'invalid tokenizer' do
24
+ it 'raises with a nice error message' do
25
+ expect {
26
+ object.extract_stemmer Object.new
27
+ }.to raise_error(<<-ERROR)
28
+ The stems_with option needs a stemmer,
29
+ which responds to #stem(text) and returns stemmed_text."
30
+ ERROR
31
+ end
32
+ end
33
+ end
34
+ end
@@ -18,6 +18,7 @@ A short overview:
18
18
  normalizes_words [[/replace (this)/, 'with this \\1'], ...]
19
19
  rejects_token_if Proc/lambda, default :blank?.to_proc
20
20
  substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
21
+ stems_with Instance responds to #stem(String)
21
22
  case_sensitive true/false
22
23
 
23
24
  MESSAGE
@@ -39,8 +40,9 @@ Removes characters: -
39
40
  Stopwords: -
40
41
  Splits text on: /\\s/
41
42
  Normalizes words: -
42
- Rejects tokens? Yes, see line 28 in app/application.rb
43
+ Rejects tokens? Yes, see line 29 in app/application.rb
43
44
  Substitutes chars? -
45
+ Stems? -
44
46
  Case sensitive? Yes.
45
47
  EXPECTED
46
48
  end
@@ -59,6 +61,7 @@ Splits text on: /\\s/
59
61
  Normalizes words: -
60
62
  Rejects tokens? -
61
63
  Substitutes chars? -
64
+ Stems? -
62
65
  Case sensitive? -
63
66
  EXPECTED
64
67
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picky
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.5
4
+ version: 4.6.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-24 00:00:00.000000000 Z
12
+ date: 2012-10-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -34,7 +34,7 @@ dependencies:
34
34
  requirements:
35
35
  - - ~>
36
36
  - !ruby/object:Gem::Version
37
- version: 4.6.5
37
+ version: 4.6.6
38
38
  type: :development
39
39
  prerelease: false
40
40
  version_requirements: !ruby/object:Gem::Requirement
@@ -42,7 +42,7 @@ dependencies:
42
42
  requirements:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
- version: 4.6.5
45
+ version: 4.6.6
46
46
  - !ruby/object:Gem::Dependency
47
47
  name: text
48
48
  requirement: !ruby/object:Gem::Requirement
@@ -138,6 +138,7 @@ files:
138
138
  - lib/picky/analyzer.rb
139
139
  - lib/picky/api/search/boost.rb
140
140
  - lib/picky/api/tokenizer/character_substituter.rb
141
+ - lib/picky/api/tokenizer/stemmer.rb
141
142
  - lib/picky/backends/backend.rb
142
143
  - lib/picky/backends/file/basic.rb
143
144
  - lib/picky/backends/file/json.rb
@@ -300,6 +301,7 @@ files:
300
301
  - spec/functional/regression_spec.rb
301
302
  - spec/functional/remap_qualifiers_spec.rb
302
303
  - spec/functional/speed_spec.rb
304
+ - spec/functional/stemming_spec.rb
303
305
  - spec/functional/terminate_early_spec.rb
304
306
  - spec/functional/tokenizer_spec.rb
305
307
  - spec/functional/unique_ids_search_spec.rb
@@ -308,6 +310,7 @@ files:
308
310
  - spec/lib/analyzer_spec.rb
309
311
  - spec/lib/api/search/boost_spec.rb
310
312
  - spec/lib/api/tokenizer/character_substituter_spec.rb
313
+ - spec/lib/api/tokenizer/stemmer_spec.rb
311
314
  - spec/lib/backends/backend_spec.rb
312
315
  - spec/lib/backends/file/basic_spec.rb
313
316
  - spec/lib/backends/file_spec.rb
@@ -457,6 +460,7 @@ test_files:
457
460
  - spec/functional/regression_spec.rb
458
461
  - spec/functional/remap_qualifiers_spec.rb
459
462
  - spec/functional/speed_spec.rb
463
+ - spec/functional/stemming_spec.rb
460
464
  - spec/functional/terminate_early_spec.rb
461
465
  - spec/functional/tokenizer_spec.rb
462
466
  - spec/functional/unique_ids_search_spec.rb
@@ -465,6 +469,7 @@ test_files:
465
469
  - spec/lib/analyzer_spec.rb
466
470
  - spec/lib/api/search/boost_spec.rb
467
471
  - spec/lib/api/tokenizer/character_substituter_spec.rb
472
+ - spec/lib/api/tokenizer/stemmer_spec.rb
468
473
  - spec/lib/backends/backend_spec.rb
469
474
  - spec/lib/backends/file/basic_spec.rb
470
475
  - spec/lib/backends/file_spec.rb