picky 4.6.5 → 4.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ module Picky
2
+ module API
3
+ module Tokenizer
4
+
5
+ module Stemmer
6
+
7
+ def extract_stemmer thing
8
+ if thing.respond_to? :stem
9
+ thing
10
+ else
11
+ raise ArgumentError.new <<-ERROR
12
+ The stems_with option needs a stemmer,
13
+ which responds to #stem(text) and returns stemmed_text."
14
+ ERROR
15
+ end
16
+ end
17
+
18
+ end
19
+
20
+ end
21
+ end
22
+ end
data/lib/picky/loader.rb CHANGED
@@ -200,6 +200,7 @@ module Picky
200
200
  #
201
201
  def load_api
202
202
  load_relative 'api/tokenizer/character_substituter',
203
+ 'api/tokenizer/stemmer',
203
204
  'api/search/boost'
204
205
  end
205
206
 
@@ -8,6 +8,7 @@ module Picky
8
8
 
9
9
  extend Picky::Helpers::Identification
10
10
  include API::Tokenizer::CharacterSubstituter
11
+ include API::Tokenizer::Stemmer
11
12
 
12
13
  def self.default_indexing_with options = {}
13
14
  @indexing = from options
@@ -51,6 +52,7 @@ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on
51
52
  Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
52
53
  Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
53
54
  Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
55
+ Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
54
56
  Case sensitive? #{@case_sensitive ? "Yes." : "-"}
55
57
  TOKENIZER
56
58
  end
@@ -135,6 +137,15 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
135
137
  def substitute_characters text
136
138
  substituter?? substituter.substitute(text) : text
137
139
  end
140
+
141
+ # Stems tokens with this stemmer.
142
+ #
143
+ def stems_with stemmer
144
+ @stemmer = extract_stemmer stemmer
145
+ end
146
+ def stem text
147
+ stemmer?? stemmer.stem(text) : text
148
+ end
138
149
 
139
150
  # Reject tokens after tokenizing based on the given criteria.
140
151
  #
@@ -175,8 +186,9 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
175
186
  raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
176
187
  end
177
188
 
178
- attr_reader :substituter
189
+ attr_reader :substituter, :stemmer
179
190
  alias substituter? substituter
191
+ alias stemmer? stemmer
180
192
 
181
193
  def initialize options = {}
182
194
  options = default_options.merge options
@@ -196,6 +208,7 @@ A short overview:
196
208
  normalizes_words [[/replace (this)/, 'with this \\1'], ...]
197
209
  rejects_token_if Proc/lambda, default :blank?.to_proc
198
210
  substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
211
+ stems_with Instance responds to #stem(String)
199
212
  case_sensitive true/false
200
213
 
201
214
  ERROR
@@ -259,6 +272,7 @@ ERROR
259
272
  #
260
273
  def tokens_for words
261
274
  words.collect! { |word| word.downcase!; word } if downcase?
275
+ words.collect! { |word| stem word } if stemmer?
262
276
  words
263
277
  end
264
278
 
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+ #
3
+ require 'spec_helper'
4
+
5
+ require 'stemmer'
6
+
7
+ describe 'stemming' do
8
+ let(:stemmer) {
9
+ # Fast stemmer does not conform with the API.
10
+ #
11
+ module Stemmer
12
+ class << self
13
+ alias_method :stem, :stem_word
14
+ end
15
+ end
16
+ Stemmer
17
+ }
18
+
19
+ describe 'examples' do
20
+ it 'works correctly' do
21
+ tokenizer = Picky::Tokenizer.new(stems_with: stemmer)
22
+
23
+ # Is this really correct? Shouldn't we split after normalizing?
24
+ #
25
+ # Yes – we split using more information.
26
+ #
27
+ tokenizer.stem('computers').should == 'comput'
28
+ tokenizer.stem('computing').should == 'comput'
29
+ tokenizer.stem('computed').should == 'comput'
30
+ tokenizer.stem('computer').should == 'comput'
31
+ end
32
+
33
+ # This tests the weights option.
34
+ #
35
+ it 'stems right' do
36
+ # Fix the Stemmer API.
37
+ #
38
+ module Stemmer
39
+ class << self
40
+ # stem_word is a bit silly, what else would you stem???
41
+ #
42
+ alias_method :stem, :stem_word
43
+ end
44
+ end
45
+
46
+ index = Picky::Index.new :stemming do
47
+ # Be aware that if !s are not removed from
48
+ # eg. Lemming!, then stemming won't work.
49
+ #
50
+ indexing removes_characters: /[^a-z\s]/i,
51
+ stems_with: Stemmer
52
+ category :text
53
+ end
54
+
55
+ index.replace_from id: 1, text: "Hello good Sirs, these things here need stems to work!"
56
+ index.replace_from id: 2, text: "Stemming Lemming!"
57
+
58
+ try = Picky::Search.new index
59
+
60
+ # If you don't stem in the search, it should not be found!
61
+ #
62
+ try.search("text:stemming").ids.should == []
63
+
64
+ try = Picky::Search.new index do
65
+ searching stems_with: Stemmer
66
+ end
67
+
68
+ # With stemming in search AND indexing, it works :)
69
+ #
70
+ try.search("text:stemming").ids.should == [2, 1]
71
+ try.search("text:lem").ids.should == [2]
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,34 @@
1
+ require 'spec_helper'
2
+
3
+ describe Picky::API::Tokenizer do
4
+ let(:object) do
5
+ Class.new do
6
+ include Picky::API::Tokenizer::Stemmer
7
+ end.new
8
+ end
9
+ context 'extract_character_substituter' do
10
+ context 'with a substituter' do
11
+ let(:stemmer) do
12
+ Class.new do
13
+ def stem text
14
+ text.gsub /computers/, 'comput' # a simple one word stemmer ;)
15
+ end
16
+ end.new
17
+ end
18
+ it 'creates a tokenizer' do
19
+ object.extract_stemmer(stemmer).
20
+ stem("computers").should == 'comput'
21
+ end
22
+ end
23
+ context 'invalid tokenizer' do
24
+ it 'raises with a nice error message' do
25
+ expect {
26
+ object.extract_stemmer Object.new
27
+ }.to raise_error(<<-ERROR)
28
+ The stems_with option needs a stemmer,
29
+ which responds to #stem(text) and returns stemmed_text."
30
+ ERROR
31
+ end
32
+ end
33
+ end
34
+ end
@@ -18,6 +18,7 @@ A short overview:
18
18
  normalizes_words [[/replace (this)/, 'with this \\1'], ...]
19
19
  rejects_token_if Proc/lambda, default :blank?.to_proc
20
20
  substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
21
+ stems_with Instance responds to #stem(String)
21
22
  case_sensitive true/false
22
23
 
23
24
  MESSAGE
@@ -39,8 +40,9 @@ Removes characters: -
39
40
  Stopwords: -
40
41
  Splits text on: /\\s/
41
42
  Normalizes words: -
42
- Rejects tokens? Yes, see line 28 in app/application.rb
43
+ Rejects tokens? Yes, see line 29 in app/application.rb
43
44
  Substitutes chars? -
45
+ Stems? -
44
46
  Case sensitive? Yes.
45
47
  EXPECTED
46
48
  end
@@ -59,6 +61,7 @@ Splits text on: /\\s/
59
61
  Normalizes words: -
60
62
  Rejects tokens? -
61
63
  Substitutes chars? -
64
+ Stems? -
62
65
  Case sensitive? -
63
66
  EXPECTED
64
67
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: picky
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.6.5
4
+ version: 4.6.6
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-09-24 00:00:00.000000000 Z
12
+ date: 2012-10-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rspec
@@ -34,7 +34,7 @@ dependencies:
34
34
  requirements:
35
35
  - - ~>
36
36
  - !ruby/object:Gem::Version
37
- version: 4.6.5
37
+ version: 4.6.6
38
38
  type: :development
39
39
  prerelease: false
40
40
  version_requirements: !ruby/object:Gem::Requirement
@@ -42,7 +42,7 @@ dependencies:
42
42
  requirements:
43
43
  - - ~>
44
44
  - !ruby/object:Gem::Version
45
- version: 4.6.5
45
+ version: 4.6.6
46
46
  - !ruby/object:Gem::Dependency
47
47
  name: text
48
48
  requirement: !ruby/object:Gem::Requirement
@@ -138,6 +138,7 @@ files:
138
138
  - lib/picky/analyzer.rb
139
139
  - lib/picky/api/search/boost.rb
140
140
  - lib/picky/api/tokenizer/character_substituter.rb
141
+ - lib/picky/api/tokenizer/stemmer.rb
141
142
  - lib/picky/backends/backend.rb
142
143
  - lib/picky/backends/file/basic.rb
143
144
  - lib/picky/backends/file/json.rb
@@ -300,6 +301,7 @@ files:
300
301
  - spec/functional/regression_spec.rb
301
302
  - spec/functional/remap_qualifiers_spec.rb
302
303
  - spec/functional/speed_spec.rb
304
+ - spec/functional/stemming_spec.rb
303
305
  - spec/functional/terminate_early_spec.rb
304
306
  - spec/functional/tokenizer_spec.rb
305
307
  - spec/functional/unique_ids_search_spec.rb
@@ -308,6 +310,7 @@ files:
308
310
  - spec/lib/analyzer_spec.rb
309
311
  - spec/lib/api/search/boost_spec.rb
310
312
  - spec/lib/api/tokenizer/character_substituter_spec.rb
313
+ - spec/lib/api/tokenizer/stemmer_spec.rb
311
314
  - spec/lib/backends/backend_spec.rb
312
315
  - spec/lib/backends/file/basic_spec.rb
313
316
  - spec/lib/backends/file_spec.rb
@@ -457,6 +460,7 @@ test_files:
457
460
  - spec/functional/regression_spec.rb
458
461
  - spec/functional/remap_qualifiers_spec.rb
459
462
  - spec/functional/speed_spec.rb
463
+ - spec/functional/stemming_spec.rb
460
464
  - spec/functional/terminate_early_spec.rb
461
465
  - spec/functional/tokenizer_spec.rb
462
466
  - spec/functional/unique_ids_search_spec.rb
@@ -465,6 +469,7 @@ test_files:
465
469
  - spec/lib/analyzer_spec.rb
466
470
  - spec/lib/api/search/boost_spec.rb
467
471
  - spec/lib/api/tokenizer/character_substituter_spec.rb
472
+ - spec/lib/api/tokenizer/stemmer_spec.rb
468
473
  - spec/lib/backends/backend_spec.rb
469
474
  - spec/lib/backends/file/basic_spec.rb
470
475
  - spec/lib/backends/file_spec.rb