picky 4.6.5 → 4.6.6
Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,22 @@
|
|
1
|
+
module Picky
|
2
|
+
module API
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
module Stemmer
|
6
|
+
|
7
|
+
def extract_stemmer thing
|
8
|
+
if thing.respond_to? :stem
|
9
|
+
thing
|
10
|
+
else
|
11
|
+
raise ArgumentError.new <<-ERROR
|
12
|
+
The stems_with option needs a stemmer,
|
13
|
+
which responds to #stem(text) and returns stemmed_text."
|
14
|
+
ERROR
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/picky/loader.rb
CHANGED
data/lib/picky/tokenizer.rb
CHANGED
@@ -8,6 +8,7 @@ module Picky
|
|
8
8
|
|
9
9
|
extend Picky::Helpers::Identification
|
10
10
|
include API::Tokenizer::CharacterSubstituter
|
11
|
+
include API::Tokenizer::Stemmer
|
11
12
|
|
12
13
|
def self.default_indexing_with options = {}
|
13
14
|
@indexing = from options
|
@@ -51,6 +52,7 @@ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on
|
|
51
52
|
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
|
52
53
|
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
53
54
|
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
55
|
+
Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
|
54
56
|
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
55
57
|
TOKENIZER
|
56
58
|
end
|
@@ -135,6 +137,15 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
|
135
137
|
def substitute_characters text
|
136
138
|
substituter?? substituter.substitute(text) : text
|
137
139
|
end
|
140
|
+
|
141
|
+
# Stems tokens with this stemmer.
|
142
|
+
#
|
143
|
+
def stems_with stemmer
|
144
|
+
@stemmer = extract_stemmer stemmer
|
145
|
+
end
|
146
|
+
def stem text
|
147
|
+
stemmer?? stemmer.stem(text) : text
|
148
|
+
end
|
138
149
|
|
139
150
|
# Reject tokens after tokenizing based on the given criteria.
|
140
151
|
#
|
@@ -175,8 +186,9 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
|
175
186
|
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
176
187
|
end
|
177
188
|
|
178
|
-
attr_reader :substituter
|
189
|
+
attr_reader :substituter, :stemmer
|
179
190
|
alias substituter? substituter
|
191
|
+
alias stemmer? stemmer
|
180
192
|
|
181
193
|
def initialize options = {}
|
182
194
|
options = default_options.merge options
|
@@ -196,6 +208,7 @@ A short overview:
|
|
196
208
|
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
|
197
209
|
rejects_token_if Proc/lambda, default :blank?.to_proc
|
198
210
|
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
|
211
|
+
stems_with Instance responds to #stem(String)
|
199
212
|
case_sensitive true/false
|
200
213
|
|
201
214
|
ERROR
|
@@ -259,6 +272,7 @@ ERROR
|
|
259
272
|
#
|
260
273
|
def tokens_for words
|
261
274
|
words.collect! { |word| word.downcase!; word } if downcase?
|
275
|
+
words.collect! { |word| stem word } if stemmer?
|
262
276
|
words
|
263
277
|
end
|
264
278
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
require 'stemmer'
|
6
|
+
|
7
|
+
describe 'stemming' do
|
8
|
+
let(:stemmer) {
|
9
|
+
# Fast stemmer does not conform with the API.
|
10
|
+
#
|
11
|
+
module Stemmer
|
12
|
+
class << self
|
13
|
+
alias_method :stem, :stem_word
|
14
|
+
end
|
15
|
+
end
|
16
|
+
Stemmer
|
17
|
+
}
|
18
|
+
|
19
|
+
describe 'examples' do
|
20
|
+
it 'works correctly' do
|
21
|
+
tokenizer = Picky::Tokenizer.new(stems_with: stemmer)
|
22
|
+
|
23
|
+
# Is this really correct? Shouldn't we split after normalizing?
|
24
|
+
#
|
25
|
+
# Yes – we split using more information.
|
26
|
+
#
|
27
|
+
tokenizer.stem('computers').should == 'comput'
|
28
|
+
tokenizer.stem('computing').should == 'comput'
|
29
|
+
tokenizer.stem('computed').should == 'comput'
|
30
|
+
tokenizer.stem('computer').should == 'comput'
|
31
|
+
end
|
32
|
+
|
33
|
+
# This tests the weights option.
|
34
|
+
#
|
35
|
+
it 'stems right' do
|
36
|
+
# Fix the Stemmer API.
|
37
|
+
#
|
38
|
+
module Stemmer
|
39
|
+
class << self
|
40
|
+
# stem_word is a bit silly, what else would you stem???
|
41
|
+
#
|
42
|
+
alias_method :stem, :stem_word
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
index = Picky::Index.new :stemming do
|
47
|
+
# Be aware that if !s are not removed from
|
48
|
+
# eg. Lemming!, then stemming won't work.
|
49
|
+
#
|
50
|
+
indexing removes_characters: /[^a-z\s]/i,
|
51
|
+
stems_with: Stemmer
|
52
|
+
category :text
|
53
|
+
end
|
54
|
+
|
55
|
+
index.replace_from id: 1, text: "Hello good Sirs, these things here need stems to work!"
|
56
|
+
index.replace_from id: 2, text: "Stemming Lemming!"
|
57
|
+
|
58
|
+
try = Picky::Search.new index
|
59
|
+
|
60
|
+
# If you don't stem in the search, it should not be found!
|
61
|
+
#
|
62
|
+
try.search("text:stemming").ids.should == []
|
63
|
+
|
64
|
+
try = Picky::Search.new index do
|
65
|
+
searching stems_with: Stemmer
|
66
|
+
end
|
67
|
+
|
68
|
+
# With stemming in search AND indexing, it works :)
|
69
|
+
#
|
70
|
+
try.search("text:stemming").ids.should == [2, 1]
|
71
|
+
try.search("text:lem").ids.should == [2]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Picky::API::Tokenizer do
|
4
|
+
let(:object) do
|
5
|
+
Class.new do
|
6
|
+
include Picky::API::Tokenizer::Stemmer
|
7
|
+
end.new
|
8
|
+
end
|
9
|
+
context 'extract_character_substituter' do
|
10
|
+
context 'with a substituter' do
|
11
|
+
let(:stemmer) do
|
12
|
+
Class.new do
|
13
|
+
def stem text
|
14
|
+
text.gsub /computers/, 'comput' # a simple one word stemmer ;)
|
15
|
+
end
|
16
|
+
end.new
|
17
|
+
end
|
18
|
+
it 'creates a tokenizer' do
|
19
|
+
object.extract_stemmer(stemmer).
|
20
|
+
stem("computers").should == 'comput'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
context 'invalid tokenizer' do
|
24
|
+
it 'raises with a nice error message' do
|
25
|
+
expect {
|
26
|
+
object.extract_stemmer Object.new
|
27
|
+
}.to raise_error(<<-ERROR)
|
28
|
+
The stems_with option needs a stemmer,
|
29
|
+
which responds to #stem(text) and returns stemmed_text."
|
30
|
+
ERROR
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/spec/lib/tokenizer_spec.rb
CHANGED
@@ -18,6 +18,7 @@ A short overview:
|
|
18
18
|
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
|
19
19
|
rejects_token_if Proc/lambda, default :blank?.to_proc
|
20
20
|
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
|
21
|
+
stems_with Instance responds to #stem(String)
|
21
22
|
case_sensitive true/false
|
22
23
|
|
23
24
|
MESSAGE
|
@@ -39,8 +40,9 @@ Removes characters: -
|
|
39
40
|
Stopwords: -
|
40
41
|
Splits text on: /\\s/
|
41
42
|
Normalizes words: -
|
42
|
-
Rejects tokens? Yes, see line
|
43
|
+
Rejects tokens? Yes, see line 29 in app/application.rb
|
43
44
|
Substitutes chars? -
|
45
|
+
Stems? -
|
44
46
|
Case sensitive? Yes.
|
45
47
|
EXPECTED
|
46
48
|
end
|
@@ -59,6 +61,7 @@ Splits text on: /\\s/
|
|
59
61
|
Normalizes words: -
|
60
62
|
Rejects tokens? -
|
61
63
|
Substitutes chars? -
|
64
|
+
Stems? -
|
62
65
|
Case sensitive? -
|
63
66
|
EXPECTED
|
64
67
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picky
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.6.
|
4
|
+
version: 4.6.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -34,7 +34,7 @@ dependencies:
|
|
34
34
|
requirements:
|
35
35
|
- - ~>
|
36
36
|
- !ruby/object:Gem::Version
|
37
|
-
version: 4.6.
|
37
|
+
version: 4.6.6
|
38
38
|
type: :development
|
39
39
|
prerelease: false
|
40
40
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -42,7 +42,7 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
version: 4.6.
|
45
|
+
version: 4.6.6
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
47
|
name: text
|
48
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,6 +138,7 @@ files:
|
|
138
138
|
- lib/picky/analyzer.rb
|
139
139
|
- lib/picky/api/search/boost.rb
|
140
140
|
- lib/picky/api/tokenizer/character_substituter.rb
|
141
|
+
- lib/picky/api/tokenizer/stemmer.rb
|
141
142
|
- lib/picky/backends/backend.rb
|
142
143
|
- lib/picky/backends/file/basic.rb
|
143
144
|
- lib/picky/backends/file/json.rb
|
@@ -300,6 +301,7 @@ files:
|
|
300
301
|
- spec/functional/regression_spec.rb
|
301
302
|
- spec/functional/remap_qualifiers_spec.rb
|
302
303
|
- spec/functional/speed_spec.rb
|
304
|
+
- spec/functional/stemming_spec.rb
|
303
305
|
- spec/functional/terminate_early_spec.rb
|
304
306
|
- spec/functional/tokenizer_spec.rb
|
305
307
|
- spec/functional/unique_ids_search_spec.rb
|
@@ -308,6 +310,7 @@ files:
|
|
308
310
|
- spec/lib/analyzer_spec.rb
|
309
311
|
- spec/lib/api/search/boost_spec.rb
|
310
312
|
- spec/lib/api/tokenizer/character_substituter_spec.rb
|
313
|
+
- spec/lib/api/tokenizer/stemmer_spec.rb
|
311
314
|
- spec/lib/backends/backend_spec.rb
|
312
315
|
- spec/lib/backends/file/basic_spec.rb
|
313
316
|
- spec/lib/backends/file_spec.rb
|
@@ -457,6 +460,7 @@ test_files:
|
|
457
460
|
- spec/functional/regression_spec.rb
|
458
461
|
- spec/functional/remap_qualifiers_spec.rb
|
459
462
|
- spec/functional/speed_spec.rb
|
463
|
+
- spec/functional/stemming_spec.rb
|
460
464
|
- spec/functional/terminate_early_spec.rb
|
461
465
|
- spec/functional/tokenizer_spec.rb
|
462
466
|
- spec/functional/unique_ids_search_spec.rb
|
@@ -465,6 +469,7 @@ test_files:
|
|
465
469
|
- spec/lib/analyzer_spec.rb
|
466
470
|
- spec/lib/api/search/boost_spec.rb
|
467
471
|
- spec/lib/api/tokenizer/character_substituter_spec.rb
|
472
|
+
- spec/lib/api/tokenizer/stemmer_spec.rb
|
468
473
|
- spec/lib/backends/backend_spec.rb
|
469
474
|
- spec/lib/backends/file/basic_spec.rb
|
470
475
|
- spec/lib/backends/file_spec.rb
|