picky 4.6.5 → 4.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
|
|
1
|
+
module Picky
|
2
|
+
module API
|
3
|
+
module Tokenizer
|
4
|
+
|
5
|
+
module Stemmer
|
6
|
+
|
7
|
+
def extract_stemmer thing
|
8
|
+
if thing.respond_to? :stem
|
9
|
+
thing
|
10
|
+
else
|
11
|
+
raise ArgumentError.new <<-ERROR
|
12
|
+
The stems_with option needs a stemmer,
|
13
|
+
which responds to #stem(text) and returns stemmed_text."
|
14
|
+
ERROR
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/picky/loader.rb
CHANGED
data/lib/picky/tokenizer.rb
CHANGED
@@ -8,6 +8,7 @@ module Picky
|
|
8
8
|
|
9
9
|
extend Picky::Helpers::Identification
|
10
10
|
include API::Tokenizer::CharacterSubstituter
|
11
|
+
include API::Tokenizer::Stemmer
|
11
12
|
|
12
13
|
def self.default_indexing_with options = {}
|
13
14
|
@indexing = from options
|
@@ -51,6 +52,7 @@ Splits text on: #{@splits_text_on.respond_to?(:source) ? "/#{@splits_text_on
|
|
51
52
|
Normalizes words: #{@normalizes_words_regexp_replaces ? @normalizes_words_regexp_replaces : '-'}
|
52
53
|
Rejects tokens? #{reject_condition_location ? "Yes, see line #{reject_condition_location} in app/application.rb" : '-'}
|
53
54
|
Substitutes chars? #{@substituter ? "Yes, using #{@substituter}." : '-' }
|
55
|
+
Stems? #{@stemmer ? "Yes, using #{@stemmer}." : '-' }
|
54
56
|
Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
55
57
|
TOKENIZER
|
56
58
|
end
|
@@ -135,6 +137,15 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
|
135
137
|
def substitute_characters text
|
136
138
|
substituter?? substituter.substitute(text) : text
|
137
139
|
end
|
140
|
+
|
141
|
+
# Stems tokens with this stemmer.
|
142
|
+
#
|
143
|
+
def stems_with stemmer
|
144
|
+
@stemmer = extract_stemmer stemmer
|
145
|
+
end
|
146
|
+
def stem text
|
147
|
+
stemmer?? stemmer.stem(text) : text
|
148
|
+
end
|
138
149
|
|
139
150
|
# Reject tokens after tokenizing based on the given criteria.
|
140
151
|
#
|
@@ -175,8 +186,9 @@ Case sensitive? #{@case_sensitive ? "Yes." : "-"}
|
|
175
186
|
raise ArgumentError.new "Application##{method} takes a #{type} as argument, not a #{argument.class}." unless type === argument
|
176
187
|
end
|
177
188
|
|
178
|
-
attr_reader :substituter
|
189
|
+
attr_reader :substituter, :stemmer
|
179
190
|
alias substituter? substituter
|
191
|
+
alias stemmer? stemmer
|
180
192
|
|
181
193
|
def initialize options = {}
|
182
194
|
options = default_options.merge options
|
@@ -196,6 +208,7 @@ A short overview:
|
|
196
208
|
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
|
197
209
|
rejects_token_if Proc/lambda, default :blank?.to_proc
|
198
210
|
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
|
211
|
+
stems_with Instance responds to #stem(String)
|
199
212
|
case_sensitive true/false
|
200
213
|
|
201
214
|
ERROR
|
@@ -259,6 +272,7 @@ ERROR
|
|
259
272
|
#
|
260
273
|
def tokens_for words
|
261
274
|
words.collect! { |word| word.downcase!; word } if downcase?
|
275
|
+
words.collect! { |word| stem word } if stemmer?
|
262
276
|
words
|
263
277
|
end
|
264
278
|
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
#
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
require 'stemmer'
|
6
|
+
|
7
|
+
describe 'stemming' do
|
8
|
+
let(:stemmer) {
|
9
|
+
# Fast stemmer does not conform with the API.
|
10
|
+
#
|
11
|
+
module Stemmer
|
12
|
+
class << self
|
13
|
+
alias_method :stem, :stem_word
|
14
|
+
end
|
15
|
+
end
|
16
|
+
Stemmer
|
17
|
+
}
|
18
|
+
|
19
|
+
describe 'examples' do
|
20
|
+
it 'works correctly' do
|
21
|
+
tokenizer = Picky::Tokenizer.new(stems_with: stemmer)
|
22
|
+
|
23
|
+
# Is this really correct? Shouldn't we split after normalizing?
|
24
|
+
#
|
25
|
+
# Yes – we split using more information.
|
26
|
+
#
|
27
|
+
tokenizer.stem('computers').should == 'comput'
|
28
|
+
tokenizer.stem('computing').should == 'comput'
|
29
|
+
tokenizer.stem('computed').should == 'comput'
|
30
|
+
tokenizer.stem('computer').should == 'comput'
|
31
|
+
end
|
32
|
+
|
33
|
+
# This tests the weights option.
|
34
|
+
#
|
35
|
+
it 'stems right' do
|
36
|
+
# Fix the Stemmer API.
|
37
|
+
#
|
38
|
+
module Stemmer
|
39
|
+
class << self
|
40
|
+
# stem_word is a bit silly, what else would you stem???
|
41
|
+
#
|
42
|
+
alias_method :stem, :stem_word
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
index = Picky::Index.new :stemming do
|
47
|
+
# Be aware that if !s are not removed from
|
48
|
+
# eg. Lemming!, then stemming won't work.
|
49
|
+
#
|
50
|
+
indexing removes_characters: /[^a-z\s]/i,
|
51
|
+
stems_with: Stemmer
|
52
|
+
category :text
|
53
|
+
end
|
54
|
+
|
55
|
+
index.replace_from id: 1, text: "Hello good Sirs, these things here need stems to work!"
|
56
|
+
index.replace_from id: 2, text: "Stemming Lemming!"
|
57
|
+
|
58
|
+
try = Picky::Search.new index
|
59
|
+
|
60
|
+
# If you don't stem in the search, it should not be found!
|
61
|
+
#
|
62
|
+
try.search("text:stemming").ids.should == []
|
63
|
+
|
64
|
+
try = Picky::Search.new index do
|
65
|
+
searching stems_with: Stemmer
|
66
|
+
end
|
67
|
+
|
68
|
+
# With stemming in search AND indexing, it works :)
|
69
|
+
#
|
70
|
+
try.search("text:stemming").ids.should == [2, 1]
|
71
|
+
try.search("text:lem").ids.should == [2]
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Picky::API::Tokenizer do
|
4
|
+
let(:object) do
|
5
|
+
Class.new do
|
6
|
+
include Picky::API::Tokenizer::Stemmer
|
7
|
+
end.new
|
8
|
+
end
|
9
|
+
context 'extract_character_substituter' do
|
10
|
+
context 'with a substituter' do
|
11
|
+
let(:stemmer) do
|
12
|
+
Class.new do
|
13
|
+
def stem text
|
14
|
+
text.gsub /computers/, 'comput' # a simple one word stemmer ;)
|
15
|
+
end
|
16
|
+
end.new
|
17
|
+
end
|
18
|
+
it 'creates a tokenizer' do
|
19
|
+
object.extract_stemmer(stemmer).
|
20
|
+
stem("computers").should == 'comput'
|
21
|
+
end
|
22
|
+
end
|
23
|
+
context 'invalid tokenizer' do
|
24
|
+
it 'raises with a nice error message' do
|
25
|
+
expect {
|
26
|
+
object.extract_stemmer Object.new
|
27
|
+
}.to raise_error(<<-ERROR)
|
28
|
+
The stems_with option needs a stemmer,
|
29
|
+
which responds to #stem(text) and returns stemmed_text."
|
30
|
+
ERROR
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/spec/lib/tokenizer_spec.rb
CHANGED
@@ -18,6 +18,7 @@ A short overview:
|
|
18
18
|
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
|
19
19
|
rejects_token_if Proc/lambda, default :blank?.to_proc
|
20
20
|
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
|
21
|
+
stems_with Instance responds to #stem(String)
|
21
22
|
case_sensitive true/false
|
22
23
|
|
23
24
|
MESSAGE
|
@@ -39,8 +40,9 @@ Removes characters: -
|
|
39
40
|
Stopwords: -
|
40
41
|
Splits text on: /\\s/
|
41
42
|
Normalizes words: -
|
42
|
-
Rejects tokens? Yes, see line
|
43
|
+
Rejects tokens? Yes, see line 29 in app/application.rb
|
43
44
|
Substitutes chars? -
|
45
|
+
Stems? -
|
44
46
|
Case sensitive? Yes.
|
45
47
|
EXPECTED
|
46
48
|
end
|
@@ -59,6 +61,7 @@ Splits text on: /\\s/
|
|
59
61
|
Normalizes words: -
|
60
62
|
Rejects tokens? -
|
61
63
|
Substitutes chars? -
|
64
|
+
Stems? -
|
62
65
|
Case sensitive? -
|
63
66
|
EXPECTED
|
64
67
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: picky
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 4.6.
|
4
|
+
version: 4.6.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-10-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
@@ -34,7 +34,7 @@ dependencies:
|
|
34
34
|
requirements:
|
35
35
|
- - ~>
|
36
36
|
- !ruby/object:Gem::Version
|
37
|
-
version: 4.6.
|
37
|
+
version: 4.6.6
|
38
38
|
type: :development
|
39
39
|
prerelease: false
|
40
40
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -42,7 +42,7 @@ dependencies:
|
|
42
42
|
requirements:
|
43
43
|
- - ~>
|
44
44
|
- !ruby/object:Gem::Version
|
45
|
-
version: 4.6.
|
45
|
+
version: 4.6.6
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
47
|
name: text
|
48
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -138,6 +138,7 @@ files:
|
|
138
138
|
- lib/picky/analyzer.rb
|
139
139
|
- lib/picky/api/search/boost.rb
|
140
140
|
- lib/picky/api/tokenizer/character_substituter.rb
|
141
|
+
- lib/picky/api/tokenizer/stemmer.rb
|
141
142
|
- lib/picky/backends/backend.rb
|
142
143
|
- lib/picky/backends/file/basic.rb
|
143
144
|
- lib/picky/backends/file/json.rb
|
@@ -300,6 +301,7 @@ files:
|
|
300
301
|
- spec/functional/regression_spec.rb
|
301
302
|
- spec/functional/remap_qualifiers_spec.rb
|
302
303
|
- spec/functional/speed_spec.rb
|
304
|
+
- spec/functional/stemming_spec.rb
|
303
305
|
- spec/functional/terminate_early_spec.rb
|
304
306
|
- spec/functional/tokenizer_spec.rb
|
305
307
|
- spec/functional/unique_ids_search_spec.rb
|
@@ -308,6 +310,7 @@ files:
|
|
308
310
|
- spec/lib/analyzer_spec.rb
|
309
311
|
- spec/lib/api/search/boost_spec.rb
|
310
312
|
- spec/lib/api/tokenizer/character_substituter_spec.rb
|
313
|
+
- spec/lib/api/tokenizer/stemmer_spec.rb
|
311
314
|
- spec/lib/backends/backend_spec.rb
|
312
315
|
- spec/lib/backends/file/basic_spec.rb
|
313
316
|
- spec/lib/backends/file_spec.rb
|
@@ -457,6 +460,7 @@ test_files:
|
|
457
460
|
- spec/functional/regression_spec.rb
|
458
461
|
- spec/functional/remap_qualifiers_spec.rb
|
459
462
|
- spec/functional/speed_spec.rb
|
463
|
+
- spec/functional/stemming_spec.rb
|
460
464
|
- spec/functional/terminate_early_spec.rb
|
461
465
|
- spec/functional/tokenizer_spec.rb
|
462
466
|
- spec/functional/unique_ids_search_spec.rb
|
@@ -465,6 +469,7 @@ test_files:
|
|
465
469
|
- spec/lib/analyzer_spec.rb
|
466
470
|
- spec/lib/api/search/boost_spec.rb
|
467
471
|
- spec/lib/api/tokenizer/character_substituter_spec.rb
|
472
|
+
- spec/lib/api/tokenizer/stemmer_spec.rb
|
468
473
|
- spec/lib/backends/backend_spec.rb
|
469
474
|
- spec/lib/backends/file/basic_spec.rb
|
470
475
|
- spec/lib/backends/file_spec.rb
|