uea-stemmer 0.10.3 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/README.rdoc +60 -34
- data/VERSION +1 -1
- data/lib/uea-stemmer/rule.rb +3 -3
- data/lib/uea-stemmer/string_helpers.rb +2 -2
- data/lib/uea-stemmer/word.rb +2 -2
- data/lib/uea-stemmer.rb +66 -54
- metadata +20 -95
- data/.document +0 -5
- data/.ruby-gemset +0 -1
- data/.ruby-version +0 -1
- data/Gemfile +0 -9
- data/Gemfile.lock +0 -83
- data/Rakefile +0 -40
- data/test/test_helper.rb +0 -10
- data/test/uea_stemmer_test.rb +0 -184
- data/uea-stemmer.gemspec +0 -60
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
|
-
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: e33e3ab687d2232940f67f69bc4ec2cc72e402fd8ddfcc3c69f2c96748978970
|
|
4
|
+
data.tar.gz: cbafb7985b745a15a9c002560fbad2ae7deae42a2d31ff6416e6985b6f1e9f93
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f600e07381586e02a83657a381b4285c0e20005dcd7b3a5e363425dccfffd83ba58105863f918468b1a7807c48d9f88486454f5ae04ef7fd504cf12ca645daee
|
|
7
|
+
data.tar.gz: 310737fcd31fc0318d44d6a82e81507bf478272629a3170c5c2c76db0f3976b3ad32f04e4b13bbe3a9fa501c4ffd6184526d352c69be9f34c309b9eea9a2ad63
|
data/README.rdoc
CHANGED
|
@@ -1,74 +1,100 @@
|
|
|
1
1
|
= uea-stemmer
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Ruby implementation of the UEA-Lite stemmer for conservative stemming in
|
|
4
|
+
search and indexing workloads. The gem has no runtime dependencies.
|
|
4
5
|
|
|
5
|
-
|
|
6
|
+
UEA-Lite[https://web.archive.org/web/20120728132949/http://www.uea.ac.uk/cmp/research/graphicsvisionspeech/speech/WordStemming]
|
|
7
|
+
uses a rule set to normalize suffixes while avoiding aggressive stemming.
|
|
6
8
|
|
|
7
|
-
|
|
9
|
+
== Behavior Notes
|
|
8
10
|
|
|
9
|
-
|
|
11
|
+
The stemmer operates on a single token at a time and returns a stemmed token.
|
|
10
12
|
|
|
11
|
-
|
|
13
|
+
Notable behavior of this implementation:
|
|
12
14
|
|
|
13
|
-
|
|
15
|
+
* possessive apostrophes are removed
|
|
16
|
+
* contractions are expanded by default (for example, <tt>don't</tt> becomes
|
|
17
|
+
<tt>do not</tt>)
|
|
18
|
+
* tokens beginning with uppercase letters are preserved, and pluralized
|
|
19
|
+
acronyms ending in a lowercase <tt>s</tt> are singularized
|
|
20
|
+
* pure numbers, and tokens containing hyphens/underscores, are passed through
|
|
21
|
+
unchanged
|
|
22
|
+
|
|
23
|
+
This is a port to Ruby from the Java port of the original Perl script by
|
|
24
|
+
Marie-Claire Jenkins and Dr. Dan J. Smith at the University of East Anglia.
|
|
14
25
|
|
|
15
26
|
== Installation
|
|
16
27
|
|
|
28
|
+
Requires Ruby 3.1 or newer.
|
|
29
|
+
|
|
17
30
|
Install the gem:
|
|
18
31
|
|
|
19
32
|
gem install uea-stemmer
|
|
20
33
|
|
|
21
|
-
Install
|
|
34
|
+
Install from source:
|
|
22
35
|
|
|
23
|
-
git clone
|
|
36
|
+
git clone https://github.com/ealdent/uea-stemmer.git
|
|
24
37
|
cd uea-stemmer
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Depending on your setup, you may need to use +sudo+ for either of these methods.
|
|
38
|
+
gem build uea-stemmer.gemspec
|
|
39
|
+
gem install ./uea-stemmer-*.gem
|
|
28
40
|
|
|
29
41
|
== Example Usage
|
|
30
42
|
|
|
31
|
-
|
|
43
|
+
Basic usage:
|
|
32
44
|
|
|
33
|
-
require
|
|
45
|
+
require "uea-stemmer"
|
|
34
46
|
stemmer = UEAStemmer.new
|
|
35
47
|
|
|
36
|
-
stemmer.stem(
|
|
37
|
-
stemmer.stem(
|
|
38
|
-
stemmer.stem(
|
|
48
|
+
stemmer.stem("helpers") # => "helper"
|
|
49
|
+
stemmer.stem("dying") # => "die"
|
|
50
|
+
stemmer.stem("scarred") # => "scar"
|
|
51
|
+
|
|
52
|
+
You can extract the matching rule with +stem_with_rule+:
|
|
53
|
+
|
|
54
|
+
result = stemmer.stem_with_rule("invited")
|
|
55
|
+
result.word # => "invite"
|
|
56
|
+
result.rule_num # => "22.3"
|
|
57
|
+
result.rule # => #<UEAStemmer::Rule ...>
|
|
58
|
+
|
|
59
|
+
Disable contraction expansion:
|
|
39
60
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
'ordained'.stem # ordain
|
|
61
|
+
UEAStemmer.new(nil, nil, skip_contractions: true).stem("don't")
|
|
62
|
+
# => "don't"
|
|
43
63
|
|
|
44
|
-
|
|
64
|
+
Use the singleton instance:
|
|
45
65
|
|
|
46
|
-
stem
|
|
47
|
-
puts stem.rule # rule #22.3 (remove -d when the word ends in -ited)
|
|
66
|
+
DefaultUEAStemmer.instance.stem("running") # => "run"
|
|
48
67
|
|
|
49
|
-
==
|
|
68
|
+
== Development
|
|
50
69
|
|
|
51
|
-
|
|
52
|
-
|
|
70
|
+
This project does not require Bundler or Rake for normal development. Run the
|
|
71
|
+
tests directly:
|
|
53
72
|
|
|
54
|
-
|
|
73
|
+
ruby -Itest test/uea_stemmer_test.rb
|
|
74
|
+
|
|
75
|
+
Build the gem package:
|
|
76
|
+
|
|
77
|
+
gem build uea-stemmer.gemspec
|
|
78
|
+
|
|
79
|
+
GitHub Actions runs the test suite and gem build on supported Ruby versions.
|
|
80
|
+
|
|
81
|
+
== Contributing
|
|
55
82
|
|
|
56
83
|
* Fork the project.
|
|
57
84
|
* Make your feature addition or bug fix.
|
|
58
|
-
* Add
|
|
59
|
-
|
|
60
|
-
*
|
|
61
|
-
|
|
62
|
-
* Send me a pull request. Bonus points for topic branches.
|
|
85
|
+
* Add or update tests.
|
|
86
|
+
* Run +ruby -Itest test/uea_stemmer_test.rb+.
|
|
87
|
+
* Run +gem build uea-stemmer.gemspec+.
|
|
88
|
+
* Send a pull request.
|
|
63
89
|
|
|
64
90
|
== Relevant Web Pages
|
|
65
91
|
|
|
66
92
|
* https://web.archive.org/web/20120728132949/http://www.uea.ac.uk/cmp/research/graphicsvisionspeech/speech/WordStemming
|
|
67
|
-
* Stemming[
|
|
93
|
+
* Stemming[https://en.wikipedia.org/wiki/Stemming]
|
|
68
94
|
|
|
69
95
|
== Copyright
|
|
70
96
|
|
|
71
97
|
Copyright (c) 2005 by the University of East Anglia and authored by Marie-Claire Jenkins and Dr. Dan J Smith. This port to Ruby was done by Jason Adams using the port to Java by Richard Churchill.
|
|
72
98
|
|
|
73
|
-
This project is distributed under the Apache 2.0
|
|
74
|
-
|
|
99
|
+
This project is distributed under the Apache 2.0
|
|
100
|
+
License[https://www.apache.org/licenses/LICENSE-2.0]. See LICENSE for details.
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.
|
|
1
|
+
0.11.0
|
data/lib/uea-stemmer/rule.rb
CHANGED
|
@@ -29,7 +29,7 @@ class UEAStemmer
|
|
|
29
29
|
def initialize(pattern, suffix_size, rule_num)
|
|
30
30
|
@pattern = pattern.dup.freeze
|
|
31
31
|
@suffix_size = suffix_size
|
|
32
|
-
@rule_num = rule_num
|
|
32
|
+
@rule_num = rule_num.to_s.freeze
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
def handle(word)
|
|
@@ -45,7 +45,7 @@ class UEAStemmer
|
|
|
45
45
|
attr_reader :original_pattern
|
|
46
46
|
|
|
47
47
|
def initialize(pattern, suffix_size, rule_num)
|
|
48
|
-
super(/^.*#{pattern}$/, suffix_size, rule_num)
|
|
48
|
+
super(/^.*#{Regexp.escape(pattern)}$/, suffix_size, rule_num)
|
|
49
49
|
@original_pattern = pattern.dup.freeze
|
|
50
50
|
end
|
|
51
51
|
|
|
@@ -105,4 +105,4 @@ class UEAStemmer
|
|
|
105
105
|
stemmed_word
|
|
106
106
|
end
|
|
107
107
|
end
|
|
108
|
-
end
|
|
108
|
+
end
|
data/lib/uea-stemmer/word.rb
CHANGED
|
@@ -24,12 +24,12 @@ class UEAStemmer
|
|
|
24
24
|
|
|
25
25
|
def initialize(word, rule_num, rule = nil)
|
|
26
26
|
@word = word.dup.freeze
|
|
27
|
-
@rule_num = rule_num
|
|
27
|
+
@rule_num = rule_num.to_s.freeze
|
|
28
28
|
@rule = rule
|
|
29
29
|
end
|
|
30
30
|
|
|
31
31
|
def to_s
|
|
32
|
-
if @rule_num
|
|
32
|
+
if @rule_num != '0'
|
|
33
33
|
"#{@word} (Rule ##{@rule_num} #{@rule})"
|
|
34
34
|
else
|
|
35
35
|
"#{@word} (No rule)"
|
data/lib/uea-stemmer.rb
CHANGED
|
@@ -26,41 +26,46 @@ require 'singleton'
|
|
|
26
26
|
class UEAStemmer
|
|
27
27
|
include StringHelpers
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
29
|
+
APOSTROPHE_PATTERN = /['’]/
|
|
30
|
+
PROBLEM_WORDS = %w[is as this has was during menses].freeze
|
|
31
|
+
SPECIAL_RULE_COUNT = 4
|
|
32
|
+
|
|
33
|
+
attr_reader :max_acronym_length, :max_word_length
|
|
31
34
|
|
|
32
35
|
def initialize(max_word_length = nil, max_acronym_length = nil, options = {})
|
|
33
36
|
@max_word_length = max_word_length || 'deoxyribonucleicacid'.size
|
|
34
37
|
@max_acronym_length = max_acronym_length || 'CAVASSOO'.size
|
|
35
|
-
@options = options.
|
|
38
|
+
@options = options.transform_keys(&:to_sym).freeze
|
|
36
39
|
|
|
37
|
-
@rules =
|
|
40
|
+
@rules = []
|
|
38
41
|
create_rules
|
|
39
42
|
end
|
|
40
43
|
|
|
44
|
+
def rules
|
|
45
|
+
@rules.dup.freeze
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def options
|
|
49
|
+
@options
|
|
50
|
+
end
|
|
51
|
+
|
|
41
52
|
def stem_with_rule(word)
|
|
42
|
-
stemmed_word = word.dup
|
|
43
|
-
ruleno = 0;
|
|
53
|
+
stemmed_word = word.dup
|
|
44
54
|
|
|
45
55
|
if problem_word?(word)
|
|
46
56
|
Word.new(word, 94)
|
|
47
57
|
elsif (word.size > @max_acronym_length && word =~ /^[A-Z]+$/) || (word.size > (@max_acronym_length + 1) && word =~ /^[A-Z]+s$/)
|
|
48
|
-
Word.new(word, 96)
|
|
58
|
+
Word.new(word, 96)
|
|
49
59
|
elsif word.size > @max_word_length
|
|
50
60
|
Word.new(word, 95)
|
|
51
|
-
elsif word
|
|
52
|
-
if word =~ /^.*'
|
|
61
|
+
elsif word =~ APOSTROPHE_PATTERN
|
|
62
|
+
if word =~ /^.*['’]s$/i
|
|
53
63
|
stemmed_word = remove_suffix(stemmed_word, 2)
|
|
54
|
-
elsif word =~ /^.*'$/
|
|
64
|
+
elsif word =~ /^.*['’]$/
|
|
55
65
|
stemmed_word = remove_suffix(stemmed_word, 1)
|
|
56
66
|
end
|
|
57
67
|
|
|
58
|
-
unless options[:skip_contractions]
|
|
59
|
-
stemmed_word.gsub!(/n't/, ' not')
|
|
60
|
-
stemmed_word.gsub!(/'ve/, ' have')
|
|
61
|
-
stemmed_word.gsub!(/'re/, ' are')
|
|
62
|
-
stemmed_word.gsub!(/'m/, ' am')
|
|
63
|
-
end
|
|
68
|
+
stemmed_word = expand_contractions(stemmed_word) unless options[:skip_contractions]
|
|
64
69
|
|
|
65
70
|
Word.new(stemmed_word, 93)
|
|
66
71
|
else
|
|
@@ -74,7 +79,7 @@ class UEAStemmer
|
|
|
74
79
|
end
|
|
75
80
|
|
|
76
81
|
def num_rules
|
|
77
|
-
@rules.map { |r| r.rule_num }.uniq.size +
|
|
82
|
+
@rules.map { |r| r.rule_num }.uniq.size + SPECIAL_RULE_COUNT
|
|
78
83
|
end
|
|
79
84
|
|
|
80
85
|
def to_s
|
|
@@ -82,7 +87,7 @@ class UEAStemmer
|
|
|
82
87
|
end
|
|
83
88
|
|
|
84
89
|
def add_rule(rule)
|
|
85
|
-
if rule.
|
|
90
|
+
if rule.is_a?(Rule)
|
|
86
91
|
@rules << rule.dup.freeze
|
|
87
92
|
true
|
|
88
93
|
else
|
|
@@ -94,13 +99,48 @@ class UEAStemmer
|
|
|
94
99
|
|
|
95
100
|
def apply_rules(word)
|
|
96
101
|
@rules.each do |rule|
|
|
97
|
-
stemmed_word, rule_num,
|
|
98
|
-
return [stemmed_word, rule_num,
|
|
102
|
+
stemmed_word, rule_num, matched_rule = rule.handle(word)
|
|
103
|
+
return [stemmed_word, rule_num, matched_rule] if stemmed_word && rule_num
|
|
99
104
|
end
|
|
100
105
|
|
|
101
106
|
[word, 0, nil]
|
|
102
107
|
end
|
|
103
108
|
|
|
109
|
+
def expand_contractions(word)
|
|
110
|
+
normalized_word = word.tr('’', "'")
|
|
111
|
+
|
|
112
|
+
if normalized_word =~ /\Awon't\z/i
|
|
113
|
+
match_word_case(word, 'will not')
|
|
114
|
+
elsif normalized_word =~ /\Acan't\z/i
|
|
115
|
+
match_word_case(word, 'can not')
|
|
116
|
+
elsif normalized_word =~ /\Ashan't\z/i
|
|
117
|
+
match_word_case(word, 'shall not')
|
|
118
|
+
elsif normalized_word =~ /\A(.+)n't\z/i
|
|
119
|
+
"#{$1}#{match_suffix_case(word, ' not')}"
|
|
120
|
+
elsif normalized_word =~ /\A(.+)'ve\z/i
|
|
121
|
+
"#{$1}#{match_suffix_case(word, ' have')}"
|
|
122
|
+
elsif normalized_word =~ /\A(.+)'re\z/i
|
|
123
|
+
"#{$1}#{match_suffix_case(word, ' are')}"
|
|
124
|
+
elsif normalized_word =~ /\A(.+)'m\z/i
|
|
125
|
+
"#{$1}#{match_suffix_case(word, ' am')}"
|
|
126
|
+
else
|
|
127
|
+
word
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def match_word_case(original_word, expanded_word)
|
|
132
|
+
return expanded_word.upcase if original_word == original_word.upcase
|
|
133
|
+
return expanded_word.sub(/\A[a-z]/) { |letter| letter.upcase } if original_word =~ /\A[A-Z]/
|
|
134
|
+
|
|
135
|
+
expanded_word
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
def match_suffix_case(original_word, suffix)
|
|
139
|
+
return suffix.upcase if original_word == original_word.upcase
|
|
140
|
+
|
|
141
|
+
suffix
|
|
142
|
+
end
|
|
143
|
+
|
|
104
144
|
def create_rules
|
|
105
145
|
@rules << Rule.new(/^\d+$/, 0, 90.3)
|
|
106
146
|
@rules << Rule.new(/^\w+-\w+$/, 0, 90.2)
|
|
@@ -117,8 +157,6 @@ class UEAStemmer
|
|
|
117
157
|
@rules << EndingRule.new('sis', 0, 4)
|
|
118
158
|
@rules << EndingRule.new('tis', 0, 5)
|
|
119
159
|
@rules << EndingRule.new('ss', 0, 6)
|
|
120
|
-
|
|
121
|
-
# plural change - this differs from Perl v1.03
|
|
122
160
|
@rules << EndingRule.new('eed', 0, 7)
|
|
123
161
|
@rules << EndingRule.new('eeds', 1, 7)
|
|
124
162
|
|
|
@@ -157,13 +195,9 @@ class UEAStemmer
|
|
|
157
195
|
@rules << EndingRule.new('eared', 2, 20.3)
|
|
158
196
|
@rules << EndingRule.new('tored', 2, 20.2)
|
|
159
197
|
@rules << EndingRule.new('ered', 2, 20.1)
|
|
160
|
-
|
|
161
|
-
# plural change - this differs from Perl v1.03
|
|
162
198
|
@rules << EndingRule.new('red', 1, 20)
|
|
163
199
|
@rules << EndingRule.new('reds', 2, 20)
|
|
164
200
|
@rules << EndingRule.new('tted', 3, 21)
|
|
165
|
-
|
|
166
|
-
# added some rules to handle invited vs. exited
|
|
167
201
|
@rules << EndingRule.new('noted', 1, 22.6)
|
|
168
202
|
@rules << EndingRule.new('leted', 1, 22.5)
|
|
169
203
|
@rules << Rule.new(/^.*[^vm]ited$/, 2, 22.4)
|
|
@@ -175,8 +209,6 @@ class UEAStemmer
|
|
|
175
209
|
@rules << EndingRule.new('anges', 1, 23)
|
|
176
210
|
@rules << EndingRule.new('aining', 3, 24)
|
|
177
211
|
@rules << EndingRule.new('acting', 3, 25)
|
|
178
|
-
|
|
179
|
-
# plural change - this differs from Perl v1.03
|
|
180
212
|
@rules << EndingRule.new('tting', 4, 26)
|
|
181
213
|
@rules << EndingRule.new('ttings', 5, 26)
|
|
182
214
|
|
|
@@ -184,9 +216,6 @@ class UEAStemmer
|
|
|
184
216
|
@rules << EndingRule.new('ssed', 2, 28)
|
|
185
217
|
@rules << EndingRule.new('sed', 1, 29)
|
|
186
218
|
@rules << EndingRule.new('titudes', 1, 30)
|
|
187
|
-
|
|
188
|
-
# added some additional rules to handle other vowels and consonants
|
|
189
|
-
# (added by Jason M. Adams)
|
|
190
219
|
@rules << EndingRule.new('oed', 1, 31.3)
|
|
191
220
|
@rules << EndingRule.new('does', 2, 31.2)
|
|
192
221
|
@rules << EndingRule.new('oes', 1, 31.2)
|
|
@@ -212,8 +241,6 @@ class UEAStemmer
|
|
|
212
241
|
@rules << EndingRule.new('ssing', 3, 37)
|
|
213
242
|
@rules << EndingRule.new('ssings', 4, 37)
|
|
214
243
|
@rules << EndingRule.new('ulting', 3, 38)
|
|
215
|
-
|
|
216
|
-
# plural change - this differs from Perl v1.03
|
|
217
244
|
@rules << ConcatenatingEndingRule.new('ving', 3, 39, 'e')
|
|
218
245
|
@rules << ConcatenatingEndingRule.new('vings', 4, 39, 'e')
|
|
219
246
|
|
|
@@ -231,8 +258,6 @@ class UEAStemmer
|
|
|
231
258
|
@rules << EndingRule.new('rdings', 4, 40.2)
|
|
232
259
|
@rules << EndingRule.new('nding', 3, 40.1)
|
|
233
260
|
@rules << EndingRule.new('ndings', 4, 40.1)
|
|
234
|
-
|
|
235
|
-
# plural change - this differs from Perl v1.03
|
|
236
261
|
@rules << ConcatenatingEndingRule.new('ding', 3, 40, 'e')
|
|
237
262
|
@rules << ConcatenatingEndingRule.new('dings', 4, 40, 'e')
|
|
238
263
|
|
|
@@ -255,8 +280,6 @@ class UEAStemmer
|
|
|
255
280
|
@rules << EndingRule.new('mmings', 5, 44.3)
|
|
256
281
|
@rules << EndingRule.new('rming', 3, 44.2)
|
|
257
282
|
@rules << EndingRule.new('lming', 3, 44.1)
|
|
258
|
-
|
|
259
|
-
# plural change - this differs from Perl v1.03
|
|
260
283
|
@rules << ConcatenatingEndingRule.new('ming', 3, 44, 'e')
|
|
261
284
|
@rules << ConcatenatingEndingRule.new('mings', 4, 44, 'e')
|
|
262
285
|
|
|
@@ -273,8 +296,6 @@ class UEAStemmer
|
|
|
273
296
|
@rules << EndingRule.new('oning', 3, 46.2)
|
|
274
297
|
@rules << EndingRule.new('rning', 3, 46.1)
|
|
275
298
|
@rules << ConcatenatingEndingRule.new('ning', 3, 46, 'e')
|
|
276
|
-
|
|
277
|
-
# plural change - this differs from Perl v1.03
|
|
278
299
|
@rules << EndingRule.new('sting', 3, 47)
|
|
279
300
|
@rules << EndingRule.new('stings', 4, 47)
|
|
280
301
|
@rules << EndingRule.new('eting', 3, 48.4)
|
|
@@ -287,13 +308,9 @@ class UEAStemmer
|
|
|
287
308
|
@rules << ConcatenatingEndingRule.new('ting', 3, 48, 'e')
|
|
288
309
|
@rules << ConcatenatingEndingRule.new('tings', 4, 48, 'e')
|
|
289
310
|
|
|
290
|
-
@rules << EndingRule.new('ssed', 2, 49)
|
|
291
311
|
@rules << EndingRule.new('les', 1, 50)
|
|
292
312
|
@rules << EndingRule.new('tes', 1, 51)
|
|
293
313
|
@rules << EndingRule.new('zed', 1, 52)
|
|
294
|
-
@rules << EndingRule.new('lled', 2, 53)
|
|
295
|
-
|
|
296
|
-
# plural change - this differs from Perl v1.03
|
|
297
314
|
@rules << ConcatenatingEndingRule.new('iring', 3, 54.4, 'e')
|
|
298
315
|
@rules << ConcatenatingEndingRule.new('irings', 4, 54.4, 'e')
|
|
299
316
|
@rules << ConcatenatingEndingRule.new('uring', 3, 54.3, 'e')
|
|
@@ -302,19 +319,15 @@ class UEAStemmer
|
|
|
302
319
|
@rules << ConcatenatingEndingRule.new('ncings', 4, 54.2, 'e')
|
|
303
320
|
|
|
304
321
|
@rules << ConcatenatingEndingRule.new('zing', 3, 54.1, 'e')
|
|
305
|
-
|
|
306
|
-
# plural change - this differs from Perl v1.03
|
|
307
322
|
@rules << ConcatenatingEndingRule.new('sing', 3, 54, 'e')
|
|
308
323
|
@rules << ConcatenatingEndingRule.new('sings', 4, 54, 'e')
|
|
309
324
|
|
|
310
325
|
@rules << EndingRule.new('lling', 3, 55)
|
|
311
326
|
@rules << ConcatenatingEndingRule.new('ied', 3, 56, 'y')
|
|
312
327
|
@rules << ConcatenatingEndingRule.new('ating', 3, 57, 'e')
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
@rules << ConcatenatingEndingRule.new('
|
|
316
|
-
@rules << ExhaustiveConcatenatingEndingRule.new('lying', 4, 58.2, 'ie') # added by JMA (lying vs flying)
|
|
317
|
-
@rules << ConcatenatingEndingRule.new('tying', 4, 58.2, 'ie') # added by JMA
|
|
328
|
+
@rules << ConcatenatingEndingRule.new('dying', 4, 58.2, 'ie')
|
|
329
|
+
@rules << ExhaustiveConcatenatingEndingRule.new('lying', 4, 58.2, 'ie')
|
|
330
|
+
@rules << ConcatenatingEndingRule.new('tying', 4, 58.2, 'ie')
|
|
318
331
|
@rules << EndingRule.new('thing', 0, 58.1)
|
|
319
332
|
@rules << EndingRule.new('things', 1, 58.1)
|
|
320
333
|
@rules << CustomRule.new(/.*\w\wings?$/, 3, 58)
|
|
@@ -325,10 +338,9 @@ class UEAStemmer
|
|
|
325
338
|
@rules << EndingRule.new('aped', 1, 61.3)
|
|
326
339
|
@rules << EndingRule.new('uded', 1, 61.2)
|
|
327
340
|
@rules << EndingRule.new('oded', 1, 61.1)
|
|
328
|
-
@rules << EndingRule.new('ated', 1, 61)
|
|
329
341
|
@rules << CustomRule.new(/.*\w\weds?$/, 2, 62)
|
|
330
|
-
@rules << EndingRule.new('des', 1, 63.10)
|
|
331
|
-
@rules << EndingRule.new('res', 1, 63.9)
|
|
342
|
+
@rules << EndingRule.new('des', 1, '63.10')
|
|
343
|
+
@rules << EndingRule.new('res', 1, 63.9)
|
|
332
344
|
@rules << EndingRule.new('pes', 1, 63.8)
|
|
333
345
|
@rules << EndingRule.new('mes', 1, 63.7)
|
|
334
346
|
@rules << EndingRule.new('ones', 1, 63.6)
|
|
@@ -351,7 +363,7 @@ class UEAStemmer
|
|
|
351
363
|
end
|
|
352
364
|
|
|
353
365
|
def problem_word?(word)
|
|
354
|
-
|
|
366
|
+
PROBLEM_WORDS.include?(word)
|
|
355
367
|
end
|
|
356
368
|
|
|
357
369
|
end
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: uea-stemmer
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.11.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Marie-Claire Jenkins
|
|
@@ -11,106 +11,32 @@ authors:
|
|
|
11
11
|
autorequire:
|
|
12
12
|
bindir: bin
|
|
13
13
|
cert_chain: []
|
|
14
|
-
date:
|
|
15
|
-
dependencies:
|
|
16
|
-
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
- - ">="
|
|
21
|
-
- !ruby/object:Gem::Version
|
|
22
|
-
version: '0'
|
|
23
|
-
type: :development
|
|
24
|
-
prerelease: false
|
|
25
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
26
|
-
requirements:
|
|
27
|
-
- - ">="
|
|
28
|
-
- !ruby/object:Gem::Version
|
|
29
|
-
version: '0'
|
|
30
|
-
- !ruby/object:Gem::Dependency
|
|
31
|
-
name: test-unit
|
|
32
|
-
requirement: !ruby/object:Gem::Requirement
|
|
33
|
-
requirements:
|
|
34
|
-
- - ">="
|
|
35
|
-
- !ruby/object:Gem::Version
|
|
36
|
-
version: '0'
|
|
37
|
-
type: :development
|
|
38
|
-
prerelease: false
|
|
39
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
40
|
-
requirements:
|
|
41
|
-
- - ">="
|
|
42
|
-
- !ruby/object:Gem::Version
|
|
43
|
-
version: '0'
|
|
44
|
-
- !ruby/object:Gem::Dependency
|
|
45
|
-
name: shoulda
|
|
46
|
-
requirement: !ruby/object:Gem::Requirement
|
|
47
|
-
requirements:
|
|
48
|
-
- - ">="
|
|
49
|
-
- !ruby/object:Gem::Version
|
|
50
|
-
version: '0'
|
|
51
|
-
type: :development
|
|
52
|
-
prerelease: false
|
|
53
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
54
|
-
requirements:
|
|
55
|
-
- - ">="
|
|
56
|
-
- !ruby/object:Gem::Version
|
|
57
|
-
version: '0'
|
|
58
|
-
- !ruby/object:Gem::Dependency
|
|
59
|
-
name: rake
|
|
60
|
-
requirement: !ruby/object:Gem::Requirement
|
|
61
|
-
requirements:
|
|
62
|
-
- - ">="
|
|
63
|
-
- !ruby/object:Gem::Version
|
|
64
|
-
version: '0'
|
|
65
|
-
type: :development
|
|
66
|
-
prerelease: false
|
|
67
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
68
|
-
requirements:
|
|
69
|
-
- - ">="
|
|
70
|
-
- !ruby/object:Gem::Version
|
|
71
|
-
version: '0'
|
|
72
|
-
- !ruby/object:Gem::Dependency
|
|
73
|
-
name: awesome_print
|
|
74
|
-
requirement: !ruby/object:Gem::Requirement
|
|
75
|
-
requirements:
|
|
76
|
-
- - ">="
|
|
77
|
-
- !ruby/object:Gem::Version
|
|
78
|
-
version: '0'
|
|
79
|
-
type: :development
|
|
80
|
-
prerelease: false
|
|
81
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
82
|
-
requirements:
|
|
83
|
-
- - ">="
|
|
84
|
-
- !ruby/object:Gem::Version
|
|
85
|
-
version: '0'
|
|
86
|
-
description: Port of UEA-Lite Stemmer to Ruby, a conservative stemmer for search and
|
|
87
|
-
indexing.
|
|
88
|
-
email: jasonmadams@gmail.com
|
|
14
|
+
date: 2026-05-21 00:00:00.000000000 Z
|
|
15
|
+
dependencies: []
|
|
16
|
+
description: Ruby port of the UEA-Lite stemmer, designed to normalize common English
|
|
17
|
+
suffixes without aggressive stemming.
|
|
18
|
+
email:
|
|
19
|
+
- jasonmadams@gmail.com
|
|
89
20
|
executables: []
|
|
90
21
|
extensions: []
|
|
91
|
-
extra_rdoc_files:
|
|
92
|
-
- LICENSE
|
|
93
|
-
- README.rdoc
|
|
22
|
+
extra_rdoc_files: []
|
|
94
23
|
files:
|
|
95
|
-
- ".document"
|
|
96
|
-
- ".ruby-gemset"
|
|
97
|
-
- ".ruby-version"
|
|
98
|
-
- Gemfile
|
|
99
|
-
- Gemfile.lock
|
|
100
24
|
- LICENSE
|
|
101
25
|
- README.rdoc
|
|
102
|
-
- Rakefile
|
|
103
26
|
- VERSION
|
|
104
27
|
- lib/uea-stemmer.rb
|
|
105
28
|
- lib/uea-stemmer/rule.rb
|
|
106
29
|
- lib/uea-stemmer/string_helpers.rb
|
|
107
30
|
- lib/uea-stemmer/word.rb
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
31
|
+
homepage: https://github.com/ealdent/uea-stemmer
|
|
32
|
+
licenses:
|
|
33
|
+
- Apache-2.0
|
|
34
|
+
metadata:
|
|
35
|
+
bug_tracker_uri: https://github.com/ealdent/uea-stemmer/issues
|
|
36
|
+
changelog_uri: https://github.com/ealdent/uea-stemmer/releases
|
|
37
|
+
homepage_uri: https://github.com/ealdent/uea-stemmer
|
|
38
|
+
rubygems_mfa_required: 'true'
|
|
39
|
+
source_code_uri: https://github.com/ealdent/uea-stemmer
|
|
114
40
|
post_install_message:
|
|
115
41
|
rdoc_options: []
|
|
116
42
|
require_paths:
|
|
@@ -119,16 +45,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
119
45
|
requirements:
|
|
120
46
|
- - ">="
|
|
121
47
|
- !ruby/object:Gem::Version
|
|
122
|
-
version: '
|
|
48
|
+
version: '3.1'
|
|
123
49
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
50
|
requirements:
|
|
125
51
|
- - ">="
|
|
126
52
|
- !ruby/object:Gem::Version
|
|
127
53
|
version: '0'
|
|
128
54
|
requirements: []
|
|
129
|
-
|
|
130
|
-
rubygems_version: 2.4.8
|
|
55
|
+
rubygems_version: 3.0.3.1
|
|
131
56
|
signing_key:
|
|
132
57
|
specification_version: 4
|
|
133
|
-
summary:
|
|
58
|
+
summary: Conservative UEA-Lite stemming for search and indexing.
|
|
134
59
|
test_files: []
|
data/.document
DELETED
data/.ruby-gemset
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
uea-stemmer
|
data/.ruby-version
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
ruby-2.3.0
|
data/Gemfile
DELETED
data/Gemfile.lock
DELETED
|
@@ -1,83 +0,0 @@
|
|
|
1
|
-
GEM
|
|
2
|
-
remote: https://rubygems.org/
|
|
3
|
-
specs:
|
|
4
|
-
activesupport (5.1.3)
|
|
5
|
-
concurrent-ruby (~> 1.0, >= 1.0.2)
|
|
6
|
-
i18n (~> 0.7)
|
|
7
|
-
minitest (~> 5.1)
|
|
8
|
-
tzinfo (~> 1.1)
|
|
9
|
-
addressable (2.4.0)
|
|
10
|
-
awesome_print (1.8.0)
|
|
11
|
-
builder (3.2.3)
|
|
12
|
-
concurrent-ruby (1.0.5)
|
|
13
|
-
descendants_tracker (0.0.4)
|
|
14
|
-
thread_safe (~> 0.3, >= 0.3.1)
|
|
15
|
-
faraday (0.9.2)
|
|
16
|
-
multipart-post (>= 1.2, < 3)
|
|
17
|
-
git (1.3.0)
|
|
18
|
-
github_api (0.16.0)
|
|
19
|
-
addressable (~> 2.4.0)
|
|
20
|
-
descendants_tracker (~> 0.0.4)
|
|
21
|
-
faraday (~> 0.8, < 0.10)
|
|
22
|
-
hashie (>= 3.4)
|
|
23
|
-
mime-types (>= 1.16, < 3.0)
|
|
24
|
-
oauth2 (~> 1.0)
|
|
25
|
-
hashie (3.5.6)
|
|
26
|
-
highline (1.7.8)
|
|
27
|
-
i18n (0.8.6)
|
|
28
|
-
jeweler (2.3.7)
|
|
29
|
-
builder
|
|
30
|
-
bundler (>= 1)
|
|
31
|
-
git (>= 1.2.5)
|
|
32
|
-
github_api (~> 0.16.0)
|
|
33
|
-
highline (>= 1.6.15)
|
|
34
|
-
nokogiri (>= 1.5.10)
|
|
35
|
-
psych (~> 2.2)
|
|
36
|
-
rake
|
|
37
|
-
rdoc
|
|
38
|
-
semver2
|
|
39
|
-
jwt (1.5.6)
|
|
40
|
-
mime-types (2.99.3)
|
|
41
|
-
mini_portile2 (2.2.0)
|
|
42
|
-
minitest (5.10.3)
|
|
43
|
-
multi_json (1.12.1)
|
|
44
|
-
multi_xml (0.6.0)
|
|
45
|
-
multipart-post (2.0.0)
|
|
46
|
-
nokogiri (1.8.0)
|
|
47
|
-
mini_portile2 (~> 2.2.0)
|
|
48
|
-
oauth2 (1.4.0)
|
|
49
|
-
faraday (>= 0.8, < 0.13)
|
|
50
|
-
jwt (~> 1.0)
|
|
51
|
-
multi_json (~> 1.3)
|
|
52
|
-
multi_xml (~> 0.5)
|
|
53
|
-
rack (>= 1.2, < 3)
|
|
54
|
-
power_assert (1.0.2)
|
|
55
|
-
psych (2.2.4)
|
|
56
|
-
rack (2.0.3)
|
|
57
|
-
rake (12.0.0)
|
|
58
|
-
rdoc (5.1.0)
|
|
59
|
-
semver2 (3.4.2)
|
|
60
|
-
shoulda (3.5.0)
|
|
61
|
-
shoulda-context (~> 1.0, >= 1.0.1)
|
|
62
|
-
shoulda-matchers (>= 1.4.1, < 3.0)
|
|
63
|
-
shoulda-context (1.2.2)
|
|
64
|
-
shoulda-matchers (2.8.0)
|
|
65
|
-
activesupport (>= 3.0.0)
|
|
66
|
-
test-unit (3.2.5)
|
|
67
|
-
power_assert
|
|
68
|
-
thread_safe (0.3.6)
|
|
69
|
-
tzinfo (1.2.3)
|
|
70
|
-
thread_safe (~> 0.1)
|
|
71
|
-
|
|
72
|
-
PLATFORMS
|
|
73
|
-
ruby
|
|
74
|
-
|
|
75
|
-
DEPENDENCIES
|
|
76
|
-
awesome_print
|
|
77
|
-
jeweler
|
|
78
|
-
rake
|
|
79
|
-
shoulda
|
|
80
|
-
test-unit
|
|
81
|
-
|
|
82
|
-
BUNDLED WITH
|
|
83
|
-
1.15.3
|
data/Rakefile
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
require 'rubygems'
|
|
2
|
-
require 'rake'
|
|
3
|
-
|
|
4
|
-
begin
|
|
5
|
-
require 'jeweler'
|
|
6
|
-
Jeweler::Tasks.new do |gem|
|
|
7
|
-
gem.name = "uea-stemmer"
|
|
8
|
-
gem.summary = %Q{Port of UEA-Lite Stemmer to Ruby, a conservative stemmer for search and indexing.}
|
|
9
|
-
gem.description = %Q{Port of UEA-Lite Stemmer to Ruby, a conservative stemmer for search and indexing.}
|
|
10
|
-
gem.email = "jasonmadams@gmail.com"
|
|
11
|
-
gem.homepage = "http://github.com/ealdent/uea-stemmer"
|
|
12
|
-
gem.authors = ["Marie-Claire Jenkins", "Dan J. Smith", "Richard Churchill", "Jason Adams"]
|
|
13
|
-
# gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
|
|
14
|
-
end
|
|
15
|
-
|
|
16
|
-
rescue LoadError
|
|
17
|
-
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
|
18
|
-
end
|
|
19
|
-
|
|
20
|
-
require 'rake/testtask'
|
|
21
|
-
Rake::TestTask.new(:test) do |test|
|
|
22
|
-
test.libs << 'lib' << 'test'
|
|
23
|
-
test.pattern = 'test/**/*_test.rb'
|
|
24
|
-
test.verbose = true
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
begin
|
|
28
|
-
require 'rcov/rcovtask'
|
|
29
|
-
Rcov::RcovTask.new do |test|
|
|
30
|
-
test.libs << 'test'
|
|
31
|
-
test.pattern = 'test/**/*_test.rb'
|
|
32
|
-
test.verbose = true
|
|
33
|
-
end
|
|
34
|
-
rescue LoadError
|
|
35
|
-
task :rcov do
|
|
36
|
-
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
|
37
|
-
end
|
|
38
|
-
end
|
|
39
|
-
|
|
40
|
-
task :default => :test
|
data/test/test_helper.rb
DELETED
data/test/uea_stemmer_test.rb
DELETED
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
require 'test_helper'
|
|
2
|
-
|
|
3
|
-
class UeaStemmerTest < Test::Unit::TestCase
|
|
4
|
-
context "A default UEAStemmer instance" do
|
|
5
|
-
setup do
|
|
6
|
-
@stemmer = UEAStemmer.new
|
|
7
|
-
end
|
|
8
|
-
|
|
9
|
-
should "have max word and max acronym sizes equivalent to deoxyribonucleicacid and CAVASSOO respectively" do
|
|
10
|
-
assert @stemmer.max_word_length == 'deoxyribonucleicacid'.size
|
|
11
|
-
assert @stemmer.max_acronym_length == 'CAVASSOO'.size
|
|
12
|
-
end
|
|
13
|
-
|
|
14
|
-
should "allow setting options" do
|
|
15
|
-
@stemmer.options[:test] = true
|
|
16
|
-
assert @stemmer.options[:test]
|
|
17
|
-
end
|
|
18
|
-
|
|
19
|
-
context "stem method" do
|
|
20
|
-
should "stem words as Strings" do
|
|
21
|
-
assert @stemmer.stem('word').is_a?(String)
|
|
22
|
-
end
|
|
23
|
-
|
|
24
|
-
should "stem base words to just the base word" do
|
|
25
|
-
assert_equal @stemmer.stem('man'), 'man'
|
|
26
|
-
assert_equal @stemmer.stem('happiness'), 'happiness'
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
should "stem theses as thesis but not bases as basis" do
|
|
30
|
-
assert_equal @stemmer.stem('theses'), 'thesis'
|
|
31
|
-
assert_not_equal @stemmer.stem('bases'), 'basis'
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
should "stem preterite words ending in -ed without the -ed" do
|
|
35
|
-
assert_equal @stemmer.stem('ordained'), 'ordain'
|
|
36
|
-
assert_equal @stemmer.stem('killed'), 'kill'
|
|
37
|
-
assert_equal @stemmer.stem('liked'), 'like'
|
|
38
|
-
assert_equal @stemmer.stem('helped'), 'help'
|
|
39
|
-
assert_equal @stemmer.stem('scarred'), 'scar'
|
|
40
|
-
assert_equal @stemmer.stem('invited'), 'invite'
|
|
41
|
-
assert_equal @stemmer.stem('exited'), 'exit'
|
|
42
|
-
assert_equal @stemmer.stem('exited'), 'exit'
|
|
43
|
-
assert_equal @stemmer.stem('debited'), 'debit'
|
|
44
|
-
assert_equal @stemmer.stem('smited'), 'smite'
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
should "stem progressive verbs and gerunds without the -ing" do
|
|
48
|
-
assert_equal @stemmer.stem('running'), 'run'
|
|
49
|
-
assert_equal @stemmer.stem('settings'), 'set'
|
|
50
|
-
assert_equal @stemmer.stem('timing'), 'time'
|
|
51
|
-
assert_equal @stemmer.stem('dying'), 'die'
|
|
52
|
-
assert_equal @stemmer.stem('harping'), 'harp'
|
|
53
|
-
assert_equal @stemmer.stem('charring'), 'char'
|
|
54
|
-
end
|
|
55
|
-
|
|
56
|
-
should "not stem false progressive verbs such as 'sing'" do
|
|
57
|
-
assert_equal @stemmer.stem('ring'), 'ring'
|
|
58
|
-
assert_equal @stemmer.stem('sing'), 'sing'
|
|
59
|
-
assert_equal @stemmer.stem('ring'), 'ring'
|
|
60
|
-
assert_equal @stemmer.stem('bring'), 'bring'
|
|
61
|
-
assert_equal @stemmer.stem('fling'), 'fling'
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
should "stem various plural nouns and 3rd-pres verbs without the -s/-es" do
|
|
65
|
-
assert_equal @stemmer.stem('changes'), 'change'
|
|
66
|
-
assert_equal @stemmer.stem('deaths'), 'death'
|
|
67
|
-
assert_equal @stemmer.stem('shadows'), 'shadow'
|
|
68
|
-
assert_equal @stemmer.stem('flies'), 'fly'
|
|
69
|
-
assert_equal @stemmer.stem('things'), 'thing'
|
|
70
|
-
assert_equal @stemmer.stem('nothings'), 'nothing' # as in 'sweet nothings'
|
|
71
|
-
assert_equal @stemmer.stem('witches'), 'witch'
|
|
72
|
-
assert_equal @stemmer.stem('makes'), 'make'
|
|
73
|
-
assert_equal @stemmer.stem('smokes'), 'smoke'
|
|
74
|
-
assert_equal @stemmer.stem('does'), 'do'
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
should "stem various words with -des suffix" do
|
|
78
|
-
assert_equal @stemmer.stem('abodes'), 'abode'
|
|
79
|
-
assert_equal @stemmer.stem('escapades'), 'escapade'
|
|
80
|
-
assert_equal @stemmer.stem('crusades'), 'crusade'
|
|
81
|
-
assert_equal @stemmer.stem('grades'), 'grade'
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
should "stem various words with -res suffix" do
|
|
85
|
-
assert_equal @stemmer.stem('wires'), 'wire'
|
|
86
|
-
assert_equal @stemmer.stem('acres'), 'acre'
|
|
87
|
-
assert_equal @stemmer.stem('fires'), 'fire'
|
|
88
|
-
assert_equal @stemmer.stem('cares'), 'care'
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
should "stem acronyms when pluralized otherwise they should be left alone" do
|
|
92
|
-
assert_equal @stemmer.stem('USA'), 'USA'
|
|
93
|
-
assert_equal @stemmer.stem('FLOSS'), 'FLOSS'
|
|
94
|
-
assert_equal @stemmer.stem('MREs'), 'MRE'
|
|
95
|
-
assert_equal @stemmer.stem('USAED'), 'USAED'
|
|
96
|
-
end
|
|
97
|
-
end
|
|
98
|
-
|
|
99
|
-
context "stem_with_rule method" do
|
|
100
|
-
should "return a Word instance" do
|
|
101
|
-
assert @stemmer.stem_with_rule('witches').is_a?(UEAStemmer::Word)
|
|
102
|
-
end
|
|
103
|
-
|
|
104
|
-
should "return a rule and the stemmed form" do
|
|
105
|
-
word = @stemmer.stem_with_rule('witches')
|
|
106
|
-
assert !word.rule.nil?
|
|
107
|
-
assert !word.word.nil?
|
|
108
|
-
end
|
|
109
|
-
end
|
|
110
|
-
|
|
111
|
-
context "other functionality" do
|
|
112
|
-
should "return the number of rules the stemmer is currently using" do
|
|
113
|
-
assert @stemmer.num_rules.is_a?(Numeric)
|
|
114
|
-
end
|
|
115
|
-
end
|
|
116
|
-
end
|
|
117
|
-
|
|
118
|
-
context "A modified UEAStemmer instance" do
|
|
119
|
-
setup do
|
|
120
|
-
@stemmer = UEAStemmer.new(5, 3) # max word length = 5, max acronym length = 3
|
|
121
|
-
end
|
|
122
|
-
|
|
123
|
-
should "have modified max word and max acronym sizes" do
|
|
124
|
-
assert @stemmer.max_word_length == 5
|
|
125
|
-
assert @stemmer.max_acronym_length == 3
|
|
126
|
-
end
|
|
127
|
-
|
|
128
|
-
should "reject a longer word with rule 95" do
|
|
129
|
-
word = @stemmer.stem_with_rule('deoxyribonucleicacid')
|
|
130
|
-
assert_equal word.rule_num, 95
|
|
131
|
-
end
|
|
132
|
-
|
|
133
|
-
should "reject a longer acronym with rule 96" do
|
|
134
|
-
word = @stemmer.stem_with_rule('CAVASSOO')
|
|
135
|
-
assert_equal word.rule_num, 96
|
|
136
|
-
end
|
|
137
|
-
end
|
|
138
|
-
|
|
139
|
-
context "A Word instance" do
|
|
140
|
-
setup do
|
|
141
|
-
@word = UEAStemmer::Word.new('helpers', 68, UEAStemmer::EndingRule.new('s', 1, 68)) # sample word
|
|
142
|
-
@stemmer = UEAStemmer.new
|
|
143
|
-
end
|
|
144
|
-
|
|
145
|
-
should "return the rule used to derive the stem" do
|
|
146
|
-
assert @word.rule.kind_of?(UEAStemmer::Rule)
|
|
147
|
-
end
|
|
148
|
-
|
|
149
|
-
should "return the number of the rule used to derive the stem" do
|
|
150
|
-
assert @word.rule_num.kind_of?(Numeric)
|
|
151
|
-
end
|
|
152
|
-
|
|
153
|
-
should "return the stemmed word as a String" do
|
|
154
|
-
assert @word.word.kind_of?(String)
|
|
155
|
-
end
|
|
156
|
-
end
|
|
157
|
-
|
|
158
|
-
context "A Rule instance" do
|
|
159
|
-
setup do
|
|
160
|
-
@rule = UEAStemmer::Rule.new(/.*s$/i, 1, 555)
|
|
161
|
-
end
|
|
162
|
-
|
|
163
|
-
should "return the rule number" do
|
|
164
|
-
assert @rule.rule_num.kind_of?(Numeric)
|
|
165
|
-
end
|
|
166
|
-
|
|
167
|
-
should "return the pattern being matched" do
|
|
168
|
-
assert @rule.pattern.kind_of?(String) || @rule.pattern.kind_of?(Regexp)
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
should "return the size of the suffix that is being removed" do
|
|
172
|
-
assert @rule.suffix_size.kind_of?(Numeric)
|
|
173
|
-
end
|
|
174
|
-
|
|
175
|
-
should "return a stemmed word, a rule number, and a rule on a successful match" do
|
|
176
|
-
word, rule_num, tmp_rule = @rule.handle('helps')
|
|
177
|
-
assert word.is_a?(String) && rule_num.is_a?(Numeric) && tmp_rule.is_a?(UEAStemmer::Rule)
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
should "return nil when match is unsuccessful" do
|
|
181
|
-
assert @rule.handle('help').nil?
|
|
182
|
-
end
|
|
183
|
-
end
|
|
184
|
-
end
|
data/uea-stemmer.gemspec
DELETED
|
@@ -1,60 +0,0 @@
|
|
|
1
|
-
# Generated by jeweler
|
|
2
|
-
# DO NOT EDIT THIS FILE DIRECTLY
|
|
3
|
-
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
|
4
|
-
# -*- encoding: utf-8 -*-
|
|
5
|
-
# stub: uea-stemmer 0.10.2 ruby lib
|
|
6
|
-
|
|
7
|
-
Gem::Specification.new do |s|
|
|
8
|
-
s.name = "uea-stemmer"
|
|
9
|
-
s.version = "0.10.2"
|
|
10
|
-
|
|
11
|
-
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
12
|
-
s.require_paths = ["lib"]
|
|
13
|
-
s.authors = ["Marie-Claire Jenkins", "Dan J. Smith", "Richard Churchill", "Jason Adams"]
|
|
14
|
-
s.date = "2017-08-14"
|
|
15
|
-
s.description = "Port of UEA-Lite Stemmer to Ruby, a conservative stemmer for search and indexing."
|
|
16
|
-
s.email = "jasonmadams@gmail.com"
|
|
17
|
-
s.extra_rdoc_files = [
|
|
18
|
-
"LICENSE",
|
|
19
|
-
"README.rdoc"
|
|
20
|
-
]
|
|
21
|
-
s.files = [
|
|
22
|
-
".document",
|
|
23
|
-
"LICENSE",
|
|
24
|
-
"README.rdoc",
|
|
25
|
-
"Rakefile",
|
|
26
|
-
"VERSION",
|
|
27
|
-
"lib/uea-stemmer.rb",
|
|
28
|
-
"lib/uea-stemmer/rule.rb",
|
|
29
|
-
"lib/uea-stemmer/string_helpers.rb",
|
|
30
|
-
"lib/uea-stemmer/word.rb",
|
|
31
|
-
"test/test_helper.rb",
|
|
32
|
-
"test/uea_stemmer_test.rb",
|
|
33
|
-
"uea-stemmer.gemspec"
|
|
34
|
-
]
|
|
35
|
-
s.homepage = "http://github.com/ealdent/uea-stemmer"
|
|
36
|
-
s.rubygems_version = "2.4.8"
|
|
37
|
-
s.summary = "Port of UEA-Lite Stemmer to Ruby, a conservative stemmer for search and indexing."
|
|
38
|
-
|
|
39
|
-
if s.respond_to? :specification_version then
|
|
40
|
-
s.specification_version = 4
|
|
41
|
-
|
|
42
|
-
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
|
43
|
-
s.add_runtime_dependency(%q<uea-stemmer>, [">= 0"])
|
|
44
|
-
s.add_development_dependency(%q<jeweler>, [">= 0"])
|
|
45
|
-
s.add_development_dependency(%q<test-unit>, [">= 0"])
|
|
46
|
-
s.add_development_dependency(%q<shoulda>, [">= 0"])
|
|
47
|
-
else
|
|
48
|
-
s.add_dependency(%q<uea-stemmer>, [">= 0"])
|
|
49
|
-
s.add_dependency(%q<jeweler>, [">= 0"])
|
|
50
|
-
s.add_dependency(%q<test-unit>, [">= 0"])
|
|
51
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
|
52
|
-
end
|
|
53
|
-
else
|
|
54
|
-
s.add_dependency(%q<uea-stemmer>, [">= 0"])
|
|
55
|
-
s.add_dependency(%q<jeweler>, [">= 0"])
|
|
56
|
-
s.add_dependency(%q<test-unit>, [">= 0"])
|
|
57
|
-
s.add_dependency(%q<shoulda>, [">= 0"])
|
|
58
|
-
end
|
|
59
|
-
end
|
|
60
|
-
|