engtagger 0.3.2 → 0.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +72 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +74 -42
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +169 -192
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
data/lib/engtagger/porter.rb
CHANGED
@@ -1,192 +1,169 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
w
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
end
|
138
|
-
|
139
|
-
# Step
|
140
|
-
if w =~
|
141
|
-
stem = $`
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
end
|
147
|
-
|
148
|
-
#
|
149
|
-
if w =~
|
150
|
-
stem = $`
|
151
|
-
if stem =~ MGR1
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
if w =~ /ll$/ && w =~ MGR1
|
171
|
-
w.chop!
|
172
|
-
end
|
173
|
-
|
174
|
-
# and turn initial Y back to y
|
175
|
-
w[0] = 'y' if w[0] == ?Y
|
176
|
-
|
177
|
-
w
|
178
|
-
end
|
179
|
-
|
180
|
-
|
181
|
-
#
|
182
|
-
# make the stem_porter the default stem method, just in case we
|
183
|
-
# feel like having multiple stemmers available later.
|
184
|
-
#
|
185
|
-
alias stem stem_porter
|
186
|
-
|
187
|
-
end
|
188
|
-
|
189
|
-
# Add stem method to all Strings
|
190
|
-
class String
|
191
|
-
include Stemmable
|
192
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stemmable
|
4
|
+
STEP_2_LIST = {
|
5
|
+
"ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
|
6
|
+
"izer" => "ize", "bli" => "ble",
|
7
|
+
"alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
|
8
|
+
"ization" => "ize", "ation" => "ate",
|
9
|
+
"ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
|
10
|
+
"ousness" => "ous", "aliti" => "al",
|
11
|
+
"iviti" => "ive", "biliti" => "ble", "logi" => "log"
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
STEP_3_LIST = {
|
15
|
+
"icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
|
16
|
+
"ical" => "ic", "ful" => "", "ness" => ""
|
17
|
+
}.freeze
|
18
|
+
|
19
|
+
SUFFIX_1_REGEXP = /(
|
20
|
+
ational |
|
21
|
+
tional |
|
22
|
+
enci |
|
23
|
+
anci |
|
24
|
+
izer |
|
25
|
+
bli |
|
26
|
+
alli |
|
27
|
+
entli |
|
28
|
+
eli |
|
29
|
+
ousli |
|
30
|
+
ization |
|
31
|
+
ation |
|
32
|
+
ator |
|
33
|
+
alism |
|
34
|
+
iveness |
|
35
|
+
fulness |
|
36
|
+
ousness |
|
37
|
+
aliti |
|
38
|
+
iviti |
|
39
|
+
biliti |
|
40
|
+
logi)$/x.freeze
|
41
|
+
|
42
|
+
SUFFIX_2_REGEXP = /(
|
43
|
+
al |
|
44
|
+
ance |
|
45
|
+
ence |
|
46
|
+
er |
|
47
|
+
ic |
|
48
|
+
able |
|
49
|
+
ible |
|
50
|
+
ant |
|
51
|
+
ement |
|
52
|
+
ment |
|
53
|
+
ent |
|
54
|
+
ou |
|
55
|
+
ism |
|
56
|
+
ate |
|
57
|
+
iti |
|
58
|
+
ous |
|
59
|
+
ive |
|
60
|
+
ize)$/x.freeze
|
61
|
+
|
62
|
+
C = "[^aeiou]" # consonant
|
63
|
+
V = "[aeiouy]" # vowel
|
64
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
65
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
66
|
+
|
67
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
|
68
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
|
69
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
|
70
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
|
71
|
+
|
72
|
+
# Porter stemmer in Ruby.
|
73
|
+
#
|
74
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
75
|
+
# version coded up in Perl. It's easy to follow against the rules
|
76
|
+
# in the original paper in:
|
77
|
+
#
|
78
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
79
|
+
# no. 3, pp 130-137,
|
80
|
+
#
|
81
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
82
|
+
#
|
83
|
+
# Send comments to raypereda@hotmail.com
|
84
|
+
#
|
85
|
+
|
86
|
+
def stem_porter
|
87
|
+
# make a copy of the given object and convert it to a string.
|
88
|
+
w = dup.to_str
|
89
|
+
|
90
|
+
return w if w.length < 3
|
91
|
+
|
92
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
93
|
+
w[0] = "Y" if w[0] == "y"
|
94
|
+
|
95
|
+
# Step 1a
|
96
|
+
case w
|
97
|
+
when /(ss|i)es$/
|
98
|
+
w = $` + $1
|
99
|
+
when /([^s])s$/
|
100
|
+
w = $` + $1
|
101
|
+
end
|
102
|
+
|
103
|
+
# Step 1b
|
104
|
+
case w
|
105
|
+
when /eed$/
|
106
|
+
w.chop! if $` =~ MGR0
|
107
|
+
when /(ed|ing)$/
|
108
|
+
stem = $`
|
109
|
+
if stem =~ VOWEL_IN_STEM
|
110
|
+
w = stem
|
111
|
+
case w
|
112
|
+
when /(at|bl|iz)$/ then w << "e"
|
113
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
114
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if w =~ /y$/
|
120
|
+
stem = $`
|
121
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
122
|
+
end
|
123
|
+
|
124
|
+
# Step 2
|
125
|
+
if w =~ SUFFIX_1_REGEXP
|
126
|
+
stem = $`
|
127
|
+
suffix = $1
|
128
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
129
|
+
w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
|
130
|
+
end
|
131
|
+
|
132
|
+
# Step 3
|
133
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
134
|
+
stem = $`
|
135
|
+
suffix = $1
|
136
|
+
w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
|
137
|
+
end
|
138
|
+
|
139
|
+
# Step 4
|
140
|
+
if w =~ SUFFIX_2_REGEXP
|
141
|
+
stem = $`
|
142
|
+
w = stem if stem =~ MGR1
|
143
|
+
elsif w =~ /(s|t)(ion)$/
|
144
|
+
stem = $` + $1
|
145
|
+
w = stem if stem =~ MGR1
|
146
|
+
end
|
147
|
+
|
148
|
+
# Step 5
|
149
|
+
if w =~ /e$/
|
150
|
+
stem = $`
|
151
|
+
w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
152
|
+
end
|
153
|
+
|
154
|
+
w.chop! if w =~ /ll$/ && w =~ MGR1
|
155
|
+
|
156
|
+
# and turn initial Y back to y
|
157
|
+
w[0] = "y" if w[0] == "Y"
|
158
|
+
w
|
159
|
+
end
|
160
|
+
|
161
|
+
# make the stem_porter the default stem method, just in case we
|
162
|
+
# feel like having multiple stemmers available later.
|
163
|
+
alias stem stem_porter
|
164
|
+
end
|
165
|
+
|
166
|
+
# Add stem method to all Strings
|
167
|
+
class String
|
168
|
+
include Stemmable
|
169
|
+
end
|
data/lib/engtagger/version.rb
CHANGED