engtagger 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +72 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +74 -42
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +169 -192
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
data/lib/engtagger/porter.rb
CHANGED
@@ -1,192 +1,169 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
#
|
78
|
-
#
|
79
|
-
#
|
80
|
-
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
#
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
w
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
end
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
end
|
138
|
-
|
139
|
-
# Step
|
140
|
-
if w =~
|
141
|
-
stem = $`
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
end
|
147
|
-
|
148
|
-
#
|
149
|
-
if w =~
|
150
|
-
stem = $`
|
151
|
-
if stem =~ MGR1
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
if w =~ /ll$/ && w =~ MGR1
|
171
|
-
w.chop!
|
172
|
-
end
|
173
|
-
|
174
|
-
# and turn initial Y back to y
|
175
|
-
w[0] = 'y' if w[0] == ?Y
|
176
|
-
|
177
|
-
w
|
178
|
-
end
|
179
|
-
|
180
|
-
|
181
|
-
#
|
182
|
-
# make the stem_porter the default stem method, just in case we
|
183
|
-
# feel like having multiple stemmers available later.
|
184
|
-
#
|
185
|
-
alias stem stem_porter
|
186
|
-
|
187
|
-
end
|
188
|
-
|
189
|
-
# Add stem method to all Strings
|
190
|
-
class String
|
191
|
-
include Stemmable
|
192
|
-
end
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Stemmable
|
4
|
+
STEP_2_LIST = {
|
5
|
+
"ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
|
6
|
+
"izer" => "ize", "bli" => "ble",
|
7
|
+
"alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
|
8
|
+
"ization" => "ize", "ation" => "ate",
|
9
|
+
"ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
|
10
|
+
"ousness" => "ous", "aliti" => "al",
|
11
|
+
"iviti" => "ive", "biliti" => "ble", "logi" => "log"
|
12
|
+
}.freeze
|
13
|
+
|
14
|
+
STEP_3_LIST = {
|
15
|
+
"icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
|
16
|
+
"ical" => "ic", "ful" => "", "ness" => ""
|
17
|
+
}.freeze
|
18
|
+
|
19
|
+
SUFFIX_1_REGEXP = /(
|
20
|
+
ational |
|
21
|
+
tional |
|
22
|
+
enci |
|
23
|
+
anci |
|
24
|
+
izer |
|
25
|
+
bli |
|
26
|
+
alli |
|
27
|
+
entli |
|
28
|
+
eli |
|
29
|
+
ousli |
|
30
|
+
ization |
|
31
|
+
ation |
|
32
|
+
ator |
|
33
|
+
alism |
|
34
|
+
iveness |
|
35
|
+
fulness |
|
36
|
+
ousness |
|
37
|
+
aliti |
|
38
|
+
iviti |
|
39
|
+
biliti |
|
40
|
+
logi)$/x.freeze
|
41
|
+
|
42
|
+
SUFFIX_2_REGEXP = /(
|
43
|
+
al |
|
44
|
+
ance |
|
45
|
+
ence |
|
46
|
+
er |
|
47
|
+
ic |
|
48
|
+
able |
|
49
|
+
ible |
|
50
|
+
ant |
|
51
|
+
ement |
|
52
|
+
ment |
|
53
|
+
ent |
|
54
|
+
ou |
|
55
|
+
ism |
|
56
|
+
ate |
|
57
|
+
iti |
|
58
|
+
ous |
|
59
|
+
ive |
|
60
|
+
ize)$/x.freeze
|
61
|
+
|
62
|
+
C = "[^aeiou]" # consonant
|
63
|
+
V = "[aeiouy]" # vowel
|
64
|
+
CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
|
65
|
+
VV = "#{V}(?>[aeiou]*)" # vowel sequence
|
66
|
+
|
67
|
+
MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
|
68
|
+
MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
|
69
|
+
MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
|
70
|
+
VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
|
71
|
+
|
72
|
+
# Porter stemmer in Ruby.
|
73
|
+
#
|
74
|
+
# This is the Porter stemming algorithm, ported to Ruby from the
|
75
|
+
# version coded up in Perl. It's easy to follow against the rules
|
76
|
+
# in the original paper in:
|
77
|
+
#
|
78
|
+
# Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
79
|
+
# no. 3, pp 130-137,
|
80
|
+
#
|
81
|
+
# See also http://www.tartarus.org/~martin/PorterStemmer
|
82
|
+
#
|
83
|
+
# Send comments to raypereda@hotmail.com
|
84
|
+
#
|
85
|
+
|
86
|
+
def stem_porter
|
87
|
+
# make a copy of the given object and convert it to a string.
|
88
|
+
w = dup.to_str
|
89
|
+
|
90
|
+
return w if w.length < 3
|
91
|
+
|
92
|
+
# now map initial y to Y so that the patterns never treat it as vowel
|
93
|
+
w[0] = "Y" if w[0] == "y"
|
94
|
+
|
95
|
+
# Step 1a
|
96
|
+
case w
|
97
|
+
when /(ss|i)es$/
|
98
|
+
w = $` + $1
|
99
|
+
when /([^s])s$/
|
100
|
+
w = $` + $1
|
101
|
+
end
|
102
|
+
|
103
|
+
# Step 1b
|
104
|
+
case w
|
105
|
+
when /eed$/
|
106
|
+
w.chop! if $` =~ MGR0
|
107
|
+
when /(ed|ing)$/
|
108
|
+
stem = $`
|
109
|
+
if stem =~ VOWEL_IN_STEM
|
110
|
+
w = stem
|
111
|
+
case w
|
112
|
+
when /(at|bl|iz)$/ then w << "e"
|
113
|
+
when /([^aeiouylsz])\1$/ then w.chop!
|
114
|
+
when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
if w =~ /y$/
|
120
|
+
stem = $`
|
121
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
122
|
+
end
|
123
|
+
|
124
|
+
# Step 2
|
125
|
+
if w =~ SUFFIX_1_REGEXP
|
126
|
+
stem = $`
|
127
|
+
suffix = $1
|
128
|
+
# print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
|
129
|
+
w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
|
130
|
+
end
|
131
|
+
|
132
|
+
# Step 3
|
133
|
+
if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
|
134
|
+
stem = $`
|
135
|
+
suffix = $1
|
136
|
+
w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
|
137
|
+
end
|
138
|
+
|
139
|
+
# Step 4
|
140
|
+
if w =~ SUFFIX_2_REGEXP
|
141
|
+
stem = $`
|
142
|
+
w = stem if stem =~ MGR1
|
143
|
+
elsif w =~ /(s|t)(ion)$/
|
144
|
+
stem = $` + $1
|
145
|
+
w = stem if stem =~ MGR1
|
146
|
+
end
|
147
|
+
|
148
|
+
# Step 5
|
149
|
+
if w =~ /e$/
|
150
|
+
stem = $`
|
151
|
+
w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
152
|
+
end
|
153
|
+
|
154
|
+
w.chop! if w =~ /ll$/ && w =~ MGR1
|
155
|
+
|
156
|
+
# and turn initial Y back to y
|
157
|
+
w[0] = "y" if w[0] == "Y"
|
158
|
+
w
|
159
|
+
end
|
160
|
+
|
161
|
+
# make the stem_porter the default stem method, just in case we
|
162
|
+
# feel like having multiple stemmers available later.
|
163
|
+
alias stem stem_porter
|
164
|
+
end
|
165
|
+
|
166
|
+
# Add stem method to all Strings
|
167
|
+
class String
|
168
|
+
include Stemmable
|
169
|
+
end
|
data/lib/engtagger/version.rb
CHANGED