engtagger 0.3.2 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,192 +1,169 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- module Stemmable
5
-
6
- STEP_2_LIST = {
7
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
8
- 'izer'=>'ize', 'bli'=>'ble',
9
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
10
- 'ization'=>'ize', 'ation'=>'ate',
11
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
12
- 'ousness'=>'ous', 'aliti'=>'al',
13
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
14
- }
15
-
16
- STEP_3_LIST = {
17
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
18
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
19
- }
20
-
21
-
22
- SUFFIX_1_REGEXP = /(
23
- ational |
24
- tional |
25
- enci |
26
- anci |
27
- izer |
28
- bli |
29
- alli |
30
- entli |
31
- eli |
32
- ousli |
33
- ization |
34
- ation |
35
- ator |
36
- alism |
37
- iveness |
38
- fulness |
39
- ousness |
40
- aliti |
41
- iviti |
42
- biliti |
43
- logi)$/x
44
-
45
-
46
- SUFFIX_2_REGEXP = /(
47
- al |
48
- ance |
49
- ence |
50
- er |
51
- ic |
52
- able |
53
- ible |
54
- ant |
55
- ement |
56
- ment |
57
- ent |
58
- ou |
59
- ism |
60
- ate |
61
- iti |
62
- ous |
63
- ive |
64
- ize)$/x
65
-
66
-
67
- C = "[^aeiou]" # consonant
68
- V = "[aeiouy]" # vowel
69
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
70
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
71
-
72
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
73
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
74
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
75
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
76
-
77
- #
78
- # Porter stemmer in Ruby.
79
- #
80
- # This is the Porter stemming algorithm, ported to Ruby from the
81
- # version coded up in Perl. It's easy to follow against the rules
82
- # in the original paper in:
83
- #
84
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
85
- # no. 3, pp 130-137,
86
- #
87
- # See also http://www.tartarus.org/~martin/PorterStemmer
88
- #
89
- # Send comments to raypereda@hotmail.com
90
- #
91
-
92
- def stem_porter
93
-
94
- # make a copy of the given object and convert it to a string.
95
- w = self.dup.to_str
96
-
97
- return w if w.length < 3
98
-
99
- # now map initial y to Y so that the patterns never treat it as vowel
100
- w[0] = 'Y' if w[0] == ?y
101
-
102
- # Step 1a
103
- if w =~ /(ss|i)es$/
104
- w = $` + $1
105
- elsif w =~ /([^s])s$/
106
- w = $` + $1
107
- end
108
-
109
- # Step 1b
110
- if w =~ /eed$/
111
- w.chop! if $` =~ MGR0
112
- elsif w =~ /(ed|ing)$/
113
- stem = $`
114
- if stem =~ VOWEL_IN_STEM
115
- w = stem
116
- case w
117
- when /(at|bl|iz)$/ then w << "e"
118
- when /([^aeiouylsz])\1$/ then w.chop!
119
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
120
- end
121
- end
122
- end
123
-
124
- if w =~ /y$/
125
- stem = $`
126
- w = stem + "i" if stem =~ VOWEL_IN_STEM
127
- end
128
-
129
- # Step 2
130
- if w =~ SUFFIX_1_REGEXP
131
- stem = $`
132
- suffix = $1
133
- # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
134
- if stem =~ MGR0
135
- w = stem + STEP_2_LIST[suffix]
136
- end
137
- end
138
-
139
- # Step 3
140
- if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
141
- stem = $`
142
- suffix = $1
143
- if stem =~ MGR0
144
- w = stem + STEP_3_LIST[suffix]
145
- end
146
- end
147
-
148
- # Step 4
149
- if w =~ SUFFIX_2_REGEXP
150
- stem = $`
151
- if stem =~ MGR1
152
- w = stem
153
- end
154
- elsif w =~ /(s|t)(ion)$/
155
- stem = $` + $1
156
- if stem =~ MGR1
157
- w = stem
158
- end
159
- end
160
-
161
- # Step 5
162
- if w =~ /e$/
163
- stem = $`
164
- if (stem =~ MGR1) ||
165
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
166
- w = stem
167
- end
168
- end
169
-
170
- if w =~ /ll$/ && w =~ MGR1
171
- w.chop!
172
- end
173
-
174
- # and turn initial Y back to y
175
- w[0] = 'y' if w[0] == ?Y
176
-
177
- w
178
- end
179
-
180
-
181
- #
182
- # make the stem_porter the default stem method, just in case we
183
- # feel like having multiple stemmers available later.
184
- #
185
- alias stem stem_porter
186
-
187
- end
188
-
189
- # Add stem method to all Strings
190
- class String
191
- include Stemmable
192
- end
1
+ # frozen_string_literal: true
2
+
3
+ module Stemmable
4
+ STEP_2_LIST = {
5
+ "ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
6
+ "izer" => "ize", "bli" => "ble",
7
+ "alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
8
+ "ization" => "ize", "ation" => "ate",
9
+ "ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
10
+ "ousness" => "ous", "aliti" => "al",
11
+ "iviti" => "ive", "biliti" => "ble", "logi" => "log"
12
+ }.freeze
13
+
14
+ STEP_3_LIST = {
15
+ "icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
16
+ "ical" => "ic", "ful" => "", "ness" => ""
17
+ }.freeze
18
+
19
+ SUFFIX_1_REGEXP = /(
20
+ ational |
21
+ tional |
22
+ enci |
23
+ anci |
24
+ izer |
25
+ bli |
26
+ alli |
27
+ entli |
28
+ eli |
29
+ ousli |
30
+ ization |
31
+ ation |
32
+ ator |
33
+ alism |
34
+ iveness |
35
+ fulness |
36
+ ousness |
37
+ aliti |
38
+ iviti |
39
+ biliti |
40
+ logi)$/x.freeze
41
+
42
+ SUFFIX_2_REGEXP = /(
43
+ al |
44
+ ance |
45
+ ence |
46
+ er |
47
+ ic |
48
+ able |
49
+ ible |
50
+ ant |
51
+ ement |
52
+ ment |
53
+ ent |
54
+ ou |
55
+ ism |
56
+ ate |
57
+ iti |
58
+ ous |
59
+ ive |
60
+ ize)$/x.freeze
61
+
62
+ C = "[^aeiou]" # consonant
63
+ V = "[aeiouy]" # vowel
64
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
65
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
66
+
67
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
68
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
69
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
70
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
71
+
72
+ # Porter stemmer in Ruby.
73
+ #
74
+ # This is the Porter stemming algorithm, ported to Ruby from the
75
+ # version coded up in Perl. It's easy to follow against the rules
76
+ # in the original paper in:
77
+ #
78
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
79
+ # no. 3, pp 130-137,
80
+ #
81
+ # See also http://www.tartarus.org/~martin/PorterStemmer
82
+ #
83
+ # Send comments to raypereda@hotmail.com
84
+ #
85
+
86
+ def stem_porter
87
+ # make a copy of the given object and convert it to a string.
88
+ w = dup.to_str
89
+
90
+ return w if w.length < 3
91
+
92
+ # now map initial y to Y so that the patterns never treat it as vowel
93
+ w[0] = "Y" if w[0] == "y"
94
+
95
+ # Step 1a
96
+ case w
97
+ when /(ss|i)es$/
98
+ w = $` + $1
99
+ when /([^s])s$/
100
+ w = $` + $1
101
+ end
102
+
103
+ # Step 1b
104
+ case w
105
+ when /eed$/
106
+ w.chop! if $` =~ MGR0
107
+ when /(ed|ing)$/
108
+ stem = $`
109
+ if stem =~ VOWEL_IN_STEM
110
+ w = stem
111
+ case w
112
+ when /(at|bl|iz)$/ then w << "e"
113
+ when /([^aeiouylsz])\1$/ then w.chop!
114
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
115
+ end
116
+ end
117
+ end
118
+
119
+ if w =~ /y$/
120
+ stem = $`
121
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
122
+ end
123
+
124
+ # Step 2
125
+ if w =~ SUFFIX_1_REGEXP
126
+ stem = $`
127
+ suffix = $1
128
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
129
+ w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
130
+ end
131
+
132
+ # Step 3
133
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
134
+ stem = $`
135
+ suffix = $1
136
+ w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
137
+ end
138
+
139
+ # Step 4
140
+ if w =~ SUFFIX_2_REGEXP
141
+ stem = $`
142
+ w = stem if stem =~ MGR1
143
+ elsif w =~ /(s|t)(ion)$/
144
+ stem = $` + $1
145
+ w = stem if stem =~ MGR1
146
+ end
147
+
148
+ # Step 5
149
+ if w =~ /e$/
150
+ stem = $`
151
+ w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
152
+ end
153
+
154
+ w.chop! if w =~ /ll$/ && w =~ MGR1
155
+
156
+ # and turn initial Y back to y
157
+ w[0] = "y" if w[0] == "Y"
158
+ w
159
+ end
160
+
161
+ # make the stem_porter the default stem method, just in case we
162
+ # feel like having multiple stemmers available later.
163
+ alias stem stem_porter
164
+ end
165
+
166
+ # Add stem method to all Strings
167
+ class String
168
+ include Stemmable
169
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class EngTagger
2
- VERSION = "0.3.2"
4
+ VERSION = "0.4.1"
3
5
  end