engtagger 0.3.2 → 0.4.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,192 +1,169 @@
1
- #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
-
4
- module Stemmable
5
-
6
- STEP_2_LIST = {
7
- 'ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',
8
- 'izer'=>'ize', 'bli'=>'ble',
9
- 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e', 'ousli'=>'ous',
10
- 'ization'=>'ize', 'ation'=>'ate',
11
- 'ator'=>'ate', 'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful',
12
- 'ousness'=>'ous', 'aliti'=>'al',
13
- 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
14
- }
15
-
16
- STEP_3_LIST = {
17
- 'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
18
- 'ical'=>'ic', 'ful'=>'', 'ness'=>''
19
- }
20
-
21
-
22
- SUFFIX_1_REGEXP = /(
23
- ational |
24
- tional |
25
- enci |
26
- anci |
27
- izer |
28
- bli |
29
- alli |
30
- entli |
31
- eli |
32
- ousli |
33
- ization |
34
- ation |
35
- ator |
36
- alism |
37
- iveness |
38
- fulness |
39
- ousness |
40
- aliti |
41
- iviti |
42
- biliti |
43
- logi)$/x
44
-
45
-
46
- SUFFIX_2_REGEXP = /(
47
- al |
48
- ance |
49
- ence |
50
- er |
51
- ic |
52
- able |
53
- ible |
54
- ant |
55
- ement |
56
- ment |
57
- ent |
58
- ou |
59
- ism |
60
- ate |
61
- iti |
62
- ous |
63
- ive |
64
- ize)$/x
65
-
66
-
67
- C = "[^aeiou]" # consonant
68
- V = "[aeiouy]" # vowel
69
- CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
70
- VV = "#{V}(?>[aeiou]*)" # vowel sequence
71
-
72
- MGR0 = /^(#{CC})?#{VV}#{CC}/o # [cc]vvcc... is m>0
73
- MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o # [cc]vvcc[vv] is m=1
74
- MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o # [cc]vvccvvcc... is m>1
75
- VOWEL_IN_STEM = /^(#{CC})?#{V}/o # vowel in stem
76
-
77
- #
78
- # Porter stemmer in Ruby.
79
- #
80
- # This is the Porter stemming algorithm, ported to Ruby from the
81
- # version coded up in Perl. It's easy to follow against the rules
82
- # in the original paper in:
83
- #
84
- # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
85
- # no. 3, pp 130-137,
86
- #
87
- # See also http://www.tartarus.org/~martin/PorterStemmer
88
- #
89
- # Send comments to raypereda@hotmail.com
90
- #
91
-
92
- def stem_porter
93
-
94
- # make a copy of the given object and convert it to a string.
95
- w = self.dup.to_str
96
-
97
- return w if w.length < 3
98
-
99
- # now map initial y to Y so that the patterns never treat it as vowel
100
- w[0] = 'Y' if w[0] == ?y
101
-
102
- # Step 1a
103
- if w =~ /(ss|i)es$/
104
- w = $` + $1
105
- elsif w =~ /([^s])s$/
106
- w = $` + $1
107
- end
108
-
109
- # Step 1b
110
- if w =~ /eed$/
111
- w.chop! if $` =~ MGR0
112
- elsif w =~ /(ed|ing)$/
113
- stem = $`
114
- if stem =~ VOWEL_IN_STEM
115
- w = stem
116
- case w
117
- when /(at|bl|iz)$/ then w << "e"
118
- when /([^aeiouylsz])\1$/ then w.chop!
119
- when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
120
- end
121
- end
122
- end
123
-
124
- if w =~ /y$/
125
- stem = $`
126
- w = stem + "i" if stem =~ VOWEL_IN_STEM
127
- end
128
-
129
- # Step 2
130
- if w =~ SUFFIX_1_REGEXP
131
- stem = $`
132
- suffix = $1
133
- # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
134
- if stem =~ MGR0
135
- w = stem + STEP_2_LIST[suffix]
136
- end
137
- end
138
-
139
- # Step 3
140
- if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
141
- stem = $`
142
- suffix = $1
143
- if stem =~ MGR0
144
- w = stem + STEP_3_LIST[suffix]
145
- end
146
- end
147
-
148
- # Step 4
149
- if w =~ SUFFIX_2_REGEXP
150
- stem = $`
151
- if stem =~ MGR1
152
- w = stem
153
- end
154
- elsif w =~ /(s|t)(ion)$/
155
- stem = $` + $1
156
- if stem =~ MGR1
157
- w = stem
158
- end
159
- end
160
-
161
- # Step 5
162
- if w =~ /e$/
163
- stem = $`
164
- if (stem =~ MGR1) ||
165
- (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
166
- w = stem
167
- end
168
- end
169
-
170
- if w =~ /ll$/ && w =~ MGR1
171
- w.chop!
172
- end
173
-
174
- # and turn initial Y back to y
175
- w[0] = 'y' if w[0] == ?Y
176
-
177
- w
178
- end
179
-
180
-
181
- #
182
- # make the stem_porter the default stem method, just in case we
183
- # feel like having multiple stemmers available later.
184
- #
185
- alias stem stem_porter
186
-
187
- end
188
-
189
- # Add stem method to all Strings
190
- class String
191
- include Stemmable
192
- end
1
+ # frozen_string_literal: true
2
+
3
+ module Stemmable
4
+ STEP_2_LIST = {
5
+ "ational" => "ate", "tional" => "tion", "enci" => "ence", "anci" => "ance",
6
+ "izer" => "ize", "bli" => "ble",
7
+ "alli" => "al", "entli" => "ent", "eli" => "e", "ousli" => "ous",
8
+ "ization" => "ize", "ation" => "ate",
9
+ "ator" => "ate", "alism" => "al", "iveness" => "ive", "fulness" => "ful",
10
+ "ousness" => "ous", "aliti" => "al",
11
+ "iviti" => "ive", "biliti" => "ble", "logi" => "log"
12
+ }.freeze
13
+
14
+ STEP_3_LIST = {
15
+ "icate" => "ic", "ative" => "", "alize" => "al", "iciti" => "ic",
16
+ "ical" => "ic", "ful" => "", "ness" => ""
17
+ }.freeze
18
+
19
+ SUFFIX_1_REGEXP = /(
20
+ ational |
21
+ tional |
22
+ enci |
23
+ anci |
24
+ izer |
25
+ bli |
26
+ alli |
27
+ entli |
28
+ eli |
29
+ ousli |
30
+ ization |
31
+ ation |
32
+ ator |
33
+ alism |
34
+ iveness |
35
+ fulness |
36
+ ousness |
37
+ aliti |
38
+ iviti |
39
+ biliti |
40
+ logi)$/x.freeze
41
+
42
+ SUFFIX_2_REGEXP = /(
43
+ al |
44
+ ance |
45
+ ence |
46
+ er |
47
+ ic |
48
+ able |
49
+ ible |
50
+ ant |
51
+ ement |
52
+ ment |
53
+ ent |
54
+ ou |
55
+ ism |
56
+ ate |
57
+ iti |
58
+ ous |
59
+ ive |
60
+ ize)$/x.freeze
61
+
62
+ C = "[^aeiou]" # consonant
63
+ V = "[aeiouy]" # vowel
64
+ CC = "#{C}(?>[^aeiouy]*)" # consonant sequence
65
+ VV = "#{V}(?>[aeiou]*)" # vowel sequence
66
+
67
+ MGR0 = /^(#{CC})?#{VV}#{CC}/o.freeze # [cc]vvcc... is m>0
68
+ MEQ1 = /^(#{CC})?#{VV}#{CC}(#{VV})?$/o.freeze # [cc]vvcc[vv] is m=1
69
+ MGR1 = /^(#{CC})?#{VV}#{CC}#{VV}#{CC}/o.freeze # [cc]vvccvvcc... is m>1
70
+ VOWEL_IN_STEM = /^(#{CC})?#{V}/o.freeze # vowel in stem
71
+
72
+ # Porter stemmer in Ruby.
73
+ #
74
+ # This is the Porter stemming algorithm, ported to Ruby from the
75
+ # version coded up in Perl. It's easy to follow against the rules
76
+ # in the original paper in:
77
+ #
78
+ # Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
79
+ # no. 3, pp 130-137,
80
+ #
81
+ # See also http://www.tartarus.org/~martin/PorterStemmer
82
+ #
83
+ # Send comments to raypereda@hotmail.com
84
+ #
85
+
86
+ def stem_porter
87
+ # make a copy of the given object and convert it to a string.
88
+ w = dup.to_str
89
+
90
+ return w if w.length < 3
91
+
92
+ # now map initial y to Y so that the patterns never treat it as vowel
93
+ w[0] = "Y" if w[0] == "y"
94
+
95
+ # Step 1a
96
+ case w
97
+ when /(ss|i)es$/
98
+ w = $` + $1
99
+ when /([^s])s$/
100
+ w = $` + $1
101
+ end
102
+
103
+ # Step 1b
104
+ case w
105
+ when /eed$/
106
+ w.chop! if $` =~ MGR0
107
+ when /(ed|ing)$/
108
+ stem = $`
109
+ if stem =~ VOWEL_IN_STEM
110
+ w = stem
111
+ case w
112
+ when /(at|bl|iz)$/ then w << "e"
113
+ when /([^aeiouylsz])\1$/ then w.chop!
114
+ when /^#{CC}#{V}[^aeiouwxy]$/o then w << "e"
115
+ end
116
+ end
117
+ end
118
+
119
+ if w =~ /y$/
120
+ stem = $`
121
+ w = stem + "i" if stem =~ VOWEL_IN_STEM
122
+ end
123
+
124
+ # Step 2
125
+ if w =~ SUFFIX_1_REGEXP
126
+ stem = $`
127
+ suffix = $1
128
+ # print "stem= " + stem + "\n" + "suffix=" + suffix + "\n"
129
+ w = stem + STEP_2_LIST[suffix] if stem =~ MGR0
130
+ end
131
+
132
+ # Step 3
133
+ if w =~ /(icate|ative|alize|iciti|ical|ful|ness)$/
134
+ stem = $`
135
+ suffix = $1
136
+ w = stem + STEP_3_LIST[suffix] if stem =~ MGR0
137
+ end
138
+
139
+ # Step 4
140
+ if w =~ SUFFIX_2_REGEXP
141
+ stem = $`
142
+ w = stem if stem =~ MGR1
143
+ elsif w =~ /(s|t)(ion)$/
144
+ stem = $` + $1
145
+ w = stem if stem =~ MGR1
146
+ end
147
+
148
+ # Step 5
149
+ if w =~ /e$/
150
+ stem = $`
151
+ w = stem if (stem =~ MGR1) || (stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
152
+ end
153
+
154
+ w.chop! if w =~ /ll$/ && w =~ MGR1
155
+
156
+ # and turn initial Y back to y
157
+ w[0] = "y" if w[0] == "Y"
158
+ w
159
+ end
160
+
161
+ # make the stem_porter the default stem method, just in case we
162
+ # feel like having multiple stemmers available later.
163
+ alias stem stem_porter
164
+ end
165
+
166
+ # Add stem method to all Strings
167
+ class String
168
+ include Stemmable
169
+ end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class EngTagger
2
- VERSION = "0.3.2"
4
+ VERSION = "0.4.1"
3
5
  end