porter2stemmer 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +13 -0
- data/Gemfile.lock +20 -0
- data/LICENSE.txt +20 -0
- data/README.md +58 -0
- data/README.rdoc +67 -0
- data/Rakefile +54 -0
- data/VERSION +1 -0
- data/lib/porter2stemmer/constants.rb +114 -0
- data/lib/porter2stemmer/implementation.rb +326 -0
- data/lib/porter2stemmer.rb +7 -0
- data/pkg/porter2stemmer-1.0.0.gem +0 -0
- data/rdoc/Porter2.html +249 -0
- data/rdoc/README_rdoc.html +204 -0
- data/rdoc/String.html +1142 -0
- data/rdoc/created.rid +6 -0
- data/rdoc/index.html +94 -0
- data/rdoc/lib/porter2stemmer/constants_rb.html +55 -0
- data/rdoc/lib/porter2stemmer/implementation_rb.html +55 -0
- data/rdoc/lib/porter2stemmer_rb.html +59 -0
- data/rdoc/rdoc.css +706 -0
- data/test/helper.rb +7 -0
- data/test/test_porter2stemmer.rb +10 -0
- data/test/test_porter2stemmer_full.rb +29533 -0
- data/test/test_porter2stemmer_parts.rb +307 -0
- metadata +127 -0
data/Gemfile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
source "http://rubygems.org"
|
2
|
+
# Add dependencies required to use your gem here.
|
3
|
+
# Example:
|
4
|
+
# gem "activesupport", ">= 2.3.5"
|
5
|
+
|
6
|
+
# Add dependencies to develop your gem here.
|
7
|
+
# Include everything needed to run rake, tests, features, etc.
|
8
|
+
group :development do
|
9
|
+
gem "shoulda", ">= 0"
|
10
|
+
gem "bundler", "~> 1.0.0"
|
11
|
+
gem "jeweler", "~> 1.5.2"
|
12
|
+
gem "rcov", ">= 0"
|
13
|
+
end
|
data/Gemfile.lock
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
git (1.2.5)
|
5
|
+
jeweler (1.5.2)
|
6
|
+
bundler (~> 1.0.0)
|
7
|
+
git (>= 1.2.5)
|
8
|
+
rake
|
9
|
+
rake (0.9.2)
|
10
|
+
rcov (0.9.9)
|
11
|
+
shoulda (2.11.3)
|
12
|
+
|
13
|
+
PLATFORMS
|
14
|
+
ruby
|
15
|
+
|
16
|
+
DEPENDENCIES
|
17
|
+
bundler (~> 1.0.0)
|
18
|
+
jeweler (~> 1.5.2)
|
19
|
+
rcov
|
20
|
+
shoulda
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Neil Smith
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
The Porter 2 stemmer
|
2
|
+
====================
|
3
|
+
This is the Porter 2 stemming algorithm, as described at
|
4
|
+
http://snowball.tartarus.org/algorithms/english/stemmer.html
|
5
|
+
The original paper is:
|
6
|
+
|
7
|
+
Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14,
|
8
|
+
no. 3, pp 130-137
|
9
|
+
|
10
|
+
Features of this implementation
|
11
|
+
===============================
|
12
|
+
This stemmer is written in pure Ruby, making it easy to modify for language variants.
|
13
|
+
For instance, the original Porter stemmer only works for American English and does
|
14
|
+
not recognise British English's '-ise' as an alternate spelling of '-ize'. This
|
15
|
+
implementation has been extended to handle correctly British English.
|
16
|
+
|
17
|
+
This stemmer also features a comprehensive test set of over 29,000 words, taken from the
|
18
|
+
[Porter 2 stemmer website](http://snowball.tartarus.org/algorithms/english/stemmer.html).
|
19
|
+
|
20
|
+
Files
|
21
|
+
=====
|
22
|
+
Constants for the stemmer are in the Porter2 module.
|
23
|
+
|
24
|
+
Procedures that implement the stemmer are added to the String class.
|
25
|
+
|
26
|
+
The stemmer algorithm is implemented in the String#porter2_stem procedure.
|
27
|
+
|
28
|
+
Internationalisation
|
29
|
+
====================
|
30
|
+
There isn't much, as this is a stemmer that only works for English.
|
31
|
+
|
32
|
+
The `gb_english` flag to the various procedures allows the stemmer to treat the British
|
33
|
+
English '-ise' the same as the American English '-ize'.
|
34
|
+
|
35
|
+
Longest suffixes
|
36
|
+
================
|
37
|
+
Several places in the algorithm require matching the longest suffix of a word. The
|
38
|
+
regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the
|
39
|
+
alternative that matches at the first position in the string. As we're only talking
|
40
|
+
about suffixes, that first match is also the longest suffix. If the regexp engine changes,
|
41
|
+
this behaviour may change and break the stemmer.
|
42
|
+
|
43
|
+
Usage
|
44
|
+
=====
|
45
|
+
Call the String#porter2_stem or String#stem methods on a string to return its stem
|
46
|
+
"consistency".stem # => "consist"
|
47
|
+
"knitting".stem # => "knit"
|
48
|
+
"articulated".stem # => "articul"
|
49
|
+
"nationalize".stem # => "nation"
|
50
|
+
"nationalise".stem # => "nationalis"
|
51
|
+
"nationalise".stem(true) # => "nation"
|
52
|
+
|
53
|
+
Author
|
54
|
+
======
|
55
|
+
The Porter 2 stemming algorithm was developed by
|
56
|
+
[Martin Porter](http://snowball.tartarus.org/algorithms/english/stemmer.html).
|
57
|
+
This implementation is by [Neil Smith](http://www.njae.me.uk).
|
58
|
+
|
data/README.rdoc
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
= porter2stemmer
|
2
|
+
|
3
|
+
==The Porter 2 stemmer
|
4
|
+
This is the Porter 2 stemming algorithm, as described at
|
5
|
+
http://snowball.tartarus.org/algorithms/english/stemmer.html
|
6
|
+
The original paper is:
|
7
|
+
|
8
|
+
Porter, 1980, "An algorithm for suffix stripping", _Program_, Vol. 14, no. 3, pp 130-137
|
9
|
+
|
10
|
+
==Features of this implementation
|
11
|
+
This stemmer is written in pure Ruby, making it easy to modify for language variants.
|
12
|
+
For instance, the original Porter stemmer only works for American English and does
|
13
|
+
not recognise British English's '-ise' as an alternate spelling of '-ize'. This
|
14
|
+
implementation has been extended to handle correctly British English.
|
15
|
+
|
16
|
+
This stemmer also features a comprehensive test set of over 29,000 words, taken from the
|
17
|
+
{Porter 2 stemmer website}[http://snowball.tartarus.org/algorithms/english/stemmer.html].
|
18
|
+
|
19
|
+
==Files
|
20
|
+
Constants for the stemmer are in the Porter2 module.
|
21
|
+
|
22
|
+
Procedures that implement the stemmer are added to the String class.
|
23
|
+
|
24
|
+
The stemmer algorithm is implemented in the String#porter2_stem procedure.
|
25
|
+
|
26
|
+
==Internationalisation
|
27
|
+
There isn't much, as this is a stemmer that only works for English.
|
28
|
+
|
29
|
+
The +gb_english+ flag to the various procedures allows the stemmer to treat the British
|
30
|
+
English '-ise' the same as the American English '-ize'.
|
31
|
+
|
32
|
+
==Longest suffixes
|
33
|
+
Several places in the algorithm require matching the longest suffix of a word. The
|
34
|
+
regexp engine in Ruby 1.9 seems to handle alterntives in regexps by finding the
|
35
|
+
alternative that matches at the first position in the string. As we're only talking
|
36
|
+
about suffixes, that first match is also the longest suffix. If the regexp engine changes,
|
37
|
+
this behaviour may change and break the stemmer.
|
38
|
+
|
39
|
+
==Usage
|
40
|
+
Call the String#porter2_stem or String#stem methods on a string to return its stem
|
41
|
+
"consistency".stem # => "consist"
|
42
|
+
"knitting".stem # => "knit"
|
43
|
+
"articulated".stem # => "articul"
|
44
|
+
"nationalize".stem # => "nation"
|
45
|
+
"nationalise".stem # => "nationalis"
|
46
|
+
"nationalise".stem(true) # => "nation"
|
47
|
+
|
48
|
+
==Author
|
49
|
+
The Porter 2 stemming algorithm was developed by
|
50
|
+
{Martin Porter}[http://snowball.tartarus.org/algorithms/english/stemmer.html].
|
51
|
+
This implementation is by {Neil Smith}[http://www.njae.me.uk].
|
52
|
+
|
53
|
+
== Contributing to porter2stemmer
|
54
|
+
|
55
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
56
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
57
|
+
* Fork the project
|
58
|
+
* Start a feature/bugfix branch
|
59
|
+
* Commit and push until you are happy with your contribution
|
60
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
61
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
62
|
+
|
63
|
+
== Copyright
|
64
|
+
|
65
|
+
Copyright (c) 2011 Neil Smith. See LICENSE.txt for
|
66
|
+
further details.
|
67
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'bundler'
|
3
|
+
begin
|
4
|
+
Bundler.setup(:default, :development)
|
5
|
+
rescue Bundler::BundlerError => e
|
6
|
+
$stderr.puts e.message
|
7
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
8
|
+
exit e.status_code
|
9
|
+
end
|
10
|
+
require 'rake'
|
11
|
+
|
12
|
+
require 'jeweler'
|
13
|
+
Jeweler::Tasks.new do |gem|
|
14
|
+
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
15
|
+
gem.name = "porter2stemmer"
|
16
|
+
gem.homepage = "http://github.com/NeilNjae/porter2stemmer"
|
17
|
+
gem.license = "MIT"
|
18
|
+
gem.summary = %Q{A pure Ruby implementation of the Porter 2 stemmer}
|
19
|
+
gem.description = %Q{This is the Porter 2 stemming algorithm, as described at
|
20
|
+
http://snowball.tartarus.org/algorithms/english/stemmer.html It removes suffixes from English words, revealing something closer to the stem of the word. For instance "acknowledgments".stem => "acknowledg".}
|
21
|
+
gem.email = "neil.github@njae.me.uk"
|
22
|
+
gem.authors = ["Neil Smith"]
|
23
|
+
# Include your dependencies below. Runtime dependencies are required when using your gem,
|
24
|
+
# and development dependencies are only needed for development (ie running rake tasks, tests, etc)
|
25
|
+
# gem.add_runtime_dependency 'jabber4r', '> 0.1'
|
26
|
+
# gem.add_development_dependency 'rspec', '> 1.2.3'
|
27
|
+
end
|
28
|
+
Jeweler::RubygemsDotOrgTasks.new
|
29
|
+
|
30
|
+
require 'rake/testtask'
|
31
|
+
Rake::TestTask.new(:test) do |test|
|
32
|
+
test.libs << 'lib' << 'test'
|
33
|
+
test.pattern = 'test/**/test_*.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
|
37
|
+
require 'rcov/rcovtask'
|
38
|
+
Rcov::RcovTask.new do |test|
|
39
|
+
test.libs << 'test'
|
40
|
+
test.pattern = 'test/**/test_*.rb'
|
41
|
+
test.verbose = true
|
42
|
+
end
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
49
|
+
|
50
|
+
rdoc.rdoc_dir = 'rdoc'
|
51
|
+
rdoc.title = "porter2stemmer #{version}"
|
52
|
+
rdoc.rdoc_files.include('README*')
|
53
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
54
|
+
end
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
1.0.0
|
@@ -0,0 +1,114 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Constants for the Porter 2 stemmer
|
4
|
+
module Porter2
|
5
|
+
|
6
|
+
# A non-vowel
|
7
|
+
C = "[^aeiouy]"
|
8
|
+
|
9
|
+
# A vowel: a e i o u y
|
10
|
+
V = "[aeiouy]"
|
11
|
+
|
12
|
+
# A non-vowel other than w, x, or Y
|
13
|
+
CW = "[^aeiouywxY]"
|
14
|
+
|
15
|
+
# Doubles created when adding a suffix: these are undoubled when stemmed
|
16
|
+
Double = "(bb|dd|ff|gg|mm|nn|pp|rr|tt)"
|
17
|
+
|
18
|
+
# A valid letter that can come before 'li' (or 'ly')
|
19
|
+
Valid_LI = "[cdeghkmnrt]"
|
20
|
+
|
21
|
+
# A specification for a short syllable.
|
22
|
+
#
|
23
|
+
# A short syllable in a word is either:
|
24
|
+
# 1. a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, or
|
25
|
+
# 2. a vowel at the beginning of the word followed by a non-vowel.
|
26
|
+
#
|
27
|
+
# (The original document is silent on whether sequences of two or more non-vowels make a
|
28
|
+
# syllable long. But as this specification is only used to find sequences of non-vowel -
|
29
|
+
# vowel - non-vowel - end-of-word, this ambiguity does not have an effect.)
|
30
|
+
SHORT_SYLLABLE = "((#{C}#{V}#{CW})|(^#{V}#{C}))"
|
31
|
+
|
32
|
+
# Suffix transformations used in porter2_step2.
|
33
|
+
# (ogi, li endings dealt with in procedure)
|
34
|
+
STEP_2_MAPS = {"tional" => "tion",
|
35
|
+
"enci" => "ence",
|
36
|
+
"anci" => "ance",
|
37
|
+
"abli" => "able",
|
38
|
+
"entli" => "ent",
|
39
|
+
"ization" => "ize",
|
40
|
+
"izer" => "ize",
|
41
|
+
"ational" => "ate",
|
42
|
+
"ation" => "ate",
|
43
|
+
"ator" => "ate",
|
44
|
+
"alism" => "al",
|
45
|
+
"aliti" => "al",
|
46
|
+
"alli" => "al",
|
47
|
+
"fulness" => "ful",
|
48
|
+
"ousli" => "ous",
|
49
|
+
"ousness" => "ous",
|
50
|
+
"iveness" => "ive",
|
51
|
+
"iviti" => "ive",
|
52
|
+
"biliti" => "ble",
|
53
|
+
"bli" => "ble",
|
54
|
+
"fulli" => "ful",
|
55
|
+
"lessli" => "less" }
|
56
|
+
|
57
|
+
# Suffix transformations used in porter2_step3.
|
58
|
+
# (ative ending dealt with in procedure)
|
59
|
+
STEP_3_MAPS = {"tional" => "tion",
|
60
|
+
"ational" => "ate",
|
61
|
+
"alize" => "al",
|
62
|
+
"icate" => "ic",
|
63
|
+
"iciti" => "ic",
|
64
|
+
"ical" => "ic",
|
65
|
+
"ful" => "",
|
66
|
+
"ness" => "" }
|
67
|
+
|
68
|
+
# Suffix transformations used in porter2_step4.
|
69
|
+
# (ion ending dealt with in procedure)
|
70
|
+
STEP_4_MAPS = {"al" => "",
|
71
|
+
"ance" => "",
|
72
|
+
"ence" => "",
|
73
|
+
"er" => "",
|
74
|
+
"ic" => "",
|
75
|
+
"able" => "",
|
76
|
+
"ible" => "",
|
77
|
+
"ant" => "",
|
78
|
+
"ement" => "",
|
79
|
+
"ment" => "",
|
80
|
+
"ent" => "",
|
81
|
+
"ism" => "",
|
82
|
+
"ate" => "",
|
83
|
+
"iti" => "",
|
84
|
+
"ous" => "",
|
85
|
+
"ive" => "",
|
86
|
+
"ize" => "" }
|
87
|
+
|
88
|
+
# Special-case stemmings
|
89
|
+
SPECIAL_CASES = {"skis" => "ski",
|
90
|
+
"skies" => "sky",
|
91
|
+
|
92
|
+
"dying" => "die",
|
93
|
+
"lying" => "lie",
|
94
|
+
"tying" => "tie",
|
95
|
+
"idly" => "idl",
|
96
|
+
"gently" => "gentl",
|
97
|
+
"ugly" => "ugli",
|
98
|
+
"early" => "earli",
|
99
|
+
"only" => "onli",
|
100
|
+
"singly" =>"singl",
|
101
|
+
|
102
|
+
"sky" => "sky",
|
103
|
+
"news" => "news",
|
104
|
+
"howe" => "howe",
|
105
|
+
"atlas" => "atlas",
|
106
|
+
"cosmos" => "cosmos",
|
107
|
+
"bias" => "bias",
|
108
|
+
"andes" => "andes" }
|
109
|
+
|
110
|
+
# Special case words to stop processing after step 1a.
|
111
|
+
STEP_1A_SPECIAL_CASES = %w[ inning outing canning herring earring proceed exceed succeed ]
|
112
|
+
|
113
|
+
end
|
114
|
+
|
@@ -0,0 +1,326 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# Implementation of the Porter 2 stemmer. String#porter2_stem is the main stemming procedure.
|
4
|
+
|
5
|
+
class String
|
6
|
+
# Tidy up the word before we get down to the algorithm
|
7
|
+
def porter2_tidy
|
8
|
+
preword = self.to_s.strip.downcase
|
9
|
+
|
10
|
+
# map apostrophe-like characters to apostrophes
|
11
|
+
preword.gsub!(/‘/, "'")
|
12
|
+
preword.gsub!(/’/, "'")
|
13
|
+
|
14
|
+
preword
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
# Preprocess the word.
|
19
|
+
# Remove any initial ', if present. Then, set initial y, or y after a vowel, to Y
|
20
|
+
#
|
21
|
+
# (The comment to 'establish the regions R1 and R2' in the original description
|
22
|
+
# is an implementation optimisation that identifies where the regions start. As
|
23
|
+
# no modifications are made to the word that affect those positions, you may want
|
24
|
+
# to cache them now. This implementation doesn't do that.)
|
25
|
+
def porter2_preprocess
|
26
|
+
w = self.dup
|
27
|
+
|
28
|
+
# remove any initial apostrophe
|
29
|
+
w.gsub!(/^'*(.)/, '\1')
|
30
|
+
|
31
|
+
# set initial y, or y after a vowel, to Y
|
32
|
+
w.gsub!(/^y/, "Y")
|
33
|
+
w.gsub!(/(#{Porter2::V})y/, '\1Y')
|
34
|
+
|
35
|
+
w
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
# R1 is the portion of the word after the first non-vowel after the first vowel
|
40
|
+
# (with words beginning 'gener-', 'commun-', and 'arsen-' treated as special cases
|
41
|
+
def porter2_r1
|
42
|
+
if self =~ /^(gener|commun|arsen)(?<r1>.*)/
|
43
|
+
Regexp.last_match(:r1)
|
44
|
+
else
|
45
|
+
self =~ /#{Porter2::V}#{Porter2::C}(?<r1>.*)$/
|
46
|
+
Regexp.last_match(:r1) || ""
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
# R2 is the portion of R1 (porter2_r1) after the first non-vowel after the first vowel
|
52
|
+
def porter2_r2
|
53
|
+
self.porter2_r1 =~ /#{Porter2::V}#{Porter2::C}(?<r2>.*)$/
|
54
|
+
Regexp.last_match(:r2) || ""
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
# Returns true if the word ends with a short syllable
|
59
|
+
def porter2_ends_with_short_syllable?
|
60
|
+
self =~ /#{Porter2::SHORT_SYLLABLE}$/ ? true : false
|
61
|
+
end
|
62
|
+
|
63
|
+
|
64
|
+
# A word is short if it ends in a short syllable, and R1 is null
|
65
|
+
def porter2_is_short_word?
|
66
|
+
self.porter2_ends_with_short_syllable? and self.porter2_r1.empty?
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
# Search for the longest among the suffixes,
|
71
|
+
# * '
|
72
|
+
# * 's
|
73
|
+
# * 's'
|
74
|
+
# and remove if found.
|
75
|
+
def porter2_step0
|
76
|
+
self.sub!(/(.)('s'|'s|')$/, '\1') || self
|
77
|
+
end
|
78
|
+
|
79
|
+
|
80
|
+
# Search for the longest among the following suffixes, and perform the action indicated.
|
81
|
+
# sses:: replace by ss
|
82
|
+
# ied, ies:: replace by i if preceded by more than one letter, otherwise by ie
|
83
|
+
# s:: delete if the preceding word part contains a vowel not immediately before the s
|
84
|
+
# us, ss:: do nothing
|
85
|
+
def porter2_step1a
|
86
|
+
if self =~ /sses$/
|
87
|
+
self.sub(/sses$/, 'ss')
|
88
|
+
elsif self =~ /..(ied|ies)$/
|
89
|
+
self.sub(/(ied|ies)$/, 'i')
|
90
|
+
elsif self =~ /(ied|ies)$/
|
91
|
+
self.sub(/(ied|ies)$/, 'ie')
|
92
|
+
elsif self =~ /(us|ss)$/
|
93
|
+
self
|
94
|
+
elsif self =~ /s$/
|
95
|
+
if self =~ /(#{Porter2::V}.+)s$/
|
96
|
+
self.sub(/s$/, '')
|
97
|
+
else
|
98
|
+
self
|
99
|
+
end
|
100
|
+
else
|
101
|
+
self
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
|
106
|
+
# Search for the longest among the following suffixes, and perform the action indicated.
|
107
|
+
# eed, eedly:: replace by ee if the suffix is also in R1
|
108
|
+
# ed, edly, ing, ingly:: delete if the preceding word part contains a vowel and,
|
109
|
+
# after the deletion:
|
110
|
+
# * if the word ends at, bl or iz: add e, or
|
111
|
+
# * if the word ends with a double: remove the last letter, or
|
112
|
+
# * if the word is short: add e
|
113
|
+
#
|
114
|
+
# (If gb_english is +true+, treat the 'is' suffix as 'iz' above.)
|
115
|
+
def porter2_step1b(gb_english = false)
|
116
|
+
if self =~ /(eed|eedly)$/
|
117
|
+
if self.porter2_r1 =~ /(eed|eedly)$/
|
118
|
+
self.sub(/(eed|eedly)$/, 'ee')
|
119
|
+
else
|
120
|
+
self
|
121
|
+
end
|
122
|
+
else
|
123
|
+
w = self.dup
|
124
|
+
if w =~ /#{Porter2::V}.*(ed|edly|ing|ingly)$/
|
125
|
+
w.sub!(/(ed|edly|ing|ingly)$/, '')
|
126
|
+
if w =~ /(at|lb|iz)$/
|
127
|
+
w += 'e'
|
128
|
+
elsif w =~ /is$/ and gb_english
|
129
|
+
w += 'e'
|
130
|
+
elsif w =~ /#{Porter2::Double}$/
|
131
|
+
w.chop!
|
132
|
+
elsif w.porter2_is_short_word?
|
133
|
+
w += 'e'
|
134
|
+
end
|
135
|
+
end
|
136
|
+
w
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
|
141
|
+
# Replace a suffix of y or Y by i if it is preceded by a non-vowel which is
|
142
|
+
# not the first letter of the word.
|
143
|
+
def porter2_step1c
|
144
|
+
if self =~ /.+#{Porter2::C}(y|Y)$/
|
145
|
+
self.sub(/(y|Y)$/, 'i')
|
146
|
+
else
|
147
|
+
self
|
148
|
+
end
|
149
|
+
end
|
150
|
+
|
151
|
+
|
152
|
+
# Search for the longest among the suffixes listed in the keys of Porter2::STEP_2_MAPS.
|
153
|
+
# If one is found and that suffix occurs in R1, replace it with the value
|
154
|
+
# found in STEP_2_MAPS.
|
155
|
+
#
|
156
|
+
# (Suffixes 'ogi' and 'li' are treated as special cases in the procedure.)
|
157
|
+
#
|
158
|
+
# (If gb_english is +true+, replace the 'iser' and 'isation' suffixes with
|
159
|
+
# 'ise', similarly to how 'izer' and 'ization' are treated.)
|
160
|
+
def porter2_step2(gb_english = false)
|
161
|
+
r1 = self.porter2_r1
|
162
|
+
s2m = Porter2::STEP_2_MAPS.dup
|
163
|
+
if gb_english
|
164
|
+
s2m["iser"] = "ise"
|
165
|
+
s2m["isation"] = "ise"
|
166
|
+
end
|
167
|
+
step_2_re = Regexp.union(s2m.keys.map {|r| Regexp.new(r + "$")})
|
168
|
+
if self =~ step_2_re
|
169
|
+
if r1 =~ /#{$&}$/
|
170
|
+
self.sub(/#{$&}$/, s2m[$&])
|
171
|
+
else
|
172
|
+
self
|
173
|
+
end
|
174
|
+
elsif r1 =~ /li$/ and self =~ /(#{Porter2::Valid_LI})li$/
|
175
|
+
self.sub(/li$/, '')
|
176
|
+
elsif r1 =~ /ogi$/ and self =~ /logi$/
|
177
|
+
self.sub(/ogi$/, 'og')
|
178
|
+
else
|
179
|
+
self
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
|
184
|
+
# Search for the longest among the suffixes listed in the keys of Porter2::STEP_3_MAPS.
|
185
|
+
# If one is found and that suffix occurs in R1, replace it with the value
|
186
|
+
# found in STEP_3_MAPS.
|
187
|
+
#
|
188
|
+
# (Suffix 'ative' is treated as a special case in the procedure.)
|
189
|
+
#
|
190
|
+
# (If gb_english is +true+, replace the 'alise' suffix with
|
191
|
+
# 'al', similarly to how 'alize' is treated.)
|
192
|
+
def porter2_step3(gb_english = false)
|
193
|
+
if self =~ /ative$/ and self.porter2_r2 =~ /ative$/
|
194
|
+
self.sub(/ative$/, '')
|
195
|
+
else
|
196
|
+
s3m = Porter2::STEP_3_MAPS.dup
|
197
|
+
if gb_english
|
198
|
+
s3m["alise"] = "al"
|
199
|
+
end
|
200
|
+
step_3_re = Regexp.union(s3m.keys.map {|r| Regexp.new(r + "$")})
|
201
|
+
r1 = self.porter2_r1
|
202
|
+
if self =~ step_3_re and r1 =~ /#{$&}$/
|
203
|
+
self.sub(/#{$&}$/, s3m[$&])
|
204
|
+
else
|
205
|
+
self
|
206
|
+
end
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
|
211
|
+
# Search for the longest among the suffixes listed in the keys of Porter2::STEP_4_MAPS.
|
212
|
+
# If one is found and that suffix occurs in R2, replace it with the value
|
213
|
+
# found in STEP_4_MAPS.
|
214
|
+
#
|
215
|
+
# (Suffix 'ion' is treated as a special case in the procedure.)
|
216
|
+
#
|
217
|
+
# (If gb_english is +true+, delete the 'ise' suffix if found.)
|
218
|
+
def porter2_step4(gb_english = false)
|
219
|
+
if self.porter2_r2 =~ /ion$/ and self =~ /(s|t)ion$/
|
220
|
+
self.sub(/ion$/, '')
|
221
|
+
else
|
222
|
+
s4m = Porter2::STEP_4_MAPS.dup
|
223
|
+
if gb_english
|
224
|
+
s4m["ise"] = ""
|
225
|
+
end
|
226
|
+
step_4_re = Regexp.union(s4m.keys.map {|r| Regexp.new(r + "$")})
|
227
|
+
r2 = self.porter2_r2
|
228
|
+
if self =~ step_4_re
|
229
|
+
if r2 =~ /#{$&}/
|
230
|
+
self.sub(/#{$&}$/, s4m[$&])
|
231
|
+
else
|
232
|
+
self
|
233
|
+
end
|
234
|
+
else
|
235
|
+
self
|
236
|
+
end
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
# Search for the the following suffixes, and, if found, perform the action indicated.
|
242
|
+
# e:: delete if in R2, or in R1 and not preceded by a short syllable
|
243
|
+
# l:: delete if in R2 and preceded by l
|
244
|
+
def porter2_step5
|
245
|
+
if self =~ /ll$/ and self.porter2_r2 =~ /l$/
|
246
|
+
self.sub(/ll$/, 'l')
|
247
|
+
elsif self =~ /e$/ and self.porter2_r2 =~ /e$/
|
248
|
+
self.sub(/e$/, '')
|
249
|
+
else
|
250
|
+
r1 = self.porter2_r1
|
251
|
+
if self =~ /e$/ and r1 =~ /e$/ and not self =~ /#{Porter2::SHORT_SYLLABLE}e$/
|
252
|
+
self.sub(/e$/, '')
|
253
|
+
else
|
254
|
+
self
|
255
|
+
end
|
256
|
+
end
|
257
|
+
end
|
258
|
+
|
259
|
+
|
260
|
+
# Turn all Y letters into y
|
261
|
+
def porter2_postprocess
|
262
|
+
self.gsub(/Y/, 'y')
|
263
|
+
end
|
264
|
+
|
265
|
+
public
|
266
|
+
|
267
|
+
# Perform the stemming procedure. If +gb_english+ is true, treat '-ise' and similar suffixes
|
268
|
+
# as '-ize' in American English.
|
269
|
+
def porter2_stem(gb_english = false)
|
270
|
+
preword = self.porter2_tidy
|
271
|
+
return preword if preword.length <= 2
|
272
|
+
|
273
|
+
word = preword.porter2_preprocess
|
274
|
+
|
275
|
+
if Porter2::SPECIAL_CASES.has_key? word
|
276
|
+
Porter2::SPECIAL_CASES[word]
|
277
|
+
else
|
278
|
+
w1a = word.porter2_step0.porter2_step1a
|
279
|
+
if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
|
280
|
+
w1a
|
281
|
+
else
|
282
|
+
w1a.porter2_step1b(gb_english).porter2_step1c.porter2_step2(gb_english).porter2_step3(gb_english).porter2_step4(gb_english).porter2_step5.porter2_postprocess
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
|
287
|
+
# A verbose version of porter2_stem that prints the output of each stage to STDOUT
|
288
|
+
def porter2_stem_verbose(gb_english = false)
|
289
|
+
preword = self.porter2_tidy
|
290
|
+
puts "Preword: #{preword}"
|
291
|
+
return preword if preword.length <= 2
|
292
|
+
|
293
|
+
word = preword.porter2_preprocess
|
294
|
+
puts "Preprocessed: #{word}"
|
295
|
+
|
296
|
+
if Porter2::SPECIAL_CASES.has_key? word
|
297
|
+
puts "Returning #{word} as special case #{Porter2::SPECIAL_CASES[word]}"
|
298
|
+
Porter2::SPECIAL_CASES[word]
|
299
|
+
else
|
300
|
+
r1 = word.porter2_r1
|
301
|
+
r2 = word.porter2_r2
|
302
|
+
puts "R1 = #{r1}, R2 = #{r2}"
|
303
|
+
|
304
|
+
w0 = word.porter2_step0 ; puts "After step 0: #{w0} (R1 = #{w0.porter2_r1}, R2 = #{w0.porter2_r2})"
|
305
|
+
w1a = w0.porter2_step1a ; puts "After step 1a: #{w1a} (R1 = #{w1a.porter2_r1}, R2 = #{w1a.porter2_r2})"
|
306
|
+
|
307
|
+
if Porter2::STEP_1A_SPECIAL_CASES.include? w1a
|
308
|
+
puts "Returning #{w1a} as 1a special case"
|
309
|
+
w1a
|
310
|
+
else
|
311
|
+
w1b = w1a.porter2_step1b(gb_english) ; puts "After step 1b: #{w1b} (R1 = #{w1b.porter2_r1}, R2 = #{w1b.porter2_r2})"
|
312
|
+
w1c = w1b.porter2_step1c ; puts "After step 1c: #{w1c} (R1 = #{w1c.porter2_r1}, R2 = #{w1c.porter2_r2})"
|
313
|
+
w2 = w1c.porter2_step2(gb_english) ; puts "After step 2: #{w2} (R1 = #{w2.porter2_r1}, R2 = #{w2.porter2_r2})"
|
314
|
+
w3 = w2.porter2_step3(gb_english) ; puts "After step 3: #{w3} (R1 = #{w3.porter2_r1}, R2 = #{w3.porter2_r2})"
|
315
|
+
w4 = w3.porter2_step4(gb_english) ; puts "After step 4: #{w4} (R1 = #{w4.porter2_r1}, R2 = #{w4.porter2_r2})"
|
316
|
+
w5 = w4.porter2_step5 ; puts "After step 5: #{w5}"
|
317
|
+
wpost = w5.porter2_postprocess ; puts "After postprocess: #{wpost}"
|
318
|
+
wpost
|
319
|
+
end
|
320
|
+
end
|
321
|
+
end
|
322
|
+
|
323
|
+
alias stem porter2_stem
|
324
|
+
|
325
|
+
end
|
326
|
+
|
Binary file
|