engtagger 0.2.0 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.yardopts +5 -0
- data/Gemfile +1 -2
- data/README.md +19 -25
- data/engtagger.gemspec +4 -4
- data/lib/engtagger/porter.rb +12 -12
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +2 -2
- data/lib/engtagger.rb +341 -290
- data/test/test_engtagger.rb +246 -201
- metadata +7 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b3f1fc1d4e6d89d2920a0774342478d951bacd4558ff8c4054da719730ed0b9c
|
4
|
+
data.tar.gz: 2c9061d018dd63d699ad18713edf0f8ba74720632574e2ed2b530965c501abc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 475e5093d071bee1fac32a98713dd3eadc51262fc61cd090fe54fc98aad68d9d0c544aae0c10374aa38ac17676f0db0dbabc18a34f393747c1b9a51ff4d687ad
|
7
|
+
data.tar.gz: 4bfc9068df3ce8cf4688c0475600c326302c4df5ed1bb13848eb64c200ffc9e2fba61edb9f8cd64d1c6cb47015384cc3020bec707ddfc74e941874c310cbed83
|
data/.gitignore
CHANGED
data/.yardopts
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,13 +4,13 @@ English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
|
4
4
|
|
5
5
|
### Description
|
6
6
|
|
7
|
-
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
-
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
-
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
-
conditional probabilities--it examines the preceding tag to determine the
|
11
|
-
appropriate tag for the current word. Unknown words are classified according to
|
12
|
-
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
-
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
7
|
+
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
+
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
+
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
+
conditional probabilities--it examines the preceding tag to determine the
|
11
|
+
appropriate tag for the current word. Unknown words are classified according to
|
12
|
+
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
+
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
14
|
of regular expressions.
|
15
15
|
|
16
16
|
### Features
|
@@ -21,7 +21,6 @@ of regular expressions.
|
|
21
21
|
|
22
22
|
### Synopsis:
|
23
23
|
|
24
|
-
require 'rubygems'
|
25
24
|
require 'engtagger'
|
26
25
|
|
27
26
|
# Create a parser object
|
@@ -34,20 +33,20 @@ of regular expressions.
|
|
34
33
|
tagged = tgr.add_tags(text)
|
35
34
|
|
36
35
|
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
37
|
-
|
36
|
+
|
38
37
|
# Get a list of all nouns and noun phrases with occurrence counts
|
39
38
|
word_list = tgr.get_words(text)
|
40
39
|
|
41
40
|
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
42
|
-
|
41
|
+
|
43
42
|
# Get a readable version of the tagged text
|
44
43
|
readable = tgr.get_readable(text)
|
45
|
-
|
44
|
+
|
46
45
|
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
46
|
|
48
47
|
# Get all nouns from a tagged output
|
49
48
|
nouns = tgr.get_nouns(tagged)
|
50
|
-
|
49
|
+
|
51
50
|
#=> {"cat"=>1, "Alice"=>1}
|
52
51
|
|
53
52
|
# Get all proper nouns
|
@@ -73,13 +72,13 @@ of regular expressions.
|
|
73
72
|
|
74
73
|
### Tag Set
|
75
74
|
|
76
|
-
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
75
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
77
76
|
|
78
77
|
CC Conjunction, coordinating and, or
|
79
78
|
CD Adjective, cardinal number 3, fifteen
|
80
79
|
DET Determiner this, each, some
|
81
80
|
EX Pronoun, existential there there
|
82
|
-
FW Foreign words
|
81
|
+
FW Foreign words
|
83
82
|
IN Preposition / Conjunction for, of, although, that
|
84
83
|
JJ Adjective happy, bad
|
85
84
|
JJR Adjective, comparative happier, worse
|
@@ -111,7 +110,7 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
111
110
|
WP Pronoun, question who, whoever
|
112
111
|
WPS Determiner, possessive & question whose
|
113
112
|
WRB Adverb, question when, how, however
|
114
|
-
|
113
|
+
|
115
114
|
PP Punctuation, sentence ender ., !, ?
|
116
115
|
PPC Punctuation, comma ,
|
117
116
|
PPD Punctuation, dollar sign $
|
@@ -121,29 +120,24 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
121
120
|
LRB Punctuation, left bracket (, {, [
|
122
121
|
RRB Punctuation, right bracket ), }, ]
|
123
122
|
|
124
|
-
### Requirements
|
125
|
-
|
126
|
-
* [Hpricot](http://code.whytheluckystiff.net/hpricot/) (optional)
|
127
|
-
|
128
123
|
### Install
|
129
124
|
|
130
|
-
|
125
|
+
gem install engtagger
|
131
126
|
|
132
127
|
### Author
|
133
128
|
|
134
|
-
of this Ruby library
|
129
|
+
of this Ruby library
|
135
130
|
|
136
|
-
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
131
|
+
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
137
132
|
|
138
133
|
### Contributors
|
139
134
|
|
140
|
-
|
141
|
-
* Phil London
|
135
|
+
Many thanks to the collaborators listed in the right column of this GitHub page.
|
142
136
|
|
143
137
|
### Acknowledgement
|
144
138
|
|
145
139
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
146
|
-
The credit for the crucial part of its algorithm/design therefore goes to
|
140
|
+
The credit for the crucial part of its algorithm/design therefore goes to
|
147
141
|
Aaron Coburn, the author of the original Perl version.
|
148
142
|
|
149
143
|
### License
|
data/engtagger.gemspec
CHANGED
@@ -4,14 +4,14 @@ require File.expand_path('../lib/engtagger/version', __FILE__)
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Yoichiro Hasebe"]
|
6
6
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
-
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
-
gem.homepage = "http://github.com/yohasebe/engtagger"
|
7
|
+
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
+
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
13
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
14
|
gem.name = "engtagger"
|
15
15
|
gem.require_paths = ["lib"]
|
16
|
-
gem.version = EngTagger::VERSION
|
16
|
+
gem.version = EngTagger::VERSION
|
17
17
|
end
|
data/lib/engtagger/porter.rb
CHANGED
@@ -12,7 +12,7 @@ module Stemmable
|
|
12
12
|
'ousness'=>'ous', 'aliti'=>'al',
|
13
13
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
14
14
|
}
|
15
|
-
|
15
|
+
|
16
16
|
STEP_3_LIST = {
|
17
17
|
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
18
18
|
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
@@ -48,7 +48,7 @@ module Stemmable
|
|
48
48
|
ance |
|
49
49
|
ence |
|
50
50
|
er |
|
51
|
-
ic |
|
51
|
+
ic |
|
52
52
|
able |
|
53
53
|
ible |
|
54
54
|
ant |
|
@@ -88,30 +88,30 @@ module Stemmable
|
|
88
88
|
#
|
89
89
|
# Send comments to raypereda@hotmail.com
|
90
90
|
#
|
91
|
-
|
91
|
+
|
92
92
|
def stem_porter
|
93
93
|
|
94
94
|
# make a copy of the given object and convert it to a string.
|
95
95
|
w = self.dup.to_str
|
96
|
-
|
96
|
+
|
97
97
|
return w if w.length < 3
|
98
|
-
|
98
|
+
|
99
99
|
# now map initial y to Y so that the patterns never treat it as vowel
|
100
100
|
w[0] = 'Y' if w[0] == ?y
|
101
|
-
|
101
|
+
|
102
102
|
# Step 1a
|
103
103
|
if w =~ /(ss|i)es$/
|
104
104
|
w = $` + $1
|
105
|
-
elsif w =~ /([^s])s$/
|
105
|
+
elsif w =~ /([^s])s$/
|
106
106
|
w = $` + $1
|
107
107
|
end
|
108
108
|
|
109
109
|
# Step 1b
|
110
110
|
if w =~ /eed$/
|
111
|
-
w.chop! if $` =~ MGR0
|
111
|
+
w.chop! if $` =~ MGR0
|
112
112
|
elsif w =~ /(ed|ing)$/
|
113
113
|
stem = $`
|
114
|
-
if stem =~ VOWEL_IN_STEM
|
114
|
+
if stem =~ VOWEL_IN_STEM
|
115
115
|
w = stem
|
116
116
|
case w
|
117
117
|
when /(at|bl|iz)$/ then w << "e"
|
@@ -121,9 +121,9 @@ module Stemmable
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
if w =~ /y$/
|
124
|
+
if w =~ /y$/
|
125
125
|
stem = $`
|
126
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
126
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
127
127
|
end
|
128
128
|
|
129
129
|
# Step 2
|
@@ -159,7 +159,7 @@ module Stemmable
|
|
159
159
|
end
|
160
160
|
|
161
161
|
# Step 5
|
162
|
-
if w =~ /e$/
|
162
|
+
if w =~ /e$/
|
163
163
|
stem = $`
|
164
164
|
if (stem =~ MGR1) ||
|
165
165
|
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
data/lib/engtagger/pos_tags.hash
CHANGED
Binary file
|
Binary file
|
data/lib/engtagger/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
class EngTagger
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|