engtagger 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.yardopts +5 -0
- data/Gemfile +1 -2
- data/README.md +19 -26
- data/engtagger.gemspec +4 -4
- data/lib/engtagger/porter.rb +12 -12
- data/lib/engtagger/version.rb +2 -2
- data/lib/engtagger.rb +164 -70
- data/test/test_engtagger.rb +246 -233
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b3f1fc1d4e6d89d2920a0774342478d951bacd4558ff8c4054da719730ed0b9c
|
4
|
+
data.tar.gz: 2c9061d018dd63d699ad18713edf0f8ba74720632574e2ed2b530965c501abc5
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 475e5093d071bee1fac32a98713dd3eadc51262fc61cd090fe54fc98aad68d9d0c544aae0c10374aa38ac17676f0db0dbabc18a34f393747c1b9a51ff4d687ad
|
7
|
+
data.tar.gz: 4bfc9068df3ce8cf4688c0475600c326302c4df5ed1bb13848eb64c200ffc9e2fba61edb9f8cd64d1c6cb47015384cc3020bec707ddfc74e941874c310cbed83
|
data/.gitignore
CHANGED
data/.yardopts
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,13 +4,13 @@ English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
|
4
4
|
|
5
5
|
### Description
|
6
6
|
|
7
|
-
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
-
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
-
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
-
conditional probabilities--it examines the preceding tag to determine the
|
11
|
-
appropriate tag for the current word. Unknown words are classified according to
|
12
|
-
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
-
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
7
|
+
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
+
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
+
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
+
conditional probabilities--it examines the preceding tag to determine the
|
11
|
+
appropriate tag for the current word. Unknown words are classified according to
|
12
|
+
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
+
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
14
|
of regular expressions.
|
15
15
|
|
16
16
|
### Features
|
@@ -21,7 +21,6 @@ of regular expressions.
|
|
21
21
|
|
22
22
|
### Synopsis:
|
23
23
|
|
24
|
-
require 'rubygems'
|
25
24
|
require 'engtagger'
|
26
25
|
|
27
26
|
# Create a parser object
|
@@ -34,20 +33,20 @@ of regular expressions.
|
|
34
33
|
tagged = tgr.add_tags(text)
|
35
34
|
|
36
35
|
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
37
|
-
|
36
|
+
|
38
37
|
# Get a list of all nouns and noun phrases with occurrence counts
|
39
38
|
word_list = tgr.get_words(text)
|
40
39
|
|
41
40
|
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
42
|
-
|
41
|
+
|
43
42
|
# Get a readable version of the tagged text
|
44
43
|
readable = tgr.get_readable(text)
|
45
|
-
|
44
|
+
|
46
45
|
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
46
|
|
48
47
|
# Get all nouns from a tagged output
|
49
48
|
nouns = tgr.get_nouns(tagged)
|
50
|
-
|
49
|
+
|
51
50
|
#=> {"cat"=>1, "Alice"=>1}
|
52
51
|
|
53
52
|
# Get all proper nouns
|
@@ -73,13 +72,13 @@ of regular expressions.
|
|
73
72
|
|
74
73
|
### Tag Set
|
75
74
|
|
76
|
-
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
75
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
77
76
|
|
78
77
|
CC Conjunction, coordinating and, or
|
79
78
|
CD Adjective, cardinal number 3, fifteen
|
80
79
|
DET Determiner this, each, some
|
81
80
|
EX Pronoun, existential there there
|
82
|
-
FW Foreign words
|
81
|
+
FW Foreign words
|
83
82
|
IN Preposition / Conjunction for, of, although, that
|
84
83
|
JJ Adjective happy, bad
|
85
84
|
JJR Adjective, comparative happier, worse
|
@@ -111,7 +110,7 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
111
110
|
WP Pronoun, question who, whoever
|
112
111
|
WPS Determiner, possessive & question whose
|
113
112
|
WRB Adverb, question when, how, however
|
114
|
-
|
113
|
+
|
115
114
|
PP Punctuation, sentence ender ., !, ?
|
116
115
|
PPC Punctuation, comma ,
|
117
116
|
PPD Punctuation, dollar sign $
|
@@ -121,30 +120,24 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
121
120
|
LRB Punctuation, left bracket (, {, [
|
122
121
|
RRB Punctuation, right bracket ), }, ]
|
123
122
|
|
124
|
-
### Requirements
|
125
|
-
|
126
|
-
* [Hpricot](http://code.whytheluckystiff.net/hpricot/) (optional)
|
127
|
-
|
128
123
|
### Install
|
129
124
|
|
130
|
-
|
125
|
+
gem install engtagger
|
131
126
|
|
132
127
|
### Author
|
133
128
|
|
134
|
-
of this Ruby library
|
129
|
+
of this Ruby library
|
135
130
|
|
136
|
-
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
131
|
+
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
137
132
|
|
138
133
|
### Contributors
|
139
134
|
|
140
|
-
|
141
|
-
* Phil London
|
142
|
-
* Bazay (Baron Bloomer)
|
135
|
+
Many thanks to the collaborators listed in the right column of this GitHub page.
|
143
136
|
|
144
137
|
### Acknowledgement
|
145
138
|
|
146
139
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
147
|
-
The credit for the crucial part of its algorithm/design therefore goes to
|
140
|
+
The credit for the crucial part of its algorithm/design therefore goes to
|
148
141
|
Aaron Coburn, the author of the original Perl version.
|
149
142
|
|
150
143
|
### License
|
data/engtagger.gemspec
CHANGED
@@ -4,14 +4,14 @@ require File.expand_path('../lib/engtagger/version', __FILE__)
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Yoichiro Hasebe"]
|
6
6
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
-
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
-
gem.homepage = "http://github.com/yohasebe/engtagger"
|
7
|
+
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
+
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
13
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
14
|
gem.name = "engtagger"
|
15
15
|
gem.require_paths = ["lib"]
|
16
|
-
gem.version = EngTagger::VERSION
|
16
|
+
gem.version = EngTagger::VERSION
|
17
17
|
end
|
data/lib/engtagger/porter.rb
CHANGED
@@ -12,7 +12,7 @@ module Stemmable
|
|
12
12
|
'ousness'=>'ous', 'aliti'=>'al',
|
13
13
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
14
14
|
}
|
15
|
-
|
15
|
+
|
16
16
|
STEP_3_LIST = {
|
17
17
|
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
18
18
|
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
@@ -48,7 +48,7 @@ module Stemmable
|
|
48
48
|
ance |
|
49
49
|
ence |
|
50
50
|
er |
|
51
|
-
ic |
|
51
|
+
ic |
|
52
52
|
able |
|
53
53
|
ible |
|
54
54
|
ant |
|
@@ -88,30 +88,30 @@ module Stemmable
|
|
88
88
|
#
|
89
89
|
# Send comments to raypereda@hotmail.com
|
90
90
|
#
|
91
|
-
|
91
|
+
|
92
92
|
def stem_porter
|
93
93
|
|
94
94
|
# make a copy of the given object and convert it to a string.
|
95
95
|
w = self.dup.to_str
|
96
|
-
|
96
|
+
|
97
97
|
return w if w.length < 3
|
98
|
-
|
98
|
+
|
99
99
|
# now map initial y to Y so that the patterns never treat it as vowel
|
100
100
|
w[0] = 'Y' if w[0] == ?y
|
101
|
-
|
101
|
+
|
102
102
|
# Step 1a
|
103
103
|
if w =~ /(ss|i)es$/
|
104
104
|
w = $` + $1
|
105
|
-
elsif w =~ /([^s])s$/
|
105
|
+
elsif w =~ /([^s])s$/
|
106
106
|
w = $` + $1
|
107
107
|
end
|
108
108
|
|
109
109
|
# Step 1b
|
110
110
|
if w =~ /eed$/
|
111
|
-
w.chop! if $` =~ MGR0
|
111
|
+
w.chop! if $` =~ MGR0
|
112
112
|
elsif w =~ /(ed|ing)$/
|
113
113
|
stem = $`
|
114
|
-
if stem =~ VOWEL_IN_STEM
|
114
|
+
if stem =~ VOWEL_IN_STEM
|
115
115
|
w = stem
|
116
116
|
case w
|
117
117
|
when /(at|bl|iz)$/ then w << "e"
|
@@ -121,9 +121,9 @@ module Stemmable
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
if w =~ /y$/
|
124
|
+
if w =~ /y$/
|
125
125
|
stem = $`
|
126
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
126
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
127
127
|
end
|
128
128
|
|
129
129
|
# Step 2
|
@@ -159,7 +159,7 @@ module Stemmable
|
|
159
159
|
end
|
160
160
|
|
161
161
|
# Step 5
|
162
|
-
if w =~ /e$/
|
162
|
+
if w =~ /e$/
|
163
163
|
stem = $`
|
164
164
|
if (stem =~ MGR1) ||
|
165
165
|
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
data/lib/engtagger/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
class EngTagger
|
2
|
+
VERSION = "0.3.0"
|
3
3
|
end
|
data/lib/engtagger.rb
CHANGED
@@ -3,30 +3,17 @@
|
|
3
3
|
|
4
4
|
$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
|
5
5
|
require 'rubygems'
|
6
|
-
require 'kconv'
|
7
6
|
require 'porter'
|
7
|
+
require 'lru_redux'
|
8
8
|
|
9
|
-
|
10
|
-
|
11
|
-
require 'hpricot'
|
12
|
-
rescue LoadError
|
13
|
-
$no_hpricot = true
|
14
|
-
end
|
15
|
-
|
16
|
-
# File paths
|
17
|
-
$lexpath = File.join(File.dirname(__FILE__), 'engtagger')
|
18
|
-
$word_path = File.join($lexpath, "pos_words.hash")
|
19
|
-
$tag_path = File.join($lexpath, "pos_tags.hash")
|
20
|
-
|
21
|
-
# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
|
22
|
-
class Module
|
23
|
-
def memoize(method)
|
9
|
+
module BoundedSpaceMemoizable
|
10
|
+
def memoize(method, max_cache_size=100000)
|
24
11
|
# alias_method is faster than define_method + old.bind(self).call
|
25
12
|
alias_method "__memoized__#{method}", method
|
26
13
|
module_eval <<-EOF
|
27
|
-
def #{method}(*a
|
28
|
-
#
|
29
|
-
|
14
|
+
def #{method}(*a)
|
15
|
+
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
16
|
+
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
30
17
|
end
|
31
18
|
EOF
|
32
19
|
end
|
@@ -34,17 +21,29 @@ end
|
|
34
21
|
|
35
22
|
# English part-of-speech tagger class
|
36
23
|
class EngTagger
|
24
|
+
extend BoundedSpaceMemoizable
|
25
|
+
|
26
|
+
# File paths
|
27
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
|
28
|
+
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
29
|
+
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
37
30
|
|
38
31
|
#################
|
39
32
|
# Class methods #
|
40
33
|
#################
|
41
34
|
|
42
|
-
# Return a class variable that holds probability data
|
35
|
+
# Return a class variable that holds probability data.
|
36
|
+
#
|
37
|
+
# @return [Hash] the probability data
|
38
|
+
#
|
43
39
|
def self.hmm
|
44
40
|
return @@hmm
|
45
41
|
end
|
46
42
|
|
47
|
-
# Return a class variable that holds lexical data
|
43
|
+
# Return a class variable that holds lexical data.
|
44
|
+
#
|
45
|
+
# @return [Hash] the lexicon
|
46
|
+
#
|
48
47
|
def self.lexicon
|
49
48
|
return @@lexicon
|
50
49
|
end
|
@@ -88,7 +87,12 @@ class EngTagger
|
|
88
87
|
IN = get_ext('in')
|
89
88
|
|
90
89
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
90
|
+
#
|
91
|
+
# @param tag [#to_s] the tag in question
|
92
|
+
# @return [String] the definition, if available
|
93
|
+
#
|
91
94
|
def self.explain_tag(tag)
|
95
|
+
tag = tag.to_s.downcase
|
92
96
|
if TAGS[tag]
|
93
97
|
return TAGS[tag]
|
94
98
|
else
|
@@ -143,7 +147,7 @@ class EngTagger
|
|
143
147
|
"PPS", "Punctuation, colon, semicolon, elipsis",
|
144
148
|
"LRB", "Punctuation, left bracket",
|
145
149
|
"RRB", "Punctuation, right bracket"
|
146
|
-
|
150
|
+
]
|
147
151
|
tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
|
148
152
|
tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
|
149
153
|
TAGS = Hash[*tags]
|
@@ -196,12 +200,12 @@ class EngTagger
|
|
196
200
|
@conf[:tag_lex] = 'tags.yml'
|
197
201
|
@conf[:word_lex] = 'words.yml'
|
198
202
|
@conf[:unknown_lex] = 'unknown.yml'
|
199
|
-
@conf[:word_path] =
|
200
|
-
@conf[:tag_path] =
|
203
|
+
@conf[:word_path] = DEFAULT_WORDPATH
|
204
|
+
@conf[:tag_path] = DEFAULT_TAGPATH
|
201
205
|
@conf[:debug] = false
|
202
206
|
# assuming that we start analyzing from the beginninga new sentence...
|
203
207
|
@conf[:current_tag] = 'pp'
|
204
|
-
@conf.merge!(params)
|
208
|
+
@conf.merge!(params) if params
|
205
209
|
unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
|
206
210
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
207
211
|
@@hmm = Hash.new
|
@@ -221,6 +225,33 @@ class EngTagger
|
|
221
225
|
# Public methods #
|
222
226
|
##################
|
223
227
|
|
228
|
+
# Return an array of pairs of the form `["word", :tag]`.
|
229
|
+
#
|
230
|
+
# @param text [String] the input text
|
231
|
+
# @return [Array] the tagged words
|
232
|
+
#
|
233
|
+
def tag_pairs(text)
|
234
|
+
return [] unless valid_text(text)
|
235
|
+
|
236
|
+
out = clean_text(text).map do |word|
|
237
|
+
cleaned_word = clean_word word
|
238
|
+
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
239
|
+
@conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
|
240
|
+
[word, tag.to_sym]
|
241
|
+
end
|
242
|
+
|
243
|
+
# reset the tagger state
|
244
|
+
reset
|
245
|
+
|
246
|
+
out
|
247
|
+
end
|
248
|
+
|
249
|
+
# Examine the string provided and return it fully tagged in XML style.
|
250
|
+
#
|
251
|
+
# @param text [String] the input text
|
252
|
+
# @param verbose [false, true] whether to use verbose tags
|
253
|
+
# @return [String] the marked-up string
|
254
|
+
#
|
224
255
|
# Examine the string provided and return it fully tagged in XML style
|
225
256
|
def add_tags(text, verbose = false)
|
226
257
|
return nil unless valid_text(text)
|
@@ -260,10 +291,10 @@ class EngTagger
|
|
260
291
|
def get_readable(text, verbose = false)
|
261
292
|
return nil unless valid_text(text)
|
262
293
|
tagged = add_tags(text, verbose)
|
263
|
-
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
294
|
+
tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
|
295
|
+
#!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
264
296
|
$1 + '/' + $2.upcase
|
265
297
|
end
|
266
|
-
return tagged
|
267
298
|
end
|
268
299
|
|
269
300
|
# Return an array of sentences (without POS tags) from a text.
|
@@ -319,90 +350,151 @@ class EngTagger
|
|
319
350
|
|
320
351
|
# Given a POS-tagged text, this method returns all nouns and their
|
321
352
|
# occurrence frequencies.
|
353
|
+
#
|
354
|
+
# @param tagged [String] the tagged text
|
355
|
+
# @return [Hash] the hash of matches
|
356
|
+
#
|
322
357
|
def get_nouns(tagged)
|
323
358
|
return nil unless valid_text(tagged)
|
324
359
|
tags = [NN]
|
325
360
|
build_matches_hash(build_trimmed(tagged, tags))
|
326
361
|
end
|
327
362
|
|
328
|
-
# Returns all types of verbs and does not descriminate between the
|
329
|
-
#
|
363
|
+
# Returns all types of verbs and does not descriminate between the
|
364
|
+
# various kinds. Combines all other verb methods listed in this
|
365
|
+
# class.
|
366
|
+
#
|
367
|
+
# @param tagged [String] the tagged text
|
368
|
+
# @return [Hash] the hash of matches
|
369
|
+
#
|
330
370
|
def get_verbs(tagged)
|
331
371
|
return nil unless valid_text(tagged)
|
332
372
|
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
333
373
|
build_matches_hash(build_trimmed(tagged, tags))
|
334
374
|
end
|
335
375
|
|
376
|
+
#
|
377
|
+
# @param tagged [String] the tagged text
|
378
|
+
# @return [Hash] the hash of matches
|
379
|
+
#
|
380
|
+
|
336
381
|
def get_infinitive_verbs(tagged)
|
337
382
|
return nil unless valid_text(tagged)
|
338
383
|
tags = [VB]
|
339
384
|
build_matches_hash(build_trimmed(tagged, tags))
|
340
385
|
end
|
341
386
|
|
387
|
+
#
|
388
|
+
# @param tagged [String] the tagged text
|
389
|
+
# @return [Hash] the hash of matches
|
390
|
+
#
|
342
391
|
def get_past_tense_verbs(tagged)
|
343
392
|
return nil unless valid_text(tagged)
|
344
393
|
tags = [VBD]
|
345
394
|
build_matches_hash(build_trimmed(tagged, tags))
|
346
395
|
end
|
347
396
|
|
397
|
+
#
|
398
|
+
# @param tagged [String] the tagged text
|
399
|
+
# @return [Hash] the hash of matches
|
400
|
+
#
|
348
401
|
def get_gerund_verbs(tagged)
|
349
402
|
return nil unless valid_text(tagged)
|
350
403
|
tags = [VBG]
|
351
404
|
build_matches_hash(build_trimmed(tagged, tags))
|
352
405
|
end
|
353
406
|
|
407
|
+
#
|
408
|
+
# @param tagged [String] the tagged text
|
409
|
+
# @return [Hash] the hash of matches
|
410
|
+
#
|
354
411
|
def get_passive_verbs(tagged)
|
355
412
|
return nil unless valid_text(tagged)
|
356
413
|
tags = [PART]
|
357
414
|
build_matches_hash(build_trimmed(tagged, tags))
|
358
415
|
end
|
359
416
|
|
417
|
+
#
|
418
|
+
# @param tagged [String] the tagged text
|
419
|
+
# @return [Hash] the hash of matches
|
420
|
+
#
|
360
421
|
def get_base_present_verbs(tagged)
|
361
422
|
return nil unless valid_text(tagged)
|
362
423
|
tags = [VBP]
|
363
424
|
build_matches_hash(build_trimmed(tagged, tags))
|
364
425
|
end
|
365
426
|
|
427
|
+
#
|
428
|
+
# @param tagged [String] the tagged text
|
429
|
+
# @return [Hash] the hash of matches
|
430
|
+
#
|
366
431
|
def get_present_verbs(tagged)
|
367
432
|
return nil unless valid_text(tagged)
|
368
433
|
tags = [VBZ]
|
369
434
|
build_matches_hash(build_trimmed(tagged, tags))
|
370
435
|
end
|
371
436
|
|
437
|
+
#
|
438
|
+
# @param tagged [String] the tagged text
|
439
|
+
# @return [Hash] the hash of matches
|
440
|
+
#
|
372
441
|
def get_adjectives(tagged)
|
373
442
|
return nil unless valid_text(tagged)
|
374
443
|
tags = [JJ]
|
375
444
|
build_matches_hash(build_trimmed(tagged, tags))
|
376
445
|
end
|
377
446
|
|
447
|
+
#
|
448
|
+
# @param tagged [String] the tagged text
|
449
|
+
# @return [Hash] the hash of matches
|
450
|
+
#
|
378
451
|
def get_comparative_adjectives(tagged)
|
379
452
|
return nil unless valid_text(tagged)
|
380
453
|
tags = [JJR]
|
381
454
|
build_matches_hash(build_trimmed(tagged, tags))
|
382
455
|
end
|
383
456
|
|
457
|
+
#
|
458
|
+
# @param tagged [String] the tagged text
|
459
|
+
# @return [Hash] the hash of matches
|
460
|
+
#
|
384
461
|
def get_superlative_adjectives(tagged)
|
385
462
|
return nil unless valid_text(tagged)
|
386
463
|
tags = [JJS]
|
387
464
|
build_matches_hash(build_trimmed(tagged, tags))
|
388
465
|
end
|
389
466
|
|
467
|
+
#
|
468
|
+
# @param tagged [String] the tagged text
|
469
|
+
# @return [Hash] the hash of matches
|
470
|
+
#
|
390
471
|
def get_adverbs(tagged)
|
391
472
|
return nil unless valid_text(tagged)
|
392
473
|
tags = [RB, RBR, RBS, RP]
|
393
474
|
build_matches_hash(build_trimmed(tagged, tags))
|
394
475
|
end
|
395
476
|
|
477
|
+
#
|
478
|
+
# @param tagged [String] the tagged text
|
479
|
+
# @return [Hash] the hash of matches
|
480
|
+
#
|
396
481
|
def get_interrogatives(tagged)
|
397
482
|
return nil unless valid_text(tagged)
|
398
483
|
tags = [WRB, WDT, WP, WPS]
|
399
484
|
build_matches_hash(build_trimmed(tagged, tags))
|
400
485
|
end
|
401
|
-
|
486
|
+
|
487
|
+
# To be consistent with documentation's naming of 'interrogative'
|
488
|
+
# parts of speech as 'question'
|
402
489
|
alias_method :get_question_parts, :get_interrogatives
|
403
490
|
|
404
|
-
# Returns all types of conjunctions and does not discriminate
|
405
|
-
# E.g. coordinating, subordinating,
|
491
|
+
# Returns all types of conjunctions and does not discriminate
|
492
|
+
# between the various kinds. E.g. coordinating, subordinating,
|
493
|
+
# correlative...
|
494
|
+
#
|
495
|
+
# @param tagged [String] the tagged text
|
496
|
+
# @return [Hash] the hash of matches
|
497
|
+
#
|
406
498
|
def get_conjunctions(tagged)
|
407
499
|
return nil unless valid_text(tagged)
|
408
500
|
tags = [CC, IN]
|
@@ -410,7 +502,11 @@ class EngTagger
|
|
410
502
|
end
|
411
503
|
|
412
504
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
413
|
-
# May be called directly, but is also used by get_noun_phrases
|
505
|
+
# May be called directly, but is also used by `get_noun_phrases`.
|
506
|
+
#
|
507
|
+
# @param tagged [String] the tagged text
|
508
|
+
# @return [Hash] the hash of matches
|
509
|
+
#
|
414
510
|
def get_max_noun_phrases(tagged)
|
415
511
|
return nil unless valid_text(tagged)
|
416
512
|
tags = [@@mnp]
|
@@ -424,11 +520,15 @@ class EngTagger
|
|
424
520
|
end
|
425
521
|
|
426
522
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
523
|
+
#
|
524
|
+
# @param tagged [String] the tagged text
|
525
|
+
# @return [Hash] the hash of matches
|
526
|
+
#
|
427
527
|
def get_noun_phrases(tagged)
|
428
528
|
return nil unless valid_text(tagged)
|
429
529
|
found = Hash.new(0)
|
430
530
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
431
|
-
|
531
|
+
scanned = tagged.scan(@@mnp)
|
432
532
|
# Find MNPs in the text, one sentence at a time
|
433
533
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
434
534
|
mn_phrases = []
|
@@ -437,9 +537,9 @@ class EngTagger
|
|
437
537
|
mn_phrases += m.split(phrase_ext)
|
438
538
|
end
|
439
539
|
mn_phrases.each do |mnp|
|
440
|
-
|
441
|
-
|
442
|
-
|
540
|
+
# Split the phrase into an array of words, and create a loop for each word,
|
541
|
+
# shortening the phrase by removing the word in the first position.
|
542
|
+
# Record the phrase and any single nouns that are found
|
443
543
|
words = mnp.split
|
444
544
|
words.length.times do |i|
|
445
545
|
found[words.join(' ')] += 1 if words.length > 1
|
@@ -484,7 +584,7 @@ class EngTagger
|
|
484
584
|
# Private methods #
|
485
585
|
###################
|
486
586
|
|
487
|
-
|
587
|
+
private
|
488
588
|
|
489
589
|
def build_trimmed(tagged, tags)
|
490
590
|
tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
|
@@ -554,17 +654,10 @@ class EngTagger
|
|
554
654
|
end
|
555
655
|
end
|
556
656
|
|
557
|
-
# Strip the provided text
|
558
|
-
# in preparation for tagging
|
657
|
+
# Strip the provided text and separate off any punctuation in preparation for tagging
|
559
658
|
def clean_text(text)
|
560
659
|
return false unless valid_text(text)
|
561
|
-
|
562
|
-
unless $no_hpricot
|
563
|
-
# Strip out any markup and convert entities to their proper form
|
564
|
-
cleaned_text = Hpricot(text).inner_text
|
565
|
-
else
|
566
|
-
cleaned_text = text
|
567
|
-
end
|
660
|
+
cleaned_text = text.encode('utf-8')
|
568
661
|
tokenized = []
|
569
662
|
# Tokenize the text (splitting on punctuation as you go)
|
570
663
|
cleaned_text.split(/\s+/).each do |line|
|
@@ -599,7 +692,8 @@ class EngTagger
|
|
599
692
|
end
|
600
693
|
words = Array.new
|
601
694
|
tokenized.each_with_index do |t, i|
|
602
|
-
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
695
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
696
|
+
tokenized[i] =~ /\A(.+)\.\z/
|
603
697
|
w = $1
|
604
698
|
# Don't separate the period off words that
|
605
699
|
# meet any of the following conditions:
|
@@ -607,7 +701,8 @@ class EngTagger
|
|
607
701
|
# 1. It is defined in one of the lists above
|
608
702
|
# 2. It is only one letter long: Alfred E. Sloan
|
609
703
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
610
|
-
unless abbr[w.downcase] or
|
704
|
+
unless abbr[w.downcase] or
|
705
|
+
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
611
706
|
words << w
|
612
707
|
words << '.'
|
613
708
|
next
|
@@ -641,7 +736,7 @@ class EngTagger
|
|
641
736
|
# Handle all other punctuation
|
642
737
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
643
738
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
644
|
-
text = text.gsub(/:/o, " :") # Shift semicolons off
|
739
|
+
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
645
740
|
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
646
741
|
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
647
742
|
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
@@ -718,8 +813,7 @@ class EngTagger
|
|
718
813
|
def classify_unknown_word(word)
|
719
814
|
if /[\(\{\[]/ =~ word # Left brackets
|
720
815
|
classified = "*LRB*"
|
721
|
-
elsif
|
722
|
-
/[\)\}\]]/ =~ word # Right brackets
|
816
|
+
elsif /[\)\}\]]/ =~ word # Right brackets
|
723
817
|
classified = "*RRB*"
|
724
818
|
elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
|
725
819
|
classified = "*NUM*"
|
@@ -763,28 +857,28 @@ class EngTagger
|
|
763
857
|
# from a POS-tagged text.
|
764
858
|
def get_max_noun_regex
|
765
859
|
regex = /
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
860
|
+
# optional number, gerund - adjective -participle
|
861
|
+
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
862
|
+
# Followed by one or more nouns
|
863
|
+
(?:#{NN})+
|
864
|
+
(?:
|
865
|
+
# Optional preposition, determinant, cardinal
|
866
|
+
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
867
|
+
# Optional gerund-adjective -participle
|
868
|
+
(?:#{GER}|#{ADJ}|#{PART})*
|
869
|
+
# one or more nouns
|
870
|
+
(?:#{NN})+
|
871
|
+
)*
|
872
|
+
/xo #/
|
873
|
+
return regex
|
780
874
|
end
|
781
875
|
|
782
876
|
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
783
877
|
# YAML data parser. It will load a YAML document with a collection of key:
|
784
878
|
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
785
879
|
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
786
|
-
def load_tags(lexicon)
|
787
|
-
path = File.join(
|
880
|
+
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
881
|
+
path = File.join(lexpath, lexicon)
|
788
882
|
fh = File.open(path, 'r')
|
789
883
|
while line = fh.gets
|
790
884
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
@@ -806,8 +900,8 @@ class EngTagger
|
|
806
900
|
# YAML data parser. It will load a YAML document with a collection of key:
|
807
901
|
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
808
902
|
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
809
|
-
def load_words(lexicon)
|
810
|
-
path = File.join(
|
903
|
+
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
904
|
+
path = File.join(lexpath, lexicon)
|
811
905
|
fh = File.open(path, 'r')
|
812
906
|
while line = fh.gets
|
813
907
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
data/test/test_engtagger.rb
CHANGED
@@ -1,233 +1,246 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
@@
|
13
|
-
Lisa Raines
|
14
|
-
EOD
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
tagpath
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
def test_add_tags
|
40
|
-
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_assign_tag
|
44
|
-
models = []; tests = []
|
45
|
-
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
-
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
-
models.length.times do |i|
|
48
|
-
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
-
end
|
50
|
-
tests = []
|
51
|
-
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
-
models.length.times do |i|
|
53
|
-
result = @tagger.assign_tag(*tests[i])
|
54
|
-
assert(EngTagger.hmm.keys.index(result))
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
assert_equal(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
result = @tagger.
|
97
|
-
|
98
|
-
end
|
99
|
-
|
100
|
-
def
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
expected_result =
|
120
|
-
result
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
@tagger.
|
156
|
-
|
157
|
-
@tagger.
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
def
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
models
|
185
|
-
models << ["
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
models
|
194
|
-
models << ["
|
195
|
-
|
196
|
-
models << ["
|
197
|
-
|
198
|
-
models
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
end
|
217
|
-
|
218
|
-
def
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
end
|
232
|
-
|
233
|
-
|
1
|
+
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
+
$LOAD_PATH << $ENGTAGGER_LIB
|
3
|
+
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
4
|
+
require 'engtagger'
|
5
|
+
|
6
|
+
class TestEngTagger < Test::Unit::TestCase
|
7
|
+
|
8
|
+
@@untagged =<<EOD
|
9
|
+
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
10
|
+
EOD
|
11
|
+
|
12
|
+
@@tagged =<<EOD
|
13
|
+
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
14
|
+
EOD
|
15
|
+
|
16
|
+
# Testing class methods
|
17
|
+
|
18
|
+
def setup
|
19
|
+
@tagger = EngTagger.new
|
20
|
+
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
21
|
+
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
22
|
+
if !File.exist?(tagpath) or !File.exist?(wordpath)
|
23
|
+
@tagger.install
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def text_get_ext
|
28
|
+
model = '<cd>[^<]+</cd}>\s*'
|
29
|
+
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_explain_tag
|
33
|
+
assert_equal("noun", EngTagger.explain_tag("nn"))
|
34
|
+
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Testing public instance methods
|
38
|
+
|
39
|
+
def test_add_tags
|
40
|
+
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_assign_tag
|
44
|
+
models = []; tests = []
|
45
|
+
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
+
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
+
models.length.times do |i|
|
48
|
+
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
+
end
|
50
|
+
tests = []
|
51
|
+
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
+
models.length.times do |i|
|
53
|
+
result = @tagger.assign_tag(*tests[i])
|
54
|
+
assert(EngTagger.hmm.keys.index(result))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_clean_text
|
59
|
+
test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
|
60
|
+
model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
61
|
+
assert_equal(model, @tagger.send(:clean_text, test))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_get_noun_phrases
|
65
|
+
result = @tagger.get_noun_phrases(@@tagged)
|
66
|
+
assert_instance_of(Hash, result)
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_get_nouns
|
70
|
+
result = @tagger.get_nouns(@@tagged)
|
71
|
+
assert_instance_of(Hash, result)
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_get_verbs
|
75
|
+
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
76
|
+
result = @tagger.get_verbs(@@tagged)
|
77
|
+
assert_equal(expected_result, result)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_get_adverbs
|
81
|
+
expected_result = { "otherwise" => 1 }
|
82
|
+
result = @tagger.get_adverbs(@@tagged)
|
83
|
+
assert_equal(expected_result, result)
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_get_interrogatives
|
87
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
88
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
89
|
+
result = @tagger.get_interrogatives(tagged)
|
90
|
+
assert_equal(expected_result, result)
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_get_question_parts
|
94
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
95
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
96
|
+
result = @tagger.get_question_parts(tagged)
|
97
|
+
assert_equal(expected_result, result)
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_get_conjunctions
|
101
|
+
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
102
|
+
result = @tagger.get_conjunctions(@@tagged)
|
103
|
+
assert_equal(expected_result, result)
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_get_proper_nouns
|
107
|
+
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
108
|
+
result = @tagger.get_proper_nouns(test)
|
109
|
+
assert_instance_of(Hash, result)
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_get_readable
|
113
|
+
test = "I woke up to the sound of pouring rain."
|
114
|
+
result = @tagger.get_readable(test)
|
115
|
+
assert(String, result)
|
116
|
+
|
117
|
+
test = "I woke up to the sound of pouring rain."
|
118
|
+
result = @tagger.get_readable(test)
|
119
|
+
expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
|
120
|
+
assert_equal(expected_result, result)
|
121
|
+
test = "I woke up with a <bad> word."
|
122
|
+
result = @tagger.get_readable(test)
|
123
|
+
expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
|
124
|
+
assert_equal(expected_result, result)
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def test_get_sentences
|
129
|
+
result = @tagger.get_sentences(@@untagged)
|
130
|
+
assert_equal(4, result.length)
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_get_words
|
134
|
+
@tagger.conf[:longest_noun_phrase] = 1
|
135
|
+
result1 = @tagger.get_words(@@tagged)
|
136
|
+
@tagger.conf[:longest_noun_phrase] = 10
|
137
|
+
result2 = @tagger.get_words(@@tagged)
|
138
|
+
assert_instance_of(Hash, result1)
|
139
|
+
assert_instance_of(Hash, result2)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Testing private instance methods
|
143
|
+
|
144
|
+
def test_reset
|
145
|
+
@tagger.conf[:current_tag] = 'nn'
|
146
|
+
@tagger.send(:reset)
|
147
|
+
assert_equal('pp', @tagger.conf[:current_tag])
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
def test_classify_unknown_word
|
152
|
+
assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
|
153
|
+
assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
|
154
|
+
assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
|
155
|
+
assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
|
156
|
+
assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
|
157
|
+
assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
|
158
|
+
assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
|
159
|
+
assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def test_clean_word
|
164
|
+
models = []; tests = []
|
165
|
+
models += ["*NUM*"]
|
166
|
+
models += ["Plays"]
|
167
|
+
models += ["pleadingly"]
|
168
|
+
tests += ["1973.0820", "Plays", "Pleadingly"]
|
169
|
+
models.length.times do |i|
|
170
|
+
assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_get_max_noun_phrases
|
175
|
+
result = @tagger.send(:get_max_noun_phrases, @@tagged)
|
176
|
+
assert_instance_of(Hash, result)
|
177
|
+
end
|
178
|
+
|
179
|
+
def test_get_max_noun_regex
|
180
|
+
assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
|
181
|
+
end
|
182
|
+
|
183
|
+
def test_split_punct
|
184
|
+
models = []; texts = []
|
185
|
+
models << ["`", "test"]; texts << "`test"
|
186
|
+
models << ["``", "test"]; texts << "\"test"
|
187
|
+
models << ["`", "test"]; texts << "'test"
|
188
|
+
models << ["''"]; texts << '"'
|
189
|
+
models << ["test", "'"]; texts << "test' "
|
190
|
+
models << ["-", "test", "-"]; texts << "---test-----"
|
191
|
+
models << ["test", ",", "test"]; texts << "test,test"
|
192
|
+
models << ["123,456"]; texts << "123,456"
|
193
|
+
models << ["test", ":", "test"]; texts << "test:test"
|
194
|
+
models << ["123", ":", "456"]; texts << "123:456"
|
195
|
+
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
196
|
+
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
197
|
+
models << ["test", "#", "test"]; texts << "test#test"
|
198
|
+
models << ["I", "'d", "like"]; texts << "I'd like"
|
199
|
+
models << ["is", "n't", "so"]; texts << "isn't so"
|
200
|
+
models << ["we", "'re", "all"]; texts << "we're all"
|
201
|
+
|
202
|
+
texts.each_with_index do |text, index|
|
203
|
+
assert_equal(models[index], @tagger.send(:split_punct, text))
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_split_sentences
|
208
|
+
models = []; tests = []
|
209
|
+
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
210
|
+
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
211
|
+
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
212
|
+
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
213
|
+
models.length.times do |i|
|
214
|
+
assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def test_stem
|
219
|
+
word = "gets"
|
220
|
+
old = @tagger.conf[:stem]
|
221
|
+
@tagger.conf[:stem] = true
|
222
|
+
assert_equal("get", @tagger.stem(word))
|
223
|
+
# the following should not work since we memoize stem method
|
224
|
+
# @tagger.conf[:stem] = false
|
225
|
+
# assert_equal("gets", @tagger.stem(word))
|
226
|
+
@tagger.conf[:stem] = old
|
227
|
+
end
|
228
|
+
|
229
|
+
def test_strip_tags
|
230
|
+
assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
|
231
|
+
end
|
232
|
+
|
233
|
+
def test_valid_text
|
234
|
+
text = nil
|
235
|
+
assert(!@tagger.send(:valid_text, text))
|
236
|
+
text = "this is test text"
|
237
|
+
assert(@tagger.send(:valid_text, text))
|
238
|
+
text = ""
|
239
|
+
assert(!@tagger.send(:valid_text, text))
|
240
|
+
end
|
241
|
+
|
242
|
+
def test_override_default_params
|
243
|
+
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
244
|
+
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
245
|
+
end
|
246
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-06-21 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
14
14
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
@@ -20,6 +20,7 @@ extensions: []
|
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- ".gitignore"
|
23
|
+
- ".yardopts"
|
23
24
|
- Gemfile
|
24
25
|
- LICENSE
|
25
26
|
- README.md
|