engtagger 0.2.2 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/.yardopts +5 -0
- data/Gemfile +1 -2
- data/README.md +19 -26
- data/engtagger.gemspec +6 -4
- data/lib/engtagger/porter.rb +12 -12
- data/lib/engtagger/version.rb +2 -2
- data/lib/engtagger.rb +165 -72
- data/test/test_engtagger.rb +246 -233
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6aa6da6cfb58bffd900843f62675d5895e80428be7295ae056ed73327286233d
|
4
|
+
data.tar.gz: dd412266b905ba4d378521540247a368bc4f73dfa89e8d6e58c220625c46e40d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de1aa006ea943270e4dcea78690e8a10551c42819abbf3c27b6d2629d600745124ec5cfa6a6104d3cb4c87dbfc14d09e643e7b2143979dee27485841fd76b0fe
|
7
|
+
data.tar.gz: 3404a699868beb475daee809cc67788a70152c0d5eba045b7d3c007e3b3fccb66ee6bb432832a8e9872cd6d3faf281fab60bf151c01eaf1cf52d6275644012bb
|
data/.gitignore
CHANGED
data/.yardopts
ADDED
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,13 +4,13 @@ English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
|
4
4
|
|
5
5
|
### Description
|
6
6
|
|
7
|
-
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
-
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
-
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
-
conditional probabilities--it examines the preceding tag to determine the
|
11
|
-
appropriate tag for the current word. Unknown words are classified according to
|
12
|
-
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
-
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
7
|
+
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
+
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
+
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
+
conditional probabilities--it examines the preceding tag to determine the
|
11
|
+
appropriate tag for the current word. Unknown words are classified according to
|
12
|
+
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
+
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
14
|
of regular expressions.
|
15
15
|
|
16
16
|
### Features
|
@@ -21,7 +21,6 @@ of regular expressions.
|
|
21
21
|
|
22
22
|
### Synopsis:
|
23
23
|
|
24
|
-
require 'rubygems'
|
25
24
|
require 'engtagger'
|
26
25
|
|
27
26
|
# Create a parser object
|
@@ -34,20 +33,20 @@ of regular expressions.
|
|
34
33
|
tagged = tgr.add_tags(text)
|
35
34
|
|
36
35
|
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
37
|
-
|
36
|
+
|
38
37
|
# Get a list of all nouns and noun phrases with occurrence counts
|
39
38
|
word_list = tgr.get_words(text)
|
40
39
|
|
41
40
|
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
42
|
-
|
41
|
+
|
43
42
|
# Get a readable version of the tagged text
|
44
43
|
readable = tgr.get_readable(text)
|
45
|
-
|
44
|
+
|
46
45
|
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
46
|
|
48
47
|
# Get all nouns from a tagged output
|
49
48
|
nouns = tgr.get_nouns(tagged)
|
50
|
-
|
49
|
+
|
51
50
|
#=> {"cat"=>1, "Alice"=>1}
|
52
51
|
|
53
52
|
# Get all proper nouns
|
@@ -73,13 +72,13 @@ of regular expressions.
|
|
73
72
|
|
74
73
|
### Tag Set
|
75
74
|
|
76
|
-
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
75
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
77
76
|
|
78
77
|
CC Conjunction, coordinating and, or
|
79
78
|
CD Adjective, cardinal number 3, fifteen
|
80
79
|
DET Determiner this, each, some
|
81
80
|
EX Pronoun, existential there there
|
82
|
-
FW Foreign words
|
81
|
+
FW Foreign words
|
83
82
|
IN Preposition / Conjunction for, of, although, that
|
84
83
|
JJ Adjective happy, bad
|
85
84
|
JJR Adjective, comparative happier, worse
|
@@ -111,7 +110,7 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
111
110
|
WP Pronoun, question who, whoever
|
112
111
|
WPS Determiner, possessive & question whose
|
113
112
|
WRB Adverb, question when, how, however
|
114
|
-
|
113
|
+
|
115
114
|
PP Punctuation, sentence ender ., !, ?
|
116
115
|
PPC Punctuation, comma ,
|
117
116
|
PPD Punctuation, dollar sign $
|
@@ -121,30 +120,24 @@ The set of POS tags used here is a modified version of the Penn Treebank tagset.
|
|
121
120
|
LRB Punctuation, left bracket (, {, [
|
122
121
|
RRB Punctuation, right bracket ), }, ]
|
123
122
|
|
124
|
-
### Requirements
|
125
|
-
|
126
|
-
* [Hpricot](http://code.whytheluckystiff.net/hpricot/) (optional)
|
127
|
-
|
128
123
|
### Install
|
129
124
|
|
130
|
-
|
125
|
+
gem install engtagger
|
131
126
|
|
132
127
|
### Author
|
133
128
|
|
134
|
-
of this Ruby library
|
129
|
+
of this Ruby library
|
135
130
|
|
136
|
-
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
131
|
+
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
137
132
|
|
138
133
|
### Contributors
|
139
134
|
|
140
|
-
|
141
|
-
* Phil London
|
142
|
-
* Bazay (Baron Bloomer)
|
135
|
+
Many thanks to the collaborators listed in the right column of this GitHub page.
|
143
136
|
|
144
137
|
### Acknowledgement
|
145
138
|
|
146
139
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
147
|
-
The credit for the crucial part of its algorithm/design therefore goes to
|
140
|
+
The credit for the crucial part of its algorithm/design therefore goes to
|
148
141
|
Aaron Coburn, the author of the original Perl version.
|
149
142
|
|
150
143
|
### License
|
data/engtagger.gemspec
CHANGED
@@ -4,14 +4,16 @@ require File.expand_path('../lib/engtagger/version', __FILE__)
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Yoichiro Hasebe"]
|
6
6
|
gem.email = ["yohasebe@gmail.com"]
|
7
|
-
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
-
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
-
gem.homepage = "http://github.com/yohasebe/engtagger"
|
7
|
+
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
+
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
10
10
|
|
11
11
|
gem.files = `git ls-files`.split($\)
|
12
12
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
13
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
14
|
gem.name = "engtagger"
|
15
15
|
gem.require_paths = ["lib"]
|
16
|
-
gem.version = EngTagger::VERSION
|
16
|
+
gem.version = EngTagger::VERSION
|
17
|
+
|
18
|
+
gem.add_runtime_dependency 'lru_redux'
|
17
19
|
end
|
data/lib/engtagger/porter.rb
CHANGED
@@ -12,7 +12,7 @@ module Stemmable
|
|
12
12
|
'ousness'=>'ous', 'aliti'=>'al',
|
13
13
|
'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log'
|
14
14
|
}
|
15
|
-
|
15
|
+
|
16
16
|
STEP_3_LIST = {
|
17
17
|
'icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic',
|
18
18
|
'ical'=>'ic', 'ful'=>'', 'ness'=>''
|
@@ -48,7 +48,7 @@ module Stemmable
|
|
48
48
|
ance |
|
49
49
|
ence |
|
50
50
|
er |
|
51
|
-
ic |
|
51
|
+
ic |
|
52
52
|
able |
|
53
53
|
ible |
|
54
54
|
ant |
|
@@ -88,30 +88,30 @@ module Stemmable
|
|
88
88
|
#
|
89
89
|
# Send comments to raypereda@hotmail.com
|
90
90
|
#
|
91
|
-
|
91
|
+
|
92
92
|
def stem_porter
|
93
93
|
|
94
94
|
# make a copy of the given object and convert it to a string.
|
95
95
|
w = self.dup.to_str
|
96
|
-
|
96
|
+
|
97
97
|
return w if w.length < 3
|
98
|
-
|
98
|
+
|
99
99
|
# now map initial y to Y so that the patterns never treat it as vowel
|
100
100
|
w[0] = 'Y' if w[0] == ?y
|
101
|
-
|
101
|
+
|
102
102
|
# Step 1a
|
103
103
|
if w =~ /(ss|i)es$/
|
104
104
|
w = $` + $1
|
105
|
-
elsif w =~ /([^s])s$/
|
105
|
+
elsif w =~ /([^s])s$/
|
106
106
|
w = $` + $1
|
107
107
|
end
|
108
108
|
|
109
109
|
# Step 1b
|
110
110
|
if w =~ /eed$/
|
111
|
-
w.chop! if $` =~ MGR0
|
111
|
+
w.chop! if $` =~ MGR0
|
112
112
|
elsif w =~ /(ed|ing)$/
|
113
113
|
stem = $`
|
114
|
-
if stem =~ VOWEL_IN_STEM
|
114
|
+
if stem =~ VOWEL_IN_STEM
|
115
115
|
w = stem
|
116
116
|
case w
|
117
117
|
when /(at|bl|iz)$/ then w << "e"
|
@@ -121,9 +121,9 @@ module Stemmable
|
|
121
121
|
end
|
122
122
|
end
|
123
123
|
|
124
|
-
if w =~ /y$/
|
124
|
+
if w =~ /y$/
|
125
125
|
stem = $`
|
126
|
-
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
126
|
+
w = stem + "i" if stem =~ VOWEL_IN_STEM
|
127
127
|
end
|
128
128
|
|
129
129
|
# Step 2
|
@@ -159,7 +159,7 @@ module Stemmable
|
|
159
159
|
end
|
160
160
|
|
161
161
|
# Step 5
|
162
|
-
if w =~ /e$/
|
162
|
+
if w =~ /e$/
|
163
163
|
stem = $`
|
164
164
|
if (stem =~ MGR1) ||
|
165
165
|
(stem =~ MEQ1 && stem !~ /^#{CC}#{V}[^aeiouwxy]$/o)
|
data/lib/engtagger/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "0.
|
1
|
+
class EngTagger
|
2
|
+
VERSION = "0.3.2"
|
3
3
|
end
|
data/lib/engtagger.rb
CHANGED
@@ -1,32 +1,18 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
|
4
|
-
$LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
|
5
4
|
require 'rubygems'
|
6
|
-
require '
|
7
|
-
require '
|
8
|
-
|
9
|
-
# use hpricot for extracting English text from docs with XML like tags
|
10
|
-
begin
|
11
|
-
require 'hpricot'
|
12
|
-
rescue LoadError
|
13
|
-
$no_hpricot = true
|
14
|
-
end
|
15
|
-
|
16
|
-
# File paths
|
17
|
-
$lexpath = File.join(File.dirname(__FILE__), 'engtagger')
|
18
|
-
$word_path = File.join($lexpath, "pos_words.hash")
|
19
|
-
$tag_path = File.join($lexpath, "pos_tags.hash")
|
5
|
+
require 'engtagger/porter'
|
6
|
+
require 'lru_redux'
|
20
7
|
|
21
|
-
|
22
|
-
|
23
|
-
def memoize(method)
|
8
|
+
module BoundedSpaceMemoizable
|
9
|
+
def memoize(method, max_cache_size=100000)
|
24
10
|
# alias_method is faster than define_method + old.bind(self).call
|
25
11
|
alias_method "__memoized__#{method}", method
|
26
12
|
module_eval <<-EOF
|
27
|
-
def #{method}(*a
|
28
|
-
#
|
29
|
-
|
13
|
+
def #{method}(*a)
|
14
|
+
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
15
|
+
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
30
16
|
end
|
31
17
|
EOF
|
32
18
|
end
|
@@ -34,17 +20,29 @@ end
|
|
34
20
|
|
35
21
|
# English part-of-speech tagger class
|
36
22
|
class EngTagger
|
23
|
+
extend BoundedSpaceMemoizable
|
24
|
+
|
25
|
+
# File paths
|
26
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
|
27
|
+
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
28
|
+
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
37
29
|
|
38
30
|
#################
|
39
31
|
# Class methods #
|
40
32
|
#################
|
41
33
|
|
42
|
-
# Return a class variable that holds probability data
|
34
|
+
# Return a class variable that holds probability data.
|
35
|
+
#
|
36
|
+
# @return [Hash] the probability data
|
37
|
+
#
|
43
38
|
def self.hmm
|
44
39
|
return @@hmm
|
45
40
|
end
|
46
41
|
|
47
|
-
# Return a class variable that holds lexical data
|
42
|
+
# Return a class variable that holds lexical data.
|
43
|
+
#
|
44
|
+
# @return [Hash] the lexicon
|
45
|
+
#
|
48
46
|
def self.lexicon
|
49
47
|
return @@lexicon
|
50
48
|
end
|
@@ -88,7 +86,12 @@ class EngTagger
|
|
88
86
|
IN = get_ext('in')
|
89
87
|
|
90
88
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
89
|
+
#
|
90
|
+
# @param tag [#to_s] the tag in question
|
91
|
+
# @return [String] the definition, if available
|
92
|
+
#
|
91
93
|
def self.explain_tag(tag)
|
94
|
+
tag = tag.to_s.downcase
|
92
95
|
if TAGS[tag]
|
93
96
|
return TAGS[tag]
|
94
97
|
else
|
@@ -143,7 +146,7 @@ class EngTagger
|
|
143
146
|
"PPS", "Punctuation, colon, semicolon, elipsis",
|
144
147
|
"LRB", "Punctuation, left bracket",
|
145
148
|
"RRB", "Punctuation, right bracket"
|
146
|
-
|
149
|
+
]
|
147
150
|
tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
|
148
151
|
tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
|
149
152
|
TAGS = Hash[*tags]
|
@@ -196,12 +199,12 @@ class EngTagger
|
|
196
199
|
@conf[:tag_lex] = 'tags.yml'
|
197
200
|
@conf[:word_lex] = 'words.yml'
|
198
201
|
@conf[:unknown_lex] = 'unknown.yml'
|
199
|
-
@conf[:word_path] =
|
200
|
-
@conf[:tag_path] =
|
202
|
+
@conf[:word_path] = DEFAULT_WORDPATH
|
203
|
+
@conf[:tag_path] = DEFAULT_TAGPATH
|
201
204
|
@conf[:debug] = false
|
202
205
|
# assuming that we start analyzing from the beginninga new sentence...
|
203
206
|
@conf[:current_tag] = 'pp'
|
204
|
-
@conf.merge!(params)
|
207
|
+
@conf.merge!(params) if params
|
205
208
|
unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
|
206
209
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
207
210
|
@@hmm = Hash.new
|
@@ -221,6 +224,33 @@ class EngTagger
|
|
221
224
|
# Public methods #
|
222
225
|
##################
|
223
226
|
|
227
|
+
# Return an array of pairs of the form `["word", :tag]`.
|
228
|
+
#
|
229
|
+
# @param text [String] the input text
|
230
|
+
# @return [Array] the tagged words
|
231
|
+
#
|
232
|
+
def tag_pairs(text)
|
233
|
+
return [] unless valid_text(text)
|
234
|
+
|
235
|
+
out = clean_text(text).map do |word|
|
236
|
+
cleaned_word = clean_word word
|
237
|
+
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
238
|
+
@conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
|
239
|
+
[word, tag.to_sym]
|
240
|
+
end
|
241
|
+
|
242
|
+
# reset the tagger state
|
243
|
+
reset
|
244
|
+
|
245
|
+
out
|
246
|
+
end
|
247
|
+
|
248
|
+
# Examine the string provided and return it fully tagged in XML style.
|
249
|
+
#
|
250
|
+
# @param text [String] the input text
|
251
|
+
# @param verbose [false, true] whether to use verbose tags
|
252
|
+
# @return [String] the marked-up string
|
253
|
+
#
|
224
254
|
# Examine the string provided and return it fully tagged in XML style
|
225
255
|
def add_tags(text, verbose = false)
|
226
256
|
return nil unless valid_text(text)
|
@@ -260,10 +290,10 @@ class EngTagger
|
|
260
290
|
def get_readable(text, verbose = false)
|
261
291
|
return nil unless valid_text(text)
|
262
292
|
tagged = add_tags(text, verbose)
|
263
|
-
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
293
|
+
tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
|
294
|
+
#!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
264
295
|
$1 + '/' + $2.upcase
|
265
296
|
end
|
266
|
-
return tagged
|
267
297
|
end
|
268
298
|
|
269
299
|
# Return an array of sentences (without POS tags) from a text.
|
@@ -319,90 +349,151 @@ class EngTagger
|
|
319
349
|
|
320
350
|
# Given a POS-tagged text, this method returns all nouns and their
|
321
351
|
# occurrence frequencies.
|
352
|
+
#
|
353
|
+
# @param tagged [String] the tagged text
|
354
|
+
# @return [Hash] the hash of matches
|
355
|
+
#
|
322
356
|
def get_nouns(tagged)
|
323
357
|
return nil unless valid_text(tagged)
|
324
358
|
tags = [NN]
|
325
359
|
build_matches_hash(build_trimmed(tagged, tags))
|
326
360
|
end
|
327
361
|
|
328
|
-
# Returns all types of verbs and does not descriminate between the
|
329
|
-
#
|
362
|
+
# Returns all types of verbs and does not descriminate between the
|
363
|
+
# various kinds. Combines all other verb methods listed in this
|
364
|
+
# class.
|
365
|
+
#
|
366
|
+
# @param tagged [String] the tagged text
|
367
|
+
# @return [Hash] the hash of matches
|
368
|
+
#
|
330
369
|
def get_verbs(tagged)
|
331
370
|
return nil unless valid_text(tagged)
|
332
371
|
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
333
372
|
build_matches_hash(build_trimmed(tagged, tags))
|
334
373
|
end
|
335
374
|
|
375
|
+
#
|
376
|
+
# @param tagged [String] the tagged text
|
377
|
+
# @return [Hash] the hash of matches
|
378
|
+
#
|
379
|
+
|
336
380
|
def get_infinitive_verbs(tagged)
|
337
381
|
return nil unless valid_text(tagged)
|
338
382
|
tags = [VB]
|
339
383
|
build_matches_hash(build_trimmed(tagged, tags))
|
340
384
|
end
|
341
385
|
|
386
|
+
#
|
387
|
+
# @param tagged [String] the tagged text
|
388
|
+
# @return [Hash] the hash of matches
|
389
|
+
#
|
342
390
|
def get_past_tense_verbs(tagged)
|
343
391
|
return nil unless valid_text(tagged)
|
344
392
|
tags = [VBD]
|
345
393
|
build_matches_hash(build_trimmed(tagged, tags))
|
346
394
|
end
|
347
395
|
|
396
|
+
#
|
397
|
+
# @param tagged [String] the tagged text
|
398
|
+
# @return [Hash] the hash of matches
|
399
|
+
#
|
348
400
|
def get_gerund_verbs(tagged)
|
349
401
|
return nil unless valid_text(tagged)
|
350
402
|
tags = [VBG]
|
351
403
|
build_matches_hash(build_trimmed(tagged, tags))
|
352
404
|
end
|
353
405
|
|
406
|
+
#
|
407
|
+
# @param tagged [String] the tagged text
|
408
|
+
# @return [Hash] the hash of matches
|
409
|
+
#
|
354
410
|
def get_passive_verbs(tagged)
|
355
411
|
return nil unless valid_text(tagged)
|
356
412
|
tags = [PART]
|
357
413
|
build_matches_hash(build_trimmed(tagged, tags))
|
358
414
|
end
|
359
415
|
|
416
|
+
#
|
417
|
+
# @param tagged [String] the tagged text
|
418
|
+
# @return [Hash] the hash of matches
|
419
|
+
#
|
360
420
|
def get_base_present_verbs(tagged)
|
361
421
|
return nil unless valid_text(tagged)
|
362
422
|
tags = [VBP]
|
363
423
|
build_matches_hash(build_trimmed(tagged, tags))
|
364
424
|
end
|
365
425
|
|
426
|
+
#
|
427
|
+
# @param tagged [String] the tagged text
|
428
|
+
# @return [Hash] the hash of matches
|
429
|
+
#
|
366
430
|
def get_present_verbs(tagged)
|
367
431
|
return nil unless valid_text(tagged)
|
368
432
|
tags = [VBZ]
|
369
433
|
build_matches_hash(build_trimmed(tagged, tags))
|
370
434
|
end
|
371
435
|
|
436
|
+
#
|
437
|
+
# @param tagged [String] the tagged text
|
438
|
+
# @return [Hash] the hash of matches
|
439
|
+
#
|
372
440
|
def get_adjectives(tagged)
|
373
441
|
return nil unless valid_text(tagged)
|
374
442
|
tags = [JJ]
|
375
443
|
build_matches_hash(build_trimmed(tagged, tags))
|
376
444
|
end
|
377
445
|
|
446
|
+
#
|
447
|
+
# @param tagged [String] the tagged text
|
448
|
+
# @return [Hash] the hash of matches
|
449
|
+
#
|
378
450
|
def get_comparative_adjectives(tagged)
|
379
451
|
return nil unless valid_text(tagged)
|
380
452
|
tags = [JJR]
|
381
453
|
build_matches_hash(build_trimmed(tagged, tags))
|
382
454
|
end
|
383
455
|
|
456
|
+
#
|
457
|
+
# @param tagged [String] the tagged text
|
458
|
+
# @return [Hash] the hash of matches
|
459
|
+
#
|
384
460
|
def get_superlative_adjectives(tagged)
|
385
461
|
return nil unless valid_text(tagged)
|
386
462
|
tags = [JJS]
|
387
463
|
build_matches_hash(build_trimmed(tagged, tags))
|
388
464
|
end
|
389
465
|
|
466
|
+
#
|
467
|
+
# @param tagged [String] the tagged text
|
468
|
+
# @return [Hash] the hash of matches
|
469
|
+
#
|
390
470
|
def get_adverbs(tagged)
|
391
471
|
return nil unless valid_text(tagged)
|
392
472
|
tags = [RB, RBR, RBS, RP]
|
393
473
|
build_matches_hash(build_trimmed(tagged, tags))
|
394
474
|
end
|
395
475
|
|
476
|
+
#
|
477
|
+
# @param tagged [String] the tagged text
|
478
|
+
# @return [Hash] the hash of matches
|
479
|
+
#
|
396
480
|
def get_interrogatives(tagged)
|
397
481
|
return nil unless valid_text(tagged)
|
398
482
|
tags = [WRB, WDT, WP, WPS]
|
399
483
|
build_matches_hash(build_trimmed(tagged, tags))
|
400
484
|
end
|
401
|
-
|
485
|
+
|
486
|
+
# To be consistent with documentation's naming of 'interrogative'
|
487
|
+
# parts of speech as 'question'
|
402
488
|
alias_method :get_question_parts, :get_interrogatives
|
403
489
|
|
404
|
-
# Returns all types of conjunctions and does not discriminate
|
405
|
-
# E.g. coordinating, subordinating,
|
490
|
+
# Returns all types of conjunctions and does not discriminate
|
491
|
+
# between the various kinds. E.g. coordinating, subordinating,
|
492
|
+
# correlative...
|
493
|
+
#
|
494
|
+
# @param tagged [String] the tagged text
|
495
|
+
# @return [Hash] the hash of matches
|
496
|
+
#
|
406
497
|
def get_conjunctions(tagged)
|
407
498
|
return nil unless valid_text(tagged)
|
408
499
|
tags = [CC, IN]
|
@@ -410,7 +501,11 @@ class EngTagger
|
|
410
501
|
end
|
411
502
|
|
412
503
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
413
|
-
# May be called directly, but is also used by get_noun_phrases
|
504
|
+
# May be called directly, but is also used by `get_noun_phrases`.
|
505
|
+
#
|
506
|
+
# @param tagged [String] the tagged text
|
507
|
+
# @return [Hash] the hash of matches
|
508
|
+
#
|
414
509
|
def get_max_noun_phrases(tagged)
|
415
510
|
return nil unless valid_text(tagged)
|
416
511
|
tags = [@@mnp]
|
@@ -424,11 +519,15 @@ class EngTagger
|
|
424
519
|
end
|
425
520
|
|
426
521
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
522
|
+
#
|
523
|
+
# @param tagged [String] the tagged text
|
524
|
+
# @return [Hash] the hash of matches
|
525
|
+
#
|
427
526
|
def get_noun_phrases(tagged)
|
428
527
|
return nil unless valid_text(tagged)
|
429
528
|
found = Hash.new(0)
|
430
529
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
431
|
-
|
530
|
+
scanned = tagged.scan(@@mnp)
|
432
531
|
# Find MNPs in the text, one sentence at a time
|
433
532
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
434
533
|
mn_phrases = []
|
@@ -437,9 +536,9 @@ class EngTagger
|
|
437
536
|
mn_phrases += m.split(phrase_ext)
|
438
537
|
end
|
439
538
|
mn_phrases.each do |mnp|
|
440
|
-
|
441
|
-
|
442
|
-
|
539
|
+
# Split the phrase into an array of words, and create a loop for each word,
|
540
|
+
# shortening the phrase by removing the word in the first position.
|
541
|
+
# Record the phrase and any single nouns that are found
|
443
542
|
words = mnp.split
|
444
543
|
words.length.times do |i|
|
445
544
|
found[words.join(' ')] += 1 if words.length > 1
|
@@ -484,7 +583,7 @@ class EngTagger
|
|
484
583
|
# Private methods #
|
485
584
|
###################
|
486
585
|
|
487
|
-
|
586
|
+
private
|
488
587
|
|
489
588
|
def build_trimmed(tagged, tags)
|
490
589
|
tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
|
@@ -554,17 +653,10 @@ class EngTagger
|
|
554
653
|
end
|
555
654
|
end
|
556
655
|
|
557
|
-
# Strip the provided text
|
558
|
-
# in preparation for tagging
|
656
|
+
# Strip the provided text and separate off any punctuation in preparation for tagging
|
559
657
|
def clean_text(text)
|
560
658
|
return false unless valid_text(text)
|
561
|
-
|
562
|
-
unless $no_hpricot
|
563
|
-
# Strip out any markup and convert entities to their proper form
|
564
|
-
cleaned_text = Hpricot(text).inner_text
|
565
|
-
else
|
566
|
-
cleaned_text = text
|
567
|
-
end
|
659
|
+
cleaned_text = text.encode('utf-8')
|
568
660
|
tokenized = []
|
569
661
|
# Tokenize the text (splitting on punctuation as you go)
|
570
662
|
cleaned_text.split(/\s+/).each do |line|
|
@@ -599,7 +691,8 @@ class EngTagger
|
|
599
691
|
end
|
600
692
|
words = Array.new
|
601
693
|
tokenized.each_with_index do |t, i|
|
602
|
-
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
694
|
+
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
|
695
|
+
tokenized[i] =~ /\A(.+)\.\z/
|
603
696
|
w = $1
|
604
697
|
# Don't separate the period off words that
|
605
698
|
# meet any of the following conditions:
|
@@ -607,7 +700,8 @@ class EngTagger
|
|
607
700
|
# 1. It is defined in one of the lists above
|
608
701
|
# 2. It is only one letter long: Alfred E. Sloan
|
609
702
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
610
|
-
unless abbr[w.downcase] or
|
703
|
+
unless abbr[w.downcase] or
|
704
|
+
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
611
705
|
words << w
|
612
706
|
words << '.'
|
613
707
|
next
|
@@ -641,7 +735,7 @@ class EngTagger
|
|
641
735
|
# Handle all other punctuation
|
642
736
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
643
737
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
644
|
-
text = text.gsub(/:/o, " :") # Shift semicolons off
|
738
|
+
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
645
739
|
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
646
740
|
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
647
741
|
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
@@ -718,8 +812,7 @@ class EngTagger
|
|
718
812
|
def classify_unknown_word(word)
|
719
813
|
if /[\(\{\[]/ =~ word # Left brackets
|
720
814
|
classified = "*LRB*"
|
721
|
-
elsif
|
722
|
-
/[\)\}\]]/ =~ word # Right brackets
|
815
|
+
elsif /[\)\}\]]/ =~ word # Right brackets
|
723
816
|
classified = "*RRB*"
|
724
817
|
elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
|
725
818
|
classified = "*NUM*"
|
@@ -763,28 +856,28 @@ class EngTagger
|
|
763
856
|
# from a POS-tagged text.
|
764
857
|
def get_max_noun_regex
|
765
858
|
regex = /
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
859
|
+
# optional number, gerund - adjective -participle
|
860
|
+
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
861
|
+
# Followed by one or more nouns
|
862
|
+
(?:#{NN})+
|
863
|
+
(?:
|
864
|
+
# Optional preposition, determinant, cardinal
|
865
|
+
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
866
|
+
# Optional gerund-adjective -participle
|
867
|
+
(?:#{GER}|#{ADJ}|#{PART})*
|
868
|
+
# one or more nouns
|
869
|
+
(?:#{NN})+
|
870
|
+
)*
|
871
|
+
/xo #/
|
872
|
+
return regex
|
780
873
|
end
|
781
874
|
|
782
875
|
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
783
876
|
# YAML data parser. It will load a YAML document with a collection of key:
|
784
877
|
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
785
878
|
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
786
|
-
def load_tags(lexicon)
|
787
|
-
path = File.join(
|
879
|
+
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
880
|
+
path = File.join(lexpath, lexicon)
|
788
881
|
fh = File.open(path, 'r')
|
789
882
|
while line = fh.gets
|
790
883
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
@@ -806,8 +899,8 @@ class EngTagger
|
|
806
899
|
# YAML data parser. It will load a YAML document with a collection of key:
|
807
900
|
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
808
901
|
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
809
|
-
def load_words(lexicon)
|
810
|
-
path = File.join(
|
902
|
+
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
903
|
+
path = File.join(lexpath, lexicon)
|
811
904
|
fh = File.open(path, 'r')
|
812
905
|
while line = fh.gets
|
813
906
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
data/test/test_engtagger.rb
CHANGED
@@ -1,233 +1,246 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
@@
|
13
|
-
Lisa Raines
|
14
|
-
EOD
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
tagpath
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
def test_add_tags
|
40
|
-
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
-
end
|
42
|
-
|
43
|
-
def test_assign_tag
|
44
|
-
models = []; tests = []
|
45
|
-
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
-
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
-
models.length.times do |i|
|
48
|
-
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
-
end
|
50
|
-
tests = []
|
51
|
-
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
-
models.length.times do |i|
|
53
|
-
result = @tagger.assign_tag(*tests[i])
|
54
|
-
assert(EngTagger.hmm.keys.index(result))
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
def
|
59
|
-
|
60
|
-
|
61
|
-
assert_equal(
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
end
|
68
|
-
|
69
|
-
def
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
end
|
85
|
-
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
result = @tagger.
|
97
|
-
|
98
|
-
end
|
99
|
-
|
100
|
-
def
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
expected_result =
|
120
|
-
result
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
@tagger.
|
156
|
-
|
157
|
-
@tagger.
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
def
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
models
|
185
|
-
models << ["
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
models
|
194
|
-
models << ["
|
195
|
-
|
196
|
-
models << ["
|
197
|
-
|
198
|
-
models
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
end
|
217
|
-
|
218
|
-
def
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
end
|
232
|
-
|
233
|
-
|
1
|
+
$ENGTAGGER_LIB = File.join(File.dirname(__FILE__), '..', 'lib')
|
2
|
+
$LOAD_PATH << $ENGTAGGER_LIB
|
3
|
+
require 'test/unit' unless defined? $ZENTEST and $ZENTEST
|
4
|
+
require 'engtagger'
|
5
|
+
|
6
|
+
class TestEngTagger < Test::Unit::TestCase
|
7
|
+
|
8
|
+
@@untagged =<<EOD
|
9
|
+
Lisa Raines, a lawyer and director of government relations for the Industrial Biotechnical Association, contends that a judge well-versed in patent law and the concerns of research-based industries would have ruled otherwise. And Judge Newman, a former patent lawyer, wrote in her dissent when the court denied a motion for a rehearing of the case by the full court, "The panel's judicial legislation has affected an important high-technological industry, without regard to the consequences for research and innovation or the public interest." Says Ms. Raines, "[The judgement] confirms our concern that the absence of patent lawyers on the court could prove troublesome."
|
10
|
+
EOD
|
11
|
+
|
12
|
+
@@tagged =<<EOD
|
13
|
+
<nnp>Lisa</nnp> <nnp>Raines</nnp> <ppc>,</ppc> <det>a</det> <nn>lawyer</nn> <cc>and</cc> <nn>director</nn> <in>of</in> <nn>government</nn> <nns>relations</nns> <in>for</in> <det>the</det> <nnp>Industrial</nnp> <nnp>Biotechnical</nnp> <nnp>Association</nnp> <ppc>,</ppc> <vbz>contends</vbz> <in>that</in> <det>a</det> <nn>judge</nn> <jj>well-versed</jj> <in>in</in> <nn>patent</nn> <nn>law</nn> <cc>and</cc> <det>the</det> <nns>concerns</nns> <in>of</in> <jj>research-based</jj> <nns>industries</nns> <md>would</md> <vb>have</vb> <vbn>ruled</vbn> <rb>otherwise</rb> <pp>.</pp>
|
14
|
+
EOD
|
15
|
+
|
16
|
+
# Testing class methods
|
17
|
+
|
18
|
+
def setup
|
19
|
+
@tagger = EngTagger.new
|
20
|
+
tagpath = File.join($ENGTAGGER_LIB, @tagger.conf[:tag_path])
|
21
|
+
wordpath = File.join($ENGTAGGER_LIB, @tagger.conf[:word_path])
|
22
|
+
if !File.exist?(tagpath) or !File.exist?(wordpath)
|
23
|
+
@tagger.install
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def text_get_ext
|
28
|
+
model = '<cd>[^<]+</cd}>\s*'
|
29
|
+
assert_equal(model, EngTagger.get_ext(model, "cd"))
|
30
|
+
end
|
31
|
+
|
32
|
+
def test_explain_tag
|
33
|
+
assert_equal("noun", EngTagger.explain_tag("nn"))
|
34
|
+
assert_equal("verb_infinitive", EngTagger.explain_tag("vb"))
|
35
|
+
end
|
36
|
+
|
37
|
+
# Testing public instance methods
|
38
|
+
|
39
|
+
def test_add_tags
|
40
|
+
assert_instance_of(String, @tagger.add_tags(@@untagged))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_assign_tag
|
44
|
+
models = []; tests = []
|
45
|
+
models += [@tagger.conf[:unknown_word_tag], "sym"]
|
46
|
+
tests += [["pp","-unknown-"], ["pp", "-sym-"]]
|
47
|
+
models.length.times do |i|
|
48
|
+
assert_equal(models[i],@tagger.assign_tag(*tests[i]))
|
49
|
+
end
|
50
|
+
tests = []
|
51
|
+
tests += [["vb","water"], ["nn", "runs"]]
|
52
|
+
models.length.times do |i|
|
53
|
+
result = @tagger.assign_tag(*tests[i])
|
54
|
+
assert(EngTagger.hmm.keys.index(result))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def test_clean_text
|
59
|
+
test = "I am 100.0% sure that Dr. Watson is too naive. I'm sorry."
|
60
|
+
model = ["I","am","100.0","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
61
|
+
assert_equal(model, @tagger.send(:clean_text, test))
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_get_noun_phrases
|
65
|
+
result = @tagger.get_noun_phrases(@@tagged)
|
66
|
+
assert_instance_of(Hash, result)
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_get_nouns
|
70
|
+
result = @tagger.get_nouns(@@tagged)
|
71
|
+
assert_instance_of(Hash, result)
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_get_verbs
|
75
|
+
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
76
|
+
result = @tagger.get_verbs(@@tagged)
|
77
|
+
assert_equal(expected_result, result)
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_get_adverbs
|
81
|
+
expected_result = { "otherwise" => 1 }
|
82
|
+
result = @tagger.get_adverbs(@@tagged)
|
83
|
+
assert_equal(expected_result, result)
|
84
|
+
end
|
85
|
+
|
86
|
+
def test_get_interrogatives
|
87
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
88
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
89
|
+
result = @tagger.get_interrogatives(tagged)
|
90
|
+
assert_equal(expected_result, result)
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_get_question_parts
|
94
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
95
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
96
|
+
result = @tagger.get_question_parts(tagged)
|
97
|
+
assert_equal(expected_result, result)
|
98
|
+
end
|
99
|
+
|
100
|
+
def test_get_conjunctions
|
101
|
+
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
102
|
+
result = @tagger.get_conjunctions(@@tagged)
|
103
|
+
assert_equal(expected_result, result)
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_get_proper_nouns
|
107
|
+
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
108
|
+
result = @tagger.get_proper_nouns(test)
|
109
|
+
assert_instance_of(Hash, result)
|
110
|
+
end
|
111
|
+
|
112
|
+
def test_get_readable
|
113
|
+
test = "I woke up to the sound of pouring rain."
|
114
|
+
result = @tagger.get_readable(test)
|
115
|
+
assert(String, result)
|
116
|
+
|
117
|
+
test = "I woke up to the sound of pouring rain."
|
118
|
+
result = @tagger.get_readable(test)
|
119
|
+
expected_result = "I/PRP woke/VBD up/RB to/TO the/DET sound/NN of/IN pouring/VBG rain/NN ./PP"
|
120
|
+
assert_equal(expected_result, result)
|
121
|
+
test = "I woke up with a <bad> word."
|
122
|
+
result = @tagger.get_readable(test)
|
123
|
+
expected_result = "I/PRP woke/VBD up/RB with/IN a/DET <bad>/NNP word/NN ./PP"
|
124
|
+
assert_equal(expected_result, result)
|
125
|
+
end
|
126
|
+
|
127
|
+
|
128
|
+
def test_get_sentences
|
129
|
+
result = @tagger.get_sentences(@@untagged)
|
130
|
+
assert_equal(4, result.length)
|
131
|
+
end
|
132
|
+
|
133
|
+
def test_get_words
|
134
|
+
@tagger.conf[:longest_noun_phrase] = 1
|
135
|
+
result1 = @tagger.get_words(@@tagged)
|
136
|
+
@tagger.conf[:longest_noun_phrase] = 10
|
137
|
+
result2 = @tagger.get_words(@@tagged)
|
138
|
+
assert_instance_of(Hash, result1)
|
139
|
+
assert_instance_of(Hash, result2)
|
140
|
+
end
|
141
|
+
|
142
|
+
# Testing private instance methods
|
143
|
+
|
144
|
+
def test_reset
|
145
|
+
@tagger.conf[:current_tag] = 'nn'
|
146
|
+
@tagger.send(:reset)
|
147
|
+
assert_equal('pp', @tagger.conf[:current_tag])
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
def test_classify_unknown_word
|
152
|
+
assert_equal("*LRB*", @tagger.send(:classify_unknown_word, "{"))
|
153
|
+
assert_equal("*NUM*", @tagger.send(:classify_unknown_word, "123.4567"))
|
154
|
+
assert_equal("*ORD*", @tagger.send(:classify_unknown_word, "40th"))
|
155
|
+
assert_equal("-abr-", @tagger.send(:classify_unknown_word, "GT-R"))
|
156
|
+
assert_equal("-hyp-adj-", @tagger.send(:classify_unknown_word, "extremely-high"))
|
157
|
+
assert_equal("-sym-", @tagger.send(:classify_unknown_word, "&&"))
|
158
|
+
assert_equal("-ing-", @tagger.send(:classify_unknown_word, "wikiing"))
|
159
|
+
assert_equal("-unknown-", @tagger.send(:classify_unknown_word, "asefasdf"))
|
160
|
+
end
|
161
|
+
|
162
|
+
|
163
|
+
def test_clean_word
|
164
|
+
models = []; tests = []
|
165
|
+
models += ["*NUM*"]
|
166
|
+
models += ["Plays"]
|
167
|
+
models += ["pleadingly"]
|
168
|
+
tests += ["1973.0820", "Plays", "Pleadingly"]
|
169
|
+
models.length.times do |i|
|
170
|
+
assert_equal(models[i], @tagger.send(:clean_word, tests[i]))
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
def test_get_max_noun_phrases
|
175
|
+
result = @tagger.send(:get_max_noun_phrases, @@tagged)
|
176
|
+
assert_instance_of(Hash, result)
|
177
|
+
end
|
178
|
+
|
179
|
+
def test_get_max_noun_regex
|
180
|
+
assert_instance_of(Regexp, @tagger.send(:get_max_noun_regex))
|
181
|
+
end
|
182
|
+
|
183
|
+
def test_split_punct
|
184
|
+
models = []; texts = []
|
185
|
+
models << ["`", "test"]; texts << "`test"
|
186
|
+
models << ["``", "test"]; texts << "\"test"
|
187
|
+
models << ["`", "test"]; texts << "'test"
|
188
|
+
models << ["''"]; texts << '"'
|
189
|
+
models << ["test", "'"]; texts << "test' "
|
190
|
+
models << ["-", "test", "-"]; texts << "---test-----"
|
191
|
+
models << ["test", ",", "test"]; texts << "test,test"
|
192
|
+
models << ["123,456"]; texts << "123,456"
|
193
|
+
models << ["test", ":", "test"]; texts << "test:test"
|
194
|
+
models << ["123", ":", "456"]; texts << "123:456"
|
195
|
+
models << ["test1", "...", "test2"]; texts << "test1...test2"
|
196
|
+
models << ["{", "ab","[","(","c",")","[","d","]","]","}"]; texts << "{ab[(c)[d]]}"
|
197
|
+
models << ["test", "#", "test"]; texts << "test#test"
|
198
|
+
models << ["I", "'d", "like"]; texts << "I'd like"
|
199
|
+
models << ["is", "n't", "so"]; texts << "isn't so"
|
200
|
+
models << ["we", "'re", "all"]; texts << "we're all"
|
201
|
+
|
202
|
+
texts.each_with_index do |text, index|
|
203
|
+
assert_equal(models[index], @tagger.send(:split_punct, text))
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def test_split_sentences
|
208
|
+
models = []; tests = []
|
209
|
+
models << ["He", "is", "a", "u.s.", "army", "officer", "."]
|
210
|
+
tests << ["He", "is", "a", "u.s.", "army", "officer."]
|
211
|
+
models << ["He", "is", "Mr.", "Johnson", ".", "He", "'s", "my", "friend", "."]
|
212
|
+
tests << ["He", "is", "Mr.", "Johnson.", "He", "'s", "my", "friend."]
|
213
|
+
models.length.times do |i|
|
214
|
+
assert_equal(models[i], @tagger.send(:split_sentences, tests[i]))
|
215
|
+
end
|
216
|
+
end
|
217
|
+
|
218
|
+
def test_stem
|
219
|
+
word = "gets"
|
220
|
+
old = @tagger.conf[:stem]
|
221
|
+
@tagger.conf[:stem] = true
|
222
|
+
assert_equal("get", @tagger.stem(word))
|
223
|
+
# the following should not work since we memoize stem method
|
224
|
+
# @tagger.conf[:stem] = false
|
225
|
+
# assert_equal("gets", @tagger.stem(word))
|
226
|
+
@tagger.conf[:stem] = old
|
227
|
+
end
|
228
|
+
|
229
|
+
def test_strip_tags
|
230
|
+
assert_instance_of(String, @tagger.send(:strip_tags, @@tagged))
|
231
|
+
end
|
232
|
+
|
233
|
+
def test_valid_text
|
234
|
+
text = nil
|
235
|
+
assert(!@tagger.send(:valid_text, text))
|
236
|
+
text = "this is test text"
|
237
|
+
assert(@tagger.send(:valid_text, text))
|
238
|
+
text = ""
|
239
|
+
assert(!@tagger.send(:valid_text, text))
|
240
|
+
end
|
241
|
+
|
242
|
+
def test_override_default_params
|
243
|
+
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
244
|
+
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
245
|
+
end
|
246
|
+
end
|
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
12
|
-
dependencies:
|
11
|
+
date: 2022-08-05 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: lru_redux
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
14
28
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
15
29
|
set of probability values.
|
@@ -20,6 +34,7 @@ extensions: []
|
|
20
34
|
extra_rdoc_files: []
|
21
35
|
files:
|
22
36
|
- ".gitignore"
|
37
|
+
- ".yardopts"
|
23
38
|
- Gemfile
|
24
39
|
- LICENSE
|
25
40
|
- README.md
|