engtagger 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -0
- data/lib/engtagger.rb +193 -236
- data/lib/engtagger/version.rb +1 -1
- data/test/test_engtagger.rb +33 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8a32003a9025611e547589fcc5e8d9dd906f6540
|
4
|
+
data.tar.gz: dbdf70f16444fe60178b1013b72a5155a2ae66fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 00da48be968ddb0b7d314df05a5ba6bc2c33376ae8d1c5fda5b60c6cd72b9b8d44b73a5c553d13b1972e199f0d0d7b4a8de957ee92a03c39216f4a87c86df8a2
|
7
|
+
data.tar.gz: '08842a7e26fd7579fada8c63d1af4e2176015c470a297f1147183bbeeb91e4d4c00b8062873e53d3c815ee4d19abb45ad11104f02974a1a9cbd17e2f9c25ab5c'
|
data/README.md
CHANGED
data/lib/engtagger.rb
CHANGED
@@ -18,7 +18,7 @@ $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
|
|
18
18
|
$word_path = File.join($lexpath, "pos_words.hash")
|
19
19
|
$tag_path = File.join($lexpath, "pos_tags.hash")
|
20
20
|
|
21
|
-
# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
|
21
|
+
# for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
|
22
22
|
class Module
|
23
23
|
def memoize(method)
|
24
24
|
# alias_method is faster than define_method + old.bind(self).call
|
@@ -38,7 +38,7 @@ class EngTagger
|
|
38
38
|
#################
|
39
39
|
# Class methods #
|
40
40
|
#################
|
41
|
-
|
41
|
+
|
42
42
|
# Return a class variable that holds probability data
|
43
43
|
def self.hmm
|
44
44
|
return @@hmm
|
@@ -48,13 +48,13 @@ class EngTagger
|
|
48
48
|
def self.lexicon
|
49
49
|
return @@lexicon
|
50
50
|
end
|
51
|
-
|
52
|
-
# Return a regexp from a string argument that matches an XML-style pos tag
|
51
|
+
|
52
|
+
# Return a regexp from a string argument that matches an XML-style pos tag
|
53
53
|
def self.get_ext(tag = nil)
|
54
54
|
return nil unless tag
|
55
55
|
return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
# Regexps to match XML-style part-of-speech tags
|
59
59
|
NUM = get_ext('cd')
|
60
60
|
GER = get_ext('vbg')
|
@@ -70,22 +70,32 @@ class EngTagger
|
|
70
70
|
VB = get_ext('vb')
|
71
71
|
VBG = get_ext('vbg')
|
72
72
|
VBD = get_ext('vbd')
|
73
|
-
PART = get_ext('vbn')
|
73
|
+
PART = get_ext('vbn')
|
74
74
|
VBP = get_ext('vbp')
|
75
75
|
VBZ = get_ext('vbz')
|
76
76
|
JJ = get_ext('jj')
|
77
77
|
JJR = get_ext('jjr')
|
78
78
|
JJS = get_ext('jjs')
|
79
|
+
RB = get_ext('rb')
|
80
|
+
RBR = get_ext('rbr')
|
81
|
+
RBS = get_ext('rbs')
|
82
|
+
RP = get_ext('rp')
|
83
|
+
WRB = get_ext('wrb')
|
84
|
+
WDT = get_ext('wdt')
|
85
|
+
WP = get_ext('wp')
|
86
|
+
WPS = get_ext('wps')
|
87
|
+
CC = get_ext('cc')
|
88
|
+
IN = get_ext('in')
|
79
89
|
|
80
|
-
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
90
|
+
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
81
91
|
def self.explain_tag(tag)
|
82
92
|
if TAGS[tag]
|
83
93
|
return TAGS[tag]
|
84
94
|
else
|
85
95
|
return tag
|
86
96
|
end
|
87
|
-
end
|
88
|
-
|
97
|
+
end
|
98
|
+
|
89
99
|
# The folloging is to make a hash to convert a pos tag to its definition
|
90
100
|
# used by the explain_tag method
|
91
101
|
tags = [
|
@@ -132,35 +142,35 @@ class EngTagger
|
|
132
142
|
"PPR", "Punctuation, quotation mark right",
|
133
143
|
"PPS", "Punctuation, colon, semicolon, elipsis",
|
134
144
|
"LRB", "Punctuation, left bracket",
|
135
|
-
"RRB", "Punctuation, right bracket"
|
145
|
+
"RRB", "Punctuation, right bracket"
|
136
146
|
]
|
137
147
|
tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
|
138
148
|
tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
|
139
149
|
TAGS = Hash[*tags]
|
140
|
-
|
150
|
+
|
141
151
|
# Hash storing config values:
|
142
152
|
#
|
143
153
|
# * :unknown_word_tag
|
144
154
|
# => (String) Tag to assign to unknown words
|
145
|
-
# * :stem
|
155
|
+
# * :stem
|
146
156
|
# => (Boolean) Stem single words using Porter module
|
147
157
|
# * :weight_noun_phrases
|
148
|
-
# => (Boolean) When returning occurrence counts for a noun phrase, multiply
|
158
|
+
# => (Boolean) When returning occurrence counts for a noun phrase, multiply
|
149
159
|
# the valuethe number of words in the NP.
|
150
|
-
# * :longest_noun_phrase
|
151
|
-
# => (Integer) Will ignore noun phrases longer than this threshold. This
|
160
|
+
# * :longest_noun_phrase
|
161
|
+
# => (Integer) Will ignore noun phrases longer than this threshold. This
|
152
162
|
# affects only the get_words() and get_nouns() methods.
|
153
|
-
# * :relax
|
154
|
-
# => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
|
163
|
+
# * :relax
|
164
|
+
# => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
|
155
165
|
# uncommon words, particularly words used polysemously
|
156
166
|
# * :tag_lex
|
157
|
-
# => (String) Name of the YAML file containing a hash of adjacent part of
|
167
|
+
# => (String) Name of the YAML file containing a hash of adjacent part of
|
158
168
|
# speech tags and the probability of each
|
159
169
|
# * :word_lex
|
160
|
-
# => (String) Name of the YAML file containing a hash of words and corresponding
|
170
|
+
# => (String) Name of the YAML file containing a hash of words and corresponding
|
161
171
|
# parts of speech
|
162
172
|
# * :unknown_lex
|
163
|
-
# => (String) Name of the YAML file containing a hash of tags for unknown
|
173
|
+
# => (String) Name of the YAML file containing a hash of tags for unknown
|
164
174
|
# words and corresponding parts of speech
|
165
175
|
# * :tag_path
|
166
176
|
# => (String) Directory path of tag_lex
|
@@ -169,12 +179,12 @@ class EngTagger
|
|
169
179
|
# * :debug
|
170
180
|
# => (Boolean) Print debug messages
|
171
181
|
attr_accessor :conf
|
172
|
-
|
182
|
+
|
173
183
|
###############
|
174
184
|
# Constructor #
|
175
185
|
###############
|
176
|
-
|
177
|
-
# Take a hash of parameters that override default values.
|
186
|
+
|
187
|
+
# Take a hash of parameters that override default values.
|
178
188
|
# See above for details.
|
179
189
|
def initialize(params = {})
|
180
190
|
@conf = Hash.new
|
@@ -187,10 +197,10 @@ class EngTagger
|
|
187
197
|
@conf[:word_lex] = 'words.yml'
|
188
198
|
@conf[:unknown_lex] = 'unknown.yml'
|
189
199
|
@conf[:word_path] = $word_path
|
190
|
-
@conf[:tag_path] = $tag_path
|
200
|
+
@conf[:tag_path] = $tag_path
|
191
201
|
@conf[:debug] = false
|
192
202
|
# assuming that we start analyzing from the beginninga new sentence...
|
193
|
-
@conf[:current_tag] = 'pp'
|
203
|
+
@conf[:current_tag] = 'pp'
|
194
204
|
@conf.merge!(params)
|
195
205
|
unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
|
196
206
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
@@ -206,11 +216,11 @@ class EngTagger
|
|
206
216
|
end
|
207
217
|
@@mnp = get_max_noun_regex
|
208
218
|
end
|
209
|
-
|
219
|
+
|
210
220
|
##################
|
211
221
|
# Public methods #
|
212
222
|
##################
|
213
|
-
|
223
|
+
|
214
224
|
# Examine the string provided and return it fully tagged in XML style
|
215
225
|
def add_tags(text, verbose = false)
|
216
226
|
return nil unless valid_text(text)
|
@@ -222,15 +232,15 @@ class EngTagger
|
|
222
232
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
223
233
|
@conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
|
224
234
|
tag = EngTagger.explain_tag(tag) if verbose
|
225
|
-
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
235
|
+
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
226
236
|
end
|
227
237
|
reset
|
228
238
|
return tagged.join(' ')
|
229
239
|
end
|
230
|
-
|
231
|
-
# Given a text string, return as many nouns and noun phrases as possible.
|
240
|
+
|
241
|
+
# Given a text string, return as many nouns and noun phrases as possible.
|
232
242
|
# Applies add_tags and involves three stages:
|
233
|
-
#
|
243
|
+
#
|
234
244
|
# * Tag the text
|
235
245
|
# * Extract all the maximal noun phrases
|
236
246
|
# * Recursively extract all noun phrases from the MNPs
|
@@ -244,19 +254,19 @@ class EngTagger
|
|
244
254
|
return get_noun_phrases(tagged)
|
245
255
|
end
|
246
256
|
end
|
247
|
-
|
248
|
-
# Return an easy-on-the-eyes tagged version of a text string.
|
257
|
+
|
258
|
+
# Return an easy-on-the-eyes tagged version of a text string.
|
249
259
|
# Applies add_tags and reformats to be easier to read.
|
250
260
|
def get_readable(text, verbose = false)
|
251
261
|
return nil unless valid_text(text)
|
252
262
|
tagged = add_tags(text, verbose)
|
253
|
-
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
263
|
+
tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
|
254
264
|
$1 + '/' + $2.upcase
|
255
265
|
end
|
256
266
|
return tagged
|
257
267
|
end
|
258
|
-
|
259
|
-
# Return an array of sentences (without POS tags) from a text.
|
268
|
+
|
269
|
+
# Return an array of sentences (without POS tags) from a text.
|
260
270
|
def get_sentences(text)
|
261
271
|
return nil unless valid_text(text)
|
262
272
|
tagged = add_tags(text)
|
@@ -270,25 +280,19 @@ class EngTagger
|
|
270
280
|
sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
|
271
281
|
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
272
282
|
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
273
|
-
end
|
283
|
+
end
|
274
284
|
return sentences
|
275
285
|
end
|
276
|
-
|
286
|
+
|
277
287
|
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
278
288
|
# and their occurrence frequencies. The method is greedy and will
|
279
289
|
# return multi-word phrases, if possible, so it would find ``Linguistic
|
280
|
-
# Data Consortium'' as a single unit, rather than as three individual
|
281
|
-
# proper nouns. This method does not stem the found words.
|
290
|
+
# Data Consortium'' as a single unit, rather than as three individual
|
291
|
+
# proper nouns. This method does not stem the found words.
|
282
292
|
def get_proper_nouns(tagged)
|
283
293
|
return nil unless valid_text(tagged)
|
284
|
-
|
285
|
-
|
286
|
-
end
|
287
|
-
nnp = Hash.new(0)
|
288
|
-
trimmed.each do |n|
|
289
|
-
next unless n.length < 100 # sanity check on word length
|
290
|
-
nnp[n] += 1 unless n =~ /\A\s*\z/
|
291
|
-
end
|
294
|
+
tags = [NNP]
|
295
|
+
nnp = build_matches_hash(build_trimmed(tagged, tags))
|
292
296
|
# Now for some fancy resolution stuff...
|
293
297
|
nnp.keys.each do |key|
|
294
298
|
words = key.split(/\s/)
|
@@ -301,7 +305,7 @@ class EngTagger
|
|
301
305
|
/\A([a-z])[a-z]*\z/ =~ word
|
302
306
|
$1
|
303
307
|
end.join ''
|
304
|
-
# If that acronym has been seen,
|
308
|
+
# If that acronym has been seen,
|
305
309
|
# remove it and add the values to
|
306
310
|
# the full name
|
307
311
|
if nnp[acronym]
|
@@ -312,167 +316,105 @@ class EngTagger
|
|
312
316
|
end
|
313
317
|
return nnp
|
314
318
|
end
|
315
|
-
|
316
|
-
# Given a POS-tagged text, this method returns all nouns and their
|
317
|
-
# occurrence frequencies.
|
319
|
+
|
320
|
+
# Given a POS-tagged text, this method returns all nouns and their
|
321
|
+
# occurrence frequencies.
|
318
322
|
def get_nouns(tagged)
|
319
323
|
return nil unless valid_text(tagged)
|
320
|
-
NN
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
return ret
|
324
|
+
tags = [NN]
|
325
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
326
|
+
end
|
327
|
+
|
328
|
+
# Returns all types of verbs and does not descriminate between the various kinds.
|
329
|
+
# Is the combination of all other verb methods listed in this class.
|
330
|
+
def get_verbs(tagged)
|
331
|
+
return nil unless valid_text(tagged)
|
332
|
+
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
333
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
331
334
|
end
|
332
335
|
|
333
336
|
def get_infinitive_verbs(tagged)
|
334
337
|
return nil unless valid_text(tagged)
|
335
|
-
VB
|
336
|
-
|
337
|
-
strip_tags(n)
|
338
|
-
end
|
339
|
-
ret = Hash.new(0)
|
340
|
-
trimmed.each do |n|
|
341
|
-
n = stem(n)
|
342
|
-
next unless n.length < 100 # sanity check on word length
|
343
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
344
|
-
end
|
345
|
-
return ret
|
338
|
+
tags = [VB]
|
339
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
346
340
|
end
|
347
341
|
|
348
342
|
def get_past_tense_verbs(tagged)
|
349
343
|
return nil unless valid_text(tagged)
|
350
|
-
VBD
|
351
|
-
|
352
|
-
strip_tags(n)
|
353
|
-
end
|
354
|
-
ret = Hash.new(0)
|
355
|
-
trimmed.each do |n|
|
356
|
-
n = stem(n)
|
357
|
-
next unless n.length < 100 # sanity check on word length
|
358
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
359
|
-
end
|
360
|
-
return ret
|
344
|
+
tags = [VBD]
|
345
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
361
346
|
end
|
362
347
|
|
363
348
|
def get_gerund_verbs(tagged)
|
364
349
|
return nil unless valid_text(tagged)
|
365
|
-
VBG
|
366
|
-
|
367
|
-
strip_tags(n)
|
368
|
-
end
|
369
|
-
ret = Hash.new(0)
|
370
|
-
trimmed.each do |n|
|
371
|
-
n = stem(n)
|
372
|
-
next unless n.length < 100 # sanity check on word length
|
373
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
374
|
-
end
|
375
|
-
return ret
|
350
|
+
tags = [VBG]
|
351
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
376
352
|
end
|
377
353
|
|
378
354
|
def get_passive_verbs(tagged)
|
379
355
|
return nil unless valid_text(tagged)
|
380
|
-
PART
|
381
|
-
|
382
|
-
strip_tags(n)
|
383
|
-
end
|
384
|
-
ret = Hash.new(0)
|
385
|
-
trimmed.each do |n|
|
386
|
-
n = stem(n)
|
387
|
-
next unless n.length < 100 # sanity check on word length
|
388
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
389
|
-
end
|
390
|
-
return ret
|
356
|
+
tags = [PART]
|
357
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
391
358
|
end
|
392
359
|
|
393
|
-
|
394
360
|
def get_base_present_verbs(tagged)
|
395
361
|
return nil unless valid_text(tagged)
|
396
|
-
VBP
|
397
|
-
|
398
|
-
strip_tags(n)
|
399
|
-
end
|
400
|
-
ret = Hash.new(0)
|
401
|
-
trimmed.each do |n|
|
402
|
-
n = stem(n)
|
403
|
-
next unless n.length < 100 # sanity check on word length
|
404
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
405
|
-
end
|
406
|
-
return ret
|
362
|
+
tags = [VBP]
|
363
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
407
364
|
end
|
408
365
|
|
409
366
|
def get_present_verbs(tagged)
|
410
367
|
return nil unless valid_text(tagged)
|
411
|
-
VBZ
|
412
|
-
|
413
|
-
strip_tags(n)
|
414
|
-
end
|
415
|
-
ret = Hash.new(0)
|
416
|
-
trimmed.each do |n|
|
417
|
-
n = stem(n)
|
418
|
-
next unless n.length < 100 # sanity check on word length
|
419
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
420
|
-
end
|
421
|
-
return ret
|
368
|
+
tags = [VBZ]
|
369
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
422
370
|
end
|
423
371
|
|
424
372
|
def get_adjectives(tagged)
|
425
373
|
return nil unless valid_text(tagged)
|
426
|
-
JJ
|
427
|
-
|
428
|
-
strip_tags(n)
|
429
|
-
end
|
430
|
-
ret = Hash.new(0)
|
431
|
-
trimmed.each do |n|
|
432
|
-
n = stem(n)
|
433
|
-
next unless n.length < 100 # sanity check on word length
|
434
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
435
|
-
end
|
436
|
-
return ret
|
374
|
+
tags = [JJ]
|
375
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
437
376
|
end
|
438
377
|
|
439
378
|
def get_comparative_adjectives(tagged)
|
440
379
|
return nil unless valid_text(tagged)
|
441
|
-
JJR
|
442
|
-
|
443
|
-
|
444
|
-
end
|
445
|
-
ret = Hash.new(0)
|
446
|
-
trimmed.each do |n|
|
447
|
-
n = stem(n)
|
448
|
-
next unless n.length < 100 # sanity check on word length
|
449
|
-
ret[n] += 1 unless n =~ /\A\s*\z/
|
450
|
-
end
|
451
|
-
return ret
|
452
|
-
end
|
380
|
+
tags = [JJR]
|
381
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
382
|
+
end
|
453
383
|
|
454
384
|
def get_superlative_adjectives(tagged)
|
455
385
|
return nil unless valid_text(tagged)
|
456
|
-
JJS
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
386
|
+
tags = [JJS]
|
387
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
388
|
+
end
|
389
|
+
|
390
|
+
def get_adverbs(tagged)
|
391
|
+
return nil unless valid_text(tagged)
|
392
|
+
tags = [RB, RBR, RBS, RP]
|
393
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
394
|
+
end
|
395
|
+
|
396
|
+
def get_interrogatives(tagged)
|
397
|
+
return nil unless valid_text(tagged)
|
398
|
+
tags = [WRB, WDT, WP, WPS]
|
399
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
400
|
+
end
|
401
|
+
# To be consistent with documentation's naming of 'interrogative' parts of speech as 'question'
|
402
|
+
alias_method :get_question_parts, :get_interrogatives
|
403
|
+
|
404
|
+
# Returns all types of conjunctions and does not discriminate between the various kinds.
|
405
|
+
# E.g. coordinating, subordinating, correlative...
|
406
|
+
def get_conjunctions(tagged)
|
407
|
+
return nil unless valid_text(tagged)
|
408
|
+
tags = [CC, IN]
|
409
|
+
build_matches_hash(build_trimmed(tagged, tags))
|
467
410
|
end
|
468
411
|
|
469
412
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
470
413
|
# May be called directly, but is also used by get_noun_phrases
|
471
414
|
def get_max_noun_phrases(tagged)
|
472
|
-
return unless valid_text(tagged)
|
473
|
-
|
474
|
-
|
475
|
-
end
|
415
|
+
return nil unless valid_text(tagged)
|
416
|
+
tags = [@@mnp]
|
417
|
+
mn_phrases = build_trimmed(tagged, tags)
|
476
418
|
ret = Hash.new(0)
|
477
419
|
mn_phrases.each do |p|
|
478
420
|
p = stem(p) unless p =~ /\s/ # stem single words
|
@@ -495,9 +437,9 @@ class EngTagger
|
|
495
437
|
mn_phrases += m.split(phrase_ext)
|
496
438
|
end
|
497
439
|
mn_phrases.each do |mnp|
|
498
|
-
# Split the phrase into an array of words, and create a loop for each word,
|
440
|
+
# Split the phrase into an array of words, and create a loop for each word,
|
499
441
|
# shortening the phrase by removing the word in the first position.
|
500
|
-
# Record the phrase and any single nouns that are found
|
442
|
+
# Record the phrase and any single nouns that are found
|
501
443
|
words = mnp.split
|
502
444
|
words.length.times do |i|
|
503
445
|
found[words.join(' ')] += 1 if words.length > 1
|
@@ -519,12 +461,12 @@ class EngTagger
|
|
519
461
|
multiplier = word_count if @conf[:weight_noun_phrases]
|
520
462
|
ret[k] += multiplier * v
|
521
463
|
end
|
522
|
-
return ret
|
464
|
+
return ret
|
523
465
|
end
|
524
|
-
|
525
|
-
# Reads some included corpus data and saves it in a stored hash on the
|
526
|
-
# local file system. This is called automatically if the tagger can't
|
527
|
-
# find the stored lexicon.
|
466
|
+
|
467
|
+
# Reads some included corpus data and saves it in a stored hash on the
|
468
|
+
# local file system. This is called automatically if the tagger can't
|
469
|
+
# find the stored lexicon.
|
528
470
|
def install
|
529
471
|
puts "Creating part-of-speech lexicon" if @conf[:debug]
|
530
472
|
load_tags(@conf[:tag_lex])
|
@@ -542,7 +484,23 @@ class EngTagger
|
|
542
484
|
# Private methods #
|
543
485
|
###################
|
544
486
|
|
545
|
-
:private
|
487
|
+
:private
|
488
|
+
|
489
|
+
def build_trimmed(tagged, tags)
|
490
|
+
tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
|
491
|
+
strip_tags(n)
|
492
|
+
end
|
493
|
+
end
|
494
|
+
|
495
|
+
def build_matches_hash(trimmed)
|
496
|
+
ret = Hash.new(0)
|
497
|
+
trimmed.each do |n|
|
498
|
+
n = stem(n)
|
499
|
+
next unless n.length < 100 # sanity check on word length
|
500
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
501
|
+
end
|
502
|
+
ret
|
503
|
+
end
|
546
504
|
|
547
505
|
# Downcase the first letter of word
|
548
506
|
def lcfirst(word)
|
@@ -552,8 +510,8 @@ class EngTagger
|
|
552
510
|
# Upcase the first letter of word
|
553
511
|
def ucfirst(word)
|
554
512
|
word.split(//)[0].upcase + word.split(//)[1..-1].join
|
555
|
-
end
|
556
|
-
|
513
|
+
end
|
514
|
+
|
557
515
|
# Return the word stem as given by Stemmable module. This can be
|
558
516
|
# turned off with the class parameter @conf[:stem] => false.
|
559
517
|
def stem(word)
|
@@ -561,8 +519,8 @@ class EngTagger
|
|
561
519
|
return word.stem
|
562
520
|
end
|
563
521
|
|
564
|
-
# This method will reset the preceeding tag to a sentence ender (PP).
|
565
|
-
# This prepares the first word of a new sentence to be tagged correctly.
|
522
|
+
# This method will reset the preceeding tag to a sentence ender (PP).
|
523
|
+
# This prepares the first word of a new sentence to be tagged correctly.
|
566
524
|
def reset
|
567
525
|
@conf[:current_tag] = 'pp'
|
568
526
|
end
|
@@ -581,7 +539,7 @@ class EngTagger
|
|
581
539
|
return true
|
582
540
|
end
|
583
541
|
end
|
584
|
-
|
542
|
+
|
585
543
|
# Return a text string with the part-of-speech tags removed
|
586
544
|
def strip_tags(tagged, downcase = false)
|
587
545
|
return nil unless valid_text(tagged)
|
@@ -595,8 +553,8 @@ class EngTagger
|
|
595
553
|
return text
|
596
554
|
end
|
597
555
|
end
|
598
|
-
|
599
|
-
# Strip the provided text of HTML-style tags and separate off any punctuation
|
556
|
+
|
557
|
+
# Strip the provided text of HTML-style tags and separate off any punctuation
|
600
558
|
# in preparation for tagging
|
601
559
|
def clean_text(text)
|
602
560
|
return false unless valid_text(text)
|
@@ -615,27 +573,27 @@ class EngTagger
|
|
615
573
|
words = split_sentences(tokenized)
|
616
574
|
return words
|
617
575
|
end
|
618
|
-
|
619
|
-
# This handles all of the trailing periods, keeping those that
|
576
|
+
|
577
|
+
# This handles all of the trailing periods, keeping those that
|
620
578
|
# belong on abbreviations and removing those that seem to be
|
621
579
|
# at the end of sentences. This method makes some assumptions
|
622
580
|
# about the use of capitalization in the incoming text
|
623
581
|
def split_sentences(array)
|
624
582
|
tokenized = array
|
625
|
-
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
583
|
+
people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
626
584
|
supt det mssrs rev)
|
627
585
|
army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
|
628
586
|
inst = %w(dept univ assn bros ph.d)
|
629
|
-
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
587
|
+
place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
630
588
|
hwy hway la pde pd plz pl rd st tce)
|
631
589
|
comp = %w(mfg inc ltd co corp)
|
632
|
-
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
633
|
-
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
634
|
-
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
590
|
+
state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
591
|
+
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
592
|
+
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
635
593
|
va wash wis wisc wy wyo usafa alta man ont que sask yuk)
|
636
594
|
month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
|
637
595
|
misc = %w(vs etc no esp)
|
638
|
-
abbr = Hash.new
|
596
|
+
abbr = Hash.new
|
639
597
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
640
598
|
abbr[i] = true
|
641
599
|
end
|
@@ -643,11 +601,11 @@ class EngTagger
|
|
643
601
|
tokenized.each_with_index do |t, i|
|
644
602
|
if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
|
645
603
|
w = $1
|
646
|
-
# Don't separate the period off words that
|
604
|
+
# Don't separate the period off words that
|
647
605
|
# meet any of the following conditions:
|
648
606
|
#
|
649
607
|
# 1. It is defined in one of the lists above
|
650
|
-
# 2. It is only one letter long: Alfred E. Sloan
|
608
|
+
# 2. It is only one letter long: Alfred E. Sloan
|
651
609
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
652
610
|
unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
|
653
611
|
words << w
|
@@ -664,8 +622,8 @@ class EngTagger
|
|
664
622
|
end
|
665
623
|
return words
|
666
624
|
end
|
667
|
-
|
668
|
-
# Separate punctuation from words, where appropriate. This leaves trailing
|
625
|
+
|
626
|
+
# Separate punctuation from words, where appropriate. This leaves trailing
|
669
627
|
# periods in place to be dealt with later. Called by the clean_text method.
|
670
628
|
def split_punct(text)
|
671
629
|
# If there's no punctuation, return immediately
|
@@ -675,27 +633,27 @@ class EngTagger
|
|
675
633
|
|
676
634
|
# Put quotes into a standard format
|
677
635
|
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
678
|
-
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
679
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
636
|
+
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
637
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
680
638
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
681
639
|
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
682
|
-
|
640
|
+
|
683
641
|
# Handle all other punctuation
|
684
642
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
685
643
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
686
644
|
text = text.gsub(/:/o, " :") # Shift semicolons off
|
687
|
-
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
645
|
+
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
688
646
|
text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
|
689
647
|
text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
|
690
648
|
|
691
649
|
# English-specific contractions
|
692
650
|
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
|
693
|
-
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
651
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
694
652
|
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
|
695
653
|
result = text.split(' ')
|
696
654
|
return result
|
697
|
-
end
|
698
|
-
|
655
|
+
end
|
656
|
+
|
699
657
|
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
700
658
|
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
701
659
|
def assign_tag(prev_tag, word)
|
@@ -709,7 +667,7 @@ class EngTagger
|
|
709
667
|
best_so_far = 0
|
710
668
|
w = @@lexicon[word]
|
711
669
|
t = @@hmm
|
712
|
-
|
670
|
+
|
713
671
|
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
714
672
|
# which is used in most POS taggers
|
715
673
|
best_tag = ""
|
@@ -724,9 +682,9 @@ class EngTagger
|
|
724
682
|
else
|
725
683
|
next
|
726
684
|
end
|
727
|
-
|
728
|
-
# Bayesian logic:
|
729
|
-
# P = P( tag | prev_tag ) * P( tag | word )
|
685
|
+
|
686
|
+
# Bayesian logic:
|
687
|
+
# P = P( tag | prev_tag ) * P( tag | word )
|
730
688
|
probability = t[prev_tag][tag] * (pw + 1)
|
731
689
|
# Set the tag with maximal probability
|
732
690
|
if probability > best_so_far
|
@@ -735,18 +693,18 @@ class EngTagger
|
|
735
693
|
end
|
736
694
|
end
|
737
695
|
return best_tag
|
738
|
-
end
|
739
|
-
|
740
|
-
# This method determines whether a word should be considered in its
|
696
|
+
end
|
697
|
+
|
698
|
+
# This method determines whether a word should be considered in its
|
741
699
|
# lower or upper case form. This is useful in considering proper nouns
|
742
|
-
# and words that begin sentences. Called by add_tags.
|
700
|
+
# and words that begin sentences. Called by add_tags.
|
743
701
|
def clean_word(word)
|
744
702
|
lcf = lcfirst(word)
|
745
703
|
# seen this word as it appears (lower or upper case)
|
746
704
|
if @@lexicon[word]
|
747
705
|
return word
|
748
706
|
elsif @@lexicon[lcf]
|
749
|
-
# seen this word only as lower case
|
707
|
+
# seen this word only as lower case
|
750
708
|
return lcf
|
751
709
|
else
|
752
710
|
# never seen this word. guess.
|
@@ -754,13 +712,13 @@ class EngTagger
|
|
754
712
|
end
|
755
713
|
end
|
756
714
|
|
757
|
-
# This changes any word not appearing in the lexicon to identifiable
|
758
|
-
# classes of words handled by a simple unknown word classification
|
715
|
+
# This changes any word not appearing in the lexicon to identifiable
|
716
|
+
# classes of words handled by a simple unknown word classification
|
759
717
|
# metric. Called by the clean_word method.
|
760
718
|
def classify_unknown_word(word)
|
761
719
|
if /[\(\{\[]/ =~ word # Left brackets
|
762
720
|
classified = "*LRB*"
|
763
|
-
elsif
|
721
|
+
elsif
|
764
722
|
/[\)\}\]]/ =~ word # Right brackets
|
765
723
|
classified = "*RRB*"
|
766
724
|
elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
|
@@ -800,31 +758,31 @@ class EngTagger
|
|
800
758
|
end
|
801
759
|
return classified
|
802
760
|
end
|
803
|
-
|
804
|
-
# This returns a compiled regexp for extracting maximal noun phrases
|
761
|
+
|
762
|
+
# This returns a compiled regexp for extracting maximal noun phrases
|
805
763
|
# from a POS-tagged text.
|
806
764
|
def get_max_noun_regex
|
807
765
|
regex = /
|
808
766
|
# optional number, gerund - adjective -participle
|
809
767
|
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
810
768
|
# Followed by one or more nouns
|
811
|
-
(?:#{NN})+
|
769
|
+
(?:#{NN})+
|
812
770
|
(?:
|
813
771
|
# Optional preposition, determinant, cardinal
|
814
|
-
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
772
|
+
(?:#{PREP})*(?:#{DET})?(?:#{NUM})?
|
815
773
|
# Optional gerund-adjective -participle
|
816
|
-
(?:#{GER}|#{ADJ}|#{PART})*
|
774
|
+
(?:#{GER}|#{ADJ}|#{PART})*
|
817
775
|
# one or more nouns
|
818
|
-
(?:#{NN})+
|
776
|
+
(?:#{NN})+
|
819
777
|
)*
|
820
778
|
/xo #/
|
821
779
|
return regex
|
822
|
-
end
|
823
|
-
|
824
|
-
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
825
|
-
# YAML data parser. It will load a YAML document with a collection of key:
|
826
|
-
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
827
|
-
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
780
|
+
end
|
781
|
+
|
782
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
783
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
784
|
+
# value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
|
785
|
+
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
828
786
|
def load_tags(lexicon)
|
829
787
|
path = File.join($lexpath, lexicon)
|
830
788
|
fh = File.open(path, 'r')
|
@@ -837,17 +795,17 @@ class EngTagger
|
|
837
795
|
pairs = {}
|
838
796
|
items.each do |i|
|
839
797
|
/([^:]+):\s*(.+)/ =~ i
|
840
|
-
pairs[$1] = $2.to_f
|
798
|
+
pairs[$1] = $2.to_f
|
841
799
|
end
|
842
800
|
@@hmm[key] = pairs
|
843
801
|
end
|
844
802
|
fh.close
|
845
803
|
end
|
846
804
|
|
847
|
-
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
848
|
-
# YAML data parser. It will load a YAML document with a collection of key:
|
849
|
-
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
850
|
-
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
805
|
+
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
806
|
+
# YAML data parser. It will load a YAML document with a collection of key:
|
807
|
+
# value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
|
808
|
+
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
851
809
|
def load_words(lexicon)
|
852
810
|
path = File.join($lexpath, lexicon)
|
853
811
|
fh = File.open(path, 'r')
|
@@ -860,15 +818,14 @@ class EngTagger
|
|
860
818
|
pairs = {}
|
861
819
|
items.each do |i|
|
862
820
|
/([^:]+):\s*(.+)/ =~ i
|
863
|
-
pairs[$1] = $2.to_f
|
821
|
+
pairs[$1] = $2.to_f
|
864
822
|
end
|
865
823
|
@@lexicon[key] = pairs
|
866
824
|
end
|
867
825
|
fh.close
|
868
826
|
end
|
869
|
-
|
870
|
-
#memoize the stem and assign_tag methods
|
827
|
+
|
828
|
+
#memoize the stem and assign_tag methods
|
871
829
|
memoize("stem")
|
872
|
-
memoize("assign_tag")
|
830
|
+
memoize("assign_tag")
|
873
831
|
end
|
874
|
-
|
data/lib/engtagger/version.rb
CHANGED
data/test/test_engtagger.rb
CHANGED
@@ -69,7 +69,7 @@ EOD
|
|
69
69
|
def test_clean_text
|
70
70
|
test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
|
71
71
|
model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
|
72
|
-
assert_equal(model, @tagger.clean_text(test))
|
72
|
+
assert_equal(model, @tagger.clean_text(test)) unless $no_hpricot
|
73
73
|
end
|
74
74
|
|
75
75
|
def test_clean_word
|
@@ -102,6 +102,38 @@ EOD
|
|
102
102
|
assert_instance_of(Hash, result)
|
103
103
|
end
|
104
104
|
|
105
|
+
def test_get_verbs
|
106
|
+
expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
|
107
|
+
result = @tagger.get_verbs(@@tagged)
|
108
|
+
assert_equal(expected_result, result)
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_get_adverbs
|
112
|
+
expected_result = { "otherwise" => 1 }
|
113
|
+
result = @tagger.get_adverbs(@@tagged)
|
114
|
+
assert_equal(expected_result, result)
|
115
|
+
end
|
116
|
+
|
117
|
+
def test_get_interrogatives
|
118
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
119
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
120
|
+
result = @tagger.get_interrogatives(tagged)
|
121
|
+
assert_equal(expected_result, result)
|
122
|
+
end
|
123
|
+
|
124
|
+
def test_get_question_parts
|
125
|
+
tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
|
126
|
+
expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
|
127
|
+
result = @tagger.get_question_parts(tagged)
|
128
|
+
assert_equal(expected_result, result)
|
129
|
+
end
|
130
|
+
|
131
|
+
def test_get_conjunctions
|
132
|
+
expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
|
133
|
+
result = @tagger.get_conjunctions(@@tagged)
|
134
|
+
assert_equal(expected_result, result)
|
135
|
+
end
|
136
|
+
|
105
137
|
def test_get_proper_nouns
|
106
138
|
test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
|
107
139
|
result = @tagger.get_proper_nouns(test)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2016-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
14
14
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
@@ -53,7 +53,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
53
|
version: '0'
|
54
54
|
requirements: []
|
55
55
|
rubyforge_project:
|
56
|
-
rubygems_version: 2.
|
56
|
+
rubygems_version: 2.5.1
|
57
57
|
signing_key:
|
58
58
|
specification_version: 4
|
59
59
|
summary: A probability based, corpus-trained English POS tagger
|