engtagger 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +75 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +33 -31
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +38 -60
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
data/lib/engtagger.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
require
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require "rubygems"
|
6
|
+
require "lru_redux"
|
7
|
+
require_relative "engtagger/porter"
|
7
8
|
|
8
9
|
module BoundedSpaceMemoizable
|
9
|
-
def memoize(method, max_cache_size=
|
10
|
+
def memoize(method, max_cache_size = 100_000)
|
10
11
|
# alias_method is faster than define_method + old.bind(self).call
|
11
12
|
alias_method "__memoized__#{method}", method
|
12
|
-
module_eval <<-
|
13
|
+
module_eval <<-MODEV
|
13
14
|
def #{method}(*a)
|
14
15
|
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
15
16
|
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
16
17
|
end
|
17
|
-
|
18
|
+
MODEV
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
@@ -23,7 +24,7 @@ class EngTagger
|
|
23
24
|
extend BoundedSpaceMemoizable
|
24
25
|
|
25
26
|
# File paths
|
26
|
-
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__),
|
27
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
|
27
28
|
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
28
29
|
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
29
30
|
|
@@ -36,7 +37,7 @@ class EngTagger
|
|
36
37
|
# @return [Hash] the probability data
|
37
38
|
#
|
38
39
|
def self.hmm
|
39
|
-
|
40
|
+
@@hmm
|
40
41
|
end
|
41
42
|
|
42
43
|
# Return a class variable that holds lexical data.
|
@@ -44,46 +45,47 @@ class EngTagger
|
|
44
45
|
# @return [Hash] the lexicon
|
45
46
|
#
|
46
47
|
def self.lexicon
|
47
|
-
|
48
|
+
@@lexicon
|
48
49
|
end
|
49
50
|
|
50
51
|
# Return a regexp from a string argument that matches an XML-style pos tag
|
51
52
|
def self.get_ext(tag = nil)
|
52
53
|
return nil unless tag
|
53
|
-
|
54
|
+
|
55
|
+
Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
54
56
|
end
|
55
57
|
|
56
58
|
# Regexps to match XML-style part-of-speech tags
|
57
|
-
NUM = get_ext(
|
58
|
-
GER = get_ext(
|
59
|
-
ADJ = get_ext(
|
60
|
-
NN = get_ext(
|
61
|
-
NNP = get_ext(
|
62
|
-
PREP = get_ext(
|
63
|
-
DET = get_ext(
|
64
|
-
PAREN = get_ext(
|
65
|
-
QUOT = get_ext(
|
66
|
-
SEN = get_ext(
|
67
|
-
WORD = get_ext(
|
68
|
-
VB = get_ext(
|
69
|
-
VBG = get_ext(
|
70
|
-
VBD = get_ext(
|
71
|
-
PART = get_ext(
|
72
|
-
VBP = get_ext(
|
73
|
-
VBZ = get_ext(
|
74
|
-
JJ = get_ext(
|
75
|
-
JJR = get_ext(
|
76
|
-
JJS = get_ext(
|
77
|
-
RB = get_ext(
|
78
|
-
RBR = get_ext(
|
79
|
-
RBS = get_ext(
|
80
|
-
RP = get_ext(
|
81
|
-
WRB = get_ext(
|
82
|
-
WDT = get_ext(
|
83
|
-
WP = get_ext(
|
84
|
-
WPS = get_ext(
|
85
|
-
CC = get_ext(
|
86
|
-
IN = get_ext(
|
59
|
+
NUM = get_ext("cd")
|
60
|
+
GER = get_ext("vbg")
|
61
|
+
ADJ = get_ext("jj[rs]*")
|
62
|
+
NN = get_ext("nn[sp]*")
|
63
|
+
NNP = get_ext("nnp")
|
64
|
+
PREP = get_ext("in")
|
65
|
+
DET = get_ext("det")
|
66
|
+
PAREN = get_ext("[lr]rb")
|
67
|
+
QUOT = get_ext("ppr")
|
68
|
+
SEN = get_ext("pp")
|
69
|
+
WORD = get_ext("\w+")
|
70
|
+
VB = get_ext("vb")
|
71
|
+
VBG = get_ext("vbg")
|
72
|
+
VBD = get_ext("vbd")
|
73
|
+
PART = get_ext("vbn")
|
74
|
+
VBP = get_ext("vbp")
|
75
|
+
VBZ = get_ext("vbz")
|
76
|
+
JJ = get_ext("jj")
|
77
|
+
JJR = get_ext("jjr")
|
78
|
+
JJS = get_ext("jjs")
|
79
|
+
RB = get_ext("rb")
|
80
|
+
RBR = get_ext("rbr")
|
81
|
+
RBS = get_ext("rbs")
|
82
|
+
RP = get_ext("rp")
|
83
|
+
WRB = get_ext("wrb")
|
84
|
+
WDT = get_ext("wdt")
|
85
|
+
WP = get_ext("wp")
|
86
|
+
WPS = get_ext("wps")
|
87
|
+
CC = get_ext("cc")
|
88
|
+
IN = get_ext("in")
|
87
89
|
|
88
90
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
89
91
|
#
|
@@ -92,11 +94,7 @@ class EngTagger
|
|
92
94
|
#
|
93
95
|
def self.explain_tag(tag)
|
94
96
|
tag = tag.to_s.downcase
|
95
|
-
|
96
|
-
return TAGS[tag]
|
97
|
-
else
|
98
|
-
return tag
|
99
|
-
end
|
97
|
+
TAGS[tag] || tag
|
100
98
|
end
|
101
99
|
|
102
100
|
# The folloging is to make a hash to convert a pos tag to its definition
|
@@ -147,8 +145,8 @@ class EngTagger
|
|
147
145
|
"LRB", "Punctuation, left bracket",
|
148
146
|
"RRB", "Punctuation, right bracket"
|
149
147
|
]
|
150
|
-
tags = tags.collect{|t| t.downcase.gsub(/[
|
151
|
-
tags = tags.collect{|t| t.gsub(
|
148
|
+
tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
|
149
|
+
tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
|
152
150
|
TAGS = Hash[*tags]
|
153
151
|
|
154
152
|
# Hash storing config values:
|
@@ -190,30 +188,30 @@ class EngTagger
|
|
190
188
|
# Take a hash of parameters that override default values.
|
191
189
|
# See above for details.
|
192
190
|
def initialize(params = {})
|
193
|
-
@conf =
|
194
|
-
@conf[:unknown_word_tag] =
|
191
|
+
@conf = {}
|
192
|
+
@conf[:unknown_word_tag] = ""
|
195
193
|
@conf[:stem] = false
|
196
194
|
@conf[:weight_noun_phrases] = false
|
197
195
|
@conf[:longest_noun_phrase] = 5
|
198
196
|
@conf[:relax] = false
|
199
|
-
@conf[:tag_lex] =
|
200
|
-
@conf[:word_lex] =
|
201
|
-
@conf[:unknown_lex] =
|
197
|
+
@conf[:tag_lex] = "tags.yml"
|
198
|
+
@conf[:word_lex] = "words.yml"
|
199
|
+
@conf[:unknown_lex] = "unknown.yml"
|
202
200
|
@conf[:word_path] = DEFAULT_WORDPATH
|
203
201
|
@conf[:tag_path] = DEFAULT_TAGPATH
|
204
202
|
@conf[:debug] = false
|
205
203
|
# assuming that we start analyzing from the beginninga new sentence...
|
206
|
-
@conf[:current_tag] =
|
204
|
+
@conf[:current_tag] = "pp"
|
207
205
|
@conf.merge!(params) if params
|
208
|
-
|
206
|
+
if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
|
209
207
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
210
|
-
@@hmm =
|
211
|
-
@@lexicon =
|
208
|
+
@@hmm = {}
|
209
|
+
@@lexicon = {}
|
212
210
|
else
|
213
|
-
lexf = File.open(@conf[:word_path],
|
211
|
+
lexf = File.open(@conf[:word_path], "r")
|
214
212
|
@@lexicon = Marshal.load(lexf)
|
215
213
|
lexf.close
|
216
|
-
hmmf = File.open(@conf[:tag_path],
|
214
|
+
hmmf = File.open(@conf[:tag_path], "r")
|
217
215
|
@@hmm = Marshal.load(hmmf)
|
218
216
|
hmmf.close
|
219
217
|
end
|
@@ -235,7 +233,7 @@ class EngTagger
|
|
235
233
|
out = clean_text(text).map do |word|
|
236
234
|
cleaned_word = clean_word word
|
237
235
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
238
|
-
@conf[:current_tag] = tag =
|
236
|
+
@conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
|
239
237
|
[word, tag.to_sym]
|
240
238
|
end
|
241
239
|
|
@@ -254,18 +252,18 @@ class EngTagger
|
|
254
252
|
# Examine the string provided and return it fully tagged in XML style
|
255
253
|
def add_tags(text, verbose = false)
|
256
254
|
return nil unless valid_text(text)
|
255
|
+
|
257
256
|
tagged = []
|
258
257
|
words = clean_text(text)
|
259
|
-
tags = Array.new
|
260
258
|
words.each do |word|
|
261
259
|
cleaned_word = clean_word(word)
|
262
260
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
263
|
-
@conf[:current_tag] = tag =
|
261
|
+
@conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
|
264
262
|
tag = EngTagger.explain_tag(tag) if verbose
|
265
|
-
tagged <<
|
263
|
+
tagged << "<#{tag}>#{word}</#{tag}>"
|
266
264
|
end
|
267
265
|
reset
|
268
|
-
|
266
|
+
tagged.join(" ")
|
269
267
|
end
|
270
268
|
|
271
269
|
# Given a text string, return as many nouns and noun phrases as possible.
|
@@ -277,11 +275,12 @@ class EngTagger
|
|
277
275
|
#
|
278
276
|
def get_words(text)
|
279
277
|
return false unless valid_text(text)
|
278
|
+
|
280
279
|
tagged = add_tags(text)
|
281
|
-
if
|
282
|
-
|
280
|
+
if @conf[:longest_noun_phrase] <= 1
|
281
|
+
get_nouns(tagged)
|
283
282
|
else
|
284
|
-
|
283
|
+
get_noun_phrases(tagged)
|
285
284
|
end
|
286
285
|
end
|
287
286
|
|
@@ -289,29 +288,29 @@ class EngTagger
|
|
289
288
|
# Applies add_tags and reformats to be easier to read.
|
290
289
|
def get_readable(text, verbose = false)
|
291
290
|
return nil unless valid_text(text)
|
291
|
+
|
292
292
|
tagged = add_tags(text, verbose)
|
293
|
-
tagged
|
294
|
-
|
295
|
-
$1 + '/' + $2.upcase
|
293
|
+
tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
|
294
|
+
"#{$1}/#{$2.upcase}"
|
296
295
|
end
|
297
296
|
end
|
298
297
|
|
299
298
|
# Return an array of sentences (without POS tags) from a text.
|
300
299
|
def get_sentences(text)
|
301
300
|
return nil unless valid_text(text)
|
301
|
+
|
302
302
|
tagged = add_tags(text)
|
303
|
-
sentences =
|
304
|
-
tagged.split(
|
303
|
+
sentences = []
|
304
|
+
tagged.split(%r{</pp>}).each do |line|
|
305
305
|
sentences << strip_tags(line)
|
306
306
|
end
|
307
307
|
sentences = sentences.map do |sentence|
|
308
|
-
sentence.gsub(Regexp.new(" ('s?) ")){$1 +
|
309
|
-
sentence.gsub(Regexp.new(" (\W+) ")){$1 +
|
310
|
-
sentence.gsub(Regexp.new(" (`+) ")){
|
311
|
-
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
312
|
-
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
308
|
+
sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
|
309
|
+
sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
|
310
|
+
sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
|
311
|
+
sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
|
312
|
+
sentence.gsub(Regexp.new("^(`+) ")) { $1 }
|
313
313
|
end
|
314
|
-
return sentences
|
315
314
|
end
|
316
315
|
|
317
316
|
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
@@ -321,30 +320,31 @@ class EngTagger
|
|
321
320
|
# proper nouns. This method does not stem the found words.
|
322
321
|
def get_proper_nouns(tagged)
|
323
322
|
return nil unless valid_text(tagged)
|
323
|
+
|
324
324
|
tags = [NNP]
|
325
325
|
nnp = build_matches_hash(build_trimmed(tagged, tags))
|
326
326
|
# Now for some fancy resolution stuff...
|
327
|
-
nnp.
|
327
|
+
nnp.each_key do |key|
|
328
328
|
words = key.split(/\s/)
|
329
329
|
# Let's say this is an organization's name --
|
330
330
|
# (and it's got at least three words)
|
331
331
|
# is there a corresponding acronym in this hash?
|
332
|
-
if words.length
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
332
|
+
next if words.length <= 2
|
333
|
+
|
334
|
+
# Make a (naive) acronym out of this name
|
335
|
+
acronym = words.map do |word|
|
336
|
+
/\A([a-z])[a-z]*\z/ =~ word
|
337
|
+
$1
|
338
|
+
end.join " "
|
339
|
+
# If that acronym has been seen,
|
340
|
+
# remove it and add the values to
|
341
|
+
# the full name
|
342
|
+
if nnp[acronym]
|
343
|
+
nnp[key] += nnp[acronym]
|
344
|
+
nnp.delete(acronym)
|
345
345
|
end
|
346
346
|
end
|
347
|
-
|
347
|
+
nnp
|
348
348
|
end
|
349
349
|
|
350
350
|
# Given a POS-tagged text, this method returns all nouns and their
|
@@ -355,6 +355,7 @@ class EngTagger
|
|
355
355
|
#
|
356
356
|
def get_nouns(tagged)
|
357
357
|
return nil unless valid_text(tagged)
|
358
|
+
|
358
359
|
tags = [NN]
|
359
360
|
build_matches_hash(build_trimmed(tagged, tags))
|
360
361
|
end
|
@@ -368,6 +369,7 @@ class EngTagger
|
|
368
369
|
#
|
369
370
|
def get_verbs(tagged)
|
370
371
|
return nil unless valid_text(tagged)
|
372
|
+
|
371
373
|
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
372
374
|
build_matches_hash(build_trimmed(tagged, tags))
|
373
375
|
end
|
@@ -379,6 +381,7 @@ class EngTagger
|
|
379
381
|
|
380
382
|
def get_infinitive_verbs(tagged)
|
381
383
|
return nil unless valid_text(tagged)
|
384
|
+
|
382
385
|
tags = [VB]
|
383
386
|
build_matches_hash(build_trimmed(tagged, tags))
|
384
387
|
end
|
@@ -389,6 +392,7 @@ class EngTagger
|
|
389
392
|
#
|
390
393
|
def get_past_tense_verbs(tagged)
|
391
394
|
return nil unless valid_text(tagged)
|
395
|
+
|
392
396
|
tags = [VBD]
|
393
397
|
build_matches_hash(build_trimmed(tagged, tags))
|
394
398
|
end
|
@@ -399,6 +403,7 @@ class EngTagger
|
|
399
403
|
#
|
400
404
|
def get_gerund_verbs(tagged)
|
401
405
|
return nil unless valid_text(tagged)
|
406
|
+
|
402
407
|
tags = [VBG]
|
403
408
|
build_matches_hash(build_trimmed(tagged, tags))
|
404
409
|
end
|
@@ -409,6 +414,7 @@ class EngTagger
|
|
409
414
|
#
|
410
415
|
def get_passive_verbs(tagged)
|
411
416
|
return nil unless valid_text(tagged)
|
417
|
+
|
412
418
|
tags = [PART]
|
413
419
|
build_matches_hash(build_trimmed(tagged, tags))
|
414
420
|
end
|
@@ -419,6 +425,7 @@ class EngTagger
|
|
419
425
|
#
|
420
426
|
def get_base_present_verbs(tagged)
|
421
427
|
return nil unless valid_text(tagged)
|
428
|
+
|
422
429
|
tags = [VBP]
|
423
430
|
build_matches_hash(build_trimmed(tagged, tags))
|
424
431
|
end
|
@@ -429,6 +436,7 @@ class EngTagger
|
|
429
436
|
#
|
430
437
|
def get_present_verbs(tagged)
|
431
438
|
return nil unless valid_text(tagged)
|
439
|
+
|
432
440
|
tags = [VBZ]
|
433
441
|
build_matches_hash(build_trimmed(tagged, tags))
|
434
442
|
end
|
@@ -439,6 +447,7 @@ class EngTagger
|
|
439
447
|
#
|
440
448
|
def get_adjectives(tagged)
|
441
449
|
return nil unless valid_text(tagged)
|
450
|
+
|
442
451
|
tags = [JJ]
|
443
452
|
build_matches_hash(build_trimmed(tagged, tags))
|
444
453
|
end
|
@@ -449,6 +458,7 @@ class EngTagger
|
|
449
458
|
#
|
450
459
|
def get_comparative_adjectives(tagged)
|
451
460
|
return nil unless valid_text(tagged)
|
461
|
+
|
452
462
|
tags = [JJR]
|
453
463
|
build_matches_hash(build_trimmed(tagged, tags))
|
454
464
|
end
|
@@ -459,6 +469,7 @@ class EngTagger
|
|
459
469
|
#
|
460
470
|
def get_superlative_adjectives(tagged)
|
461
471
|
return nil unless valid_text(tagged)
|
472
|
+
|
462
473
|
tags = [JJS]
|
463
474
|
build_matches_hash(build_trimmed(tagged, tags))
|
464
475
|
end
|
@@ -469,6 +480,7 @@ class EngTagger
|
|
469
480
|
#
|
470
481
|
def get_adverbs(tagged)
|
471
482
|
return nil unless valid_text(tagged)
|
483
|
+
|
472
484
|
tags = [RB, RBR, RBS, RP]
|
473
485
|
build_matches_hash(build_trimmed(tagged, tags))
|
474
486
|
end
|
@@ -479,13 +491,14 @@ class EngTagger
|
|
479
491
|
#
|
480
492
|
def get_interrogatives(tagged)
|
481
493
|
return nil unless valid_text(tagged)
|
494
|
+
|
482
495
|
tags = [WRB, WDT, WP, WPS]
|
483
496
|
build_matches_hash(build_trimmed(tagged, tags))
|
484
497
|
end
|
485
498
|
|
486
499
|
# To be consistent with documentation's naming of 'interrogative'
|
487
500
|
# parts of speech as 'question'
|
488
|
-
|
501
|
+
alias get_question_parts get_interrogatives
|
489
502
|
|
490
503
|
# Returns all types of conjunctions and does not discriminate
|
491
504
|
# between the various kinds. E.g. coordinating, subordinating,
|
@@ -496,6 +509,7 @@ class EngTagger
|
|
496
509
|
#
|
497
510
|
def get_conjunctions(tagged)
|
498
511
|
return nil unless valid_text(tagged)
|
512
|
+
|
499
513
|
tags = [CC, IN]
|
500
514
|
build_matches_hash(build_trimmed(tagged, tags))
|
501
515
|
end
|
@@ -508,14 +522,15 @@ class EngTagger
|
|
508
522
|
#
|
509
523
|
def get_max_noun_phrases(tagged)
|
510
524
|
return nil unless valid_text(tagged)
|
525
|
+
|
511
526
|
tags = [@@mnp]
|
512
527
|
mn_phrases = build_trimmed(tagged, tags)
|
513
528
|
ret = Hash.new(0)
|
514
529
|
mn_phrases.each do |p|
|
515
|
-
p = stem(p) unless p =~ /\s/
|
530
|
+
p = stem(p) unless p =~ /\s/ # stem single words
|
516
531
|
ret[p] += 1 unless p =~ /\A\s*\z/
|
517
532
|
end
|
518
|
-
|
533
|
+
ret
|
519
534
|
end
|
520
535
|
|
521
536
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
@@ -525,9 +540,10 @@ class EngTagger
|
|
525
540
|
#
|
526
541
|
def get_noun_phrases(tagged)
|
527
542
|
return nil unless valid_text(tagged)
|
543
|
+
|
528
544
|
found = Hash.new(0)
|
529
545
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
530
|
-
|
546
|
+
scanned = tagged.scan(@@mnp)
|
531
547
|
# Find MNPs in the text, one sentence at a time
|
532
548
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
533
549
|
mn_phrases = []
|
@@ -540,14 +556,14 @@ class EngTagger
|
|
540
556
|
# shortening the phrase by removing the word in the first position.
|
541
557
|
# Record the phrase and any single nouns that are found
|
542
558
|
words = mnp.split
|
543
|
-
words.length.times do
|
544
|
-
found[words.join(
|
559
|
+
words.length.times do
|
560
|
+
found[words.join(" ")] += 1 if words.length > 1
|
545
561
|
w = words.shift
|
546
562
|
found[w] += 1 if w =~ /#{NN}/
|
547
563
|
end
|
548
564
|
end
|
549
565
|
ret = Hash.new(0)
|
550
|
-
found.
|
566
|
+
found.each_key do |f|
|
551
567
|
k = strip_tags(f)
|
552
568
|
v = found[f]
|
553
569
|
# We weight by the word count to favor long noun phrases
|
@@ -555,12 +571,13 @@ class EngTagger
|
|
555
571
|
word_count = space_count.length + 1
|
556
572
|
# Throttle MNPs if necessary
|
557
573
|
next if word_count > @conf[:longest_noun_phrase]
|
558
|
-
|
574
|
+
|
575
|
+
k = stem(k) unless word_count > 1 # stem single words
|
559
576
|
multiplier = 1
|
560
577
|
multiplier = word_count if @conf[:weight_noun_phrases]
|
561
578
|
ret[k] += multiplier * v
|
562
579
|
end
|
563
|
-
|
580
|
+
ret
|
564
581
|
end
|
565
582
|
|
566
583
|
# Reads some included corpus data and saves it in a stored hash on the
|
@@ -571,10 +588,10 @@ class EngTagger
|
|
571
588
|
load_tags(@conf[:tag_lex])
|
572
589
|
load_words(@conf[:word_lex])
|
573
590
|
load_words(@conf[:unknown_lex])
|
574
|
-
File.open(@conf[:word_path],
|
591
|
+
File.open(@conf[:word_path], "w") do |f|
|
575
592
|
Marshal.dump(@@lexicon, f)
|
576
593
|
end
|
577
|
-
File.open(@conf[:tag_path],
|
594
|
+
File.open(@conf[:tag_path], "w") do |f|
|
578
595
|
Marshal.dump(@@hmm, f)
|
579
596
|
end
|
580
597
|
end
|
@@ -596,6 +613,7 @@ class EngTagger
|
|
596
613
|
trimmed.each do |n|
|
597
614
|
n = stem(n)
|
598
615
|
next unless n.length < 100 # sanity check on word length
|
616
|
+
|
599
617
|
ret[n] += 1 unless n =~ /\A\s*\z/
|
600
618
|
end
|
601
619
|
ret
|
@@ -603,25 +621,24 @@ class EngTagger
|
|
603
621
|
|
604
622
|
# Downcase the first letter of word
|
605
623
|
def lcfirst(word)
|
606
|
-
word.split(//)[0].downcase + word.split(//)[1
|
624
|
+
word.split(//)[0].downcase + word.split(//)[1..].join
|
607
625
|
end
|
608
626
|
|
609
627
|
# Upcase the first letter of word
|
610
628
|
def ucfirst(word)
|
611
|
-
word.split(//)[0].upcase + word.split(//)[1
|
629
|
+
word.split(//)[0].upcase + word.split(//)[1..].join
|
612
630
|
end
|
613
631
|
|
614
632
|
# Return the word stem as given by Stemmable module. This can be
|
615
633
|
# turned off with the class parameter @conf[:stem] => false.
|
616
634
|
def stem(word)
|
617
|
-
|
618
|
-
return word.stem
|
635
|
+
@conf[:stem] ? word.stem : word
|
619
636
|
end
|
620
637
|
|
621
638
|
# This method will reset the preceeding tag to a sentence ender (PP).
|
622
639
|
# This prepares the first word of a new sentence to be tagged correctly.
|
623
640
|
def reset
|
624
|
-
@conf[:current_tag] =
|
641
|
+
@conf[:current_tag] = "pp"
|
625
642
|
end
|
626
643
|
|
627
644
|
# Check whether the text is a valid string
|
@@ -629,41 +646,38 @@ class EngTagger
|
|
629
646
|
if !text
|
630
647
|
# there's nothing to parse
|
631
648
|
"method call on uninitialized variable" if @conf[:debug]
|
632
|
-
|
649
|
+
false
|
633
650
|
elsif /\A\s*\z/ =~ text
|
634
651
|
# text is an empty string, nothing to parse
|
635
|
-
|
652
|
+
false
|
636
653
|
else
|
637
654
|
# $text is valid
|
638
|
-
|
655
|
+
true
|
639
656
|
end
|
640
657
|
end
|
641
658
|
|
642
659
|
# Return a text string with the part-of-speech tags removed
|
643
660
|
def strip_tags(tagged, downcase = false)
|
644
661
|
return nil unless valid_text(tagged)
|
662
|
+
|
645
663
|
text = tagged.gsub(/<[^>]+>/m, "")
|
646
664
|
text = text.gsub(/\s+/m, " ")
|
647
665
|
text = text.gsub(/\A\s*/, "")
|
648
666
|
text = text.gsub(/\s*\z/, "")
|
649
|
-
|
650
|
-
return text.downcase
|
651
|
-
else
|
652
|
-
return text
|
653
|
-
end
|
667
|
+
downcase ? text.downcase : text
|
654
668
|
end
|
655
669
|
|
656
670
|
# Strip the provided text and separate off any punctuation in preparation for tagging
|
657
671
|
def clean_text(text)
|
658
672
|
return false unless valid_text(text)
|
659
|
-
|
673
|
+
|
674
|
+
cleaned_text = text.encode("utf-8")
|
660
675
|
tokenized = []
|
661
676
|
# Tokenize the text (splitting on punctuation as you go)
|
662
677
|
cleaned_text.split(/\s+/).each do |line|
|
663
678
|
tokenized += split_punct(line)
|
664
679
|
end
|
665
|
-
|
666
|
-
return words
|
680
|
+
split_sentences(tokenized)
|
667
681
|
end
|
668
682
|
|
669
683
|
# This handles all of the trailing periods, keeping those that
|
@@ -672,27 +686,26 @@ class EngTagger
|
|
672
686
|
# about the use of capitalization in the incoming text
|
673
687
|
def split_sentences(array)
|
674
688
|
tokenized = array
|
675
|
-
people = %w
|
676
|
-
supt det mssrs rev
|
677
|
-
army = %w
|
678
|
-
inst = %w
|
679
|
-
place = %w
|
680
|
-
hwy hway la pde pd plz pl rd st tce
|
681
|
-
comp = %w
|
682
|
-
state = %w
|
689
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
690
|
+
supt det mssrs rev]
|
691
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
692
|
+
inst = %w[dept univ assn bros ph.d]
|
693
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
694
|
+
hwy hway la pde pd plz pl rd st tce]
|
695
|
+
comp = %w[mfg inc ltd co corp]
|
696
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
683
697
|
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
684
698
|
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
685
|
-
va wash wis wisc wy wyo usafa alta man ont que sask yuk
|
686
|
-
month = %w
|
687
|
-
misc = %w
|
688
|
-
abbr =
|
699
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
700
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
701
|
+
misc = %w[vs etc no esp]
|
702
|
+
abbr = {}
|
689
703
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
690
704
|
abbr[i] = true
|
691
705
|
end
|
692
|
-
words =
|
693
|
-
tokenized.each_with_index do |
|
694
|
-
if tokenized[i + 1]
|
695
|
-
tokenized[i] =~ /\A(.+)\.\z/
|
706
|
+
words = []
|
707
|
+
tokenized.each_with_index do |_, i|
|
708
|
+
if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
696
709
|
w = $1
|
697
710
|
# Don't separate the period off words that
|
698
711
|
# meet any of the following conditions:
|
@@ -700,21 +713,20 @@ class EngTagger
|
|
700
713
|
# 1. It is defined in one of the lists above
|
701
714
|
# 2. It is only one letter long: Alfred E. Sloan
|
702
715
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
703
|
-
unless abbr[w.downcase]
|
704
|
-
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
716
|
+
unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
705
717
|
words << w
|
706
|
-
words <<
|
718
|
+
words << "."
|
707
719
|
next
|
708
720
|
end
|
709
721
|
end
|
710
722
|
words << tokenized[i]
|
711
723
|
end
|
712
724
|
# If the final word ends in a period..
|
713
|
-
if words[-1]
|
725
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
714
726
|
words[-1] = $1
|
715
|
-
words.push
|
727
|
+
words.push "."
|
716
728
|
end
|
717
|
-
|
729
|
+
words
|
718
730
|
end
|
719
731
|
|
720
732
|
# Separate punctuation from words, where appropriate. This leaves trailing
|
@@ -722,39 +734,40 @@ class EngTagger
|
|
722
734
|
def split_punct(text)
|
723
735
|
# If there's no punctuation, return immediately
|
724
736
|
return [text] if /\A\w+\z/ =~ text
|
737
|
+
|
725
738
|
# Sanity checks
|
726
739
|
text = text.gsub(/\W{10,}/o, " ")
|
727
740
|
|
728
741
|
# Put quotes into a standard format
|
729
742
|
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
730
743
|
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
731
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
744
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
|
732
745
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
733
|
-
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
746
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
|
734
747
|
|
735
748
|
# Handle all other punctuation
|
736
749
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
737
750
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
738
751
|
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
739
|
-
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
740
|
-
text = text.gsub(/([
|
741
|
-
text = text.gsub(/([
|
752
|
+
text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
|
753
|
+
text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
|
754
|
+
text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
|
742
755
|
|
743
756
|
# English-specific contractions
|
744
|
-
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}
|
745
|
-
text = text.gsub(/n't\b/o, " n't")
|
746
|
-
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}
|
747
|
-
|
748
|
-
return result
|
757
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
|
758
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
759
|
+
text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
|
760
|
+
text.split(" ")
|
749
761
|
end
|
750
762
|
|
751
763
|
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
752
764
|
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
753
765
|
def assign_tag(prev_tag, word)
|
754
|
-
|
766
|
+
case word
|
767
|
+
when "-unknown-"
|
755
768
|
# classify unknown words accordingly
|
756
769
|
return @conf[:unknown_word_tag]
|
757
|
-
|
770
|
+
when "-sym-"
|
758
771
|
# If this is a symbol, tag it as a symbol
|
759
772
|
return "sym"
|
760
773
|
end
|
@@ -765,13 +778,13 @@ class EngTagger
|
|
765
778
|
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
766
779
|
# which is used in most POS taggers
|
767
780
|
best_tag = ""
|
768
|
-
t[prev_tag].
|
781
|
+
t[prev_tag].each_key do |tag|
|
769
782
|
# With @config[:relax] set, this method
|
770
783
|
# will also include any `open classes' of POS tags
|
771
784
|
pw = 0
|
772
785
|
if w[tag]
|
773
786
|
pw = w[tag]
|
774
|
-
elsif @conf[:relax]
|
787
|
+
elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
|
775
788
|
pw = 0
|
776
789
|
else
|
777
790
|
next
|
@@ -786,7 +799,7 @@ class EngTagger
|
|
786
799
|
best_tag = tag
|
787
800
|
end
|
788
801
|
end
|
789
|
-
|
802
|
+
best_tag
|
790
803
|
end
|
791
804
|
|
792
805
|
# This method determines whether a word should be considered in its
|
@@ -796,13 +809,13 @@ class EngTagger
|
|
796
809
|
lcf = lcfirst(word)
|
797
810
|
# seen this word as it appears (lower or upper case)
|
798
811
|
if @@lexicon[word]
|
799
|
-
|
812
|
+
word
|
800
813
|
elsif @@lexicon[lcf]
|
801
814
|
# seen this word only as lower case
|
802
|
-
|
815
|
+
lcf
|
803
816
|
else
|
804
817
|
# never seen this word. guess.
|
805
|
-
|
818
|
+
classify_unknown_word(word)
|
806
819
|
end
|
807
820
|
end
|
808
821
|
|
@@ -810,52 +823,52 @@ class EngTagger
|
|
810
823
|
# classes of words handled by a simple unknown word classification
|
811
824
|
# metric. Called by the clean_word method.
|
812
825
|
def classify_unknown_word(word)
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
+
case word
|
827
|
+
when /[({\[]/ # Left brackets
|
828
|
+
"*LRB*"
|
829
|
+
when /[)}\]]/ # Right brackets
|
830
|
+
"*RRB*"
|
831
|
+
when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
|
832
|
+
"*NUM*"
|
833
|
+
when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
|
834
|
+
"*NUM*"
|
835
|
+
when /\A-?\d+\w+\z/o # Ordinal number
|
836
|
+
"*ORD*"
|
837
|
+
when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
|
838
|
+
"-abr-"
|
839
|
+
when /\w-\w/o # Hyphenated word
|
826
840
|
/-([^-]+)\z/ =~ word
|
827
841
|
h_suffix = $1
|
828
|
-
if h_suffix
|
842
|
+
if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
|
829
843
|
# last part of this is defined as an adjective
|
830
|
-
|
844
|
+
"-hyp-adj-"
|
831
845
|
else
|
832
846
|
# last part of this is not defined as an adjective
|
833
|
-
|
847
|
+
"-hyp-"
|
834
848
|
end
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
+
when /\A\W+\z/o
|
850
|
+
"-sym-" # Symbol
|
851
|
+
when ucfirst(word)
|
852
|
+
"-cap-" # Capitalized word
|
853
|
+
when /ing\z/o
|
854
|
+
"-ing-" # Ends in "ing"
|
855
|
+
when /s\z/o
|
856
|
+
"-s-" # Ends in "s"
|
857
|
+
when /tion\z/o
|
858
|
+
"-tion-" # Ends in "tion"
|
859
|
+
when /ly\z/o
|
860
|
+
"-ly-" # Ends in "ly"
|
861
|
+
when /ed\z/o
|
862
|
+
"-ed-" # Ends in "ed
|
849
863
|
else
|
850
|
-
|
864
|
+
"-unknown-" # Completely unknown
|
851
865
|
end
|
852
|
-
return classified
|
853
866
|
end
|
854
867
|
|
855
868
|
# This returns a compiled regexp for extracting maximal noun phrases
|
856
869
|
# from a POS-tagged text.
|
857
870
|
def get_max_noun_regex
|
858
|
-
|
871
|
+
/
|
859
872
|
# optional number, gerund - adjective -participle
|
860
873
|
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
861
874
|
# Followed by one or more nouns
|
@@ -868,8 +881,7 @@ class EngTagger
|
|
868
881
|
# one or more nouns
|
869
882
|
(?:#{NN})+
|
870
883
|
)*
|
871
|
-
/xo
|
872
|
-
return regex
|
884
|
+
/xo
|
873
885
|
end
|
874
886
|
|
875
887
|
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
@@ -878,12 +890,13 @@ class EngTagger
|
|
878
890
|
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
879
891
|
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
880
892
|
path = File.join(lexpath, lexicon)
|
881
|
-
fh = File.open(path,
|
882
|
-
while line = fh.gets
|
893
|
+
fh = File.open(path, "r")
|
894
|
+
while (line = fh.gets)
|
883
895
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
884
|
-
next unless $1
|
885
|
-
|
886
|
-
|
896
|
+
next unless $1 && $2
|
897
|
+
|
898
|
+
key = $1
|
899
|
+
data = $2
|
887
900
|
items = data.split(/,\s+/)
|
888
901
|
pairs = {}
|
889
902
|
items.each do |i|
|
@@ -901,12 +914,13 @@ class EngTagger
|
|
901
914
|
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
902
915
|
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
903
916
|
path = File.join(lexpath, lexicon)
|
904
|
-
fh = File.open(path,
|
905
|
-
while line = fh.gets
|
917
|
+
fh = File.open(path, "r")
|
918
|
+
while (line = fh.gets)
|
906
919
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
907
|
-
next unless $1
|
908
|
-
|
909
|
-
|
920
|
+
next unless $1 && $2
|
921
|
+
|
922
|
+
key = $1
|
923
|
+
data = $2
|
910
924
|
items = data.split(/,\s+/)
|
911
925
|
pairs = {}
|
912
926
|
items.each do |i|
|
@@ -918,7 +932,7 @@ class EngTagger
|
|
918
932
|
fh.close
|
919
933
|
end
|
920
934
|
|
921
|
-
#memoize the stem and assign_tag methods
|
935
|
+
# memoize the stem and assign_tag methods
|
922
936
|
memoize("stem")
|
923
937
|
memoize("assign_tag")
|
924
938
|
end
|