engtagger 0.3.2 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +72 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +74 -42
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +169 -192
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -206
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
data/lib/engtagger.rb
CHANGED
@@ -1,20 +1,21 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
require
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require "rubygems"
|
6
|
+
require "lru_redux"
|
7
|
+
require_relative "./engtagger/porter"
|
7
8
|
|
8
9
|
module BoundedSpaceMemoizable
|
9
|
-
def memoize(method, max_cache_size=
|
10
|
+
def memoize(method, max_cache_size = 100_000)
|
10
11
|
# alias_method is faster than define_method + old.bind(self).call
|
11
12
|
alias_method "__memoized__#{method}", method
|
12
|
-
module_eval <<-
|
13
|
+
module_eval <<-MODEV
|
13
14
|
def #{method}(*a)
|
14
15
|
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
15
16
|
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
16
17
|
end
|
17
|
-
|
18
|
+
MODEV
|
18
19
|
end
|
19
20
|
end
|
20
21
|
|
@@ -23,7 +24,7 @@ class EngTagger
|
|
23
24
|
extend BoundedSpaceMemoizable
|
24
25
|
|
25
26
|
# File paths
|
26
|
-
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__),
|
27
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
|
27
28
|
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
28
29
|
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
29
30
|
|
@@ -36,7 +37,7 @@ class EngTagger
|
|
36
37
|
# @return [Hash] the probability data
|
37
38
|
#
|
38
39
|
def self.hmm
|
39
|
-
|
40
|
+
@@hmm
|
40
41
|
end
|
41
42
|
|
42
43
|
# Return a class variable that holds lexical data.
|
@@ -44,46 +45,47 @@ class EngTagger
|
|
44
45
|
# @return [Hash] the lexicon
|
45
46
|
#
|
46
47
|
def self.lexicon
|
47
|
-
|
48
|
+
@@lexicon
|
48
49
|
end
|
49
50
|
|
50
51
|
# Return a regexp from a string argument that matches an XML-style pos tag
|
51
52
|
def self.get_ext(tag = nil)
|
52
53
|
return nil unless tag
|
53
|
-
|
54
|
+
|
55
|
+
Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
54
56
|
end
|
55
57
|
|
56
58
|
# Regexps to match XML-style part-of-speech tags
|
57
|
-
NUM = get_ext(
|
58
|
-
GER = get_ext(
|
59
|
-
ADJ = get_ext(
|
60
|
-
NN = get_ext(
|
61
|
-
NNP = get_ext(
|
62
|
-
PREP = get_ext(
|
63
|
-
DET = get_ext(
|
64
|
-
PAREN = get_ext(
|
65
|
-
QUOT = get_ext(
|
66
|
-
SEN = get_ext(
|
67
|
-
WORD = get_ext(
|
68
|
-
VB = get_ext(
|
69
|
-
VBG = get_ext(
|
70
|
-
VBD = get_ext(
|
71
|
-
PART = get_ext(
|
72
|
-
VBP = get_ext(
|
73
|
-
VBZ = get_ext(
|
74
|
-
JJ = get_ext(
|
75
|
-
JJR = get_ext(
|
76
|
-
JJS = get_ext(
|
77
|
-
RB = get_ext(
|
78
|
-
RBR = get_ext(
|
79
|
-
RBS = get_ext(
|
80
|
-
RP = get_ext(
|
81
|
-
WRB = get_ext(
|
82
|
-
WDT = get_ext(
|
83
|
-
WP = get_ext(
|
84
|
-
WPS = get_ext(
|
85
|
-
CC = get_ext(
|
86
|
-
IN = get_ext(
|
59
|
+
NUM = get_ext("cd")
|
60
|
+
GER = get_ext("vbg")
|
61
|
+
ADJ = get_ext("jj[rs]*")
|
62
|
+
NN = get_ext("nn[sp]*")
|
63
|
+
NNP = get_ext("nnp")
|
64
|
+
PREP = get_ext("in")
|
65
|
+
DET = get_ext("det")
|
66
|
+
PAREN = get_ext("[lr]rb")
|
67
|
+
QUOT = get_ext("ppr")
|
68
|
+
SEN = get_ext("pp")
|
69
|
+
WORD = get_ext("\w+")
|
70
|
+
VB = get_ext("vb")
|
71
|
+
VBG = get_ext("vbg")
|
72
|
+
VBD = get_ext("vbd")
|
73
|
+
PART = get_ext("vbn")
|
74
|
+
VBP = get_ext("vbp")
|
75
|
+
VBZ = get_ext("vbz")
|
76
|
+
JJ = get_ext("jj")
|
77
|
+
JJR = get_ext("jjr")
|
78
|
+
JJS = get_ext("jjs")
|
79
|
+
RB = get_ext("rb")
|
80
|
+
RBR = get_ext("rbr")
|
81
|
+
RBS = get_ext("rbs")
|
82
|
+
RP = get_ext("rp")
|
83
|
+
WRB = get_ext("wrb")
|
84
|
+
WDT = get_ext("wdt")
|
85
|
+
WP = get_ext("wp")
|
86
|
+
WPS = get_ext("wps")
|
87
|
+
CC = get_ext("cc")
|
88
|
+
IN = get_ext("in")
|
87
89
|
|
88
90
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
89
91
|
#
|
@@ -92,11 +94,7 @@ class EngTagger
|
|
92
94
|
#
|
93
95
|
def self.explain_tag(tag)
|
94
96
|
tag = tag.to_s.downcase
|
95
|
-
|
96
|
-
return TAGS[tag]
|
97
|
-
else
|
98
|
-
return tag
|
99
|
-
end
|
97
|
+
TAGS[tag] || tag
|
100
98
|
end
|
101
99
|
|
102
100
|
# The folloging is to make a hash to convert a pos tag to its definition
|
@@ -147,8 +145,8 @@ class EngTagger
|
|
147
145
|
"LRB", "Punctuation, left bracket",
|
148
146
|
"RRB", "Punctuation, right bracket"
|
149
147
|
]
|
150
|
-
tags = tags.collect{|t| t.downcase.gsub(/[
|
151
|
-
tags = tags.collect{|t| t.gsub(
|
148
|
+
tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
|
149
|
+
tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
|
152
150
|
TAGS = Hash[*tags]
|
153
151
|
|
154
152
|
# Hash storing config values:
|
@@ -190,30 +188,30 @@ class EngTagger
|
|
190
188
|
# Take a hash of parameters that override default values.
|
191
189
|
# See above for details.
|
192
190
|
def initialize(params = {})
|
193
|
-
@conf =
|
194
|
-
@conf[:unknown_word_tag] =
|
191
|
+
@conf = {}
|
192
|
+
@conf[:unknown_word_tag] = ""
|
195
193
|
@conf[:stem] = false
|
196
194
|
@conf[:weight_noun_phrases] = false
|
197
195
|
@conf[:longest_noun_phrase] = 5
|
198
196
|
@conf[:relax] = false
|
199
|
-
@conf[:tag_lex] =
|
200
|
-
@conf[:word_lex] =
|
201
|
-
@conf[:unknown_lex] =
|
197
|
+
@conf[:tag_lex] = "tags.yml"
|
198
|
+
@conf[:word_lex] = "words.yml"
|
199
|
+
@conf[:unknown_lex] = "unknown.yml"
|
202
200
|
@conf[:word_path] = DEFAULT_WORDPATH
|
203
201
|
@conf[:tag_path] = DEFAULT_TAGPATH
|
204
202
|
@conf[:debug] = false
|
205
203
|
# assuming that we start analyzing from the beginninga new sentence...
|
206
|
-
@conf[:current_tag] =
|
204
|
+
@conf[:current_tag] = "pp"
|
207
205
|
@conf.merge!(params) if params
|
208
|
-
|
206
|
+
if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
|
209
207
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
210
|
-
@@hmm =
|
211
|
-
@@lexicon =
|
208
|
+
@@hmm = {}
|
209
|
+
@@lexicon = {}
|
212
210
|
else
|
213
|
-
lexf = File.open(@conf[:word_path],
|
211
|
+
lexf = File.open(@conf[:word_path], "r")
|
214
212
|
@@lexicon = Marshal.load(lexf)
|
215
213
|
lexf.close
|
216
|
-
hmmf = File.open(@conf[:tag_path],
|
214
|
+
hmmf = File.open(@conf[:tag_path], "r")
|
217
215
|
@@hmm = Marshal.load(hmmf)
|
218
216
|
hmmf.close
|
219
217
|
end
|
@@ -235,7 +233,7 @@ class EngTagger
|
|
235
233
|
out = clean_text(text).map do |word|
|
236
234
|
cleaned_word = clean_word word
|
237
235
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
238
|
-
@conf[:current_tag] = tag =
|
236
|
+
@conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
|
239
237
|
[word, tag.to_sym]
|
240
238
|
end
|
241
239
|
|
@@ -254,18 +252,18 @@ class EngTagger
|
|
254
252
|
# Examine the string provided and return it fully tagged in XML style
|
255
253
|
def add_tags(text, verbose = false)
|
256
254
|
return nil unless valid_text(text)
|
255
|
+
|
257
256
|
tagged = []
|
258
257
|
words = clean_text(text)
|
259
|
-
tags = Array.new
|
260
258
|
words.each do |word|
|
261
259
|
cleaned_word = clean_word(word)
|
262
260
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
263
|
-
@conf[:current_tag] = tag =
|
261
|
+
@conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
|
264
262
|
tag = EngTagger.explain_tag(tag) if verbose
|
265
|
-
tagged <<
|
263
|
+
tagged << "<#{tag}>#{word}</#{tag}>"
|
266
264
|
end
|
267
265
|
reset
|
268
|
-
|
266
|
+
tagged.join(" ")
|
269
267
|
end
|
270
268
|
|
271
269
|
# Given a text string, return as many nouns and noun phrases as possible.
|
@@ -277,11 +275,12 @@ class EngTagger
|
|
277
275
|
#
|
278
276
|
def get_words(text)
|
279
277
|
return false unless valid_text(text)
|
278
|
+
|
280
279
|
tagged = add_tags(text)
|
281
|
-
if
|
282
|
-
|
280
|
+
if @conf[:longest_noun_phrase] <= 1
|
281
|
+
get_nouns(tagged)
|
283
282
|
else
|
284
|
-
|
283
|
+
get_noun_phrases(tagged)
|
285
284
|
end
|
286
285
|
end
|
287
286
|
|
@@ -289,29 +288,29 @@ class EngTagger
|
|
289
288
|
# Applies add_tags and reformats to be easier to read.
|
290
289
|
def get_readable(text, verbose = false)
|
291
290
|
return nil unless valid_text(text)
|
291
|
+
|
292
292
|
tagged = add_tags(text, verbose)
|
293
|
-
tagged
|
294
|
-
|
295
|
-
$1 + '/' + $2.upcase
|
293
|
+
tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
|
294
|
+
"#{$1}/#{$2.upcase}"
|
296
295
|
end
|
297
296
|
end
|
298
297
|
|
299
298
|
# Return an array of sentences (without POS tags) from a text.
|
300
299
|
def get_sentences(text)
|
301
300
|
return nil unless valid_text(text)
|
301
|
+
|
302
302
|
tagged = add_tags(text)
|
303
|
-
sentences =
|
304
|
-
tagged.split(
|
303
|
+
sentences = []
|
304
|
+
tagged.split(%r{</pp>}).each do |line|
|
305
305
|
sentences << strip_tags(line)
|
306
306
|
end
|
307
307
|
sentences = sentences.map do |sentence|
|
308
|
-
sentence.gsub(Regexp.new(" ('s?) ")){$1 +
|
309
|
-
sentence.gsub(Regexp.new(" (\W+) ")){$1 +
|
310
|
-
sentence.gsub(Regexp.new(" (`+) ")){
|
311
|
-
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
312
|
-
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
308
|
+
sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
|
309
|
+
sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
|
310
|
+
sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
|
311
|
+
sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
|
312
|
+
sentence.gsub(Regexp.new("^(`+) ")) { $1 }
|
313
313
|
end
|
314
|
-
return sentences
|
315
314
|
end
|
316
315
|
|
317
316
|
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
@@ -321,30 +320,31 @@ class EngTagger
|
|
321
320
|
# proper nouns. This method does not stem the found words.
|
322
321
|
def get_proper_nouns(tagged)
|
323
322
|
return nil unless valid_text(tagged)
|
323
|
+
|
324
324
|
tags = [NNP]
|
325
325
|
nnp = build_matches_hash(build_trimmed(tagged, tags))
|
326
326
|
# Now for some fancy resolution stuff...
|
327
|
-
nnp.
|
327
|
+
nnp.each_key do |key|
|
328
328
|
words = key.split(/\s/)
|
329
329
|
# Let's say this is an organization's name --
|
330
330
|
# (and it's got at least three words)
|
331
331
|
# is there a corresponding acronym in this hash?
|
332
|
-
if words.length
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
332
|
+
next if words.length <= 2
|
333
|
+
|
334
|
+
# Make a (naive) acronym out of this name
|
335
|
+
acronym = words.map do |word|
|
336
|
+
/\A([a-z])[a-z]*\z/ =~ word
|
337
|
+
$1
|
338
|
+
end.join " "
|
339
|
+
# If that acronym has been seen,
|
340
|
+
# remove it and add the values to
|
341
|
+
# the full name
|
342
|
+
if nnp[acronym]
|
343
|
+
nnp[key] += nnp[acronym]
|
344
|
+
nnp.delete(acronym)
|
345
345
|
end
|
346
346
|
end
|
347
|
-
|
347
|
+
nnp
|
348
348
|
end
|
349
349
|
|
350
350
|
# Given a POS-tagged text, this method returns all nouns and their
|
@@ -355,6 +355,7 @@ class EngTagger
|
|
355
355
|
#
|
356
356
|
def get_nouns(tagged)
|
357
357
|
return nil unless valid_text(tagged)
|
358
|
+
|
358
359
|
tags = [NN]
|
359
360
|
build_matches_hash(build_trimmed(tagged, tags))
|
360
361
|
end
|
@@ -368,6 +369,7 @@ class EngTagger
|
|
368
369
|
#
|
369
370
|
def get_verbs(tagged)
|
370
371
|
return nil unless valid_text(tagged)
|
372
|
+
|
371
373
|
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
372
374
|
build_matches_hash(build_trimmed(tagged, tags))
|
373
375
|
end
|
@@ -379,6 +381,7 @@ class EngTagger
|
|
379
381
|
|
380
382
|
def get_infinitive_verbs(tagged)
|
381
383
|
return nil unless valid_text(tagged)
|
384
|
+
|
382
385
|
tags = [VB]
|
383
386
|
build_matches_hash(build_trimmed(tagged, tags))
|
384
387
|
end
|
@@ -389,6 +392,7 @@ class EngTagger
|
|
389
392
|
#
|
390
393
|
def get_past_tense_verbs(tagged)
|
391
394
|
return nil unless valid_text(tagged)
|
395
|
+
|
392
396
|
tags = [VBD]
|
393
397
|
build_matches_hash(build_trimmed(tagged, tags))
|
394
398
|
end
|
@@ -399,6 +403,7 @@ class EngTagger
|
|
399
403
|
#
|
400
404
|
def get_gerund_verbs(tagged)
|
401
405
|
return nil unless valid_text(tagged)
|
406
|
+
|
402
407
|
tags = [VBG]
|
403
408
|
build_matches_hash(build_trimmed(tagged, tags))
|
404
409
|
end
|
@@ -409,6 +414,7 @@ class EngTagger
|
|
409
414
|
#
|
410
415
|
def get_passive_verbs(tagged)
|
411
416
|
return nil unless valid_text(tagged)
|
417
|
+
|
412
418
|
tags = [PART]
|
413
419
|
build_matches_hash(build_trimmed(tagged, tags))
|
414
420
|
end
|
@@ -419,6 +425,7 @@ class EngTagger
|
|
419
425
|
#
|
420
426
|
def get_base_present_verbs(tagged)
|
421
427
|
return nil unless valid_text(tagged)
|
428
|
+
|
422
429
|
tags = [VBP]
|
423
430
|
build_matches_hash(build_trimmed(tagged, tags))
|
424
431
|
end
|
@@ -429,6 +436,7 @@ class EngTagger
|
|
429
436
|
#
|
430
437
|
def get_present_verbs(tagged)
|
431
438
|
return nil unless valid_text(tagged)
|
439
|
+
|
432
440
|
tags = [VBZ]
|
433
441
|
build_matches_hash(build_trimmed(tagged, tags))
|
434
442
|
end
|
@@ -439,6 +447,7 @@ class EngTagger
|
|
439
447
|
#
|
440
448
|
def get_adjectives(tagged)
|
441
449
|
return nil unless valid_text(tagged)
|
450
|
+
|
442
451
|
tags = [JJ]
|
443
452
|
build_matches_hash(build_trimmed(tagged, tags))
|
444
453
|
end
|
@@ -449,6 +458,7 @@ class EngTagger
|
|
449
458
|
#
|
450
459
|
def get_comparative_adjectives(tagged)
|
451
460
|
return nil unless valid_text(tagged)
|
461
|
+
|
452
462
|
tags = [JJR]
|
453
463
|
build_matches_hash(build_trimmed(tagged, tags))
|
454
464
|
end
|
@@ -459,6 +469,7 @@ class EngTagger
|
|
459
469
|
#
|
460
470
|
def get_superlative_adjectives(tagged)
|
461
471
|
return nil unless valid_text(tagged)
|
472
|
+
|
462
473
|
tags = [JJS]
|
463
474
|
build_matches_hash(build_trimmed(tagged, tags))
|
464
475
|
end
|
@@ -469,6 +480,7 @@ class EngTagger
|
|
469
480
|
#
|
470
481
|
def get_adverbs(tagged)
|
471
482
|
return nil unless valid_text(tagged)
|
483
|
+
|
472
484
|
tags = [RB, RBR, RBS, RP]
|
473
485
|
build_matches_hash(build_trimmed(tagged, tags))
|
474
486
|
end
|
@@ -479,13 +491,14 @@ class EngTagger
|
|
479
491
|
#
|
480
492
|
def get_interrogatives(tagged)
|
481
493
|
return nil unless valid_text(tagged)
|
494
|
+
|
482
495
|
tags = [WRB, WDT, WP, WPS]
|
483
496
|
build_matches_hash(build_trimmed(tagged, tags))
|
484
497
|
end
|
485
498
|
|
486
499
|
# To be consistent with documentation's naming of 'interrogative'
|
487
500
|
# parts of speech as 'question'
|
488
|
-
|
501
|
+
alias get_question_parts get_interrogatives
|
489
502
|
|
490
503
|
# Returns all types of conjunctions and does not discriminate
|
491
504
|
# between the various kinds. E.g. coordinating, subordinating,
|
@@ -496,6 +509,7 @@ class EngTagger
|
|
496
509
|
#
|
497
510
|
def get_conjunctions(tagged)
|
498
511
|
return nil unless valid_text(tagged)
|
512
|
+
|
499
513
|
tags = [CC, IN]
|
500
514
|
build_matches_hash(build_trimmed(tagged, tags))
|
501
515
|
end
|
@@ -508,14 +522,15 @@ class EngTagger
|
|
508
522
|
#
|
509
523
|
def get_max_noun_phrases(tagged)
|
510
524
|
return nil unless valid_text(tagged)
|
525
|
+
|
511
526
|
tags = [@@mnp]
|
512
527
|
mn_phrases = build_trimmed(tagged, tags)
|
513
528
|
ret = Hash.new(0)
|
514
529
|
mn_phrases.each do |p|
|
515
|
-
p = stem(p) unless p =~ /\s/
|
530
|
+
p = stem(p) unless p =~ /\s/ # stem single words
|
516
531
|
ret[p] += 1 unless p =~ /\A\s*\z/
|
517
532
|
end
|
518
|
-
|
533
|
+
ret
|
519
534
|
end
|
520
535
|
|
521
536
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
@@ -525,9 +540,10 @@ class EngTagger
|
|
525
540
|
#
|
526
541
|
def get_noun_phrases(tagged)
|
527
542
|
return nil unless valid_text(tagged)
|
543
|
+
|
528
544
|
found = Hash.new(0)
|
529
545
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
530
|
-
|
546
|
+
scanned = tagged.scan(@@mnp)
|
531
547
|
# Find MNPs in the text, one sentence at a time
|
532
548
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
533
549
|
mn_phrases = []
|
@@ -540,14 +556,14 @@ class EngTagger
|
|
540
556
|
# shortening the phrase by removing the word in the first position.
|
541
557
|
# Record the phrase and any single nouns that are found
|
542
558
|
words = mnp.split
|
543
|
-
words.length.times do
|
544
|
-
found[words.join(
|
559
|
+
words.length.times do
|
560
|
+
found[words.join(" ")] += 1 if words.length > 1
|
545
561
|
w = words.shift
|
546
562
|
found[w] += 1 if w =~ /#{NN}/
|
547
563
|
end
|
548
564
|
end
|
549
565
|
ret = Hash.new(0)
|
550
|
-
found.
|
566
|
+
found.each_key do |f|
|
551
567
|
k = strip_tags(f)
|
552
568
|
v = found[f]
|
553
569
|
# We weight by the word count to favor long noun phrases
|
@@ -555,12 +571,13 @@ class EngTagger
|
|
555
571
|
word_count = space_count.length + 1
|
556
572
|
# Throttle MNPs if necessary
|
557
573
|
next if word_count > @conf[:longest_noun_phrase]
|
558
|
-
|
574
|
+
|
575
|
+
k = stem(k) unless word_count > 1 # stem single words
|
559
576
|
multiplier = 1
|
560
577
|
multiplier = word_count if @conf[:weight_noun_phrases]
|
561
578
|
ret[k] += multiplier * v
|
562
579
|
end
|
563
|
-
|
580
|
+
ret
|
564
581
|
end
|
565
582
|
|
566
583
|
# Reads some included corpus data and saves it in a stored hash on the
|
@@ -571,10 +588,10 @@ class EngTagger
|
|
571
588
|
load_tags(@conf[:tag_lex])
|
572
589
|
load_words(@conf[:word_lex])
|
573
590
|
load_words(@conf[:unknown_lex])
|
574
|
-
File.open(@conf[:word_path],
|
591
|
+
File.open(@conf[:word_path], "w") do |f|
|
575
592
|
Marshal.dump(@@lexicon, f)
|
576
593
|
end
|
577
|
-
File.open(@conf[:tag_path],
|
594
|
+
File.open(@conf[:tag_path], "w") do |f|
|
578
595
|
Marshal.dump(@@hmm, f)
|
579
596
|
end
|
580
597
|
end
|
@@ -596,6 +613,7 @@ class EngTagger
|
|
596
613
|
trimmed.each do |n|
|
597
614
|
n = stem(n)
|
598
615
|
next unless n.length < 100 # sanity check on word length
|
616
|
+
|
599
617
|
ret[n] += 1 unless n =~ /\A\s*\z/
|
600
618
|
end
|
601
619
|
ret
|
@@ -603,25 +621,24 @@ class EngTagger
|
|
603
621
|
|
604
622
|
# Downcase the first letter of word
|
605
623
|
def lcfirst(word)
|
606
|
-
word.split(//)[0].downcase + word.split(//)[1
|
624
|
+
word.split(//)[0].downcase + word.split(//)[1..].join
|
607
625
|
end
|
608
626
|
|
609
627
|
# Upcase the first letter of word
|
610
628
|
def ucfirst(word)
|
611
|
-
word.split(//)[0].upcase + word.split(//)[1
|
629
|
+
word.split(//)[0].upcase + word.split(//)[1..].join
|
612
630
|
end
|
613
631
|
|
614
632
|
# Return the word stem as given by Stemmable module. This can be
|
615
633
|
# turned off with the class parameter @conf[:stem] => false.
|
616
634
|
def stem(word)
|
617
|
-
|
618
|
-
return word.stem
|
635
|
+
@conf[:stem] ? word.stem : word
|
619
636
|
end
|
620
637
|
|
621
638
|
# This method will reset the preceeding tag to a sentence ender (PP).
|
622
639
|
# This prepares the first word of a new sentence to be tagged correctly.
|
623
640
|
def reset
|
624
|
-
@conf[:current_tag] =
|
641
|
+
@conf[:current_tag] = "pp"
|
625
642
|
end
|
626
643
|
|
627
644
|
# Check whether the text is a valid string
|
@@ -629,41 +646,38 @@ class EngTagger
|
|
629
646
|
if !text
|
630
647
|
# there's nothing to parse
|
631
648
|
"method call on uninitialized variable" if @conf[:debug]
|
632
|
-
|
649
|
+
false
|
633
650
|
elsif /\A\s*\z/ =~ text
|
634
651
|
# text is an empty string, nothing to parse
|
635
|
-
|
652
|
+
false
|
636
653
|
else
|
637
654
|
# $text is valid
|
638
|
-
|
655
|
+
true
|
639
656
|
end
|
640
657
|
end
|
641
658
|
|
642
659
|
# Return a text string with the part-of-speech tags removed
|
643
660
|
def strip_tags(tagged, downcase = false)
|
644
661
|
return nil unless valid_text(tagged)
|
662
|
+
|
645
663
|
text = tagged.gsub(/<[^>]+>/m, "")
|
646
664
|
text = text.gsub(/\s+/m, " ")
|
647
665
|
text = text.gsub(/\A\s*/, "")
|
648
666
|
text = text.gsub(/\s*\z/, "")
|
649
|
-
|
650
|
-
return text.downcase
|
651
|
-
else
|
652
|
-
return text
|
653
|
-
end
|
667
|
+
downcase ? text.downcase : text
|
654
668
|
end
|
655
669
|
|
656
670
|
# Strip the provided text and separate off any punctuation in preparation for tagging
|
657
671
|
def clean_text(text)
|
658
672
|
return false unless valid_text(text)
|
659
|
-
|
673
|
+
|
674
|
+
cleaned_text = text.encode("utf-8")
|
660
675
|
tokenized = []
|
661
676
|
# Tokenize the text (splitting on punctuation as you go)
|
662
677
|
cleaned_text.split(/\s+/).each do |line|
|
663
678
|
tokenized += split_punct(line)
|
664
679
|
end
|
665
|
-
|
666
|
-
return words
|
680
|
+
split_sentences(tokenized)
|
667
681
|
end
|
668
682
|
|
669
683
|
# This handles all of the trailing periods, keeping those that
|
@@ -672,27 +686,26 @@ class EngTagger
|
|
672
686
|
# about the use of capitalization in the incoming text
|
673
687
|
def split_sentences(array)
|
674
688
|
tokenized = array
|
675
|
-
people = %w
|
676
|
-
supt det mssrs rev
|
677
|
-
army = %w
|
678
|
-
inst = %w
|
679
|
-
place = %w
|
680
|
-
hwy hway la pde pd plz pl rd st tce
|
681
|
-
comp = %w
|
682
|
-
state = %w
|
689
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
690
|
+
supt det mssrs rev]
|
691
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
692
|
+
inst = %w[dept univ assn bros ph.d]
|
693
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
694
|
+
hwy hway la pde pd plz pl rd st tce]
|
695
|
+
comp = %w[mfg inc ltd co corp]
|
696
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
683
697
|
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
684
698
|
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
685
|
-
va wash wis wisc wy wyo usafa alta man ont que sask yuk
|
686
|
-
month = %w
|
687
|
-
misc = %w
|
688
|
-
abbr =
|
699
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
700
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
701
|
+
misc = %w[vs etc no esp]
|
702
|
+
abbr = {}
|
689
703
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
690
704
|
abbr[i] = true
|
691
705
|
end
|
692
|
-
words =
|
693
|
-
tokenized.each_with_index do |
|
694
|
-
if tokenized[i + 1]
|
695
|
-
tokenized[i] =~ /\A(.+)\.\z/
|
706
|
+
words = []
|
707
|
+
tokenized.each_with_index do |_, i|
|
708
|
+
if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
696
709
|
w = $1
|
697
710
|
# Don't separate the period off words that
|
698
711
|
# meet any of the following conditions:
|
@@ -700,21 +713,20 @@ class EngTagger
|
|
700
713
|
# 1. It is defined in one of the lists above
|
701
714
|
# 2. It is only one letter long: Alfred E. Sloan
|
702
715
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
703
|
-
unless abbr[w.downcase]
|
704
|
-
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
716
|
+
unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
705
717
|
words << w
|
706
|
-
words <<
|
718
|
+
words << "."
|
707
719
|
next
|
708
720
|
end
|
709
721
|
end
|
710
722
|
words << tokenized[i]
|
711
723
|
end
|
712
724
|
# If the final word ends in a period..
|
713
|
-
if words[-1]
|
725
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
714
726
|
words[-1] = $1
|
715
|
-
words.push
|
727
|
+
words.push "."
|
716
728
|
end
|
717
|
-
|
729
|
+
words
|
718
730
|
end
|
719
731
|
|
720
732
|
# Separate punctuation from words, where appropriate. This leaves trailing
|
@@ -722,39 +734,40 @@ class EngTagger
|
|
722
734
|
def split_punct(text)
|
723
735
|
# If there's no punctuation, return immediately
|
724
736
|
return [text] if /\A\w+\z/ =~ text
|
737
|
+
|
725
738
|
# Sanity checks
|
726
739
|
text = text.gsub(/\W{10,}/o, " ")
|
727
740
|
|
728
741
|
# Put quotes into a standard format
|
729
742
|
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
730
743
|
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
731
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
744
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
|
732
745
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
733
|
-
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
746
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
|
734
747
|
|
735
748
|
# Handle all other punctuation
|
736
749
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
737
750
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
738
751
|
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
739
|
-
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
740
|
-
text = text.gsub(/([
|
741
|
-
text = text.gsub(/([
|
752
|
+
text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
|
753
|
+
text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
|
754
|
+
text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
|
742
755
|
|
743
756
|
# English-specific contractions
|
744
|
-
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}
|
745
|
-
text = text.gsub(/n't\b/o, " n't")
|
746
|
-
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}
|
747
|
-
|
748
|
-
return result
|
757
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
|
758
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
759
|
+
text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
|
760
|
+
text.split(" ")
|
749
761
|
end
|
750
762
|
|
751
763
|
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
752
764
|
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
753
765
|
def assign_tag(prev_tag, word)
|
754
|
-
|
766
|
+
case word
|
767
|
+
when "-unknown-"
|
755
768
|
# classify unknown words accordingly
|
756
769
|
return @conf[:unknown_word_tag]
|
757
|
-
|
770
|
+
when "-sym-"
|
758
771
|
# If this is a symbol, tag it as a symbol
|
759
772
|
return "sym"
|
760
773
|
end
|
@@ -765,13 +778,13 @@ class EngTagger
|
|
765
778
|
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
766
779
|
# which is used in most POS taggers
|
767
780
|
best_tag = ""
|
768
|
-
t[prev_tag].
|
781
|
+
t[prev_tag].each_key do |tag|
|
769
782
|
# With @config[:relax] set, this method
|
770
783
|
# will also include any `open classes' of POS tags
|
771
784
|
pw = 0
|
772
785
|
if w[tag]
|
773
786
|
pw = w[tag]
|
774
|
-
elsif @conf[:relax]
|
787
|
+
elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
|
775
788
|
pw = 0
|
776
789
|
else
|
777
790
|
next
|
@@ -786,7 +799,7 @@ class EngTagger
|
|
786
799
|
best_tag = tag
|
787
800
|
end
|
788
801
|
end
|
789
|
-
|
802
|
+
best_tag
|
790
803
|
end
|
791
804
|
|
792
805
|
# This method determines whether a word should be considered in its
|
@@ -796,13 +809,13 @@ class EngTagger
|
|
796
809
|
lcf = lcfirst(word)
|
797
810
|
# seen this word as it appears (lower or upper case)
|
798
811
|
if @@lexicon[word]
|
799
|
-
|
812
|
+
word
|
800
813
|
elsif @@lexicon[lcf]
|
801
814
|
# seen this word only as lower case
|
802
|
-
|
815
|
+
lcf
|
803
816
|
else
|
804
817
|
# never seen this word. guess.
|
805
|
-
|
818
|
+
classify_unknown_word(word)
|
806
819
|
end
|
807
820
|
end
|
808
821
|
|
@@ -810,52 +823,52 @@ class EngTagger
|
|
810
823
|
# classes of words handled by a simple unknown word classification
|
811
824
|
# metric. Called by the clean_word method.
|
812
825
|
def classify_unknown_word(word)
|
813
|
-
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
+
case word
|
827
|
+
when /[({\[]/ # Left brackets
|
828
|
+
"*LRB*"
|
829
|
+
when /[)}\]]/ # Right brackets
|
830
|
+
"*RRB*"
|
831
|
+
when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
|
832
|
+
"*NUM*"
|
833
|
+
when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
|
834
|
+
"*NUM*"
|
835
|
+
when /\A-?\d+\w+\z/o # Ordinal number
|
836
|
+
"*ORD*"
|
837
|
+
when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
|
838
|
+
"-abr-"
|
839
|
+
when /\w-\w/o # Hyphenated word
|
826
840
|
/-([^-]+)\z/ =~ word
|
827
841
|
h_suffix = $1
|
828
|
-
if h_suffix
|
842
|
+
if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
|
829
843
|
# last part of this is defined as an adjective
|
830
|
-
|
844
|
+
"-hyp-adj-"
|
831
845
|
else
|
832
846
|
# last part of this is not defined as an adjective
|
833
|
-
|
847
|
+
"-hyp-"
|
834
848
|
end
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
+
when /\A\W+\z/o
|
850
|
+
"-sym-" # Symbol
|
851
|
+
when ucfirst(word)
|
852
|
+
"-cap-" # Capitalized word
|
853
|
+
when /ing\z/o
|
854
|
+
"-ing-" # Ends in "ing"
|
855
|
+
when /s\z/o
|
856
|
+
"-s-" # Ends in "s"
|
857
|
+
when /tion\z/o
|
858
|
+
"-tion-" # Ends in "tion"
|
859
|
+
when /ly\z/o
|
860
|
+
"-ly-" # Ends in "ly"
|
861
|
+
when /ed\z/o
|
862
|
+
"-ed-" # Ends in "ed
|
849
863
|
else
|
850
|
-
|
864
|
+
"-unknown-" # Completely unknown
|
851
865
|
end
|
852
|
-
return classified
|
853
866
|
end
|
854
867
|
|
855
868
|
# This returns a compiled regexp for extracting maximal noun phrases
|
856
869
|
# from a POS-tagged text.
|
857
870
|
def get_max_noun_regex
|
858
|
-
|
871
|
+
/
|
859
872
|
# optional number, gerund - adjective -participle
|
860
873
|
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
861
874
|
# Followed by one or more nouns
|
@@ -868,8 +881,7 @@ class EngTagger
|
|
868
881
|
# one or more nouns
|
869
882
|
(?:#{NN})+
|
870
883
|
)*
|
871
|
-
/xo
|
872
|
-
return regex
|
884
|
+
/xo
|
873
885
|
end
|
874
886
|
|
875
887
|
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
@@ -878,12 +890,13 @@ class EngTagger
|
|
878
890
|
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
879
891
|
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
880
892
|
path = File.join(lexpath, lexicon)
|
881
|
-
fh = File.open(path,
|
882
|
-
while line = fh.gets
|
893
|
+
fh = File.open(path, "r")
|
894
|
+
while (line = fh.gets)
|
883
895
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
884
|
-
next unless $1
|
885
|
-
|
886
|
-
|
896
|
+
next unless $1 && $2
|
897
|
+
|
898
|
+
key = $1
|
899
|
+
data = $2
|
887
900
|
items = data.split(/,\s+/)
|
888
901
|
pairs = {}
|
889
902
|
items.each do |i|
|
@@ -901,12 +914,13 @@ class EngTagger
|
|
901
914
|
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
902
915
|
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
903
916
|
path = File.join(lexpath, lexicon)
|
904
|
-
fh = File.open(path,
|
905
|
-
while line = fh.gets
|
917
|
+
fh = File.open(path, "r")
|
918
|
+
while (line = fh.gets)
|
906
919
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
907
|
-
next unless $1
|
908
|
-
|
909
|
-
|
920
|
+
next unless $1 && $2
|
921
|
+
|
922
|
+
key = $1
|
923
|
+
data = $2
|
910
924
|
items = data.split(/,\s+/)
|
911
925
|
pairs = {}
|
912
926
|
items.each do |i|
|
@@ -918,7 +932,7 @@ class EngTagger
|
|
918
932
|
fh.close
|
919
933
|
end
|
920
934
|
|
921
|
-
#memoize the stem and assign_tag methods
|
935
|
+
# memoize the stem and assign_tag methods
|
922
936
|
memoize("stem")
|
923
937
|
memoize("assign_tag")
|
924
938
|
end
|