engtagger 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +75 -0
- data/.solargraph.yml +22 -0
- data/Gemfile +6 -2
- data/README.md +33 -31
- data/Rakefile +9 -1
- data/engtagger.gemspec +13 -10
- data/lib/engtagger/porter.rb +38 -60
- data/lib/engtagger/version.rb +3 -1
- data/lib/engtagger.rb +220 -207
- metadata +9 -8
- data/test/test_engtagger.rb +0 -246
data/lib/engtagger.rb
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
# -*- coding: utf-8 -*-
|
3
2
|
|
4
|
-
|
5
|
-
|
6
|
-
require
|
7
|
-
require
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
require "rubygems"
|
6
|
+
require "lru_redux"
|
7
|
+
require_relative "engtagger/porter"
|
8
8
|
|
9
9
|
module BoundedSpaceMemoizable
|
10
|
-
def memoize(method, max_cache_size=
|
10
|
+
def memoize(method, max_cache_size = 100_000)
|
11
11
|
# alias_method is faster than define_method + old.bind(self).call
|
12
12
|
alias_method "__memoized__#{method}", method
|
13
|
-
module_eval <<-
|
13
|
+
module_eval <<-MODEV
|
14
14
|
def #{method}(*a)
|
15
15
|
@__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
|
16
16
|
@__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
|
17
17
|
end
|
18
|
-
|
18
|
+
MODEV
|
19
19
|
end
|
20
20
|
end
|
21
21
|
|
@@ -24,7 +24,7 @@ class EngTagger
|
|
24
24
|
extend BoundedSpaceMemoizable
|
25
25
|
|
26
26
|
# File paths
|
27
|
-
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__),
|
27
|
+
DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
|
28
28
|
DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
|
29
29
|
DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
|
30
30
|
|
@@ -37,7 +37,7 @@ class EngTagger
|
|
37
37
|
# @return [Hash] the probability data
|
38
38
|
#
|
39
39
|
def self.hmm
|
40
|
-
|
40
|
+
@@hmm
|
41
41
|
end
|
42
42
|
|
43
43
|
# Return a class variable that holds lexical data.
|
@@ -45,46 +45,47 @@ class EngTagger
|
|
45
45
|
# @return [Hash] the lexicon
|
46
46
|
#
|
47
47
|
def self.lexicon
|
48
|
-
|
48
|
+
@@lexicon
|
49
49
|
end
|
50
50
|
|
51
51
|
# Return a regexp from a string argument that matches an XML-style pos tag
|
52
52
|
def self.get_ext(tag = nil)
|
53
53
|
return nil unless tag
|
54
|
-
|
54
|
+
|
55
|
+
Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
|
55
56
|
end
|
56
57
|
|
57
58
|
# Regexps to match XML-style part-of-speech tags
|
58
|
-
NUM = get_ext(
|
59
|
-
GER = get_ext(
|
60
|
-
ADJ = get_ext(
|
61
|
-
NN = get_ext(
|
62
|
-
NNP = get_ext(
|
63
|
-
PREP = get_ext(
|
64
|
-
DET = get_ext(
|
65
|
-
PAREN = get_ext(
|
66
|
-
QUOT = get_ext(
|
67
|
-
SEN = get_ext(
|
68
|
-
WORD = get_ext(
|
69
|
-
VB = get_ext(
|
70
|
-
VBG = get_ext(
|
71
|
-
VBD = get_ext(
|
72
|
-
PART = get_ext(
|
73
|
-
VBP = get_ext(
|
74
|
-
VBZ = get_ext(
|
75
|
-
JJ = get_ext(
|
76
|
-
JJR = get_ext(
|
77
|
-
JJS = get_ext(
|
78
|
-
RB = get_ext(
|
79
|
-
RBR = get_ext(
|
80
|
-
RBS = get_ext(
|
81
|
-
RP = get_ext(
|
82
|
-
WRB = get_ext(
|
83
|
-
WDT = get_ext(
|
84
|
-
WP = get_ext(
|
85
|
-
WPS = get_ext(
|
86
|
-
CC = get_ext(
|
87
|
-
IN = get_ext(
|
59
|
+
NUM = get_ext("cd")
|
60
|
+
GER = get_ext("vbg")
|
61
|
+
ADJ = get_ext("jj[rs]*")
|
62
|
+
NN = get_ext("nn[sp]*")
|
63
|
+
NNP = get_ext("nnp")
|
64
|
+
PREP = get_ext("in")
|
65
|
+
DET = get_ext("det")
|
66
|
+
PAREN = get_ext("[lr]rb")
|
67
|
+
QUOT = get_ext("ppr")
|
68
|
+
SEN = get_ext("pp")
|
69
|
+
WORD = get_ext("\w+")
|
70
|
+
VB = get_ext("vb")
|
71
|
+
VBG = get_ext("vbg")
|
72
|
+
VBD = get_ext("vbd")
|
73
|
+
PART = get_ext("vbn")
|
74
|
+
VBP = get_ext("vbp")
|
75
|
+
VBZ = get_ext("vbz")
|
76
|
+
JJ = get_ext("jj")
|
77
|
+
JJR = get_ext("jjr")
|
78
|
+
JJS = get_ext("jjs")
|
79
|
+
RB = get_ext("rb")
|
80
|
+
RBR = get_ext("rbr")
|
81
|
+
RBS = get_ext("rbs")
|
82
|
+
RP = get_ext("rp")
|
83
|
+
WRB = get_ext("wrb")
|
84
|
+
WDT = get_ext("wdt")
|
85
|
+
WP = get_ext("wp")
|
86
|
+
WPS = get_ext("wps")
|
87
|
+
CC = get_ext("cc")
|
88
|
+
IN = get_ext("in")
|
88
89
|
|
89
90
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
90
91
|
#
|
@@ -93,11 +94,7 @@ class EngTagger
|
|
93
94
|
#
|
94
95
|
def self.explain_tag(tag)
|
95
96
|
tag = tag.to_s.downcase
|
96
|
-
|
97
|
-
return TAGS[tag]
|
98
|
-
else
|
99
|
-
return tag
|
100
|
-
end
|
97
|
+
TAGS[tag] || tag
|
101
98
|
end
|
102
99
|
|
103
100
|
# The folloging is to make a hash to convert a pos tag to its definition
|
@@ -148,8 +145,8 @@ class EngTagger
|
|
148
145
|
"LRB", "Punctuation, left bracket",
|
149
146
|
"RRB", "Punctuation, right bracket"
|
150
147
|
]
|
151
|
-
tags = tags.collect{|t| t.downcase.gsub(/[
|
152
|
-
tags = tags.collect{|t| t.gsub(
|
148
|
+
tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
|
149
|
+
tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
|
153
150
|
TAGS = Hash[*tags]
|
154
151
|
|
155
152
|
# Hash storing config values:
|
@@ -191,30 +188,30 @@ class EngTagger
|
|
191
188
|
# Take a hash of parameters that override default values.
|
192
189
|
# See above for details.
|
193
190
|
def initialize(params = {})
|
194
|
-
@conf =
|
195
|
-
@conf[:unknown_word_tag] =
|
191
|
+
@conf = {}
|
192
|
+
@conf[:unknown_word_tag] = ""
|
196
193
|
@conf[:stem] = false
|
197
194
|
@conf[:weight_noun_phrases] = false
|
198
195
|
@conf[:longest_noun_phrase] = 5
|
199
196
|
@conf[:relax] = false
|
200
|
-
@conf[:tag_lex] =
|
201
|
-
@conf[:word_lex] =
|
202
|
-
@conf[:unknown_lex] =
|
197
|
+
@conf[:tag_lex] = "tags.yml"
|
198
|
+
@conf[:word_lex] = "words.yml"
|
199
|
+
@conf[:unknown_lex] = "unknown.yml"
|
203
200
|
@conf[:word_path] = DEFAULT_WORDPATH
|
204
201
|
@conf[:tag_path] = DEFAULT_TAGPATH
|
205
202
|
@conf[:debug] = false
|
206
203
|
# assuming that we start analyzing from the beginninga new sentence...
|
207
|
-
@conf[:current_tag] =
|
204
|
+
@conf[:current_tag] = "pp"
|
208
205
|
@conf.merge!(params) if params
|
209
|
-
|
206
|
+
if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
|
210
207
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
211
|
-
@@hmm =
|
212
|
-
@@lexicon =
|
208
|
+
@@hmm = {}
|
209
|
+
@@lexicon = {}
|
213
210
|
else
|
214
|
-
lexf = File.open(@conf[:word_path],
|
211
|
+
lexf = File.open(@conf[:word_path], "r")
|
215
212
|
@@lexicon = Marshal.load(lexf)
|
216
213
|
lexf.close
|
217
|
-
hmmf = File.open(@conf[:tag_path],
|
214
|
+
hmmf = File.open(@conf[:tag_path], "r")
|
218
215
|
@@hmm = Marshal.load(hmmf)
|
219
216
|
hmmf.close
|
220
217
|
end
|
@@ -236,7 +233,7 @@ class EngTagger
|
|
236
233
|
out = clean_text(text).map do |word|
|
237
234
|
cleaned_word = clean_word word
|
238
235
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
239
|
-
@conf[:current_tag] = tag =
|
236
|
+
@conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
|
240
237
|
[word, tag.to_sym]
|
241
238
|
end
|
242
239
|
|
@@ -255,18 +252,18 @@ class EngTagger
|
|
255
252
|
# Examine the string provided and return it fully tagged in XML style
|
256
253
|
def add_tags(text, verbose = false)
|
257
254
|
return nil unless valid_text(text)
|
255
|
+
|
258
256
|
tagged = []
|
259
257
|
words = clean_text(text)
|
260
|
-
tags = Array.new
|
261
258
|
words.each do |word|
|
262
259
|
cleaned_word = clean_word(word)
|
263
260
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
264
|
-
@conf[:current_tag] = tag =
|
261
|
+
@conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
|
265
262
|
tag = EngTagger.explain_tag(tag) if verbose
|
266
|
-
tagged <<
|
263
|
+
tagged << "<#{tag}>#{word}</#{tag}>"
|
267
264
|
end
|
268
265
|
reset
|
269
|
-
|
266
|
+
tagged.join(" ")
|
270
267
|
end
|
271
268
|
|
272
269
|
# Given a text string, return as many nouns and noun phrases as possible.
|
@@ -278,11 +275,12 @@ class EngTagger
|
|
278
275
|
#
|
279
276
|
def get_words(text)
|
280
277
|
return false unless valid_text(text)
|
278
|
+
|
281
279
|
tagged = add_tags(text)
|
282
|
-
if
|
283
|
-
|
280
|
+
if @conf[:longest_noun_phrase] <= 1
|
281
|
+
get_nouns(tagged)
|
284
282
|
else
|
285
|
-
|
283
|
+
get_noun_phrases(tagged)
|
286
284
|
end
|
287
285
|
end
|
288
286
|
|
@@ -290,29 +288,29 @@ class EngTagger
|
|
290
288
|
# Applies add_tags and reformats to be easier to read.
|
291
289
|
def get_readable(text, verbose = false)
|
292
290
|
return nil unless valid_text(text)
|
291
|
+
|
293
292
|
tagged = add_tags(text, verbose)
|
294
|
-
tagged
|
295
|
-
|
296
|
-
$1 + '/' + $2.upcase
|
293
|
+
tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
|
294
|
+
"#{$1}/#{$2.upcase}"
|
297
295
|
end
|
298
296
|
end
|
299
297
|
|
300
298
|
# Return an array of sentences (without POS tags) from a text.
|
301
299
|
def get_sentences(text)
|
302
300
|
return nil unless valid_text(text)
|
301
|
+
|
303
302
|
tagged = add_tags(text)
|
304
|
-
sentences =
|
305
|
-
tagged.split(
|
303
|
+
sentences = []
|
304
|
+
tagged.split(%r{</pp>}).each do |line|
|
306
305
|
sentences << strip_tags(line)
|
307
306
|
end
|
308
307
|
sentences = sentences.map do |sentence|
|
309
|
-
sentence.gsub(Regexp.new(" ('s?) ")){$1 +
|
310
|
-
sentence.gsub(Regexp.new(" (\W+) ")){$1 +
|
311
|
-
sentence.gsub(Regexp.new(" (`+) ")){
|
312
|
-
sentence.gsub(Regexp.new(" (\W+)$")){$1}
|
313
|
-
sentence.gsub(Regexp.new("^(`+) ")){$1}
|
308
|
+
sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
|
309
|
+
sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
|
310
|
+
sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
|
311
|
+
sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
|
312
|
+
sentence.gsub(Regexp.new("^(`+) ")) { $1 }
|
314
313
|
end
|
315
|
-
return sentences
|
316
314
|
end
|
317
315
|
|
318
316
|
# Given a POS-tagged text, this method returns a hash of all proper nouns
|
@@ -322,30 +320,31 @@ class EngTagger
|
|
322
320
|
# proper nouns. This method does not stem the found words.
|
323
321
|
def get_proper_nouns(tagged)
|
324
322
|
return nil unless valid_text(tagged)
|
323
|
+
|
325
324
|
tags = [NNP]
|
326
325
|
nnp = build_matches_hash(build_trimmed(tagged, tags))
|
327
326
|
# Now for some fancy resolution stuff...
|
328
|
-
nnp.
|
327
|
+
nnp.each_key do |key|
|
329
328
|
words = key.split(/\s/)
|
330
329
|
# Let's say this is an organization's name --
|
331
330
|
# (and it's got at least three words)
|
332
331
|
# is there a corresponding acronym in this hash?
|
333
|
-
if words.length
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
332
|
+
next if words.length <= 2
|
333
|
+
|
334
|
+
# Make a (naive) acronym out of this name
|
335
|
+
acronym = words.map do |word|
|
336
|
+
/\A([a-z])[a-z]*\z/ =~ word
|
337
|
+
$1
|
338
|
+
end.join " "
|
339
|
+
# If that acronym has been seen,
|
340
|
+
# remove it and add the values to
|
341
|
+
# the full name
|
342
|
+
if nnp[acronym]
|
343
|
+
nnp[key] += nnp[acronym]
|
344
|
+
nnp.delete(acronym)
|
346
345
|
end
|
347
346
|
end
|
348
|
-
|
347
|
+
nnp
|
349
348
|
end
|
350
349
|
|
351
350
|
# Given a POS-tagged text, this method returns all nouns and their
|
@@ -356,6 +355,7 @@ class EngTagger
|
|
356
355
|
#
|
357
356
|
def get_nouns(tagged)
|
358
357
|
return nil unless valid_text(tagged)
|
358
|
+
|
359
359
|
tags = [NN]
|
360
360
|
build_matches_hash(build_trimmed(tagged, tags))
|
361
361
|
end
|
@@ -369,6 +369,7 @@ class EngTagger
|
|
369
369
|
#
|
370
370
|
def get_verbs(tagged)
|
371
371
|
return nil unless valid_text(tagged)
|
372
|
+
|
372
373
|
tags = [VB, VBD, VBG, PART, VBP, VBZ]
|
373
374
|
build_matches_hash(build_trimmed(tagged, tags))
|
374
375
|
end
|
@@ -380,6 +381,7 @@ class EngTagger
|
|
380
381
|
|
381
382
|
def get_infinitive_verbs(tagged)
|
382
383
|
return nil unless valid_text(tagged)
|
384
|
+
|
383
385
|
tags = [VB]
|
384
386
|
build_matches_hash(build_trimmed(tagged, tags))
|
385
387
|
end
|
@@ -390,6 +392,7 @@ class EngTagger
|
|
390
392
|
#
|
391
393
|
def get_past_tense_verbs(tagged)
|
392
394
|
return nil unless valid_text(tagged)
|
395
|
+
|
393
396
|
tags = [VBD]
|
394
397
|
build_matches_hash(build_trimmed(tagged, tags))
|
395
398
|
end
|
@@ -400,6 +403,7 @@ class EngTagger
|
|
400
403
|
#
|
401
404
|
def get_gerund_verbs(tagged)
|
402
405
|
return nil unless valid_text(tagged)
|
406
|
+
|
403
407
|
tags = [VBG]
|
404
408
|
build_matches_hash(build_trimmed(tagged, tags))
|
405
409
|
end
|
@@ -410,6 +414,7 @@ class EngTagger
|
|
410
414
|
#
|
411
415
|
def get_passive_verbs(tagged)
|
412
416
|
return nil unless valid_text(tagged)
|
417
|
+
|
413
418
|
tags = [PART]
|
414
419
|
build_matches_hash(build_trimmed(tagged, tags))
|
415
420
|
end
|
@@ -420,6 +425,7 @@ class EngTagger
|
|
420
425
|
#
|
421
426
|
def get_base_present_verbs(tagged)
|
422
427
|
return nil unless valid_text(tagged)
|
428
|
+
|
423
429
|
tags = [VBP]
|
424
430
|
build_matches_hash(build_trimmed(tagged, tags))
|
425
431
|
end
|
@@ -430,6 +436,7 @@ class EngTagger
|
|
430
436
|
#
|
431
437
|
def get_present_verbs(tagged)
|
432
438
|
return nil unless valid_text(tagged)
|
439
|
+
|
433
440
|
tags = [VBZ]
|
434
441
|
build_matches_hash(build_trimmed(tagged, tags))
|
435
442
|
end
|
@@ -440,6 +447,7 @@ class EngTagger
|
|
440
447
|
#
|
441
448
|
def get_adjectives(tagged)
|
442
449
|
return nil unless valid_text(tagged)
|
450
|
+
|
443
451
|
tags = [JJ]
|
444
452
|
build_matches_hash(build_trimmed(tagged, tags))
|
445
453
|
end
|
@@ -450,6 +458,7 @@ class EngTagger
|
|
450
458
|
#
|
451
459
|
def get_comparative_adjectives(tagged)
|
452
460
|
return nil unless valid_text(tagged)
|
461
|
+
|
453
462
|
tags = [JJR]
|
454
463
|
build_matches_hash(build_trimmed(tagged, tags))
|
455
464
|
end
|
@@ -460,6 +469,7 @@ class EngTagger
|
|
460
469
|
#
|
461
470
|
def get_superlative_adjectives(tagged)
|
462
471
|
return nil unless valid_text(tagged)
|
472
|
+
|
463
473
|
tags = [JJS]
|
464
474
|
build_matches_hash(build_trimmed(tagged, tags))
|
465
475
|
end
|
@@ -470,6 +480,7 @@ class EngTagger
|
|
470
480
|
#
|
471
481
|
def get_adverbs(tagged)
|
472
482
|
return nil unless valid_text(tagged)
|
483
|
+
|
473
484
|
tags = [RB, RBR, RBS, RP]
|
474
485
|
build_matches_hash(build_trimmed(tagged, tags))
|
475
486
|
end
|
@@ -480,13 +491,14 @@ class EngTagger
|
|
480
491
|
#
|
481
492
|
def get_interrogatives(tagged)
|
482
493
|
return nil unless valid_text(tagged)
|
494
|
+
|
483
495
|
tags = [WRB, WDT, WP, WPS]
|
484
496
|
build_matches_hash(build_trimmed(tagged, tags))
|
485
497
|
end
|
486
498
|
|
487
499
|
# To be consistent with documentation's naming of 'interrogative'
|
488
500
|
# parts of speech as 'question'
|
489
|
-
|
501
|
+
alias get_question_parts get_interrogatives
|
490
502
|
|
491
503
|
# Returns all types of conjunctions and does not discriminate
|
492
504
|
# between the various kinds. E.g. coordinating, subordinating,
|
@@ -497,6 +509,7 @@ class EngTagger
|
|
497
509
|
#
|
498
510
|
def get_conjunctions(tagged)
|
499
511
|
return nil unless valid_text(tagged)
|
512
|
+
|
500
513
|
tags = [CC, IN]
|
501
514
|
build_matches_hash(build_trimmed(tagged, tags))
|
502
515
|
end
|
@@ -509,14 +522,15 @@ class EngTagger
|
|
509
522
|
#
|
510
523
|
def get_max_noun_phrases(tagged)
|
511
524
|
return nil unless valid_text(tagged)
|
525
|
+
|
512
526
|
tags = [@@mnp]
|
513
527
|
mn_phrases = build_trimmed(tagged, tags)
|
514
528
|
ret = Hash.new(0)
|
515
529
|
mn_phrases.each do |p|
|
516
|
-
p = stem(p) unless p =~ /\s/
|
530
|
+
p = stem(p) unless p =~ /\s/ # stem single words
|
517
531
|
ret[p] += 1 unless p =~ /\A\s*\z/
|
518
532
|
end
|
519
|
-
|
533
|
+
ret
|
520
534
|
end
|
521
535
|
|
522
536
|
# Similar to get_words, but requires a POS-tagged text as an argument.
|
@@ -526,9 +540,10 @@ class EngTagger
|
|
526
540
|
#
|
527
541
|
def get_noun_phrases(tagged)
|
528
542
|
return nil unless valid_text(tagged)
|
543
|
+
|
529
544
|
found = Hash.new(0)
|
530
545
|
phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
|
531
|
-
|
546
|
+
scanned = tagged.scan(@@mnp)
|
532
547
|
# Find MNPs in the text, one sentence at a time
|
533
548
|
# Record and split if the phrase is extended by a (?:PREP|DET|NUM)
|
534
549
|
mn_phrases = []
|
@@ -541,14 +556,14 @@ class EngTagger
|
|
541
556
|
# shortening the phrase by removing the word in the first position.
|
542
557
|
# Record the phrase and any single nouns that are found
|
543
558
|
words = mnp.split
|
544
|
-
words.length.times do
|
545
|
-
found[words.join(
|
559
|
+
words.length.times do
|
560
|
+
found[words.join(" ")] += 1 if words.length > 1
|
546
561
|
w = words.shift
|
547
562
|
found[w] += 1 if w =~ /#{NN}/
|
548
563
|
end
|
549
564
|
end
|
550
565
|
ret = Hash.new(0)
|
551
|
-
found.
|
566
|
+
found.each_key do |f|
|
552
567
|
k = strip_tags(f)
|
553
568
|
v = found[f]
|
554
569
|
# We weight by the word count to favor long noun phrases
|
@@ -556,12 +571,13 @@ class EngTagger
|
|
556
571
|
word_count = space_count.length + 1
|
557
572
|
# Throttle MNPs if necessary
|
558
573
|
next if word_count > @conf[:longest_noun_phrase]
|
559
|
-
|
574
|
+
|
575
|
+
k = stem(k) unless word_count > 1 # stem single words
|
560
576
|
multiplier = 1
|
561
577
|
multiplier = word_count if @conf[:weight_noun_phrases]
|
562
578
|
ret[k] += multiplier * v
|
563
579
|
end
|
564
|
-
|
580
|
+
ret
|
565
581
|
end
|
566
582
|
|
567
583
|
# Reads some included corpus data and saves it in a stored hash on the
|
@@ -572,10 +588,10 @@ class EngTagger
|
|
572
588
|
load_tags(@conf[:tag_lex])
|
573
589
|
load_words(@conf[:word_lex])
|
574
590
|
load_words(@conf[:unknown_lex])
|
575
|
-
File.open(@conf[:word_path],
|
591
|
+
File.open(@conf[:word_path], "w") do |f|
|
576
592
|
Marshal.dump(@@lexicon, f)
|
577
593
|
end
|
578
|
-
File.open(@conf[:tag_path],
|
594
|
+
File.open(@conf[:tag_path], "w") do |f|
|
579
595
|
Marshal.dump(@@hmm, f)
|
580
596
|
end
|
581
597
|
end
|
@@ -597,6 +613,7 @@ class EngTagger
|
|
597
613
|
trimmed.each do |n|
|
598
614
|
n = stem(n)
|
599
615
|
next unless n.length < 100 # sanity check on word length
|
616
|
+
|
600
617
|
ret[n] += 1 unless n =~ /\A\s*\z/
|
601
618
|
end
|
602
619
|
ret
|
@@ -604,25 +621,24 @@ class EngTagger
|
|
604
621
|
|
605
622
|
# Downcase the first letter of word
|
606
623
|
def lcfirst(word)
|
607
|
-
word.split(//)[0].downcase + word.split(//)[1
|
624
|
+
word.split(//)[0].downcase + word.split(//)[1..].join
|
608
625
|
end
|
609
626
|
|
610
627
|
# Upcase the first letter of word
|
611
628
|
def ucfirst(word)
|
612
|
-
word.split(//)[0].upcase + word.split(//)[1
|
629
|
+
word.split(//)[0].upcase + word.split(//)[1..].join
|
613
630
|
end
|
614
631
|
|
615
632
|
# Return the word stem as given by Stemmable module. This can be
|
616
633
|
# turned off with the class parameter @conf[:stem] => false.
|
617
634
|
def stem(word)
|
618
|
-
|
619
|
-
return word.stem
|
635
|
+
@conf[:stem] ? word.stem : word
|
620
636
|
end
|
621
637
|
|
622
638
|
# This method will reset the preceeding tag to a sentence ender (PP).
|
623
639
|
# This prepares the first word of a new sentence to be tagged correctly.
|
624
640
|
def reset
|
625
|
-
@conf[:current_tag] =
|
641
|
+
@conf[:current_tag] = "pp"
|
626
642
|
end
|
627
643
|
|
628
644
|
# Check whether the text is a valid string
|
@@ -630,41 +646,38 @@ class EngTagger
|
|
630
646
|
if !text
|
631
647
|
# there's nothing to parse
|
632
648
|
"method call on uninitialized variable" if @conf[:debug]
|
633
|
-
|
649
|
+
false
|
634
650
|
elsif /\A\s*\z/ =~ text
|
635
651
|
# text is an empty string, nothing to parse
|
636
|
-
|
652
|
+
false
|
637
653
|
else
|
638
654
|
# $text is valid
|
639
|
-
|
655
|
+
true
|
640
656
|
end
|
641
657
|
end
|
642
658
|
|
643
659
|
# Return a text string with the part-of-speech tags removed
|
644
660
|
def strip_tags(tagged, downcase = false)
|
645
661
|
return nil unless valid_text(tagged)
|
662
|
+
|
646
663
|
text = tagged.gsub(/<[^>]+>/m, "")
|
647
664
|
text = text.gsub(/\s+/m, " ")
|
648
665
|
text = text.gsub(/\A\s*/, "")
|
649
666
|
text = text.gsub(/\s*\z/, "")
|
650
|
-
|
651
|
-
return text.downcase
|
652
|
-
else
|
653
|
-
return text
|
654
|
-
end
|
667
|
+
downcase ? text.downcase : text
|
655
668
|
end
|
656
669
|
|
657
670
|
# Strip the provided text and separate off any punctuation in preparation for tagging
|
658
671
|
def clean_text(text)
|
659
672
|
return false unless valid_text(text)
|
660
|
-
|
673
|
+
|
674
|
+
cleaned_text = text.encode("utf-8")
|
661
675
|
tokenized = []
|
662
676
|
# Tokenize the text (splitting on punctuation as you go)
|
663
677
|
cleaned_text.split(/\s+/).each do |line|
|
664
678
|
tokenized += split_punct(line)
|
665
679
|
end
|
666
|
-
|
667
|
-
return words
|
680
|
+
split_sentences(tokenized)
|
668
681
|
end
|
669
682
|
|
670
683
|
# This handles all of the trailing periods, keeping those that
|
@@ -673,27 +686,26 @@ class EngTagger
|
|
673
686
|
# about the use of capitalization in the incoming text
|
674
687
|
def split_sentences(array)
|
675
688
|
tokenized = array
|
676
|
-
people = %w
|
677
|
-
supt det mssrs rev
|
678
|
-
army = %w
|
679
|
-
inst = %w
|
680
|
-
place = %w
|
681
|
-
hwy hway la pde pd plz pl rd st tce
|
682
|
-
comp = %w
|
683
|
-
state = %w
|
689
|
+
people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
|
690
|
+
supt det mssrs rev]
|
691
|
+
army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
|
692
|
+
inst = %w[dept univ assn bros ph.d]
|
693
|
+
place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
|
694
|
+
hwy hway la pde pd plz pl rd st tce]
|
695
|
+
comp = %w[mfg inc ltd co corp]
|
696
|
+
state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
|
684
697
|
ind ia kans kan ken ky la me md is mass mich minn miss mo mont
|
685
698
|
neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
|
686
|
-
va wash wis wisc wy wyo usafa alta man ont que sask yuk
|
687
|
-
month = %w
|
688
|
-
misc = %w
|
689
|
-
abbr =
|
699
|
+
va wash wis wisc wy wyo usafa alta man ont que sask yuk]
|
700
|
+
month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
|
701
|
+
misc = %w[vs etc no esp]
|
702
|
+
abbr = {}
|
690
703
|
[people, army, inst, place, comp, state, month, misc].flatten.each do |i|
|
691
704
|
abbr[i] = true
|
692
705
|
end
|
693
|
-
words =
|
694
|
-
tokenized.each_with_index do |
|
695
|
-
if tokenized[i + 1]
|
696
|
-
tokenized[i] =~ /\A(.+)\.\z/
|
706
|
+
words = []
|
707
|
+
tokenized.each_with_index do |_, i|
|
708
|
+
if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
|
697
709
|
w = $1
|
698
710
|
# Don't separate the period off words that
|
699
711
|
# meet any of the following conditions:
|
@@ -701,21 +713,20 @@ class EngTagger
|
|
701
713
|
# 1. It is defined in one of the lists above
|
702
714
|
# 2. It is only one letter long: Alfred E. Sloan
|
703
715
|
# 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
|
704
|
-
unless abbr[w.downcase]
|
705
|
-
[/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
716
|
+
unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
|
706
717
|
words << w
|
707
|
-
words <<
|
718
|
+
words << "."
|
708
719
|
next
|
709
720
|
end
|
710
721
|
end
|
711
722
|
words << tokenized[i]
|
712
723
|
end
|
713
724
|
# If the final word ends in a period..
|
714
|
-
if words[-1]
|
725
|
+
if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
|
715
726
|
words[-1] = $1
|
716
|
-
words.push
|
727
|
+
words.push "."
|
717
728
|
end
|
718
|
-
|
729
|
+
words
|
719
730
|
end
|
720
731
|
|
721
732
|
# Separate punctuation from words, where appropriate. This leaves trailing
|
@@ -723,39 +734,40 @@ class EngTagger
|
|
723
734
|
def split_punct(text)
|
724
735
|
# If there's no punctuation, return immediately
|
725
736
|
return [text] if /\A\w+\z/ =~ text
|
737
|
+
|
726
738
|
# Sanity checks
|
727
739
|
text = text.gsub(/\W{10,}/o, " ")
|
728
740
|
|
729
741
|
# Put quotes into a standard format
|
730
742
|
text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
|
731
743
|
text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
|
732
|
-
text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
|
744
|
+
text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
|
733
745
|
text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
|
734
|
-
text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
|
746
|
+
text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
|
735
747
|
|
736
748
|
# Handle all other punctuation
|
737
749
|
text = text.gsub(/--+/o, " - ") # Convert and separate dashes
|
738
750
|
text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
|
739
751
|
text = text.gsub(/:/o, " : ") # Shift semicolons off
|
740
|
-
text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
|
741
|
-
text = text.gsub(/([
|
742
|
-
text = text.gsub(/([
|
752
|
+
text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
|
753
|
+
text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
|
754
|
+
text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
|
743
755
|
|
744
756
|
# English-specific contractions
|
745
|
-
text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2}
|
746
|
-
text = text.gsub(/n't\b/o, " n't")
|
747
|
-
text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1}
|
748
|
-
|
749
|
-
return result
|
757
|
+
text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
|
758
|
+
text = text.gsub(/n't\b/o, " n't") # Separate off n't
|
759
|
+
text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
|
760
|
+
text.split(" ")
|
750
761
|
end
|
751
762
|
|
752
763
|
# Given a preceding tag, assign a tag word. Called by the add_tags method.
|
753
764
|
# This method is a modified version of the Viterbi algorithm for part-of-speech tagging
|
754
765
|
def assign_tag(prev_tag, word)
|
755
|
-
|
766
|
+
case word
|
767
|
+
when "-unknown-"
|
756
768
|
# classify unknown words accordingly
|
757
769
|
return @conf[:unknown_word_tag]
|
758
|
-
|
770
|
+
when "-sym-"
|
759
771
|
# If this is a symbol, tag it as a symbol
|
760
772
|
return "sym"
|
761
773
|
end
|
@@ -766,13 +778,13 @@ class EngTagger
|
|
766
778
|
# TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
|
767
779
|
# which is used in most POS taggers
|
768
780
|
best_tag = ""
|
769
|
-
t[prev_tag].
|
781
|
+
t[prev_tag].each_key do |tag|
|
770
782
|
# With @config[:relax] set, this method
|
771
783
|
# will also include any `open classes' of POS tags
|
772
784
|
pw = 0
|
773
785
|
if w[tag]
|
774
786
|
pw = w[tag]
|
775
|
-
elsif @conf[:relax]
|
787
|
+
elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
|
776
788
|
pw = 0
|
777
789
|
else
|
778
790
|
next
|
@@ -787,7 +799,7 @@ class EngTagger
|
|
787
799
|
best_tag = tag
|
788
800
|
end
|
789
801
|
end
|
790
|
-
|
802
|
+
best_tag
|
791
803
|
end
|
792
804
|
|
793
805
|
# This method determines whether a word should be considered in its
|
@@ -797,13 +809,13 @@ class EngTagger
|
|
797
809
|
lcf = lcfirst(word)
|
798
810
|
# seen this word as it appears (lower or upper case)
|
799
811
|
if @@lexicon[word]
|
800
|
-
|
812
|
+
word
|
801
813
|
elsif @@lexicon[lcf]
|
802
814
|
# seen this word only as lower case
|
803
|
-
|
815
|
+
lcf
|
804
816
|
else
|
805
817
|
# never seen this word. guess.
|
806
|
-
|
818
|
+
classify_unknown_word(word)
|
807
819
|
end
|
808
820
|
end
|
809
821
|
|
@@ -811,52 +823,52 @@ class EngTagger
|
|
811
823
|
# classes of words handled by a simple unknown word classification
|
812
824
|
# metric. Called by the clean_word method.
|
813
825
|
def classify_unknown_word(word)
|
814
|
-
|
815
|
-
|
816
|
-
|
817
|
-
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
823
|
-
|
824
|
-
|
825
|
-
|
826
|
-
|
826
|
+
case word
|
827
|
+
when /[({\[]/ # Left brackets
|
828
|
+
"*LRB*"
|
829
|
+
when /[)}\]]/ # Right brackets
|
830
|
+
"*RRB*"
|
831
|
+
when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
|
832
|
+
"*NUM*"
|
833
|
+
when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
|
834
|
+
"*NUM*"
|
835
|
+
when /\A-?\d+\w+\z/o # Ordinal number
|
836
|
+
"*ORD*"
|
837
|
+
when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
|
838
|
+
"-abr-"
|
839
|
+
when /\w-\w/o # Hyphenated word
|
827
840
|
/-([^-]+)\z/ =~ word
|
828
841
|
h_suffix = $1
|
829
|
-
if h_suffix
|
842
|
+
if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
|
830
843
|
# last part of this is defined as an adjective
|
831
|
-
|
844
|
+
"-hyp-adj-"
|
832
845
|
else
|
833
846
|
# last part of this is not defined as an adjective
|
834
|
-
|
847
|
+
"-hyp-"
|
835
848
|
end
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
849
|
+
when /\A\W+\z/o
|
850
|
+
"-sym-" # Symbol
|
851
|
+
when ucfirst(word)
|
852
|
+
"-cap-" # Capitalized word
|
853
|
+
when /ing\z/o
|
854
|
+
"-ing-" # Ends in "ing"
|
855
|
+
when /s\z/o
|
856
|
+
"-s-" # Ends in "s"
|
857
|
+
when /tion\z/o
|
858
|
+
"-tion-" # Ends in "tion"
|
859
|
+
when /ly\z/o
|
860
|
+
"-ly-" # Ends in "ly"
|
861
|
+
when /ed\z/o
|
862
|
+
"-ed-" # Ends in "ed
|
850
863
|
else
|
851
|
-
|
864
|
+
"-unknown-" # Completely unknown
|
852
865
|
end
|
853
|
-
return classified
|
854
866
|
end
|
855
867
|
|
856
868
|
# This returns a compiled regexp for extracting maximal noun phrases
|
857
869
|
# from a POS-tagged text.
|
858
870
|
def get_max_noun_regex
|
859
|
-
|
871
|
+
/
|
860
872
|
# optional number, gerund - adjective -participle
|
861
873
|
(?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
|
862
874
|
# Followed by one or more nouns
|
@@ -869,8 +881,7 @@ class EngTagger
|
|
869
881
|
# one or more nouns
|
870
882
|
(?:#{NN})+
|
871
883
|
)*
|
872
|
-
/xo
|
873
|
-
return regex
|
884
|
+
/xo
|
874
885
|
end
|
875
886
|
|
876
887
|
# Load the 2-grams into a hash from YAML data: This is a naive (but fast)
|
@@ -879,12 +890,13 @@ class EngTagger
|
|
879
890
|
# Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
|
880
891
|
def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
|
881
892
|
path = File.join(lexpath, lexicon)
|
882
|
-
fh = File.open(path,
|
883
|
-
while line = fh.gets
|
893
|
+
fh = File.open(path, "r")
|
894
|
+
while (line = fh.gets)
|
884
895
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
885
|
-
next unless $1
|
886
|
-
|
887
|
-
|
896
|
+
next unless $1 && $2
|
897
|
+
|
898
|
+
key = $1
|
899
|
+
data = $2
|
888
900
|
items = data.split(/,\s+/)
|
889
901
|
pairs = {}
|
890
902
|
items.each do |i|
|
@@ -902,12 +914,13 @@ class EngTagger
|
|
902
914
|
# Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
|
903
915
|
def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
|
904
916
|
path = File.join(lexpath, lexicon)
|
905
|
-
fh = File.open(path,
|
906
|
-
while line = fh.gets
|
917
|
+
fh = File.open(path, "r")
|
918
|
+
while (line = fh.gets)
|
907
919
|
/\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
|
908
|
-
next unless $1
|
909
|
-
|
910
|
-
|
920
|
+
next unless $1 && $2
|
921
|
+
|
922
|
+
key = $1
|
923
|
+
data = $2
|
911
924
|
items = data.split(/,\s+/)
|
912
925
|
pairs = {}
|
913
926
|
items.each do |i|
|
@@ -919,7 +932,7 @@ class EngTagger
|
|
919
932
|
fh.close
|
920
933
|
end
|
921
934
|
|
922
|
-
#memoize the stem and assign_tag methods
|
935
|
+
# memoize the stem and assign_tag methods
|
923
936
|
memoize("stem")
|
924
937
|
memoize("assign_tag")
|
925
938
|
end
|