engtagger 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/engtagger.rb CHANGED
@@ -1,21 +1,21 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
2
 
4
- $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
5
- require 'rubygems'
6
- require 'porter'
7
- require 'lru_redux'
3
+ # frozen_string_literal: true
4
+
5
+ require "rubygems"
6
+ require "lru_redux"
7
+ require_relative "engtagger/porter"
8
8
 
9
9
  module BoundedSpaceMemoizable
10
- def memoize(method, max_cache_size=100000)
10
+ def memoize(method, max_cache_size = 100_000)
11
11
  # alias_method is faster than define_method + old.bind(self).call
12
12
  alias_method "__memoized__#{method}", method
13
- module_eval <<-EOF
13
+ module_eval <<-MODEV
14
14
  def #{method}(*a)
15
15
  @__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
16
16
  @__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
17
17
  end
18
- EOF
18
+ MODEV
19
19
  end
20
20
  end
21
21
 
@@ -24,7 +24,7 @@ class EngTagger
24
24
  extend BoundedSpaceMemoizable
25
25
 
26
26
  # File paths
27
- DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
27
+ DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
28
28
  DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
29
29
  DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
30
30
 
@@ -37,7 +37,7 @@ class EngTagger
37
37
  # @return [Hash] the probability data
38
38
  #
39
39
  def self.hmm
40
- return @@hmm
40
+ @@hmm
41
41
  end
42
42
 
43
43
  # Return a class variable that holds lexical data.
@@ -45,46 +45,47 @@ class EngTagger
45
45
  # @return [Hash] the lexicon
46
46
  #
47
47
  def self.lexicon
48
- return @@lexicon
48
+ @@lexicon
49
49
  end
50
50
 
51
51
  # Return a regexp from a string argument that matches an XML-style pos tag
52
52
  def self.get_ext(tag = nil)
53
53
  return nil unless tag
54
- return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
54
+
55
+ Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
55
56
  end
56
57
 
57
58
  # Regexps to match XML-style part-of-speech tags
58
- NUM = get_ext('cd')
59
- GER = get_ext('vbg')
60
- ADJ = get_ext('jj[rs]*')
61
- NN = get_ext('nn[sp]*')
62
- NNP = get_ext('nnp')
63
- PREP = get_ext('in')
64
- DET = get_ext('det')
65
- PAREN = get_ext('[lr]rb')
66
- QUOT = get_ext('ppr')
67
- SEN = get_ext('pp')
68
- WORD = get_ext('\w+')
69
- VB = get_ext('vb')
70
- VBG = get_ext('vbg')
71
- VBD = get_ext('vbd')
72
- PART = get_ext('vbn')
73
- VBP = get_ext('vbp')
74
- VBZ = get_ext('vbz')
75
- JJ = get_ext('jj')
76
- JJR = get_ext('jjr')
77
- JJS = get_ext('jjs')
78
- RB = get_ext('rb')
79
- RBR = get_ext('rbr')
80
- RBS = get_ext('rbs')
81
- RP = get_ext('rp')
82
- WRB = get_ext('wrb')
83
- WDT = get_ext('wdt')
84
- WP = get_ext('wp')
85
- WPS = get_ext('wps')
86
- CC = get_ext('cc')
87
- IN = get_ext('in')
59
+ NUM = get_ext("cd")
60
+ GER = get_ext("vbg")
61
+ ADJ = get_ext("jj[rs]*")
62
+ NN = get_ext("nn[sp]*")
63
+ NNP = get_ext("nnp")
64
+ PREP = get_ext("in")
65
+ DET = get_ext("det")
66
+ PAREN = get_ext("[lr]rb")
67
+ QUOT = get_ext("ppr")
68
+ SEN = get_ext("pp")
69
+ WORD = get_ext("\w+")
70
+ VB = get_ext("vb")
71
+ VBG = get_ext("vbg")
72
+ VBD = get_ext("vbd")
73
+ PART = get_ext("vbn")
74
+ VBP = get_ext("vbp")
75
+ VBZ = get_ext("vbz")
76
+ JJ = get_ext("jj")
77
+ JJR = get_ext("jjr")
78
+ JJS = get_ext("jjs")
79
+ RB = get_ext("rb")
80
+ RBR = get_ext("rbr")
81
+ RBS = get_ext("rbs")
82
+ RP = get_ext("rp")
83
+ WRB = get_ext("wrb")
84
+ WDT = get_ext("wdt")
85
+ WP = get_ext("wp")
86
+ WPS = get_ext("wps")
87
+ CC = get_ext("cc")
88
+ IN = get_ext("in")
88
89
 
89
90
  # Convert a Treebank-style, abbreviated tag into verbose definitions
90
91
  #
@@ -93,11 +94,7 @@ class EngTagger
93
94
  #
94
95
  def self.explain_tag(tag)
95
96
  tag = tag.to_s.downcase
96
- if TAGS[tag]
97
- return TAGS[tag]
98
- else
99
- return tag
100
- end
97
+ TAGS[tag] || tag
101
98
  end
102
99
 
103
100
  # The folloging is to make a hash to convert a pos tag to its definition
@@ -148,8 +145,8 @@ class EngTagger
148
145
  "LRB", "Punctuation, left bracket",
149
146
  "RRB", "Punctuation, right bracket"
150
147
  ]
151
- tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
152
- tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
148
+ tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
149
+ tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
153
150
  TAGS = Hash[*tags]
154
151
 
155
152
  # Hash storing config values:
@@ -191,30 +188,30 @@ class EngTagger
191
188
  # Take a hash of parameters that override default values.
192
189
  # See above for details.
193
190
  def initialize(params = {})
194
- @conf = Hash.new
195
- @conf[:unknown_word_tag] = ''
191
+ @conf = {}
192
+ @conf[:unknown_word_tag] = ""
196
193
  @conf[:stem] = false
197
194
  @conf[:weight_noun_phrases] = false
198
195
  @conf[:longest_noun_phrase] = 5
199
196
  @conf[:relax] = false
200
- @conf[:tag_lex] = 'tags.yml'
201
- @conf[:word_lex] = 'words.yml'
202
- @conf[:unknown_lex] = 'unknown.yml'
197
+ @conf[:tag_lex] = "tags.yml"
198
+ @conf[:word_lex] = "words.yml"
199
+ @conf[:unknown_lex] = "unknown.yml"
203
200
  @conf[:word_path] = DEFAULT_WORDPATH
204
201
  @conf[:tag_path] = DEFAULT_TAGPATH
205
202
  @conf[:debug] = false
206
203
  # assuming that we start analyzing from the beginninga new sentence...
207
- @conf[:current_tag] = 'pp'
204
+ @conf[:current_tag] = "pp"
208
205
  @conf.merge!(params) if params
209
- unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
206
+ if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
210
207
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
211
- @@hmm = Hash.new
212
- @@lexicon = Hash.new
208
+ @@hmm = {}
209
+ @@lexicon = {}
213
210
  else
214
- lexf = File.open(@conf[:word_path], 'r')
211
+ lexf = File.open(@conf[:word_path], "r")
215
212
  @@lexicon = Marshal.load(lexf)
216
213
  lexf.close
217
- hmmf = File.open(@conf[:tag_path], 'r')
214
+ hmmf = File.open(@conf[:tag_path], "r")
218
215
  @@hmm = Marshal.load(hmmf)
219
216
  hmmf.close
220
217
  end
@@ -236,7 +233,7 @@ class EngTagger
236
233
  out = clean_text(text).map do |word|
237
234
  cleaned_word = clean_word word
238
235
  tag = assign_tag(@conf[:current_tag], cleaned_word)
239
- @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
236
+ @conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
240
237
  [word, tag.to_sym]
241
238
  end
242
239
 
@@ -255,18 +252,18 @@ class EngTagger
255
252
  # Examine the string provided and return it fully tagged in XML style
256
253
  def add_tags(text, verbose = false)
257
254
  return nil unless valid_text(text)
255
+
258
256
  tagged = []
259
257
  words = clean_text(text)
260
- tags = Array.new
261
258
  words.each do |word|
262
259
  cleaned_word = clean_word(word)
263
260
  tag = assign_tag(@conf[:current_tag], cleaned_word)
264
- @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
261
+ @conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
265
262
  tag = EngTagger.explain_tag(tag) if verbose
266
- tagged << '<' + tag + '>' + word + '</' + tag + '>'
263
+ tagged << "<#{tag}>#{word}</#{tag}>"
267
264
  end
268
265
  reset
269
- return tagged.join(' ')
266
+ tagged.join(" ")
270
267
  end
271
268
 
272
269
  # Given a text string, return as many nouns and noun phrases as possible.
@@ -278,11 +275,12 @@ class EngTagger
278
275
  #
279
276
  def get_words(text)
280
277
  return false unless valid_text(text)
278
+
281
279
  tagged = add_tags(text)
282
- if(@conf[:longest_noun_phrase] <= 1)
283
- return get_nouns(tagged)
280
+ if @conf[:longest_noun_phrase] <= 1
281
+ get_nouns(tagged)
284
282
  else
285
- return get_noun_phrases(tagged)
283
+ get_noun_phrases(tagged)
286
284
  end
287
285
  end
288
286
 
@@ -290,29 +288,29 @@ class EngTagger
290
288
  # Applies add_tags and reformats to be easier to read.
291
289
  def get_readable(text, verbose = false)
292
290
  return nil unless valid_text(text)
291
+
293
292
  tagged = add_tags(text, verbose)
294
- tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
295
- #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
296
- $1 + '/' + $2.upcase
293
+ tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
294
+ "#{$1}/#{$2.upcase}"
297
295
  end
298
296
  end
299
297
 
300
298
  # Return an array of sentences (without POS tags) from a text.
301
299
  def get_sentences(text)
302
300
  return nil unless valid_text(text)
301
+
303
302
  tagged = add_tags(text)
304
- sentences = Array.new
305
- tagged.split(/<\/pp>/).each do |line|
303
+ sentences = []
304
+ tagged.split(%r{</pp>}).each do |line|
306
305
  sentences << strip_tags(line)
307
306
  end
308
307
  sentences = sentences.map do |sentence|
309
- sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
310
- sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
311
- sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
312
- sentence.gsub(Regexp.new(" (\W+)$")){$1}
313
- sentence.gsub(Regexp.new("^(`+) ")){$1}
308
+ sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
309
+ sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
310
+ sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
311
+ sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
312
+ sentence.gsub(Regexp.new("^(`+) ")) { $1 }
314
313
  end
315
- return sentences
316
314
  end
317
315
 
318
316
  # Given a POS-tagged text, this method returns a hash of all proper nouns
@@ -322,30 +320,31 @@ class EngTagger
322
320
  # proper nouns. This method does not stem the found words.
323
321
  def get_proper_nouns(tagged)
324
322
  return nil unless valid_text(tagged)
323
+
325
324
  tags = [NNP]
326
325
  nnp = build_matches_hash(build_trimmed(tagged, tags))
327
326
  # Now for some fancy resolution stuff...
328
- nnp.keys.each do |key|
327
+ nnp.each_key do |key|
329
328
  words = key.split(/\s/)
330
329
  # Let's say this is an organization's name --
331
330
  # (and it's got at least three words)
332
331
  # is there a corresponding acronym in this hash?
333
- if words.length > 2
334
- # Make a (naive) acronym out of this name
335
- acronym = words.map do |word|
336
- /\A([a-z])[a-z]*\z/ =~ word
337
- $1
338
- end.join ''
339
- # If that acronym has been seen,
340
- # remove it and add the values to
341
- # the full name
342
- if nnp[acronym]
343
- nnp[key] += nnp[acronym]
344
- nnp.delete(acronym)
345
- end
332
+ next if words.length <= 2
333
+
334
+ # Make a (naive) acronym out of this name
335
+ acronym = words.map do |word|
336
+ /\A([a-z])[a-z]*\z/ =~ word
337
+ $1
338
+ end.join " "
339
+ # If that acronym has been seen,
340
+ # remove it and add the values to
341
+ # the full name
342
+ if nnp[acronym]
343
+ nnp[key] += nnp[acronym]
344
+ nnp.delete(acronym)
346
345
  end
347
346
  end
348
- return nnp
347
+ nnp
349
348
  end
350
349
 
351
350
  # Given a POS-tagged text, this method returns all nouns and their
@@ -356,6 +355,7 @@ class EngTagger
356
355
  #
357
356
  def get_nouns(tagged)
358
357
  return nil unless valid_text(tagged)
358
+
359
359
  tags = [NN]
360
360
  build_matches_hash(build_trimmed(tagged, tags))
361
361
  end
@@ -369,6 +369,7 @@ class EngTagger
369
369
  #
370
370
  def get_verbs(tagged)
371
371
  return nil unless valid_text(tagged)
372
+
372
373
  tags = [VB, VBD, VBG, PART, VBP, VBZ]
373
374
  build_matches_hash(build_trimmed(tagged, tags))
374
375
  end
@@ -380,6 +381,7 @@ class EngTagger
380
381
 
381
382
  def get_infinitive_verbs(tagged)
382
383
  return nil unless valid_text(tagged)
384
+
383
385
  tags = [VB]
384
386
  build_matches_hash(build_trimmed(tagged, tags))
385
387
  end
@@ -390,6 +392,7 @@ class EngTagger
390
392
  #
391
393
  def get_past_tense_verbs(tagged)
392
394
  return nil unless valid_text(tagged)
395
+
393
396
  tags = [VBD]
394
397
  build_matches_hash(build_trimmed(tagged, tags))
395
398
  end
@@ -400,6 +403,7 @@ class EngTagger
400
403
  #
401
404
  def get_gerund_verbs(tagged)
402
405
  return nil unless valid_text(tagged)
406
+
403
407
  tags = [VBG]
404
408
  build_matches_hash(build_trimmed(tagged, tags))
405
409
  end
@@ -410,6 +414,7 @@ class EngTagger
410
414
  #
411
415
  def get_passive_verbs(tagged)
412
416
  return nil unless valid_text(tagged)
417
+
413
418
  tags = [PART]
414
419
  build_matches_hash(build_trimmed(tagged, tags))
415
420
  end
@@ -420,6 +425,7 @@ class EngTagger
420
425
  #
421
426
  def get_base_present_verbs(tagged)
422
427
  return nil unless valid_text(tagged)
428
+
423
429
  tags = [VBP]
424
430
  build_matches_hash(build_trimmed(tagged, tags))
425
431
  end
@@ -430,6 +436,7 @@ class EngTagger
430
436
  #
431
437
  def get_present_verbs(tagged)
432
438
  return nil unless valid_text(tagged)
439
+
433
440
  tags = [VBZ]
434
441
  build_matches_hash(build_trimmed(tagged, tags))
435
442
  end
@@ -440,6 +447,7 @@ class EngTagger
440
447
  #
441
448
  def get_adjectives(tagged)
442
449
  return nil unless valid_text(tagged)
450
+
443
451
  tags = [JJ]
444
452
  build_matches_hash(build_trimmed(tagged, tags))
445
453
  end
@@ -450,6 +458,7 @@ class EngTagger
450
458
  #
451
459
  def get_comparative_adjectives(tagged)
452
460
  return nil unless valid_text(tagged)
461
+
453
462
  tags = [JJR]
454
463
  build_matches_hash(build_trimmed(tagged, tags))
455
464
  end
@@ -460,6 +469,7 @@ class EngTagger
460
469
  #
461
470
  def get_superlative_adjectives(tagged)
462
471
  return nil unless valid_text(tagged)
472
+
463
473
  tags = [JJS]
464
474
  build_matches_hash(build_trimmed(tagged, tags))
465
475
  end
@@ -470,6 +480,7 @@ class EngTagger
470
480
  #
471
481
  def get_adverbs(tagged)
472
482
  return nil unless valid_text(tagged)
483
+
473
484
  tags = [RB, RBR, RBS, RP]
474
485
  build_matches_hash(build_trimmed(tagged, tags))
475
486
  end
@@ -480,13 +491,14 @@ class EngTagger
480
491
  #
481
492
  def get_interrogatives(tagged)
482
493
  return nil unless valid_text(tagged)
494
+
483
495
  tags = [WRB, WDT, WP, WPS]
484
496
  build_matches_hash(build_trimmed(tagged, tags))
485
497
  end
486
498
 
487
499
  # To be consistent with documentation's naming of 'interrogative'
488
500
  # parts of speech as 'question'
489
- alias_method :get_question_parts, :get_interrogatives
501
+ alias get_question_parts get_interrogatives
490
502
 
491
503
  # Returns all types of conjunctions and does not discriminate
492
504
  # between the various kinds. E.g. coordinating, subordinating,
@@ -497,6 +509,7 @@ class EngTagger
497
509
  #
498
510
  def get_conjunctions(tagged)
499
511
  return nil unless valid_text(tagged)
512
+
500
513
  tags = [CC, IN]
501
514
  build_matches_hash(build_trimmed(tagged, tags))
502
515
  end
@@ -509,14 +522,15 @@ class EngTagger
509
522
  #
510
523
  def get_max_noun_phrases(tagged)
511
524
  return nil unless valid_text(tagged)
525
+
512
526
  tags = [@@mnp]
513
527
  mn_phrases = build_trimmed(tagged, tags)
514
528
  ret = Hash.new(0)
515
529
  mn_phrases.each do |p|
516
- p = stem(p) unless p =~ /\s/ # stem single words
530
+ p = stem(p) unless p =~ /\s/ # stem single words
517
531
  ret[p] += 1 unless p =~ /\A\s*\z/
518
532
  end
519
- return ret
533
+ ret
520
534
  end
521
535
 
522
536
  # Similar to get_words, but requires a POS-tagged text as an argument.
@@ -526,9 +540,10 @@ class EngTagger
526
540
  #
527
541
  def get_noun_phrases(tagged)
528
542
  return nil unless valid_text(tagged)
543
+
529
544
  found = Hash.new(0)
530
545
  phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
531
- scanned = tagged.scan(@@mnp)
546
+ scanned = tagged.scan(@@mnp)
532
547
  # Find MNPs in the text, one sentence at a time
533
548
  # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
534
549
  mn_phrases = []
@@ -541,14 +556,14 @@ class EngTagger
541
556
  # shortening the phrase by removing the word in the first position.
542
557
  # Record the phrase and any single nouns that are found
543
558
  words = mnp.split
544
- words.length.times do |i|
545
- found[words.join(' ')] += 1 if words.length > 1
559
+ words.length.times do
560
+ found[words.join(" ")] += 1 if words.length > 1
546
561
  w = words.shift
547
562
  found[w] += 1 if w =~ /#{NN}/
548
563
  end
549
564
  end
550
565
  ret = Hash.new(0)
551
- found.keys.each do |f|
566
+ found.each_key do |f|
552
567
  k = strip_tags(f)
553
568
  v = found[f]
554
569
  # We weight by the word count to favor long noun phrases
@@ -556,12 +571,13 @@ class EngTagger
556
571
  word_count = space_count.length + 1
557
572
  # Throttle MNPs if necessary
558
573
  next if word_count > @conf[:longest_noun_phrase]
559
- k = stem(k) unless word_count > 1 # stem single words
574
+
575
+ k = stem(k) unless word_count > 1 # stem single words
560
576
  multiplier = 1
561
577
  multiplier = word_count if @conf[:weight_noun_phrases]
562
578
  ret[k] += multiplier * v
563
579
  end
564
- return ret
580
+ ret
565
581
  end
566
582
 
567
583
  # Reads some included corpus data and saves it in a stored hash on the
@@ -572,10 +588,10 @@ class EngTagger
572
588
  load_tags(@conf[:tag_lex])
573
589
  load_words(@conf[:word_lex])
574
590
  load_words(@conf[:unknown_lex])
575
- File.open(@conf[:word_path], 'w') do |f|
591
+ File.open(@conf[:word_path], "w") do |f|
576
592
  Marshal.dump(@@lexicon, f)
577
593
  end
578
- File.open(@conf[:tag_path], 'w') do |f|
594
+ File.open(@conf[:tag_path], "w") do |f|
579
595
  Marshal.dump(@@hmm, f)
580
596
  end
581
597
  end
@@ -597,6 +613,7 @@ class EngTagger
597
613
  trimmed.each do |n|
598
614
  n = stem(n)
599
615
  next unless n.length < 100 # sanity check on word length
616
+
600
617
  ret[n] += 1 unless n =~ /\A\s*\z/
601
618
  end
602
619
  ret
@@ -604,25 +621,24 @@ class EngTagger
604
621
 
605
622
  # Downcase the first letter of word
606
623
  def lcfirst(word)
607
- word.split(//)[0].downcase + word.split(//)[1..-1].join
624
+ word.split(//)[0].downcase + word.split(//)[1..].join
608
625
  end
609
626
 
610
627
  # Upcase the first letter of word
611
628
  def ucfirst(word)
612
- word.split(//)[0].upcase + word.split(//)[1..-1].join
629
+ word.split(//)[0].upcase + word.split(//)[1..].join
613
630
  end
614
631
 
615
632
  # Return the word stem as given by Stemmable module. This can be
616
633
  # turned off with the class parameter @conf[:stem] => false.
617
634
  def stem(word)
618
- return word unless @conf[:stem]
619
- return word.stem
635
+ @conf[:stem] ? word.stem : word
620
636
  end
621
637
 
622
638
  # This method will reset the preceeding tag to a sentence ender (PP).
623
639
  # This prepares the first word of a new sentence to be tagged correctly.
624
640
  def reset
625
- @conf[:current_tag] = 'pp'
641
+ @conf[:current_tag] = "pp"
626
642
  end
627
643
 
628
644
  # Check whether the text is a valid string
@@ -630,41 +646,38 @@ class EngTagger
630
646
  if !text
631
647
  # there's nothing to parse
632
648
  "method call on uninitialized variable" if @conf[:debug]
633
- return false
649
+ false
634
650
  elsif /\A\s*\z/ =~ text
635
651
  # text is an empty string, nothing to parse
636
- return false
652
+ false
637
653
  else
638
654
  # $text is valid
639
- return true
655
+ true
640
656
  end
641
657
  end
642
658
 
643
659
  # Return a text string with the part-of-speech tags removed
644
660
  def strip_tags(tagged, downcase = false)
645
661
  return nil unless valid_text(tagged)
662
+
646
663
  text = tagged.gsub(/<[^>]+>/m, "")
647
664
  text = text.gsub(/\s+/m, " ")
648
665
  text = text.gsub(/\A\s*/, "")
649
666
  text = text.gsub(/\s*\z/, "")
650
- if downcase
651
- return text.downcase
652
- else
653
- return text
654
- end
667
+ downcase ? text.downcase : text
655
668
  end
656
669
 
657
670
  # Strip the provided text and separate off any punctuation in preparation for tagging
658
671
  def clean_text(text)
659
672
  return false unless valid_text(text)
660
- cleaned_text = text.encode('utf-8')
673
+
674
+ cleaned_text = text.encode("utf-8")
661
675
  tokenized = []
662
676
  # Tokenize the text (splitting on punctuation as you go)
663
677
  cleaned_text.split(/\s+/).each do |line|
664
678
  tokenized += split_punct(line)
665
679
  end
666
- words = split_sentences(tokenized)
667
- return words
680
+ split_sentences(tokenized)
668
681
  end
669
682
 
670
683
  # This handles all of the trailing periods, keeping those that
@@ -673,27 +686,26 @@ class EngTagger
673
686
  # about the use of capitalization in the incoming text
674
687
  def split_sentences(array)
675
688
  tokenized = array
676
- people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
677
- supt det mssrs rev)
678
- army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
679
- inst = %w(dept univ assn bros ph.d)
680
- place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
681
- hwy hway la pde pd plz pl rd st tce)
682
- comp = %w(mfg inc ltd co corp)
683
- state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
689
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
690
+ supt det mssrs rev]
691
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
692
+ inst = %w[dept univ assn bros ph.d]
693
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
694
+ hwy hway la pde pd plz pl rd st tce]
695
+ comp = %w[mfg inc ltd co corp]
696
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
684
697
  ind ia kans kan ken ky la me md is mass mich minn miss mo mont
685
698
  neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
686
- va wash wis wisc wy wyo usafa alta man ont que sask yuk)
687
- month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
688
- misc = %w(vs etc no esp)
689
- abbr = Hash.new
699
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
700
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
701
+ misc = %w[vs etc no esp]
702
+ abbr = {}
690
703
  [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
691
704
  abbr[i] = true
692
705
  end
693
- words = Array.new
694
- tokenized.each_with_index do |t, i|
695
- if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
696
- tokenized[i] =~ /\A(.+)\.\z/
706
+ words = []
707
+ tokenized.each_with_index do |_, i|
708
+ if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
697
709
  w = $1
698
710
  # Don't separate the period off words that
699
711
  # meet any of the following conditions:
@@ -701,21 +713,20 @@ class EngTagger
701
713
  # 1. It is defined in one of the lists above
702
714
  # 2. It is only one letter long: Alfred E. Sloan
703
715
  # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
704
- unless abbr[w.downcase] or
705
- [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
716
+ unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
706
717
  words << w
707
- words << '.'
718
+ words << "."
708
719
  next
709
720
  end
710
721
  end
711
722
  words << tokenized[i]
712
723
  end
713
724
  # If the final word ends in a period..
714
- if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
725
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
715
726
  words[-1] = $1
716
- words.push '.'
727
+ words.push "."
717
728
  end
718
- return words
729
+ words
719
730
  end
720
731
 
721
732
  # Separate punctuation from words, where appropriate. This leaves trailing
@@ -723,39 +734,40 @@ class EngTagger
723
734
  def split_punct(text)
724
735
  # If there's no punctuation, return immediately
725
736
  return [text] if /\A\w+\z/ =~ text
737
+
726
738
  # Sanity checks
727
739
  text = text.gsub(/\W{10,}/o, " ")
728
740
 
729
741
  # Put quotes into a standard format
730
742
  text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
731
743
  text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
732
- text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
744
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
733
745
  text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
734
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
746
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
735
747
 
736
748
  # Handle all other punctuation
737
749
  text = text.gsub(/--+/o, " - ") # Convert and separate dashes
738
750
  text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
739
751
  text = text.gsub(/:/o, " : ") # Shift semicolons off
740
- text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
741
- text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
742
- text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
752
+ text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
753
+ text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
754
+ text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
743
755
 
744
756
  # English-specific contractions
745
- text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
746
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
747
- text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
748
- result = text.split(' ')
749
- return result
757
+ text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
758
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
759
+ text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
760
+ text.split(" ")
750
761
  end
751
762
 
752
763
  # Given a preceding tag, assign a tag word. Called by the add_tags method.
753
764
  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
754
765
  def assign_tag(prev_tag, word)
755
- if word == "-unknown-"
766
+ case word
767
+ when "-unknown-"
756
768
  # classify unknown words accordingly
757
769
  return @conf[:unknown_word_tag]
758
- elsif word == "-sym-"
770
+ when "-sym-"
759
771
  # If this is a symbol, tag it as a symbol
760
772
  return "sym"
761
773
  end
@@ -766,13 +778,13 @@ class EngTagger
766
778
  # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
767
779
  # which is used in most POS taggers
768
780
  best_tag = ""
769
- t[prev_tag].keys.each do |tag|
781
+ t[prev_tag].each_key do |tag|
770
782
  # With @config[:relax] set, this method
771
783
  # will also include any `open classes' of POS tags
772
784
  pw = 0
773
785
  if w[tag]
774
786
  pw = w[tag]
775
- elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
787
+ elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
776
788
  pw = 0
777
789
  else
778
790
  next
@@ -787,7 +799,7 @@ class EngTagger
787
799
  best_tag = tag
788
800
  end
789
801
  end
790
- return best_tag
802
+ best_tag
791
803
  end
792
804
 
793
805
  # This method determines whether a word should be considered in its
@@ -797,13 +809,13 @@ class EngTagger
797
809
  lcf = lcfirst(word)
798
810
  # seen this word as it appears (lower or upper case)
799
811
  if @@lexicon[word]
800
- return word
812
+ word
801
813
  elsif @@lexicon[lcf]
802
814
  # seen this word only as lower case
803
- return lcf
815
+ lcf
804
816
  else
805
817
  # never seen this word. guess.
806
- return classify_unknown_word(word)
818
+ classify_unknown_word(word)
807
819
  end
808
820
  end
809
821
 
@@ -811,52 +823,52 @@ class EngTagger
811
823
  # classes of words handled by a simple unknown word classification
812
824
  # metric. Called by the clean_word method.
813
825
  def classify_unknown_word(word)
814
- if /[\(\{\[]/ =~ word # Left brackets
815
- classified = "*LRB*"
816
- elsif /[\)\}\]]/ =~ word # Right brackets
817
- classified = "*RRB*"
818
- elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
819
- classified = "*NUM*"
820
- elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
821
- classified = "*NUM*"
822
- elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
823
- classified = "*ORD*"
824
- elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
825
- classified = "-abr-"
826
- elsif /\w-\w/o =~ word # Hyphenated word
826
+ case word
827
+ when /[({\[]/ # Left brackets
828
+ "*LRB*"
829
+ when /[)}\]]/ # Right brackets
830
+ "*RRB*"
831
+ when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
832
+ "*NUM*"
833
+ when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
834
+ "*NUM*"
835
+ when /\A-?\d+\w+\z/o # Ordinal number
836
+ "*ORD*"
837
+ when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
838
+ "-abr-"
839
+ when /\w-\w/o # Hyphenated word
827
840
  /-([^-]+)\z/ =~ word
828
841
  h_suffix = $1
829
- if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
842
+ if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
830
843
  # last part of this is defined as an adjective
831
- classified = "-hyp-adj-"
844
+ "-hyp-adj-"
832
845
  else
833
846
  # last part of this is not defined as an adjective
834
- classified = "-hyp-"
847
+ "-hyp-"
835
848
  end
836
- elsif /\A\W+\z/o =~ word
837
- classified = "-sym-" # Symbol
838
- elsif word == ucfirst(word)
839
- classified = "-cap-" # Capitalized word
840
- elsif /ing\z/o =~ word
841
- classified = "-ing-" # Ends in 'ing'
842
- elsif /s\z/o =~ word
843
- classified = "-s-" # Ends in 's'
844
- elsif /tion\z/o =~ word
845
- classified = "-tion-" # Ends in 'tion'
846
- elsif /ly\z/o =~ word
847
- classified = "-ly-" # Ends in 'ly'
848
- elsif /ed\z/o =~ word
849
- classified = "-ed-" # Ends in 'ed
849
+ when /\A\W+\z/o
850
+ "-sym-" # Symbol
851
+ when ucfirst(word)
852
+ "-cap-" # Capitalized word
853
+ when /ing\z/o
854
+ "-ing-" # Ends in "ing"
855
+ when /s\z/o
856
+ "-s-" # Ends in "s"
857
+ when /tion\z/o
858
+ "-tion-" # Ends in "tion"
859
+ when /ly\z/o
860
+ "-ly-" # Ends in "ly"
861
+ when /ed\z/o
862
+ "-ed-" # Ends in "ed
850
863
  else
851
- classified = "-unknown-" # Completely unknown
864
+ "-unknown-" # Completely unknown
852
865
  end
853
- return classified
854
866
  end
855
867
 
856
868
  # This returns a compiled regexp for extracting maximal noun phrases
857
869
  # from a POS-tagged text.
858
870
  def get_max_noun_regex
859
- regex = /
871
+ /
860
872
  # optional number, gerund - adjective -participle
861
873
  (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
862
874
  # Followed by one or more nouns
@@ -869,8 +881,7 @@ class EngTagger
869
881
  # one or more nouns
870
882
  (?:#{NN})+
871
883
  )*
872
- /xo #/
873
- return regex
884
+ /xo
874
885
  end
875
886
 
876
887
  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
@@ -879,12 +890,13 @@ class EngTagger
879
890
  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
880
891
  def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
881
892
  path = File.join(lexpath, lexicon)
882
- fh = File.open(path, 'r')
883
- while line = fh.gets
893
+ fh = File.open(path, "r")
894
+ while (line = fh.gets)
884
895
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
885
- next unless $1 and $2
886
- key, data = $1, $2
887
- tags = Hash.new
896
+ next unless $1 && $2
897
+
898
+ key = $1
899
+ data = $2
888
900
  items = data.split(/,\s+/)
889
901
  pairs = {}
890
902
  items.each do |i|
@@ -902,12 +914,13 @@ class EngTagger
902
914
  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
903
915
  def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
904
916
  path = File.join(lexpath, lexicon)
905
- fh = File.open(path, 'r')
906
- while line = fh.gets
917
+ fh = File.open(path, "r")
918
+ while (line = fh.gets)
907
919
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
908
- next unless $1 and $2
909
- key, data = $1, $2
910
- tags = Hash.new
920
+ next unless $1 && $2
921
+
922
+ key = $1
923
+ data = $2
911
924
  items = data.split(/,\s+/)
912
925
  pairs = {}
913
926
  items.each do |i|
@@ -919,7 +932,7 @@ class EngTagger
919
932
  fh.close
920
933
  end
921
934
 
922
- #memoize the stem and assign_tag methods
935
+ # memoize the stem and assign_tag methods
923
936
  memoize("stem")
924
937
  memoize("assign_tag")
925
938
  end