engtagger 0.3.2 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/engtagger.rb CHANGED
@@ -1,20 +1,21 @@
1
1
  #!/usr/bin/env ruby
2
- # -*- coding: utf-8 -*-
3
2
 
4
- require 'rubygems'
5
- require 'engtagger/porter'
6
- require 'lru_redux'
3
+ # frozen_string_literal: true
4
+
5
+ require "rubygems"
6
+ require "lru_redux"
7
+ require_relative "engtagger/porter"
7
8
 
8
9
  module BoundedSpaceMemoizable
9
- def memoize(method, max_cache_size=100000)
10
+ def memoize(method, max_cache_size = 100_000)
10
11
  # alias_method is faster than define_method + old.bind(self).call
11
12
  alias_method "__memoized__#{method}", method
12
- module_eval <<-EOF
13
+ module_eval <<-MODEV
13
14
  def #{method}(*a)
14
15
  @__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
15
16
  @__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
16
17
  end
17
- EOF
18
+ MODEV
18
19
  end
19
20
  end
20
21
 
@@ -23,7 +24,7 @@ class EngTagger
23
24
  extend BoundedSpaceMemoizable
24
25
 
25
26
  # File paths
26
- DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
27
+ DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), "engtagger")
27
28
  DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
28
29
  DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
29
30
 
@@ -36,7 +37,7 @@ class EngTagger
36
37
  # @return [Hash] the probability data
37
38
  #
38
39
  def self.hmm
39
- return @@hmm
40
+ @@hmm
40
41
  end
41
42
 
42
43
  # Return a class variable that holds lexical data.
@@ -44,46 +45,47 @@ class EngTagger
44
45
  # @return [Hash] the lexicon
45
46
  #
46
47
  def self.lexicon
47
- return @@lexicon
48
+ @@lexicon
48
49
  end
49
50
 
50
51
  # Return a regexp from a string argument that matches an XML-style pos tag
51
52
  def self.get_ext(tag = nil)
52
53
  return nil unless tag
53
- return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
54
+
55
+ Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
54
56
  end
55
57
 
56
58
  # Regexps to match XML-style part-of-speech tags
57
- NUM = get_ext('cd')
58
- GER = get_ext('vbg')
59
- ADJ = get_ext('jj[rs]*')
60
- NN = get_ext('nn[sp]*')
61
- NNP = get_ext('nnp')
62
- PREP = get_ext('in')
63
- DET = get_ext('det')
64
- PAREN = get_ext('[lr]rb')
65
- QUOT = get_ext('ppr')
66
- SEN = get_ext('pp')
67
- WORD = get_ext('\w+')
68
- VB = get_ext('vb')
69
- VBG = get_ext('vbg')
70
- VBD = get_ext('vbd')
71
- PART = get_ext('vbn')
72
- VBP = get_ext('vbp')
73
- VBZ = get_ext('vbz')
74
- JJ = get_ext('jj')
75
- JJR = get_ext('jjr')
76
- JJS = get_ext('jjs')
77
- RB = get_ext('rb')
78
- RBR = get_ext('rbr')
79
- RBS = get_ext('rbs')
80
- RP = get_ext('rp')
81
- WRB = get_ext('wrb')
82
- WDT = get_ext('wdt')
83
- WP = get_ext('wp')
84
- WPS = get_ext('wps')
85
- CC = get_ext('cc')
86
- IN = get_ext('in')
59
+ NUM = get_ext("cd")
60
+ GER = get_ext("vbg")
61
+ ADJ = get_ext("jj[rs]*")
62
+ NN = get_ext("nn[sp]*")
63
+ NNP = get_ext("nnp")
64
+ PREP = get_ext("in")
65
+ DET = get_ext("det")
66
+ PAREN = get_ext("[lr]rb")
67
+ QUOT = get_ext("ppr")
68
+ SEN = get_ext("pp")
69
+ WORD = get_ext("\w+")
70
+ VB = get_ext("vb")
71
+ VBG = get_ext("vbg")
72
+ VBD = get_ext("vbd")
73
+ PART = get_ext("vbn")
74
+ VBP = get_ext("vbp")
75
+ VBZ = get_ext("vbz")
76
+ JJ = get_ext("jj")
77
+ JJR = get_ext("jjr")
78
+ JJS = get_ext("jjs")
79
+ RB = get_ext("rb")
80
+ RBR = get_ext("rbr")
81
+ RBS = get_ext("rbs")
82
+ RP = get_ext("rp")
83
+ WRB = get_ext("wrb")
84
+ WDT = get_ext("wdt")
85
+ WP = get_ext("wp")
86
+ WPS = get_ext("wps")
87
+ CC = get_ext("cc")
88
+ IN = get_ext("in")
87
89
 
88
90
  # Convert a Treebank-style, abbreviated tag into verbose definitions
89
91
  #
@@ -92,11 +94,7 @@ class EngTagger
92
94
  #
93
95
  def self.explain_tag(tag)
94
96
  tag = tag.to_s.downcase
95
- if TAGS[tag]
96
- return TAGS[tag]
97
- else
98
- return tag
99
- end
97
+ TAGS[tag] || tag
100
98
  end
101
99
 
102
100
  # The folloging is to make a hash to convert a pos tag to its definition
@@ -147,8 +145,8 @@ class EngTagger
147
145
  "LRB", "Punctuation, left bracket",
148
146
  "RRB", "Punctuation, right bracket"
149
147
  ]
150
- tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
151
- tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
148
+ tags = tags.collect { |t| t.downcase.gsub(/[.,'\-\s]+/, "_") }
149
+ tags = tags.collect { |t| t.gsub("&", "and").gsub("/", "or") }
152
150
  TAGS = Hash[*tags]
153
151
 
154
152
  # Hash storing config values:
@@ -190,30 +188,30 @@ class EngTagger
190
188
  # Take a hash of parameters that override default values.
191
189
  # See above for details.
192
190
  def initialize(params = {})
193
- @conf = Hash.new
194
- @conf[:unknown_word_tag] = ''
191
+ @conf = {}
192
+ @conf[:unknown_word_tag] = ""
195
193
  @conf[:stem] = false
196
194
  @conf[:weight_noun_phrases] = false
197
195
  @conf[:longest_noun_phrase] = 5
198
196
  @conf[:relax] = false
199
- @conf[:tag_lex] = 'tags.yml'
200
- @conf[:word_lex] = 'words.yml'
201
- @conf[:unknown_lex] = 'unknown.yml'
197
+ @conf[:tag_lex] = "tags.yml"
198
+ @conf[:word_lex] = "words.yml"
199
+ @conf[:unknown_lex] = "unknown.yml"
202
200
  @conf[:word_path] = DEFAULT_WORDPATH
203
201
  @conf[:tag_path] = DEFAULT_TAGPATH
204
202
  @conf[:debug] = false
205
203
  # assuming that we start analyzing from the beginninga new sentence...
206
- @conf[:current_tag] = 'pp'
204
+ @conf[:current_tag] = "pp"
207
205
  @conf.merge!(params) if params
208
- unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
206
+ if !File.exist?(@conf[:word_path]) || !File.exist?(@conf[:tag_path])
209
207
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
210
- @@hmm = Hash.new
211
- @@lexicon = Hash.new
208
+ @@hmm = {}
209
+ @@lexicon = {}
212
210
  else
213
- lexf = File.open(@conf[:word_path], 'r')
211
+ lexf = File.open(@conf[:word_path], "r")
214
212
  @@lexicon = Marshal.load(lexf)
215
213
  lexf.close
216
- hmmf = File.open(@conf[:tag_path], 'r')
214
+ hmmf = File.open(@conf[:tag_path], "r")
217
215
  @@hmm = Marshal.load(hmmf)
218
216
  hmmf.close
219
217
  end
@@ -235,7 +233,7 @@ class EngTagger
235
233
  out = clean_text(text).map do |word|
236
234
  cleaned_word = clean_word word
237
235
  tag = assign_tag(@conf[:current_tag], cleaned_word)
238
- @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
236
+ @conf[:current_tag] = tag = tag && !tag.empty? ? tag : "nn"
239
237
  [word, tag.to_sym]
240
238
  end
241
239
 
@@ -254,18 +252,18 @@ class EngTagger
254
252
  # Examine the string provided and return it fully tagged in XML style
255
253
  def add_tags(text, verbose = false)
256
254
  return nil unless valid_text(text)
255
+
257
256
  tagged = []
258
257
  words = clean_text(text)
259
- tags = Array.new
260
258
  words.each do |word|
261
259
  cleaned_word = clean_word(word)
262
260
  tag = assign_tag(@conf[:current_tag], cleaned_word)
263
- @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
261
+ @conf[:current_tag] = tag = tag && tag != "" ? tag : "nn"
264
262
  tag = EngTagger.explain_tag(tag) if verbose
265
- tagged << '<' + tag + '>' + word + '</' + tag + '>'
263
+ tagged << "<#{tag}>#{word}</#{tag}>"
266
264
  end
267
265
  reset
268
- return tagged.join(' ')
266
+ tagged.join(" ")
269
267
  end
270
268
 
271
269
  # Given a text string, return as many nouns and noun phrases as possible.
@@ -277,11 +275,12 @@ class EngTagger
277
275
  #
278
276
  def get_words(text)
279
277
  return false unless valid_text(text)
278
+
280
279
  tagged = add_tags(text)
281
- if(@conf[:longest_noun_phrase] <= 1)
282
- return get_nouns(tagged)
280
+ if @conf[:longest_noun_phrase] <= 1
281
+ get_nouns(tagged)
283
282
  else
284
- return get_noun_phrases(tagged)
283
+ get_noun_phrases(tagged)
285
284
  end
286
285
  end
287
286
 
@@ -289,29 +288,29 @@ class EngTagger
289
288
  # Applies add_tags and reformats to be easier to read.
290
289
  def get_readable(text, verbose = false)
291
290
  return nil unless valid_text(text)
291
+
292
292
  tagged = add_tags(text, verbose)
293
- tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
294
- #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
295
- $1 + '/' + $2.upcase
293
+ tagged.gsub(%r{<\w+>([^<]+|[<\w>]+)</(\w+)>}o) do
294
+ "#{$1}/#{$2.upcase}"
296
295
  end
297
296
  end
298
297
 
299
298
  # Return an array of sentences (without POS tags) from a text.
300
299
  def get_sentences(text)
301
300
  return nil unless valid_text(text)
301
+
302
302
  tagged = add_tags(text)
303
- sentences = Array.new
304
- tagged.split(/<\/pp>/).each do |line|
303
+ sentences = []
304
+ tagged.split(%r{</pp>}).each do |line|
305
305
  sentences << strip_tags(line)
306
306
  end
307
307
  sentences = sentences.map do |sentence|
308
- sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
309
- sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
310
- sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
311
- sentence.gsub(Regexp.new(" (\W+)$")){$1}
312
- sentence.gsub(Regexp.new("^(`+) ")){$1}
308
+ sentence.gsub(Regexp.new(" ('s?) ")) { $1 + " " }
309
+ sentence.gsub(Regexp.new(" (\W+) ")) { $1 + " " }
310
+ sentence.gsub(Regexp.new(" (`+) ")) { " " + $1 }
311
+ sentence.gsub(Regexp.new(" (\W+)$")) { $1 }
312
+ sentence.gsub(Regexp.new("^(`+) ")) { $1 }
313
313
  end
314
- return sentences
315
314
  end
316
315
 
317
316
  # Given a POS-tagged text, this method returns a hash of all proper nouns
@@ -321,30 +320,31 @@ class EngTagger
321
320
  # proper nouns. This method does not stem the found words.
322
321
  def get_proper_nouns(tagged)
323
322
  return nil unless valid_text(tagged)
323
+
324
324
  tags = [NNP]
325
325
  nnp = build_matches_hash(build_trimmed(tagged, tags))
326
326
  # Now for some fancy resolution stuff...
327
- nnp.keys.each do |key|
327
+ nnp.each_key do |key|
328
328
  words = key.split(/\s/)
329
329
  # Let's say this is an organization's name --
330
330
  # (and it's got at least three words)
331
331
  # is there a corresponding acronym in this hash?
332
- if words.length > 2
333
- # Make a (naive) acronym out of this name
334
- acronym = words.map do |word|
335
- /\A([a-z])[a-z]*\z/ =~ word
336
- $1
337
- end.join ''
338
- # If that acronym has been seen,
339
- # remove it and add the values to
340
- # the full name
341
- if nnp[acronym]
342
- nnp[key] += nnp[acronym]
343
- nnp.delete(acronym)
344
- end
332
+ next if words.length <= 2
333
+
334
+ # Make a (naive) acronym out of this name
335
+ acronym = words.map do |word|
336
+ /\A([a-z])[a-z]*\z/ =~ word
337
+ $1
338
+ end.join " "
339
+ # If that acronym has been seen,
340
+ # remove it and add the values to
341
+ # the full name
342
+ if nnp[acronym]
343
+ nnp[key] += nnp[acronym]
344
+ nnp.delete(acronym)
345
345
  end
346
346
  end
347
- return nnp
347
+ nnp
348
348
  end
349
349
 
350
350
  # Given a POS-tagged text, this method returns all nouns and their
@@ -355,6 +355,7 @@ class EngTagger
355
355
  #
356
356
  def get_nouns(tagged)
357
357
  return nil unless valid_text(tagged)
358
+
358
359
  tags = [NN]
359
360
  build_matches_hash(build_trimmed(tagged, tags))
360
361
  end
@@ -368,6 +369,7 @@ class EngTagger
368
369
  #
369
370
  def get_verbs(tagged)
370
371
  return nil unless valid_text(tagged)
372
+
371
373
  tags = [VB, VBD, VBG, PART, VBP, VBZ]
372
374
  build_matches_hash(build_trimmed(tagged, tags))
373
375
  end
@@ -379,6 +381,7 @@ class EngTagger
379
381
 
380
382
  def get_infinitive_verbs(tagged)
381
383
  return nil unless valid_text(tagged)
384
+
382
385
  tags = [VB]
383
386
  build_matches_hash(build_trimmed(tagged, tags))
384
387
  end
@@ -389,6 +392,7 @@ class EngTagger
389
392
  #
390
393
  def get_past_tense_verbs(tagged)
391
394
  return nil unless valid_text(tagged)
395
+
392
396
  tags = [VBD]
393
397
  build_matches_hash(build_trimmed(tagged, tags))
394
398
  end
@@ -399,6 +403,7 @@ class EngTagger
399
403
  #
400
404
  def get_gerund_verbs(tagged)
401
405
  return nil unless valid_text(tagged)
406
+
402
407
  tags = [VBG]
403
408
  build_matches_hash(build_trimmed(tagged, tags))
404
409
  end
@@ -409,6 +414,7 @@ class EngTagger
409
414
  #
410
415
  def get_passive_verbs(tagged)
411
416
  return nil unless valid_text(tagged)
417
+
412
418
  tags = [PART]
413
419
  build_matches_hash(build_trimmed(tagged, tags))
414
420
  end
@@ -419,6 +425,7 @@ class EngTagger
419
425
  #
420
426
  def get_base_present_verbs(tagged)
421
427
  return nil unless valid_text(tagged)
428
+
422
429
  tags = [VBP]
423
430
  build_matches_hash(build_trimmed(tagged, tags))
424
431
  end
@@ -429,6 +436,7 @@ class EngTagger
429
436
  #
430
437
  def get_present_verbs(tagged)
431
438
  return nil unless valid_text(tagged)
439
+
432
440
  tags = [VBZ]
433
441
  build_matches_hash(build_trimmed(tagged, tags))
434
442
  end
@@ -439,6 +447,7 @@ class EngTagger
439
447
  #
440
448
  def get_adjectives(tagged)
441
449
  return nil unless valid_text(tagged)
450
+
442
451
  tags = [JJ]
443
452
  build_matches_hash(build_trimmed(tagged, tags))
444
453
  end
@@ -449,6 +458,7 @@ class EngTagger
449
458
  #
450
459
  def get_comparative_adjectives(tagged)
451
460
  return nil unless valid_text(tagged)
461
+
452
462
  tags = [JJR]
453
463
  build_matches_hash(build_trimmed(tagged, tags))
454
464
  end
@@ -459,6 +469,7 @@ class EngTagger
459
469
  #
460
470
  def get_superlative_adjectives(tagged)
461
471
  return nil unless valid_text(tagged)
472
+
462
473
  tags = [JJS]
463
474
  build_matches_hash(build_trimmed(tagged, tags))
464
475
  end
@@ -469,6 +480,7 @@ class EngTagger
469
480
  #
470
481
  def get_adverbs(tagged)
471
482
  return nil unless valid_text(tagged)
483
+
472
484
  tags = [RB, RBR, RBS, RP]
473
485
  build_matches_hash(build_trimmed(tagged, tags))
474
486
  end
@@ -479,13 +491,14 @@ class EngTagger
479
491
  #
480
492
  def get_interrogatives(tagged)
481
493
  return nil unless valid_text(tagged)
494
+
482
495
  tags = [WRB, WDT, WP, WPS]
483
496
  build_matches_hash(build_trimmed(tagged, tags))
484
497
  end
485
498
 
486
499
  # To be consistent with documentation's naming of 'interrogative'
487
500
  # parts of speech as 'question'
488
- alias_method :get_question_parts, :get_interrogatives
501
+ alias get_question_parts get_interrogatives
489
502
 
490
503
  # Returns all types of conjunctions and does not discriminate
491
504
  # between the various kinds. E.g. coordinating, subordinating,
@@ -496,6 +509,7 @@ class EngTagger
496
509
  #
497
510
  def get_conjunctions(tagged)
498
511
  return nil unless valid_text(tagged)
512
+
499
513
  tags = [CC, IN]
500
514
  build_matches_hash(build_trimmed(tagged, tags))
501
515
  end
@@ -508,14 +522,15 @@ class EngTagger
508
522
  #
509
523
  def get_max_noun_phrases(tagged)
510
524
  return nil unless valid_text(tagged)
525
+
511
526
  tags = [@@mnp]
512
527
  mn_phrases = build_trimmed(tagged, tags)
513
528
  ret = Hash.new(0)
514
529
  mn_phrases.each do |p|
515
- p = stem(p) unless p =~ /\s/ # stem single words
530
+ p = stem(p) unless p =~ /\s/ # stem single words
516
531
  ret[p] += 1 unless p =~ /\A\s*\z/
517
532
  end
518
- return ret
533
+ ret
519
534
  end
520
535
 
521
536
  # Similar to get_words, but requires a POS-tagged text as an argument.
@@ -525,9 +540,10 @@ class EngTagger
525
540
  #
526
541
  def get_noun_phrases(tagged)
527
542
  return nil unless valid_text(tagged)
543
+
528
544
  found = Hash.new(0)
529
545
  phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
530
- scanned = tagged.scan(@@mnp)
546
+ scanned = tagged.scan(@@mnp)
531
547
  # Find MNPs in the text, one sentence at a time
532
548
  # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
533
549
  mn_phrases = []
@@ -540,14 +556,14 @@ class EngTagger
540
556
  # shortening the phrase by removing the word in the first position.
541
557
  # Record the phrase and any single nouns that are found
542
558
  words = mnp.split
543
- words.length.times do |i|
544
- found[words.join(' ')] += 1 if words.length > 1
559
+ words.length.times do
560
+ found[words.join(" ")] += 1 if words.length > 1
545
561
  w = words.shift
546
562
  found[w] += 1 if w =~ /#{NN}/
547
563
  end
548
564
  end
549
565
  ret = Hash.new(0)
550
- found.keys.each do |f|
566
+ found.each_key do |f|
551
567
  k = strip_tags(f)
552
568
  v = found[f]
553
569
  # We weight by the word count to favor long noun phrases
@@ -555,12 +571,13 @@ class EngTagger
555
571
  word_count = space_count.length + 1
556
572
  # Throttle MNPs if necessary
557
573
  next if word_count > @conf[:longest_noun_phrase]
558
- k = stem(k) unless word_count > 1 # stem single words
574
+
575
+ k = stem(k) unless word_count > 1 # stem single words
559
576
  multiplier = 1
560
577
  multiplier = word_count if @conf[:weight_noun_phrases]
561
578
  ret[k] += multiplier * v
562
579
  end
563
- return ret
580
+ ret
564
581
  end
565
582
 
566
583
  # Reads some included corpus data and saves it in a stored hash on the
@@ -571,10 +588,10 @@ class EngTagger
571
588
  load_tags(@conf[:tag_lex])
572
589
  load_words(@conf[:word_lex])
573
590
  load_words(@conf[:unknown_lex])
574
- File.open(@conf[:word_path], 'w') do |f|
591
+ File.open(@conf[:word_path], "w") do |f|
575
592
  Marshal.dump(@@lexicon, f)
576
593
  end
577
- File.open(@conf[:tag_path], 'w') do |f|
594
+ File.open(@conf[:tag_path], "w") do |f|
578
595
  Marshal.dump(@@hmm, f)
579
596
  end
580
597
  end
@@ -596,6 +613,7 @@ class EngTagger
596
613
  trimmed.each do |n|
597
614
  n = stem(n)
598
615
  next unless n.length < 100 # sanity check on word length
616
+
599
617
  ret[n] += 1 unless n =~ /\A\s*\z/
600
618
  end
601
619
  ret
@@ -603,25 +621,24 @@ class EngTagger
603
621
 
604
622
  # Downcase the first letter of word
605
623
  def lcfirst(word)
606
- word.split(//)[0].downcase + word.split(//)[1..-1].join
624
+ word.split(//)[0].downcase + word.split(//)[1..].join
607
625
  end
608
626
 
609
627
  # Upcase the first letter of word
610
628
  def ucfirst(word)
611
- word.split(//)[0].upcase + word.split(//)[1..-1].join
629
+ word.split(//)[0].upcase + word.split(//)[1..].join
612
630
  end
613
631
 
614
632
  # Return the word stem as given by Stemmable module. This can be
615
633
  # turned off with the class parameter @conf[:stem] => false.
616
634
  def stem(word)
617
- return word unless @conf[:stem]
618
- return word.stem
635
+ @conf[:stem] ? word.stem : word
619
636
  end
620
637
 
621
638
  # This method will reset the preceeding tag to a sentence ender (PP).
622
639
  # This prepares the first word of a new sentence to be tagged correctly.
623
640
  def reset
624
- @conf[:current_tag] = 'pp'
641
+ @conf[:current_tag] = "pp"
625
642
  end
626
643
 
627
644
  # Check whether the text is a valid string
@@ -629,41 +646,38 @@ class EngTagger
629
646
  if !text
630
647
  # there's nothing to parse
631
648
  "method call on uninitialized variable" if @conf[:debug]
632
- return false
649
+ false
633
650
  elsif /\A\s*\z/ =~ text
634
651
  # text is an empty string, nothing to parse
635
- return false
652
+ false
636
653
  else
637
654
  # $text is valid
638
- return true
655
+ true
639
656
  end
640
657
  end
641
658
 
642
659
  # Return a text string with the part-of-speech tags removed
643
660
  def strip_tags(tagged, downcase = false)
644
661
  return nil unless valid_text(tagged)
662
+
645
663
  text = tagged.gsub(/<[^>]+>/m, "")
646
664
  text = text.gsub(/\s+/m, " ")
647
665
  text = text.gsub(/\A\s*/, "")
648
666
  text = text.gsub(/\s*\z/, "")
649
- if downcase
650
- return text.downcase
651
- else
652
- return text
653
- end
667
+ downcase ? text.downcase : text
654
668
  end
655
669
 
656
670
  # Strip the provided text and separate off any punctuation in preparation for tagging
657
671
  def clean_text(text)
658
672
  return false unless valid_text(text)
659
- cleaned_text = text.encode('utf-8')
673
+
674
+ cleaned_text = text.encode("utf-8")
660
675
  tokenized = []
661
676
  # Tokenize the text (splitting on punctuation as you go)
662
677
  cleaned_text.split(/\s+/).each do |line|
663
678
  tokenized += split_punct(line)
664
679
  end
665
- words = split_sentences(tokenized)
666
- return words
680
+ split_sentences(tokenized)
667
681
  end
668
682
 
669
683
  # This handles all of the trailing periods, keeping those that
@@ -672,27 +686,26 @@ class EngTagger
672
686
  # about the use of capitalization in the incoming text
673
687
  def split_sentences(array)
674
688
  tokenized = array
675
- people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
676
- supt det mssrs rev)
677
- army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
678
- inst = %w(dept univ assn bros ph.d)
679
- place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
680
- hwy hway la pde pd plz pl rd st tce)
681
- comp = %w(mfg inc ltd co corp)
682
- state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
689
+ people = %w[jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
690
+ supt det mssrs rev]
691
+ army = %w[col gen lt cmdr adm capt sgt cpl maj brig]
692
+ inst = %w[dept univ assn bros ph.d]
693
+ place = %w[arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
694
+ hwy hway la pde pd plz pl rd st tce]
695
+ comp = %w[mfg inc ltd co corp]
696
+ state = %w[ala ariz ark cal calif colo col conn del fed fla ga ida id ill
683
697
  ind ia kans kan ken ky la me md is mass mich minn miss mo mont
684
698
  neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
685
- va wash wis wisc wy wyo usafa alta man ont que sask yuk)
686
- month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
687
- misc = %w(vs etc no esp)
688
- abbr = Hash.new
699
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk]
700
+ month = %w[jan feb mar apr may jun jul aug sep sept oct nov dec]
701
+ misc = %w[vs etc no esp]
702
+ abbr = {}
689
703
  [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
690
704
  abbr[i] = true
691
705
  end
692
- words = Array.new
693
- tokenized.each_with_index do |t, i|
694
- if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
695
- tokenized[i] =~ /\A(.+)\.\z/
706
+ words = []
707
+ tokenized.each_with_index do |_, i|
708
+ if tokenized[i + 1] && tokenized [i + 1] =~ /[A-Z\W]/ && tokenized[i] =~ /\A(.+)\.\z/
696
709
  w = $1
697
710
  # Don't separate the period off words that
698
711
  # meet any of the following conditions:
@@ -700,21 +713,20 @@ class EngTagger
700
713
  # 1. It is defined in one of the lists above
701
714
  # 2. It is only one letter long: Alfred E. Sloan
702
715
  # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
703
- unless abbr[w.downcase] or
704
- [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
716
+ unless abbr[w.downcase] || [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
705
717
  words << w
706
- words << '.'
718
+ words << "."
707
719
  next
708
720
  end
709
721
  end
710
722
  words << tokenized[i]
711
723
  end
712
724
  # If the final word ends in a period..
713
- if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
725
+ if words[-1] && words[-1] =~ /\A(.*\w)\.\z/
714
726
  words[-1] = $1
715
- words.push '.'
727
+ words.push "."
716
728
  end
717
- return words
729
+ words
718
730
  end
719
731
 
720
732
  # Separate punctuation from words, where appropriate. This leaves trailing
@@ -722,39 +734,40 @@ class EngTagger
722
734
  def split_punct(text)
723
735
  # If there's no punctuation, return immediately
724
736
  return [text] if /\A\w+\z/ =~ text
737
+
725
738
  # Sanity checks
726
739
  text = text.gsub(/\W{10,}/o, " ")
727
740
 
728
741
  # Put quotes into a standard format
729
742
  text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
730
743
  text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
731
- text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
744
+ text = text.gsub(/(\W|^)'(?=.*\w)/o) { $1 ? $1 + " ` " : " ` " } # Convert left quotes to `
732
745
  text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
733
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
746
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o) { $1 + " ' " } # Separate right single quotes
734
747
 
735
748
  # Handle all other punctuation
736
749
  text = text.gsub(/--+/o, " - ") # Convert and separate dashes
737
750
  text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
738
751
  text = text.gsub(/:/o, " : ") # Shift semicolons off
739
- text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
740
- text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
741
- text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
752
+ text = text.gsub(/(\.\.\.+)/o) { " " + $1 + " " } # Shift ellipses off
753
+ text = text.gsub(/([(\[{}\])])/o) { " " + $1 + " " } # Shift off brackets
754
+ text = text.gsub(/([!?#$%;~|])/o) { " " + $1 + " " } # Shift off other ``standard'' punctuation
742
755
 
743
756
  # English-specific contractions
744
- text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
745
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
746
- text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
747
- result = text.split(' ')
748
- return result
757
+ text = text.gsub(/([A-Za-z])'([dms])\b/o) { $1 + " '" + $2 } # Separate off 'd 'm 's
758
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
759
+ text = text.gsub(/'(ve|ll|re)\b/o) { " '" + $1 } # Separate off 've, 'll, 're
760
+ text.split(" ")
749
761
  end
750
762
 
751
763
  # Given a preceding tag, assign a tag word. Called by the add_tags method.
752
764
  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
753
765
  def assign_tag(prev_tag, word)
754
- if word == "-unknown-"
766
+ case word
767
+ when "-unknown-"
755
768
  # classify unknown words accordingly
756
769
  return @conf[:unknown_word_tag]
757
- elsif word == "-sym-"
770
+ when "-sym-"
758
771
  # If this is a symbol, tag it as a symbol
759
772
  return "sym"
760
773
  end
@@ -765,13 +778,13 @@ class EngTagger
765
778
  # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
766
779
  # which is used in most POS taggers
767
780
  best_tag = ""
768
- t[prev_tag].keys.each do |tag|
781
+ t[prev_tag].each_key do |tag|
769
782
  # With @config[:relax] set, this method
770
783
  # will also include any `open classes' of POS tags
771
784
  pw = 0
772
785
  if w[tag]
773
786
  pw = w[tag]
774
- elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
787
+ elsif @conf[:relax] && tag =~ /\A(?:jj|nn|rb|vb)/
775
788
  pw = 0
776
789
  else
777
790
  next
@@ -786,7 +799,7 @@ class EngTagger
786
799
  best_tag = tag
787
800
  end
788
801
  end
789
- return best_tag
802
+ best_tag
790
803
  end
791
804
 
792
805
  # This method determines whether a word should be considered in its
@@ -796,13 +809,13 @@ class EngTagger
796
809
  lcf = lcfirst(word)
797
810
  # seen this word as it appears (lower or upper case)
798
811
  if @@lexicon[word]
799
- return word
812
+ word
800
813
  elsif @@lexicon[lcf]
801
814
  # seen this word only as lower case
802
- return lcf
815
+ lcf
803
816
  else
804
817
  # never seen this word. guess.
805
- return classify_unknown_word(word)
818
+ classify_unknown_word(word)
806
819
  end
807
820
  end
808
821
 
@@ -810,52 +823,52 @@ class EngTagger
810
823
  # classes of words handled by a simple unknown word classification
811
824
  # metric. Called by the clean_word method.
812
825
  def classify_unknown_word(word)
813
- if /[\(\{\[]/ =~ word # Left brackets
814
- classified = "*LRB*"
815
- elsif /[\)\}\]]/ =~ word # Right brackets
816
- classified = "*RRB*"
817
- elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
818
- classified = "*NUM*"
819
- elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
820
- classified = "*NUM*"
821
- elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
822
- classified = "*ORD*"
823
- elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
824
- classified = "-abr-"
825
- elsif /\w-\w/o =~ word # Hyphenated word
826
+ case word
827
+ when /[({\[]/ # Left brackets
828
+ "*LRB*"
829
+ when /[)}\]]/ # Right brackets
830
+ "*RRB*"
831
+ when /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ # Floating point number
832
+ "*NUM*"
833
+ when %r{\A\d+[\d/:-]+\d\z} # Other number constructs
834
+ "*NUM*"
835
+ when /\A-?\d+\w+\z/o # Ordinal number
836
+ "*ORD*"
837
+ when /\A[A-Z][A-Z.-]*\z/o # Abbreviation (all caps)
838
+ "-abr-"
839
+ when /\w-\w/o # Hyphenated word
826
840
  /-([^-]+)\z/ =~ word
827
841
  h_suffix = $1
828
- if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
842
+ if h_suffix && (@@lexicon[h_suffix] && @@lexicon[h_suffix]["jj"])
829
843
  # last part of this is defined as an adjective
830
- classified = "-hyp-adj-"
844
+ "-hyp-adj-"
831
845
  else
832
846
  # last part of this is not defined as an adjective
833
- classified = "-hyp-"
847
+ "-hyp-"
834
848
  end
835
- elsif /\A\W+\z/o =~ word
836
- classified = "-sym-" # Symbol
837
- elsif word == ucfirst(word)
838
- classified = "-cap-" # Capitalized word
839
- elsif /ing\z/o =~ word
840
- classified = "-ing-" # Ends in 'ing'
841
- elsif /s\z/o =~ word
842
- classified = "-s-" # Ends in 's'
843
- elsif /tion\z/o =~ word
844
- classified = "-tion-" # Ends in 'tion'
845
- elsif /ly\z/o =~ word
846
- classified = "-ly-" # Ends in 'ly'
847
- elsif /ed\z/o =~ word
848
- classified = "-ed-" # Ends in 'ed
849
+ when /\A\W+\z/o
850
+ "-sym-" # Symbol
851
+ when ucfirst(word)
852
+ "-cap-" # Capitalized word
853
+ when /ing\z/o
854
+ "-ing-" # Ends in "ing"
855
+ when /s\z/o
856
+ "-s-" # Ends in "s"
857
+ when /tion\z/o
858
+ "-tion-" # Ends in "tion"
859
+ when /ly\z/o
860
+ "-ly-" # Ends in "ly"
861
+ when /ed\z/o
862
+ "-ed-" # Ends in "ed
849
863
  else
850
- classified = "-unknown-" # Completely unknown
864
+ "-unknown-" # Completely unknown
851
865
  end
852
- return classified
853
866
  end
854
867
 
855
868
  # This returns a compiled regexp for extracting maximal noun phrases
856
869
  # from a POS-tagged text.
857
870
  def get_max_noun_regex
858
- regex = /
871
+ /
859
872
  # optional number, gerund - adjective -participle
860
873
  (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
861
874
  # Followed by one or more nouns
@@ -868,8 +881,7 @@ class EngTagger
868
881
  # one or more nouns
869
882
  (?:#{NN})+
870
883
  )*
871
- /xo #/
872
- return regex
884
+ /xo
873
885
  end
874
886
 
875
887
  # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
@@ -878,12 +890,13 @@ class EngTagger
878
890
  # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
879
891
  def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
880
892
  path = File.join(lexpath, lexicon)
881
- fh = File.open(path, 'r')
882
- while line = fh.gets
893
+ fh = File.open(path, "r")
894
+ while (line = fh.gets)
883
895
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
884
- next unless $1 and $2
885
- key, data = $1, $2
886
- tags = Hash.new
896
+ next unless $1 && $2
897
+
898
+ key = $1
899
+ data = $2
887
900
  items = data.split(/,\s+/)
888
901
  pairs = {}
889
902
  items.each do |i|
@@ -901,12 +914,13 @@ class EngTagger
901
914
  # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
902
915
  def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
903
916
  path = File.join(lexpath, lexicon)
904
- fh = File.open(path, 'r')
905
- while line = fh.gets
917
+ fh = File.open(path, "r")
918
+ while (line = fh.gets)
906
919
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
907
- next unless $1 and $2
908
- key, data = $1, $2
909
- tags = Hash.new
920
+ next unless $1 && $2
921
+
922
+ key = $1
923
+ data = $2
910
924
  items = data.split(/,\s+/)
911
925
  pairs = {}
912
926
  items.each do |i|
@@ -918,7 +932,7 @@ class EngTagger
918
932
  fh.close
919
933
  end
920
934
 
921
- #memoize the stem and assign_tag methods
935
+ # memoize the stem and assign_tag methods
922
936
  memoize("stem")
923
937
  memoize("assign_tag")
924
938
  end