engtagger 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/engtagger.rb CHANGED
@@ -3,30 +3,17 @@
3
3
 
4
4
  $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
5
5
  require 'rubygems'
6
- require 'kconv'
7
6
  require 'porter'
7
+ require 'lru_redux'
8
8
 
9
- # use hpricot for extracting English text from docs with XML like tags
10
- begin
11
- require 'hpricot'
12
- rescue LoadError
13
- $no_hpricot = true
14
- end
15
-
16
- # File paths
17
- $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
18
- $word_path = File.join($lexpath, "pos_words.hash")
19
- $tag_path = File.join($lexpath, "pos_tags.hash")
20
-
21
- # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
22
- class Module
23
- def memoize(method)
9
+ module BoundedSpaceMemoizable
10
+ def memoize(method, max_cache_size=100000)
24
11
  # alias_method is faster than define_method + old.bind(self).call
25
12
  alias_method "__memoized__#{method}", method
26
13
  module_eval <<-EOF
27
- def #{method}(*a, &b)
28
- # assumes the block won't change the result if the args are the same
29
- (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
14
+ def #{method}(*a)
15
+ @__memoized_#{method}_cache ||= LruRedux::Cache.new(#{max_cache_size})
16
+ @__memoized_#{method}_cache[a] ||= __memoized__#{method}(*a)
30
17
  end
31
18
  EOF
32
19
  end
@@ -34,27 +21,39 @@ end
34
21
 
35
22
  # English part-of-speech tagger class
36
23
  class EngTagger
24
+ extend BoundedSpaceMemoizable
25
+
26
+ # File paths
27
+ DEFAULT_LEXPATH = File.join(File.dirname(__FILE__), 'engtagger')
28
+ DEFAULT_WORDPATH = File.join(DEFAULT_LEXPATH, "pos_words.hash")
29
+ DEFAULT_TAGPATH = File.join(DEFAULT_LEXPATH, "pos_tags.hash")
37
30
 
38
31
  #################
39
32
  # Class methods #
40
33
  #################
41
-
42
- # Return a class variable that holds probability data
34
+
35
+ # Return a class variable that holds probability data.
36
+ #
37
+ # @return [Hash] the probability data
38
+ #
43
39
  def self.hmm
44
40
  return @@hmm
45
41
  end
46
42
 
47
- # Return a class variable that holds lexical data
43
+ # Return a class variable that holds lexical data.
44
+ #
45
+ # @return [Hash] the lexicon
46
+ #
48
47
  def self.lexicon
49
48
  return @@lexicon
50
49
  end
51
-
52
- # Return a regexp from a string argument that matches an XML-style pos tag
50
+
51
+ # Return a regexp from a string argument that matches an XML-style pos tag
53
52
  def self.get_ext(tag = nil)
54
53
  return nil unless tag
55
54
  return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
56
55
  end
57
-
56
+
58
57
  # Regexps to match XML-style part-of-speech tags
59
58
  NUM = get_ext('cd')
60
59
  GER = get_ext('vbg')
@@ -70,22 +69,37 @@ class EngTagger
70
69
  VB = get_ext('vb')
71
70
  VBG = get_ext('vbg')
72
71
  VBD = get_ext('vbd')
73
- PART = get_ext('vbn')
72
+ PART = get_ext('vbn')
74
73
  VBP = get_ext('vbp')
75
74
  VBZ = get_ext('vbz')
76
75
  JJ = get_ext('jj')
77
76
  JJR = get_ext('jjr')
78
77
  JJS = get_ext('jjs')
78
+ RB = get_ext('rb')
79
+ RBR = get_ext('rbr')
80
+ RBS = get_ext('rbs')
81
+ RP = get_ext('rp')
82
+ WRB = get_ext('wrb')
83
+ WDT = get_ext('wdt')
84
+ WP = get_ext('wp')
85
+ WPS = get_ext('wps')
86
+ CC = get_ext('cc')
87
+ IN = get_ext('in')
79
88
 
80
- # Convert a Treebank-style, abbreviated tag into verbose definitions
89
+ # Convert a Treebank-style, abbreviated tag into verbose definitions
90
+ #
91
+ # @param tag [#to_s] the tag in question
92
+ # @return [String] the definition, if available
93
+ #
81
94
  def self.explain_tag(tag)
95
+ tag = tag.to_s.downcase
82
96
  if TAGS[tag]
83
97
  return TAGS[tag]
84
98
  else
85
99
  return tag
86
100
  end
87
- end
88
-
101
+ end
102
+
89
103
  # The folloging is to make a hash to convert a pos tag to its definition
90
104
  # used by the explain_tag method
91
105
  tags = [
@@ -132,35 +146,35 @@ class EngTagger
132
146
  "PPR", "Punctuation, quotation mark right",
133
147
  "PPS", "Punctuation, colon, semicolon, elipsis",
134
148
  "LRB", "Punctuation, left bracket",
135
- "RRB", "Punctuation, right bracket"
136
- ]
149
+ "RRB", "Punctuation, right bracket"
150
+ ]
137
151
  tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
138
152
  tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
139
153
  TAGS = Hash[*tags]
140
-
154
+
141
155
  # Hash storing config values:
142
156
  #
143
157
  # * :unknown_word_tag
144
158
  # => (String) Tag to assign to unknown words
145
- # * :stem
159
+ # * :stem
146
160
  # => (Boolean) Stem single words using Porter module
147
161
  # * :weight_noun_phrases
148
- # => (Boolean) When returning occurrence counts for a noun phrase, multiply
162
+ # => (Boolean) When returning occurrence counts for a noun phrase, multiply
149
163
  # the valuethe number of words in the NP.
150
- # * :longest_noun_phrase
151
- # => (Integer) Will ignore noun phrases longer than this threshold. This
164
+ # * :longest_noun_phrase
165
+ # => (Integer) Will ignore noun phrases longer than this threshold. This
152
166
  # affects only the get_words() and get_nouns() methods.
153
- # * :relax
154
- # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
167
+ # * :relax
168
+ # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
155
169
  # uncommon words, particularly words used polysemously
156
170
  # * :tag_lex
157
- # => (String) Name of the YAML file containing a hash of adjacent part of
171
+ # => (String) Name of the YAML file containing a hash of adjacent part of
158
172
  # speech tags and the probability of each
159
173
  # * :word_lex
160
- # => (String) Name of the YAML file containing a hash of words and corresponding
174
+ # => (String) Name of the YAML file containing a hash of words and corresponding
161
175
  # parts of speech
162
176
  # * :unknown_lex
163
- # => (String) Name of the YAML file containing a hash of tags for unknown
177
+ # => (String) Name of the YAML file containing a hash of tags for unknown
164
178
  # words and corresponding parts of speech
165
179
  # * :tag_path
166
180
  # => (String) Directory path of tag_lex
@@ -169,12 +183,12 @@ class EngTagger
169
183
  # * :debug
170
184
  # => (Boolean) Print debug messages
171
185
  attr_accessor :conf
172
-
186
+
173
187
  ###############
174
188
  # Constructor #
175
189
  ###############
176
-
177
- # Take a hash of parameters that override default values.
190
+
191
+ # Take a hash of parameters that override default values.
178
192
  # See above for details.
179
193
  def initialize(params = {})
180
194
  @conf = Hash.new
@@ -186,13 +200,13 @@ class EngTagger
186
200
  @conf[:tag_lex] = 'tags.yml'
187
201
  @conf[:word_lex] = 'words.yml'
188
202
  @conf[:unknown_lex] = 'unknown.yml'
189
- @conf[:word_path] = $word_path
190
- @conf[:tag_path] = $tag_path
203
+ @conf[:word_path] = DEFAULT_WORDPATH
204
+ @conf[:tag_path] = DEFAULT_TAGPATH
191
205
  @conf[:debug] = false
192
206
  # assuming that we start analyzing from the beginninga new sentence...
193
- @conf[:current_tag] = 'pp'
194
- @conf.merge!(params)
195
- unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
207
+ @conf[:current_tag] = 'pp'
208
+ @conf.merge!(params) if params
209
+ unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
196
210
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
197
211
  @@hmm = Hash.new
198
212
  @@lexicon = Hash.new
@@ -206,11 +220,38 @@ class EngTagger
206
220
  end
207
221
  @@mnp = get_max_noun_regex
208
222
  end
209
-
223
+
210
224
  ##################
211
225
  # Public methods #
212
226
  ##################
213
-
227
+
228
+ # Return an array of pairs of the form `["word", :tag]`.
229
+ #
230
+ # @param text [String] the input text
231
+ # @return [Array] the tagged words
232
+ #
233
+ def tag_pairs(text)
234
+ return [] unless valid_text(text)
235
+
236
+ out = clean_text(text).map do |word|
237
+ cleaned_word = clean_word word
238
+ tag = assign_tag(@conf[:current_tag], cleaned_word)
239
+ @conf[:current_tag] = tag = (tag and !tag.empty?) ? tag : 'nn'
240
+ [word, tag.to_sym]
241
+ end
242
+
243
+ # reset the tagger state
244
+ reset
245
+
246
+ out
247
+ end
248
+
249
+ # Examine the string provided and return it fully tagged in XML style.
250
+ #
251
+ # @param text [String] the input text
252
+ # @param verbose [false, true] whether to use verbose tags
253
+ # @return [String] the marked-up string
254
+ #
214
255
  # Examine the string provided and return it fully tagged in XML style
215
256
  def add_tags(text, verbose = false)
216
257
  return nil unless valid_text(text)
@@ -222,15 +263,15 @@ class EngTagger
222
263
  tag = assign_tag(@conf[:current_tag], cleaned_word)
223
264
  @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
224
265
  tag = EngTagger.explain_tag(tag) if verbose
225
- tagged << '<' + tag + '>' + word + '</' + tag + '>'
266
+ tagged << '<' + tag + '>' + word + '</' + tag + '>'
226
267
  end
227
268
  reset
228
269
  return tagged.join(' ')
229
270
  end
230
-
231
- # Given a text string, return as many nouns and noun phrases as possible.
271
+
272
+ # Given a text string, return as many nouns and noun phrases as possible.
232
273
  # Applies add_tags and involves three stages:
233
- #
274
+ #
234
275
  # * Tag the text
235
276
  # * Extract all the maximal noun phrases
236
277
  # * Recursively extract all noun phrases from the MNPs
@@ -244,19 +285,19 @@ class EngTagger
244
285
  return get_noun_phrases(tagged)
245
286
  end
246
287
  end
247
-
248
- # Return an easy-on-the-eyes tagged version of a text string.
288
+
289
+ # Return an easy-on-the-eyes tagged version of a text string.
249
290
  # Applies add_tags and reformats to be easier to read.
250
291
  def get_readable(text, verbose = false)
251
292
  return nil unless valid_text(text)
252
293
  tagged = add_tags(text, verbose)
253
- tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
294
+ tagged = tagged.gsub(/<\w+>([^<]+|[<\w>]+)<\/(\w+)>/o) do
295
+ #!!!# tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
254
296
  $1 + '/' + $2.upcase
255
297
  end
256
- return tagged
257
298
  end
258
-
259
- # Return an array of sentences (without POS tags) from a text.
299
+
300
+ # Return an array of sentences (without POS tags) from a text.
260
301
  def get_sentences(text)
261
302
  return nil unless valid_text(text)
262
303
  tagged = add_tags(text)
@@ -270,25 +311,19 @@ class EngTagger
270
311
  sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
271
312
  sentence.gsub(Regexp.new(" (\W+)$")){$1}
272
313
  sentence.gsub(Regexp.new("^(`+) ")){$1}
273
- end
314
+ end
274
315
  return sentences
275
316
  end
276
-
317
+
277
318
  # Given a POS-tagged text, this method returns a hash of all proper nouns
278
319
  # and their occurrence frequencies. The method is greedy and will
279
320
  # return multi-word phrases, if possible, so it would find ``Linguistic
280
- # Data Consortium'' as a single unit, rather than as three individual
281
- # proper nouns. This method does not stem the found words.
321
+ # Data Consortium'' as a single unit, rather than as three individual
322
+ # proper nouns. This method does not stem the found words.
282
323
  def get_proper_nouns(tagged)
283
324
  return nil unless valid_text(tagged)
284
- trimmed = tagged.scan(NNP).map do |n|
285
- strip_tags(n)
286
- end
287
- nnp = Hash.new(0)
288
- trimmed.each do |n|
289
- next unless n.length < 100 # sanity check on word length
290
- nnp[n] += 1 unless n =~ /\A\s*\z/
291
- end
325
+ tags = [NNP]
326
+ nnp = build_matches_hash(build_trimmed(tagged, tags))
292
327
  # Now for some fancy resolution stuff...
293
328
  nnp.keys.each do |key|
294
329
  words = key.split(/\s/)
@@ -301,7 +336,7 @@ class EngTagger
301
336
  /\A([a-z])[a-z]*\z/ =~ word
302
337
  $1
303
338
  end.join ''
304
- # If that acronym has been seen,
339
+ # If that acronym has been seen,
305
340
  # remove it and add the values to
306
341
  # the full name
307
342
  if nnp[acronym]
@@ -312,167 +347,170 @@ class EngTagger
312
347
  end
313
348
  return nnp
314
349
  end
315
-
316
- # Given a POS-tagged text, this method returns all nouns and their
317
- # occurrence frequencies.
350
+
351
+ # Given a POS-tagged text, this method returns all nouns and their
352
+ # occurrence frequencies.
353
+ #
354
+ # @param tagged [String] the tagged text
355
+ # @return [Hash] the hash of matches
356
+ #
318
357
  def get_nouns(tagged)
319
358
  return nil unless valid_text(tagged)
320
- NN
321
- trimmed = tagged.scan(NN).map do |n|
322
- strip_tags(n)
323
- end
324
- ret = Hash.new(0)
325
- trimmed.each do |n|
326
- n = stem(n)
327
- next unless n.length < 100 # sanity check on word length
328
- ret[n] += 1 unless n =~ /\A\s*\z/
329
- end
330
- return ret
359
+ tags = [NN]
360
+ build_matches_hash(build_trimmed(tagged, tags))
361
+ end
362
+
363
+ # Returns all types of verbs and does not descriminate between the
364
+ # various kinds. Combines all other verb methods listed in this
365
+ # class.
366
+ #
367
+ # @param tagged [String] the tagged text
368
+ # @return [Hash] the hash of matches
369
+ #
370
+ def get_verbs(tagged)
371
+ return nil unless valid_text(tagged)
372
+ tags = [VB, VBD, VBG, PART, VBP, VBZ]
373
+ build_matches_hash(build_trimmed(tagged, tags))
331
374
  end
332
375
 
376
+ #
377
+ # @param tagged [String] the tagged text
378
+ # @return [Hash] the hash of matches
379
+ #
380
+
333
381
  def get_infinitive_verbs(tagged)
334
382
  return nil unless valid_text(tagged)
335
- VB
336
- trimmed = tagged.scan(VB).map do |n|
337
- strip_tags(n)
338
- end
339
- ret = Hash.new(0)
340
- trimmed.each do |n|
341
- n = stem(n)
342
- next unless n.length < 100 # sanity check on word length
343
- ret[n] += 1 unless n =~ /\A\s*\z/
344
- end
345
- return ret
383
+ tags = [VB]
384
+ build_matches_hash(build_trimmed(tagged, tags))
346
385
  end
347
386
 
387
+ #
388
+ # @param tagged [String] the tagged text
389
+ # @return [Hash] the hash of matches
390
+ #
348
391
  def get_past_tense_verbs(tagged)
349
392
  return nil unless valid_text(tagged)
350
- VBD
351
- trimmed = tagged.scan(VBD).map do |n|
352
- strip_tags(n)
353
- end
354
- ret = Hash.new(0)
355
- trimmed.each do |n|
356
- n = stem(n)
357
- next unless n.length < 100 # sanity check on word length
358
- ret[n] += 1 unless n =~ /\A\s*\z/
359
- end
360
- return ret
393
+ tags = [VBD]
394
+ build_matches_hash(build_trimmed(tagged, tags))
361
395
  end
362
396
 
397
+ #
398
+ # @param tagged [String] the tagged text
399
+ # @return [Hash] the hash of matches
400
+ #
363
401
  def get_gerund_verbs(tagged)
364
402
  return nil unless valid_text(tagged)
365
- VBG
366
- trimmed = tagged.scan(VB).map do |n|
367
- strip_tags(n)
368
- end
369
- ret = Hash.new(0)
370
- trimmed.each do |n|
371
- n = stem(n)
372
- next unless n.length < 100 # sanity check on word length
373
- ret[n] += 1 unless n =~ /\A\s*\z/
374
- end
375
- return ret
403
+ tags = [VBG]
404
+ build_matches_hash(build_trimmed(tagged, tags))
376
405
  end
377
406
 
407
+ #
408
+ # @param tagged [String] the tagged text
409
+ # @return [Hash] the hash of matches
410
+ #
378
411
  def get_passive_verbs(tagged)
379
412
  return nil unless valid_text(tagged)
380
- PART
381
- trimmed = tagged.scan(PART).map do |n|
382
- strip_tags(n)
383
- end
384
- ret = Hash.new(0)
385
- trimmed.each do |n|
386
- n = stem(n)
387
- next unless n.length < 100 # sanity check on word length
388
- ret[n] += 1 unless n =~ /\A\s*\z/
389
- end
390
- return ret
413
+ tags = [PART]
414
+ build_matches_hash(build_trimmed(tagged, tags))
391
415
  end
392
416
 
393
-
417
+ #
418
+ # @param tagged [String] the tagged text
419
+ # @return [Hash] the hash of matches
420
+ #
394
421
  def get_base_present_verbs(tagged)
395
422
  return nil unless valid_text(tagged)
396
- VBP
397
- trimmed = tagged.scan(VBP).map do |n|
398
- strip_tags(n)
399
- end
400
- ret = Hash.new(0)
401
- trimmed.each do |n|
402
- n = stem(n)
403
- next unless n.length < 100 # sanity check on word length
404
- ret[n] += 1 unless n =~ /\A\s*\z/
405
- end
406
- return ret
423
+ tags = [VBP]
424
+ build_matches_hash(build_trimmed(tagged, tags))
407
425
  end
408
426
 
427
+ #
428
+ # @param tagged [String] the tagged text
429
+ # @return [Hash] the hash of matches
430
+ #
409
431
  def get_present_verbs(tagged)
410
432
  return nil unless valid_text(tagged)
411
- VBZ
412
- trimmed = tagged.scan(VBZ).map do |n|
413
- strip_tags(n)
414
- end
415
- ret = Hash.new(0)
416
- trimmed.each do |n|
417
- n = stem(n)
418
- next unless n.length < 100 # sanity check on word length
419
- ret[n] += 1 unless n =~ /\A\s*\z/
420
- end
421
- return ret
433
+ tags = [VBZ]
434
+ build_matches_hash(build_trimmed(tagged, tags))
422
435
  end
423
436
 
437
+ #
438
+ # @param tagged [String] the tagged text
439
+ # @return [Hash] the hash of matches
440
+ #
424
441
  def get_adjectives(tagged)
425
442
  return nil unless valid_text(tagged)
426
- JJ
427
- trimmed = tagged.scan(JJ).map do |n|
428
- strip_tags(n)
429
- end
430
- ret = Hash.new(0)
431
- trimmed.each do |n|
432
- n = stem(n)
433
- next unless n.length < 100 # sanity check on word length
434
- ret[n] += 1 unless n =~ /\A\s*\z/
435
- end
436
- return ret
443
+ tags = [JJ]
444
+ build_matches_hash(build_trimmed(tagged, tags))
437
445
  end
438
446
 
447
+ #
448
+ # @param tagged [String] the tagged text
449
+ # @return [Hash] the hash of matches
450
+ #
439
451
  def get_comparative_adjectives(tagged)
440
452
  return nil unless valid_text(tagged)
441
- JJR
442
- trimmed = tagged.scan(JJR).map do |n|
443
- strip_tags(n)
444
- end
445
- ret = Hash.new(0)
446
- trimmed.each do |n|
447
- n = stem(n)
448
- next unless n.length < 100 # sanity check on word length
449
- ret[n] += 1 unless n =~ /\A\s*\z/
450
- end
451
- return ret
452
- end
453
+ tags = [JJR]
454
+ build_matches_hash(build_trimmed(tagged, tags))
455
+ end
453
456
 
457
+ #
458
+ # @param tagged [String] the tagged text
459
+ # @return [Hash] the hash of matches
460
+ #
454
461
  def get_superlative_adjectives(tagged)
455
462
  return nil unless valid_text(tagged)
456
- JJS
457
- trimmed = tagged.scan(JJS).map do |n|
458
- strip_tags(n)
459
- end
460
- ret = Hash.new(0)
461
- trimmed.each do |n|
462
- n = stem(n)
463
- next unless n.length < 100 # sanity check on word length
464
- ret[n] += 1 unless n =~ /\A\s*\z/
465
- end
466
- return ret
463
+ tags = [JJS]
464
+ build_matches_hash(build_trimmed(tagged, tags))
465
+ end
466
+
467
+ #
468
+ # @param tagged [String] the tagged text
469
+ # @return [Hash] the hash of matches
470
+ #
471
+ def get_adverbs(tagged)
472
+ return nil unless valid_text(tagged)
473
+ tags = [RB, RBR, RBS, RP]
474
+ build_matches_hash(build_trimmed(tagged, tags))
475
+ end
476
+
477
+ #
478
+ # @param tagged [String] the tagged text
479
+ # @return [Hash] the hash of matches
480
+ #
481
+ def get_interrogatives(tagged)
482
+ return nil unless valid_text(tagged)
483
+ tags = [WRB, WDT, WP, WPS]
484
+ build_matches_hash(build_trimmed(tagged, tags))
485
+ end
486
+
487
+ # To be consistent with documentation's naming of 'interrogative'
488
+ # parts of speech as 'question'
489
+ alias_method :get_question_parts, :get_interrogatives
490
+
491
+ # Returns all types of conjunctions and does not discriminate
492
+ # between the various kinds. E.g. coordinating, subordinating,
493
+ # correlative...
494
+ #
495
+ # @param tagged [String] the tagged text
496
+ # @return [Hash] the hash of matches
497
+ #
498
+ def get_conjunctions(tagged)
499
+ return nil unless valid_text(tagged)
500
+ tags = [CC, IN]
501
+ build_matches_hash(build_trimmed(tagged, tags))
467
502
  end
468
503
 
469
504
  # Given a POS-tagged text, this method returns only the maximal noun phrases.
470
- # May be called directly, but is also used by get_noun_phrases
505
+ # May be called directly, but is also used by `get_noun_phrases`.
506
+ #
507
+ # @param tagged [String] the tagged text
508
+ # @return [Hash] the hash of matches
509
+ #
471
510
  def get_max_noun_phrases(tagged)
472
- return unless valid_text(tagged)
473
- mn_phrases = tagged.scan(@@mnp).map do |m|
474
- strip_tags(m)
475
- end
511
+ return nil unless valid_text(tagged)
512
+ tags = [@@mnp]
513
+ mn_phrases = build_trimmed(tagged, tags)
476
514
  ret = Hash.new(0)
477
515
  mn_phrases.each do |p|
478
516
  p = stem(p) unless p =~ /\s/ # stem single words
@@ -482,11 +520,15 @@ class EngTagger
482
520
  end
483
521
 
484
522
  # Similar to get_words, but requires a POS-tagged text as an argument.
523
+ #
524
+ # @param tagged [String] the tagged text
525
+ # @return [Hash] the hash of matches
526
+ #
485
527
  def get_noun_phrases(tagged)
486
528
  return nil unless valid_text(tagged)
487
529
  found = Hash.new(0)
488
530
  phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
489
- scanned = tagged.scan(@@mnp)
531
+ scanned = tagged.scan(@@mnp)
490
532
  # Find MNPs in the text, one sentence at a time
491
533
  # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
492
534
  mn_phrases = []
@@ -495,9 +537,9 @@ class EngTagger
495
537
  mn_phrases += m.split(phrase_ext)
496
538
  end
497
539
  mn_phrases.each do |mnp|
498
- # Split the phrase into an array of words, and create a loop for each word,
499
- # shortening the phrase by removing the word in the first position.
500
- # Record the phrase and any single nouns that are found
540
+ # Split the phrase into an array of words, and create a loop for each word,
541
+ # shortening the phrase by removing the word in the first position.
542
+ # Record the phrase and any single nouns that are found
501
543
  words = mnp.split
502
544
  words.length.times do |i|
503
545
  found[words.join(' ')] += 1 if words.length > 1
@@ -519,12 +561,12 @@ class EngTagger
519
561
  multiplier = word_count if @conf[:weight_noun_phrases]
520
562
  ret[k] += multiplier * v
521
563
  end
522
- return ret
564
+ return ret
523
565
  end
524
-
525
- # Reads some included corpus data and saves it in a stored hash on the
526
- # local file system. This is called automatically if the tagger can't
527
- # find the stored lexicon.
566
+
567
+ # Reads some included corpus data and saves it in a stored hash on the
568
+ # local file system. This is called automatically if the tagger can't
569
+ # find the stored lexicon.
528
570
  def install
529
571
  puts "Creating part-of-speech lexicon" if @conf[:debug]
530
572
  load_tags(@conf[:tag_lex])
@@ -542,7 +584,23 @@ class EngTagger
542
584
  # Private methods #
543
585
  ###################
544
586
 
545
- :private
587
+ private
588
+
589
+ def build_trimmed(tagged, tags)
590
+ tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
591
+ strip_tags(n)
592
+ end
593
+ end
594
+
595
+ def build_matches_hash(trimmed)
596
+ ret = Hash.new(0)
597
+ trimmed.each do |n|
598
+ n = stem(n)
599
+ next unless n.length < 100 # sanity check on word length
600
+ ret[n] += 1 unless n =~ /\A\s*\z/
601
+ end
602
+ ret
603
+ end
546
604
 
547
605
  # Downcase the first letter of word
548
606
  def lcfirst(word)
@@ -552,8 +610,8 @@ class EngTagger
552
610
  # Upcase the first letter of word
553
611
  def ucfirst(word)
554
612
  word.split(//)[0].upcase + word.split(//)[1..-1].join
555
- end
556
-
613
+ end
614
+
557
615
  # Return the word stem as given by Stemmable module. This can be
558
616
  # turned off with the class parameter @conf[:stem] => false.
559
617
  def stem(word)
@@ -561,8 +619,8 @@ class EngTagger
561
619
  return word.stem
562
620
  end
563
621
 
564
- # This method will reset the preceeding tag to a sentence ender (PP).
565
- # This prepares the first word of a new sentence to be tagged correctly.
622
+ # This method will reset the preceeding tag to a sentence ender (PP).
623
+ # This prepares the first word of a new sentence to be tagged correctly.
566
624
  def reset
567
625
  @conf[:current_tag] = 'pp'
568
626
  end
@@ -581,7 +639,7 @@ class EngTagger
581
639
  return true
582
640
  end
583
641
  end
584
-
642
+
585
643
  # Return a text string with the part-of-speech tags removed
586
644
  def strip_tags(tagged, downcase = false)
587
645
  return nil unless valid_text(tagged)
@@ -595,18 +653,11 @@ class EngTagger
595
653
  return text
596
654
  end
597
655
  end
598
-
599
- # Strip the provided text of HTML-style tags and separate off any punctuation
600
- # in preparation for tagging
656
+
657
+ # Strip the provided text and separate off any punctuation in preparation for tagging
601
658
  def clean_text(text)
602
659
  return false unless valid_text(text)
603
- text = text.toutf8
604
- unless $no_hpricot
605
- # Strip out any markup and convert entities to their proper form
606
- cleaned_text = Hpricot(text).inner_text
607
- else
608
- cleaned_text = text
609
- end
660
+ cleaned_text = text.encode('utf-8')
610
661
  tokenized = []
611
662
  # Tokenize the text (splitting on punctuation as you go)
612
663
  cleaned_text.split(/\s+/).each do |line|
@@ -615,41 +666,43 @@ class EngTagger
615
666
  words = split_sentences(tokenized)
616
667
  return words
617
668
  end
618
-
619
- # This handles all of the trailing periods, keeping those that
669
+
670
+ # This handles all of the trailing periods, keeping those that
620
671
  # belong on abbreviations and removing those that seem to be
621
672
  # at the end of sentences. This method makes some assumptions
622
673
  # about the use of capitalization in the incoming text
623
674
  def split_sentences(array)
624
675
  tokenized = array
625
- people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
676
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
626
677
  supt det mssrs rev)
627
678
  army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
628
679
  inst = %w(dept univ assn bros ph.d)
629
- place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
680
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
630
681
  hwy hway la pde pd plz pl rd st tce)
631
682
  comp = %w(mfg inc ltd co corp)
632
- state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
633
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
634
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
683
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
684
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
685
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
635
686
  va wash wis wisc wy wyo usafa alta man ont que sask yuk)
636
687
  month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
637
688
  misc = %w(vs etc no esp)
638
- abbr = Hash.new
689
+ abbr = Hash.new
639
690
  [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
640
691
  abbr[i] = true
641
692
  end
642
693
  words = Array.new
643
694
  tokenized.each_with_index do |t, i|
644
- if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
695
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and
696
+ tokenized[i] =~ /\A(.+)\.\z/
645
697
  w = $1
646
- # Don't separate the period off words that
698
+ # Don't separate the period off words that
647
699
  # meet any of the following conditions:
648
700
  #
649
701
  # 1. It is defined in one of the lists above
650
- # 2. It is only one letter long: Alfred E. Sloan
702
+ # 2. It is only one letter long: Alfred E. Sloan
651
703
  # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
652
- unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
704
+ unless abbr[w.downcase] or
705
+ [/\A[a-z]\z/i, /[a-z](?:\.[a-z])+\z/i].any? { |r| r.match? w }
653
706
  words << w
654
707
  words << '.'
655
708
  next
@@ -664,8 +717,8 @@ class EngTagger
664
717
  end
665
718
  return words
666
719
  end
667
-
668
- # Separate punctuation from words, where appropriate. This leaves trailing
720
+
721
+ # Separate punctuation from words, where appropriate. This leaves trailing
669
722
  # periods in place to be dealt with later. Called by the clean_text method.
670
723
  def split_punct(text)
671
724
  # If there's no punctuation, return immediately
@@ -675,27 +728,27 @@ class EngTagger
675
728
 
676
729
  # Put quotes into a standard format
677
730
  text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
678
- text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
679
- text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
731
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
732
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
680
733
  text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
681
734
  text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
682
-
735
+
683
736
  # Handle all other punctuation
684
737
  text = text.gsub(/--+/o, " - ") # Convert and separate dashes
685
738
  text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
686
- text = text.gsub(/:/o, " :") # Shift semicolons off
687
- text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
739
+ text = text.gsub(/:/o, " : ") # Shift semicolons off
740
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
688
741
  text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
689
742
  text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
690
743
 
691
744
  # English-specific contractions
692
745
  text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
693
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
746
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
694
747
  text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
695
748
  result = text.split(' ')
696
749
  return result
697
- end
698
-
750
+ end
751
+
699
752
  # Given a preceding tag, assign a tag word. Called by the add_tags method.
700
753
  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
701
754
  def assign_tag(prev_tag, word)
@@ -709,7 +762,7 @@ class EngTagger
709
762
  best_so_far = 0
710
763
  w = @@lexicon[word]
711
764
  t = @@hmm
712
-
765
+
713
766
  # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
714
767
  # which is used in most POS taggers
715
768
  best_tag = ""
@@ -724,9 +777,9 @@ class EngTagger
724
777
  else
725
778
  next
726
779
  end
727
-
728
- # Bayesian logic:
729
- # P = P( tag | prev_tag ) * P( tag | word )
780
+
781
+ # Bayesian logic:
782
+ # P = P( tag | prev_tag ) * P( tag | word )
730
783
  probability = t[prev_tag][tag] * (pw + 1)
731
784
  # Set the tag with maximal probability
732
785
  if probability > best_so_far
@@ -735,18 +788,18 @@ class EngTagger
735
788
  end
736
789
  end
737
790
  return best_tag
738
- end
739
-
740
- # This method determines whether a word should be considered in its
791
+ end
792
+
793
+ # This method determines whether a word should be considered in its
741
794
  # lower or upper case form. This is useful in considering proper nouns
742
- # and words that begin sentences. Called by add_tags.
795
+ # and words that begin sentences. Called by add_tags.
743
796
  def clean_word(word)
744
797
  lcf = lcfirst(word)
745
798
  # seen this word as it appears (lower or upper case)
746
799
  if @@lexicon[word]
747
800
  return word
748
801
  elsif @@lexicon[lcf]
749
- # seen this word only as lower case
802
+ # seen this word only as lower case
750
803
  return lcf
751
804
  else
752
805
  # never seen this word. guess.
@@ -754,14 +807,13 @@ class EngTagger
754
807
  end
755
808
  end
756
809
 
757
- # This changes any word not appearing in the lexicon to identifiable
758
- # classes of words handled by a simple unknown word classification
810
+ # This changes any word not appearing in the lexicon to identifiable
811
+ # classes of words handled by a simple unknown word classification
759
812
  # metric. Called by the clean_word method.
760
813
  def classify_unknown_word(word)
761
814
  if /[\(\{\[]/ =~ word # Left brackets
762
815
  classified = "*LRB*"
763
- elsif
764
- /[\)\}\]]/ =~ word # Right brackets
816
+ elsif /[\)\}\]]/ =~ word # Right brackets
765
817
  classified = "*RRB*"
766
818
  elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
767
819
  classified = "*NUM*"
@@ -800,33 +852,33 @@ class EngTagger
800
852
  end
801
853
  return classified
802
854
  end
803
-
804
- # This returns a compiled regexp for extracting maximal noun phrases
855
+
856
+ # This returns a compiled regexp for extracting maximal noun phrases
805
857
  # from a POS-tagged text.
806
858
  def get_max_noun_regex
807
859
  regex = /
808
- # optional number, gerund - adjective -participle
809
- (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
810
- # Followed by one or more nouns
811
- (?:#{NN})+
812
- (?:
813
- # Optional preposition, determinant, cardinal
814
- (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
815
- # Optional gerund-adjective -participle
816
- (?:#{GER}|#{ADJ}|#{PART})*
817
- # one or more nouns
818
- (?:#{NN})+
819
- )*
820
- /xo #/
821
- return regex
822
- end
823
-
824
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
825
- # YAML data parser. It will load a YAML document with a collection of key:
826
- # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
827
- # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
828
- def load_tags(lexicon)
829
- path = File.join($lexpath, lexicon)
860
+ # optional number, gerund - adjective -participle
861
+ (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
862
+ # Followed by one or more nouns
863
+ (?:#{NN})+
864
+ (?:
865
+ # Optional preposition, determinant, cardinal
866
+ (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
867
+ # Optional gerund-adjective -participle
868
+ (?:#{GER}|#{ADJ}|#{PART})*
869
+ # one or more nouns
870
+ (?:#{NN})+
871
+ )*
872
+ /xo #/
873
+ return regex
874
+ end
875
+
876
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
877
+ # YAML data parser. It will load a YAML document with a collection of key:
878
+ # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
879
+ # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
880
+ def load_tags(lexicon, lexpath = DEFAULT_LEXPATH)
881
+ path = File.join(lexpath, lexicon)
830
882
  fh = File.open(path, 'r')
831
883
  while line = fh.gets
832
884
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
@@ -837,19 +889,19 @@ class EngTagger
837
889
  pairs = {}
838
890
  items.each do |i|
839
891
  /([^:]+):\s*(.+)/ =~ i
840
- pairs[$1] = $2.to_f
892
+ pairs[$1] = $2.to_f
841
893
  end
842
894
  @@hmm[key] = pairs
843
895
  end
844
896
  fh.close
845
897
  end
846
898
 
847
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
848
- # YAML data parser. It will load a YAML document with a collection of key:
849
- # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
850
- # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
851
- def load_words(lexicon)
852
- path = File.join($lexpath, lexicon)
899
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
900
+ # YAML data parser. It will load a YAML document with a collection of key:
901
+ # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
902
+ # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
903
+ def load_words(lexicon, lexpath = DEFAULT_LEXPATH)
904
+ path = File.join(lexpath, lexicon)
853
905
  fh = File.open(path, 'r')
854
906
  while line = fh.gets
855
907
  /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
@@ -860,15 +912,14 @@ class EngTagger
860
912
  pairs = {}
861
913
  items.each do |i|
862
914
  /([^:]+):\s*(.+)/ =~ i
863
- pairs[$1] = $2.to_f
915
+ pairs[$1] = $2.to_f
864
916
  end
865
917
  @@lexicon[key] = pairs
866
918
  end
867
919
  fh.close
868
920
  end
869
-
870
- #memoize the stem and assign_tag methods
921
+
922
+ #memoize the stem and assign_tag methods
871
923
  memoize("stem")
872
- memoize("assign_tag")
924
+ memoize("assign_tag")
873
925
  end
874
-