engtagger 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/engtagger.rb ADDED
@@ -0,0 +1,729 @@
1
+ #! /local/ruby/bin/ruby
2
+
3
+ $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
4
+ require 'rubygems'
5
+ require 'kconv'
6
+ require 'porter'
7
+ # use hpricot for extracting English text from docs with XML like tags
8
+ begin
9
+ require 'hpricot'
10
+ rescue LoadError
11
+ $no_hpricot = true
12
+ end
13
+
14
+ # File paths
15
+ $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
16
+ $word_path = File.join($lexpath, "pos_words.hash")
17
+ $tag_path = File.join($lexpath, "pos_tags.hash")
18
+
19
+ # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
20
+ class Module
21
+ def memoize(method)
22
+ # alias_method is faster than define_method + old.bind(self).call
23
+ alias_method "__memoized__#{method}", method
24
+ module_eval <<-EOF
25
+ def #{method}(*a, &b)
26
+ # assumes the block won't change the result if the args are the same
27
+ (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
28
+ end
29
+ EOF
30
+ end
31
+ end
32
+
33
+ # English part-of-speech tagger class
34
+ class EngTagger
35
+ VERSION = '0.1.0'
36
+
37
+ #################
38
+ # Class methods #
39
+ #################
40
+
41
+ # Return a class variable that holds probability data
42
+ def self.hmm
43
+ return @@hmm
44
+ end
45
+
46
+ # Return a class variable that holds lexical data
47
+ def self.lexicon
48
+ return @@lexicon
49
+ end
50
+
51
+ # Return a regexp from a string argument that matches an XML-style pos tag
52
+ def self.get_ext(tag = nil)
53
+ return nil unless tag
54
+ return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
55
+ end
56
+
57
+ # Regexps to match XML-style part-of-speech tags
58
+ NUM = get_ext('cd')
59
+ GER = get_ext('vbg')
60
+ ADJ = get_ext('jj[rs]*')
61
+ PART = get_ext('vbn')
62
+ NN = get_ext('nn[sp]*')
63
+ NNP = get_ext('nnp')
64
+ PREP = get_ext('in')
65
+ DET = get_ext('det')
66
+ PAREN = get_ext('[lr]rb')
67
+ QUOT = get_ext('ppr')
68
+ SEN = get_ext('pp')
69
+ WORD = get_ext('\w+')
70
+
71
+ # Convert a Treebank-style, abbreviated tag into verbose definitions
72
+ def self.explain_tag(tag)
73
+ if TAGS[tag]
74
+ return TAGS[tag]
75
+ else
76
+ return tag
77
+ end
78
+ end
79
+
80
+ # The folloging is to make a hash to convert a pos tag to its definition
81
+ # used by the explain_tag method
82
+ tags = [
83
+ "CC", "Conjunction, coordinating",
84
+ "CD", "Adjective, cardinal number",
85
+ "DET", "Determiner",
86
+ "EX", "Pronoun, existential there",
87
+ "FW", "Foreign words",
88
+ "IN", "Preposition / Conjunction",
89
+ "JJ", "Adjective",
90
+ "JJR", "Adjective, comparative",
91
+ "JJS", "Adjective, superlative",
92
+ "LS", "Symbol, list item",
93
+ "MD", "Verb, modal",
94
+ "NN", "Noun",
95
+ "NNP", "Noun, proper",
96
+ "NNPS", "Noun, proper, plural",
97
+ "NNS", "Noun, plural",
98
+ "PDT", "Determiner, prequalifier",
99
+ "POS", "Possessive",
100
+ "PRP", "Determiner, possessive second",
101
+ "PRPS", "Determiner, possessive",
102
+ "RB", "Adverb",
103
+ "RBR", "Adverb, comparative",
104
+ "RBS", "Adverb, superlative",
105
+ "RP", "Adverb, particle",
106
+ "SYM", "Symbol",
107
+ "TO", "Preposition",
108
+ "UH", "Interjection",
109
+ "VB", "Verb, infinitive",
110
+ "VBD", "Verb, past tense",
111
+ "VBG", "Verb, gerund",
112
+ "VBN", "Verb, past/passive participle",
113
+ "VBP", "Verb, base present form",
114
+ "VBZ", "Verb, present 3SG -s form",
115
+ "WDT", "Determiner, question",
116
+ "WP", "Pronoun, question",
117
+ "WPS", "Determiner, possessive & question",
118
+ "WRB", "Adverb, question",
119
+ "PP", " Punctuation, sentence ender",
120
+ "PPC", "Punctuation, comma",
121
+ "PPD", "Punctuation, dollar sign",
122
+ "PPL", "Punctuation, quotation mark left",
123
+ "PPR", "Punctuation, quotation mark right",
124
+ "PPS", "Punctuation, colon, semicolon, elipsis",
125
+ "LRB", "Punctuation, left bracket",
126
+ "RRB", "Punctuation, right bracket"
127
+ ]
128
+ tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
129
+ tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
130
+ TAGS = Hash[*tags]
131
+
132
+ # Hash storing config values:
133
+ #
134
+ # * :unknown_word_tag
135
+ # => (String) Tag to assign to unknown words
136
+ # * :stem
137
+ # => (Boolean) Stem single words using Porter module
138
+ # * :weight_noun_phrases
139
+ # => (Boolean) When returning occurrence counts for a noun phrase, multiply
140
+ # the valuethe number of words in the NP.
141
+ # * :longest_noun_phrase
142
+ # => (Integer) Will ignore noun phrases longer than this threshold. This
143
+ # affects only the get_words() and get_nouns() methods.
144
+ # * :relax
145
+ # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
146
+ # uncommon words, particularly words used polysemously
147
+ # * :tag_lex
148
+ # => (String) Name of the YAML file containing a hash of adjacent part of
149
+ # speech tags and the probability of each
150
+ # * :word_lex
151
+ # => (String) Name of the YAML file containing a hash of words and corresponding
152
+ # parts of speech
153
+ # * :unknown_lex
154
+ # => (String) Name of the YAML file containing a hash of tags for unknown
155
+ # words and corresponding parts of speech
156
+ # * :tag_path
157
+ # => (String) Directory path of tag_lex
158
+ # * :word_path
159
+ # => (String) Directory path of word_lex and unknown_lex
160
+ # * :debug
161
+ # => (Boolean) Print debug messages
162
+ attr_accessor :conf
163
+
164
+ ###############
165
+ # Constructor #
166
+ ###############
167
+
168
+ # Take a hash of parameters that override default values.
169
+ # See above for details.
170
+ def initialize(params = {})
171
+ @conf = Hash.new
172
+ @conf[:unknown_word_tag] = ''
173
+ @conf[:stem] = false
174
+ @conf[:weight_noun_phrases] = false
175
+ @conf[:longest_noun_phrase] = 5
176
+ @conf[:relax] = false
177
+ @conf[:tag_lex] = 'tags.yml'
178
+ @conf[:word_lex] = 'words.yml'
179
+ @conf[:unknown_lex] = 'unknown.yml'
180
+ @conf[:word_path] = $word_path
181
+ @conf[:tag_path] = $tag_path
182
+ @conf[:debug] = false
183
+ # assuming that we start analyzing from the beginninga new sentence...
184
+ @conf[:current_tag] = 'pp'
185
+ @conf.merge(params) if params
186
+ unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
187
+ print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
188
+ @@hmm = Hash.new
189
+ @@lexicon = Hash.new
190
+ else
191
+ lexf = File.open(@conf[:word_path], 'r')
192
+ @@lexicon = Marshal.load(lexf)
193
+ lexf.close
194
+ hmmf = File.open(@conf[:tag_path], 'r')
195
+ @@hmm = Marshal.load(hmmf)
196
+ hmmf.close
197
+ end
198
+ @@mnp = get_max_noun_regex
199
+ end
200
+
201
+ ##################
202
+ # Public methods #
203
+ ##################
204
+
205
+ # Examine the string provided and return it fully tagged in XML style
206
+ def add_tags(text, verbose = false)
207
+ return nil unless valid_text(text)
208
+ tagged = []
209
+ words = clean_text(text)
210
+ tags = Array.new
211
+ words.each do |word|
212
+ cleaned_word = clean_word(word)
213
+ tag = assign_tag(@conf[:current_tag], cleaned_word)
214
+ @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
215
+ tag = explain_tag(tag) if verbose
216
+ tagged << '<' + tag + '>' + word + '</' + tag + '>'
217
+ end
218
+ reset
219
+ return tagged.join(' ')
220
+ end
221
+
222
+ # Given a text string, return as many nouns and noun phrases as possible.
223
+ # Applies add_tags and involves three stages:
224
+ #
225
+ # * Tag the text
226
+ # * Extract all the maximal noun phrases
227
+ # * Recursively extract all noun phrases from the MNPs
228
+ #
229
+ def get_words(text)
230
+ return false unless valid_text(text)
231
+ tagged = add_tags(text)
232
+ if(@conf[:longest_noun_phrase] <= 1)
233
+ return get_nouns(tagged)
234
+ else
235
+ return get_noun_phrases(tagged)
236
+ end
237
+ end
238
+
239
+ # Return an easy-on-the-eyes tagged version of a text string.
240
+ # Applies add_tags and reformats to be easier to read.
241
+ def get_readable(text, verbose = false)
242
+ return nil unless valid_text(text)
243
+ tagged = add_tags(text, verbose)
244
+ tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
245
+ $1 + '/' + $2.upcase
246
+ end
247
+ return tagged
248
+ end
249
+
250
+ # Return an array of sentences (without POS tags) from a text.
251
+ def get_sentences(text)
252
+ return nil unless valid_text(text)
253
+ tagged = add_tags(text)
254
+ sentences = Array.new
255
+ tagged.split(/<\/pp>/).each do |line|
256
+ sentences << strip_tags(line)
257
+ end
258
+ sentences = sentences.map do |sentence|
259
+ sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
260
+ sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
261
+ sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
262
+ sentence.gsub(Regexp.new(" (\W+)$")){$1}
263
+ sentence.gsub(Regexp.new("^(`+) ")){$1}
264
+ end
265
+ return sentences
266
+ end
267
+
268
+ # Given a POS-tagged text, this method returns a hash of all proper nouns
269
+ # and their occurrence frequencies. The method is greedy and will
270
+ # return multi-word phrases, if possible, so it would find ``Linguistic
271
+ # Data Consortium'' as a single unit, rather than as three individual
272
+ # proper nouns. This method does not stem the found words.
273
+ def get_proper_nouns(tagged)
274
+ return nil unless valid_text(tagged)
275
+ trimmed = tagged.scan(NNP).map do |n|
276
+ strip_tags(n)
277
+ end
278
+ nnp = Hash.new(0)
279
+ trimmed.each do |n|
280
+ next unless n.length < 100 # sanity check on word length
281
+ nnp[n] += 1 unless n =~ /\A\s*\z/
282
+ end
283
+ # Now for some fancy resolution stuff...
284
+ nnp.keys.each do |key|
285
+ words = key.split(/\s/)
286
+ # Let's say this is an organization's name --
287
+ # (and it's got at least three words)
288
+ # is there a corresponding acronym in this hash?
289
+ if words.length > 2
290
+ # Make a (naive) acronym out of this name
291
+ acronym = words.map do |word|
292
+ /\A([a-z])[a-z]*\z/ =~ word
293
+ $1
294
+ end.join ''
295
+ # If that acronym has been seen,
296
+ # remove it and add the values to
297
+ # the full name
298
+ if nnp[acronym]
299
+ nnp[key] += nnp[acronym]
300
+ nnp.delete(acronym)
301
+ end
302
+ end
303
+ end
304
+ return nnp
305
+ end
306
+
307
+ # Given a POS-tagged text, this method returns all nouns and their
308
+ # occurrence frequencies.
309
+ def get_nouns(tagged)
310
+ return nil unless valid_text(tagged)
311
+ NN
312
+ trimmed = tagged.scan(NN).map do |n|
313
+ strip_tags(n)
314
+ end
315
+ ret = Hash.new(0)
316
+ trimmed.each do |n|
317
+ n = stem(n)
318
+ next unless n.length < 100 # sanity check on word length
319
+ ret[n] += 1 unless n =~ /\A\s*\z/
320
+ end
321
+ return ret
322
+ end
323
+
324
+ # Given a POS-tagged text, this method returns only the maximal noun phrases.
325
+ # May be called directly, but is also used by get_noun_phrases
326
+ def get_max_noun_phrases(tagged)
327
+ return unless valid_text(tagged)
328
+ mn_phrases = tagged.scan(@@mnp).map do |m|
329
+ strip_tags(m)
330
+ end
331
+ ret = Hash.new(0)
332
+ mn_phrases.each do |p|
333
+ p = stem(p) unless p =~ /\s/ # stem single words
334
+ ret[p] += 1 unless p =~ /\A\s*\z/
335
+ end
336
+ return ret
337
+ end
338
+
339
+ # Similar to get_words, but requires a POS-tagged text as an argument.
340
+ def get_noun_phrases(tagged)
341
+ return nil unless valid_text(tagged)
342
+ found = Hash.new(0)
343
+ phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
344
+ scanned = tagged.scan(@@mnp)
345
+ # Find MNPs in the text, one sentence at a time
346
+ # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
347
+ mn_phrases = []
348
+ scanned.each do |m|
349
+ found[m] += 1 if phrase_ext =~ m
350
+ mn_phrases += m.split(phrase_ext)
351
+ end
352
+ mn_phrases.each do |mnp|
353
+ # Split the phrase into an array of words, and create a loop for each word,
354
+ # shortening the phrase by removing the word in the first position.
355
+ # Record the phrase and any single nouns that are found
356
+ words = mnp.split
357
+ words.length.times do |i|
358
+ found[words.join(' ')] += 1 if words.length > 1
359
+ w = words.shift
360
+ found[w] += 1 if w =~ /#{NN}/
361
+ end
362
+ end
363
+ ret = Hash.new(0)
364
+ found.keys.each do |f|
365
+ k = strip_tags(f)
366
+ v = found[f]
367
+ # We weight by the word count to favor long noun phrases
368
+ space_count = k.scan(/\s+/)
369
+ word_count = space_count.length + 1
370
+ # Throttle MNPs if necessary
371
+ next if word_count > @conf[:longest_noun_phrase]
372
+ k = stem(k) unless word_count > 1 # stem single words
373
+ multiplier = 1
374
+ multiplier = word_count if @conf[:weight_noun_phrases]
375
+ ret[k] += multiplier * v
376
+ end
377
+ return ret
378
+ end
379
+
380
+ # Reads some included corpus data and saves it in a stored hash on the
381
+ # local file system. This is called automatically if the tagger can't
382
+ # find the stored lexicon.
383
+ def install
384
+ puts "Creating part-of-speech lexicon" if @conf[:debug]
385
+ load_tags(@conf[:tag_lex])
386
+ load_words(@conf[:word_lex])
387
+ load_words(@conf[:unknown_lex])
388
+ File.open(@conf[:word_path], 'w') do |f|
389
+ Marshal.dump(@@lexicon, f)
390
+ end
391
+ File.open(@conf[:tag_path], 'w') do |f|
392
+ Marshal.dump(@@hmm, f)
393
+ end
394
+ end
395
+
396
+ ###################
397
+ # Private methods #
398
+ ###################
399
+
400
+ :private
401
+
402
+ # Downcase the first letter of word
403
+ def lcfirst(word)
404
+ word.split(//)[0].downcase + word.split(//)[1..-1].join
405
+ end
406
+
407
+ # Upcase the first letter of word
408
+ def ucfirst(word)
409
+ word.split(//)[0].upcase + word.split(//)[1..-1].join
410
+ end
411
+
412
+ # Return the word stem as given by Stemmable module. This can be
413
+ # turned off with the class parameter @conf[:stem] => false.
414
+ def stem(word)
415
+ return word unless @conf[:stem]
416
+ return word.stem
417
+ end
418
+
419
+ # This method will reset the preceeding tag to a sentence ender (PP).
420
+ # This prepares the first word of a new sentence to be tagged correctly.
421
+ def reset
422
+ @conf[:current_tag] = 'pp'
423
+ end
424
+
425
+ # Check whether the text is a valid string
426
+ def valid_text(text)
427
+ if !text
428
+ # there's nothing to parse
429
+ "method call on uninitialized variable" if @conf[:debug]
430
+ return false
431
+ elsif /\A\s*\z/ =~ text
432
+ # text is an empty string, nothing to parse
433
+ return false
434
+ else
435
+ # $text is valid
436
+ return true
437
+ end
438
+ end
439
+
440
+ # Return a text string with the part-of-speech tags removed
441
+ def strip_tags(tagged, downcase = false)
442
+ return nil unless valid_text(tagged)
443
+ text = tagged.gsub(/<[^>]+>/m, "")
444
+ text = text.gsub(/\s+/m, " ")
445
+ text = text.gsub(/\A\s*/, "")
446
+ text = text.gsub(/\s*\z/, "")
447
+ if downcase
448
+ return text.downcase
449
+ else
450
+ return text
451
+ end
452
+ end
453
+
454
+ # Strip the provided text of HTML-style tags and separate off any punctuation
455
+ # in preparation for tagging
456
+ def clean_text(text)
457
+ return false unless valid_text(text)
458
+ text = text.toutf8
459
+ unless $no_hpricot
460
+ # Strip out any markup and convert entities to their proper form
461
+ cleaned_text = Hpricot(text).inner_text
462
+ else
463
+ cleaned_text = text
464
+ end
465
+ tokenized = []
466
+ # Tokenize the text (splitting on punctuation as you go)
467
+ cleaned_text.split(/\s+/).each do |line|
468
+ tokenized += split_punct(line)
469
+ end
470
+ words = split_sentences(tokenized)
471
+ return words
472
+ end
473
+
474
+ # This handles all of the trailing periods, keeping those that
475
+ # belong on abbreviations and removing those that seem to be
476
+ # at the end of sentences. This method makes some assumptions
477
+ # about the use of capitalization in the incoming text
478
+ def split_sentences(array)
479
+ tokenized = array
480
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
481
+ supt det mssrs rev)
482
+ army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
483
+ inst = %w(dept univ assn bros ph.d)
484
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
485
+ hwy hway la pde pd plz pl rd st tce)
486
+ comp = %w(mfg inc ltd co corp)
487
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
488
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
489
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
490
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk)
491
+ month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
492
+ misc = %w(vs etc no esp)
493
+ abbr = Hash.new
494
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
495
+ abbr[i] = true
496
+ end
497
+ words = Array.new
498
+ tokenized.each_with_index do |t, i|
499
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
500
+ w = $1
501
+ # Don't separate the period off words that
502
+ # meet any of the following conditions:
503
+ #
504
+ # 1. It is defined in one of the lists above
505
+ # 2. It is only one letter long: Alfred E. Sloan
506
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
507
+ unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
508
+ words << w
509
+ words << '.'
510
+ next
511
+ end
512
+ end
513
+ words << tokenized[i]
514
+ end
515
+ # If the final word ends in a period..
516
+ if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
517
+ words[-1] = $1
518
+ words.push '.'
519
+ end
520
+ return words
521
+ end
522
+
523
+ # Separate punctuation from words, where appropriate. This leaves trailing
524
+ # periods in place to be dealt with later. Called by the clean_text method.
525
+ def split_punct(text)
526
+ # If there's no punctuation, return immediately
527
+ return [text] if /\A\w+\z/ =~ text
528
+ # Sanity checks
529
+ text = text.gsub(/\W{10,}/o, " ")
530
+
531
+ # Put quotes into a standard format
532
+ text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
533
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
534
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
535
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
536
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
537
+
538
+ # Handle all other punctuation
539
+ text = text.gsub(/--+/o, " - ") # Convert and separate dashes
540
+ text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
541
+ text = text.gsub(/:/o, " :") # Shift semicolons off
542
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
543
+ text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
544
+ text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
545
+
546
+ # English-specific contractions
547
+ text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
548
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
549
+ text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
550
+ result = text.split(' ')
551
+ return result
552
+ end
553
+
554
+ # Given a preceding tag, assign a tag word. Called by the add_tags method.
555
+ # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
556
+ def assign_tag(prev_tag, word)
557
+ if word == "-unknown-"
558
+ # classify unknown words accordingly
559
+ return @conf[:unknown_word_tag]
560
+ elsif word == "-sym-"
561
+ # If this is a symbol, tag it as a symbol
562
+ return "sym"
563
+ end
564
+ best_so_far = 0
565
+ w = @@lexicon[word]
566
+ t = @@hmm
567
+
568
+ # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
569
+ # which is used in most POS taggers
570
+ best_tag = ""
571
+ t[prev_tag].keys.each do |tag|
572
+ # With @config[:relax] set, this method
573
+ # will also include any `open classes' of POS tags
574
+ pw = 0
575
+ if w[tag]
576
+ pw = w[tag]
577
+ elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
578
+ pw = 0
579
+ else
580
+ next
581
+ end
582
+
583
+ # Bayesian logic:
584
+ # P = P( tag | prev_tag ) * P( tag | word )
585
+ probability = t[prev_tag][tag] * (pw + 1)
586
+ # Set the tag with maximal probability
587
+ if probability > best_so_far
588
+ best_so_far = probability
589
+ best_tag = tag
590
+ end
591
+ end
592
+ return best_tag
593
+ end
594
+
595
+ # This method determines whether a word should be considered in its
596
+ # lower or upper case form. This is useful in considering proper nouns
597
+ # and words that begin sentences. Called by add_tags.
598
+ def clean_word(word)
599
+ lcf = lcfirst(word)
600
+ # seen this word as it appears (lower or upper case)
601
+ if @@lexicon[word]
602
+ return word
603
+ elsif @@lexicon[lcf]
604
+ # seen this word only as lower case
605
+ return lcf
606
+ else
607
+ # never seen this word. guess.
608
+ return classify_unknown_word(word)
609
+ end
610
+ end
611
+
612
+ # This changes any word not appearing in the lexicon to identifiable
613
+ # classes of words handled by a simple unknown word classification
614
+ # metric. Called by the clean_word method.
615
+ def classify_unknown_word(word)
616
+ if /[\(\{\[]/ =~ word # Left brackets
617
+ classified = "*LRB*"
618
+ elsif
619
+ /[\)\}\]]/ =~ word # Right brackets
620
+ classified = "*RRB*"
621
+ elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
622
+ classified = "*NUM*"
623
+ elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
624
+ classified = "*NUM*"
625
+ elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
626
+ classified = "*ORD*"
627
+ elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
628
+ classified = "-abr-"
629
+ elsif /\w-\w/o =~ word # Hyphenated word
630
+ /-([^-]+)\z/ =~ word
631
+ h_suffix = $1
632
+ if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
633
+ # last part of this is defined as an adjective
634
+ classified = "-hyp-adj-"
635
+ else
636
+ # last part of this is not defined as an adjective
637
+ classified = "-hyp-"
638
+ end
639
+ elsif /\A\W+\z/o =~ word
640
+ classified = "-sym-" # Symbol
641
+ elsif word == ucfirst(word)
642
+ classified = "-cap-" # Capitalized word
643
+ elsif /ing\z/o =~ word
644
+ classified = "-ing-" # Ends in 'ing'
645
+ elsif /s\z/o =~ word
646
+ classified = "-s-" # Ends in 's'
647
+ elsif /tion\z/o =~ word
648
+ classified = "-tion-" # Ends in 'tion'
649
+ elsif /ly\z/o =~ word
650
+ classified = "-ly-" # Ends in 'ly'
651
+ elsif /ed\z/o =~ word
652
+ classified = "-ed-" # Ends in 'ed
653
+ else
654
+ classified = "-unknown-" # Completely unknown
655
+ end
656
+ return classified
657
+ end
658
+
659
+ # This returns a compiled regexp for extracting maximal noun phrases
660
+ # from a POS-tagged text.
661
+ def get_max_noun_regex
662
+ regex = /
663
+ # optional number, gerund - adjective -participle
664
+ (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
665
+ # Followed by one or more nouns
666
+ (?:#{NN})+
667
+ (?:
668
+ # Optional preposition, determinant, cardinal
669
+ (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
670
+ # Optional gerund-adjective -participle
671
+ (?:#{GER}|#{ADJ}|#{PART})*
672
+ # one or more nouns
673
+ (?:#{NN})+
674
+ )*
675
+ /xo #/
676
+ return regex
677
+ end
678
+
679
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
680
+ # YAML data parser. It will load a YAML document with a collection of key:
681
+ # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
682
+ # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
683
+ def load_tags(lexicon)
684
+ path = File.join($lexpath, lexicon)
685
+ fh = File.open(path, 'r')
686
+ while line = fh.gets
687
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
688
+ next unless $1 and $2
689
+ key, data = $1, $2
690
+ tags = Hash.new
691
+ items = data.split(/,\s+/)
692
+ pairs = {}
693
+ items.each do |i|
694
+ /([^:]+):\s*(.+)/ =~ i
695
+ pairs[$1] = $2.to_f
696
+ end
697
+ @@hmm[key] = pairs
698
+ end
699
+ fh.close
700
+ end
701
+
702
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
703
+ # YAML data parser. It will load a YAML document with a collection of key:
704
+ # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
705
+ # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
706
+ def load_words(lexicon)
707
+ path = File.join($lexpath, lexicon)
708
+ fh = File.open(path, 'r')
709
+ while line = fh.gets
710
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
711
+ next unless $1 and $2
712
+ key, data = $1, $2
713
+ tags = Hash.new
714
+ items = data.split(/,\s+/)
715
+ pairs = {}
716
+ items.each do |i|
717
+ /([^:]+):\s*(.+)/ =~ i
718
+ pairs[$1] = $2.to_f
719
+ end
720
+ @@lexicon[key] = pairs
721
+ end
722
+ fh.close
723
+ end
724
+
725
+ #memoize the stem and assign_tag methods
726
+ memoize("stem")
727
+ memoize("assign_tag")
728
+ end
729
+