engtagger 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/engtagger.rb ADDED
@@ -0,0 +1,729 @@
1
+ #! /local/ruby/bin/ruby
2
+
3
+ $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
4
+ require 'rubygems'
5
+ require 'kconv'
6
+ require 'porter'
7
+ # use hpricot for extracting English text from docs with XML like tags
8
+ begin
9
+ require 'hpricot'
10
+ rescue LoadError
11
+ $no_hpricot = true
12
+ end
13
+
14
+ # File paths
15
+ $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
16
+ $word_path = File.join($lexpath, "pos_words.hash")
17
+ $tag_path = File.join($lexpath, "pos_tags.hash")
18
+
19
+ # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
20
+ class Module
21
+ def memoize(method)
22
+ # alias_method is faster than define_method + old.bind(self).call
23
+ alias_method "__memoized__#{method}", method
24
+ module_eval <<-EOF
25
+ def #{method}(*a, &b)
26
+ # assumes the block won't change the result if the args are the same
27
+ (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
28
+ end
29
+ EOF
30
+ end
31
+ end
32
+
33
+ # English part-of-speech tagger class
34
+ class EngTagger
35
+ VERSION = '0.1.0'
36
+
37
+ #################
38
+ # Class methods #
39
+ #################
40
+
41
+ # Return a class variable that holds probability data
42
+ def self.hmm
43
+ return @@hmm
44
+ end
45
+
46
+ # Return a class variable that holds lexical data
47
+ def self.lexicon
48
+ return @@lexicon
49
+ end
50
+
51
+ # Return a regexp from a string argument that matches an XML-style pos tag
52
+ def self.get_ext(tag = nil)
53
+ return nil unless tag
54
+ return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
55
+ end
56
+
57
+ # Regexps to match XML-style part-of-speech tags
58
+ NUM = get_ext('cd')
59
+ GER = get_ext('vbg')
60
+ ADJ = get_ext('jj[rs]*')
61
+ PART = get_ext('vbn')
62
+ NN = get_ext('nn[sp]*')
63
+ NNP = get_ext('nnp')
64
+ PREP = get_ext('in')
65
+ DET = get_ext('det')
66
+ PAREN = get_ext('[lr]rb')
67
+ QUOT = get_ext('ppr')
68
+ SEN = get_ext('pp')
69
+ WORD = get_ext('\w+')
70
+
71
+ # Convert a Treebank-style, abbreviated tag into verbose definitions
72
+ def self.explain_tag(tag)
73
+ if TAGS[tag]
74
+ return TAGS[tag]
75
+ else
76
+ return tag
77
+ end
78
+ end
79
+
80
+ # The folloging is to make a hash to convert a pos tag to its definition
81
+ # used by the explain_tag method
82
+ tags = [
83
+ "CC", "Conjunction, coordinating",
84
+ "CD", "Adjective, cardinal number",
85
+ "DET", "Determiner",
86
+ "EX", "Pronoun, existential there",
87
+ "FW", "Foreign words",
88
+ "IN", "Preposition / Conjunction",
89
+ "JJ", "Adjective",
90
+ "JJR", "Adjective, comparative",
91
+ "JJS", "Adjective, superlative",
92
+ "LS", "Symbol, list item",
93
+ "MD", "Verb, modal",
94
+ "NN", "Noun",
95
+ "NNP", "Noun, proper",
96
+ "NNPS", "Noun, proper, plural",
97
+ "NNS", "Noun, plural",
98
+ "PDT", "Determiner, prequalifier",
99
+ "POS", "Possessive",
100
+ "PRP", "Determiner, possessive second",
101
+ "PRPS", "Determiner, possessive",
102
+ "RB", "Adverb",
103
+ "RBR", "Adverb, comparative",
104
+ "RBS", "Adverb, superlative",
105
+ "RP", "Adverb, particle",
106
+ "SYM", "Symbol",
107
+ "TO", "Preposition",
108
+ "UH", "Interjection",
109
+ "VB", "Verb, infinitive",
110
+ "VBD", "Verb, past tense",
111
+ "VBG", "Verb, gerund",
112
+ "VBN", "Verb, past/passive participle",
113
+ "VBP", "Verb, base present form",
114
+ "VBZ", "Verb, present 3SG -s form",
115
+ "WDT", "Determiner, question",
116
+ "WP", "Pronoun, question",
117
+ "WPS", "Determiner, possessive & question",
118
+ "WRB", "Adverb, question",
119
+ "PP", " Punctuation, sentence ender",
120
+ "PPC", "Punctuation, comma",
121
+ "PPD", "Punctuation, dollar sign",
122
+ "PPL", "Punctuation, quotation mark left",
123
+ "PPR", "Punctuation, quotation mark right",
124
+ "PPS", "Punctuation, colon, semicolon, elipsis",
125
+ "LRB", "Punctuation, left bracket",
126
+ "RRB", "Punctuation, right bracket"
127
+ ]
128
+ tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
129
+ tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
130
+ TAGS = Hash[*tags]
131
+
132
+ # Hash storing config values:
133
+ #
134
+ # * :unknown_word_tag
135
+ # => (String) Tag to assign to unknown words
136
+ # * :stem
137
+ # => (Boolean) Stem single words using Porter module
138
+ # * :weight_noun_phrases
139
+ # => (Boolean) When returning occurrence counts for a noun phrase, multiply
140
+ # the valuethe number of words in the NP.
141
+ # * :longest_noun_phrase
142
+ # => (Integer) Will ignore noun phrases longer than this threshold. This
143
+ # affects only the get_words() and get_nouns() methods.
144
+ # * :relax
145
+ # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
146
+ # uncommon words, particularly words used polysemously
147
+ # * :tag_lex
148
+ # => (String) Name of the YAML file containing a hash of adjacent part of
149
+ # speech tags and the probability of each
150
+ # * :word_lex
151
+ # => (String) Name of the YAML file containing a hash of words and corresponding
152
+ # parts of speech
153
+ # * :unknown_lex
154
+ # => (String) Name of the YAML file containing a hash of tags for unknown
155
+ # words and corresponding parts of speech
156
+ # * :tag_path
157
+ # => (String) Directory path of tag_lex
158
+ # * :word_path
159
+ # => (String) Directory path of word_lex and unknown_lex
160
+ # * :debug
161
+ # => (Boolean) Print debug messages
162
+ attr_accessor :conf
163
+
164
+ ###############
165
+ # Constructor #
166
+ ###############
167
+
168
+ # Take a hash of parameters that override default values.
169
+ # See above for details.
170
+ def initialize(params = {})
171
+ @conf = Hash.new
172
+ @conf[:unknown_word_tag] = ''
173
+ @conf[:stem] = false
174
+ @conf[:weight_noun_phrases] = false
175
+ @conf[:longest_noun_phrase] = 5
176
+ @conf[:relax] = false
177
+ @conf[:tag_lex] = 'tags.yml'
178
+ @conf[:word_lex] = 'words.yml'
179
+ @conf[:unknown_lex] = 'unknown.yml'
180
+ @conf[:word_path] = $word_path
181
+ @conf[:tag_path] = $tag_path
182
+ @conf[:debug] = false
183
+ # assuming that we start analyzing from the beginninga new sentence...
184
+ @conf[:current_tag] = 'pp'
185
+ @conf.merge(params) if params
186
+ unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
187
+ print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
188
+ @@hmm = Hash.new
189
+ @@lexicon = Hash.new
190
+ else
191
+ lexf = File.open(@conf[:word_path], 'r')
192
+ @@lexicon = Marshal.load(lexf)
193
+ lexf.close
194
+ hmmf = File.open(@conf[:tag_path], 'r')
195
+ @@hmm = Marshal.load(hmmf)
196
+ hmmf.close
197
+ end
198
+ @@mnp = get_max_noun_regex
199
+ end
200
+
201
+ ##################
202
+ # Public methods #
203
+ ##################
204
+
205
+ # Examine the string provided and return it fully tagged in XML style
206
+ def add_tags(text, verbose = false)
207
+ return nil unless valid_text(text)
208
+ tagged = []
209
+ words = clean_text(text)
210
+ tags = Array.new
211
+ words.each do |word|
212
+ cleaned_word = clean_word(word)
213
+ tag = assign_tag(@conf[:current_tag], cleaned_word)
214
+ @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
215
+ tag = explain_tag(tag) if verbose
216
+ tagged << '<' + tag + '>' + word + '</' + tag + '>'
217
+ end
218
+ reset
219
+ return tagged.join(' ')
220
+ end
221
+
222
+ # Given a text string, return as many nouns and noun phrases as possible.
223
+ # Applies add_tags and involves three stages:
224
+ #
225
+ # * Tag the text
226
+ # * Extract all the maximal noun phrases
227
+ # * Recursively extract all noun phrases from the MNPs
228
+ #
229
+ def get_words(text)
230
+ return false unless valid_text(text)
231
+ tagged = add_tags(text)
232
+ if(@conf[:longest_noun_phrase] <= 1)
233
+ return get_nouns(tagged)
234
+ else
235
+ return get_noun_phrases(tagged)
236
+ end
237
+ end
238
+
239
+ # Return an easy-on-the-eyes tagged version of a text string.
240
+ # Applies add_tags and reformats to be easier to read.
241
+ def get_readable(text, verbose = false)
242
+ return nil unless valid_text(text)
243
+ tagged = add_tags(text, verbose)
244
+ tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
245
+ $1 + '/' + $2.upcase
246
+ end
247
+ return tagged
248
+ end
249
+
250
+ # Return an array of sentences (without POS tags) from a text.
251
+ def get_sentences(text)
252
+ return nil unless valid_text(text)
253
+ tagged = add_tags(text)
254
+ sentences = Array.new
255
+ tagged.split(/<\/pp>/).each do |line|
256
+ sentences << strip_tags(line)
257
+ end
258
+ sentences = sentences.map do |sentence|
259
+ sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
260
+ sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
261
+ sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
262
+ sentence.gsub(Regexp.new(" (\W+)$")){$1}
263
+ sentence.gsub(Regexp.new("^(`+) ")){$1}
264
+ end
265
+ return sentences
266
+ end
267
+
268
+ # Given a POS-tagged text, this method returns a hash of all proper nouns
269
+ # and their occurrence frequencies. The method is greedy and will
270
+ # return multi-word phrases, if possible, so it would find ``Linguistic
271
+ # Data Consortium'' as a single unit, rather than as three individual
272
+ # proper nouns. This method does not stem the found words.
273
+ def get_proper_nouns(tagged)
274
+ return nil unless valid_text(tagged)
275
+ trimmed = tagged.scan(NNP).map do |n|
276
+ strip_tags(n)
277
+ end
278
+ nnp = Hash.new(0)
279
+ trimmed.each do |n|
280
+ next unless n.length < 100 # sanity check on word length
281
+ nnp[n] += 1 unless n =~ /\A\s*\z/
282
+ end
283
+ # Now for some fancy resolution stuff...
284
+ nnp.keys.each do |key|
285
+ words = key.split(/\s/)
286
+ # Let's say this is an organization's name --
287
+ # (and it's got at least three words)
288
+ # is there a corresponding acronym in this hash?
289
+ if words.length > 2
290
+ # Make a (naive) acronym out of this name
291
+ acronym = words.map do |word|
292
+ /\A([a-z])[a-z]*\z/ =~ word
293
+ $1
294
+ end.join ''
295
+ # If that acronym has been seen,
296
+ # remove it and add the values to
297
+ # the full name
298
+ if nnp[acronym]
299
+ nnp[key] += nnp[acronym]
300
+ nnp.delete(acronym)
301
+ end
302
+ end
303
+ end
304
+ return nnp
305
+ end
306
+
307
+ # Given a POS-tagged text, this method returns all nouns and their
308
+ # occurrence frequencies.
309
+ def get_nouns(tagged)
310
+ return nil unless valid_text(tagged)
311
+ NN
312
+ trimmed = tagged.scan(NN).map do |n|
313
+ strip_tags(n)
314
+ end
315
+ ret = Hash.new(0)
316
+ trimmed.each do |n|
317
+ n = stem(n)
318
+ next unless n.length < 100 # sanity check on word length
319
+ ret[n] += 1 unless n =~ /\A\s*\z/
320
+ end
321
+ return ret
322
+ end
323
+
324
+ # Given a POS-tagged text, this method returns only the maximal noun phrases.
325
+ # May be called directly, but is also used by get_noun_phrases
326
+ def get_max_noun_phrases(tagged)
327
+ return unless valid_text(tagged)
328
+ mn_phrases = tagged.scan(@@mnp).map do |m|
329
+ strip_tags(m)
330
+ end
331
+ ret = Hash.new(0)
332
+ mn_phrases.each do |p|
333
+ p = stem(p) unless p =~ /\s/ # stem single words
334
+ ret[p] += 1 unless p =~ /\A\s*\z/
335
+ end
336
+ return ret
337
+ end
338
+
339
+ # Similar to get_words, but requires a POS-tagged text as an argument.
340
+ def get_noun_phrases(tagged)
341
+ return nil unless valid_text(tagged)
342
+ found = Hash.new(0)
343
+ phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
344
+ scanned = tagged.scan(@@mnp)
345
+ # Find MNPs in the text, one sentence at a time
346
+ # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
347
+ mn_phrases = []
348
+ scanned.each do |m|
349
+ found[m] += 1 if phrase_ext =~ m
350
+ mn_phrases += m.split(phrase_ext)
351
+ end
352
+ mn_phrases.each do |mnp|
353
+ # Split the phrase into an array of words, and create a loop for each word,
354
+ # shortening the phrase by removing the word in the first position.
355
+ # Record the phrase and any single nouns that are found
356
+ words = mnp.split
357
+ words.length.times do |i|
358
+ found[words.join(' ')] += 1 if words.length > 1
359
+ w = words.shift
360
+ found[w] += 1 if w =~ /#{NN}/
361
+ end
362
+ end
363
+ ret = Hash.new(0)
364
+ found.keys.each do |f|
365
+ k = strip_tags(f)
366
+ v = found[f]
367
+ # We weight by the word count to favor long noun phrases
368
+ space_count = k.scan(/\s+/)
369
+ word_count = space_count.length + 1
370
+ # Throttle MNPs if necessary
371
+ next if word_count > @conf[:longest_noun_phrase]
372
+ k = stem(k) unless word_count > 1 # stem single words
373
+ multiplier = 1
374
+ multiplier = word_count if @conf[:weight_noun_phrases]
375
+ ret[k] += multiplier * v
376
+ end
377
+ return ret
378
+ end
379
+
380
+ # Reads some included corpus data and saves it in a stored hash on the
381
+ # local file system. This is called automatically if the tagger can't
382
+ # find the stored lexicon.
383
+ def install
384
+ puts "Creating part-of-speech lexicon" if @conf[:debug]
385
+ load_tags(@conf[:tag_lex])
386
+ load_words(@conf[:word_lex])
387
+ load_words(@conf[:unknown_lex])
388
+ File.open(@conf[:word_path], 'w') do |f|
389
+ Marshal.dump(@@lexicon, f)
390
+ end
391
+ File.open(@conf[:tag_path], 'w') do |f|
392
+ Marshal.dump(@@hmm, f)
393
+ end
394
+ end
395
+
396
+ ###################
397
+ # Private methods #
398
+ ###################
399
+
400
+ :private
401
+
402
+ # Downcase the first letter of word
403
+ def lcfirst(word)
404
+ word.split(//)[0].downcase + word.split(//)[1..-1].join
405
+ end
406
+
407
+ # Upcase the first letter of word
408
+ def ucfirst(word)
409
+ word.split(//)[0].upcase + word.split(//)[1..-1].join
410
+ end
411
+
412
+ # Return the word stem as given by Stemmable module. This can be
413
+ # turned off with the class parameter @conf[:stem] => false.
414
+ def stem(word)
415
+ return word unless @conf[:stem]
416
+ return word.stem
417
+ end
418
+
419
+ # This method will reset the preceeding tag to a sentence ender (PP).
420
+ # This prepares the first word of a new sentence to be tagged correctly.
421
+ def reset
422
+ @conf[:current_tag] = 'pp'
423
+ end
424
+
425
+ # Check whether the text is a valid string
426
+ def valid_text(text)
427
+ if !text
428
+ # there's nothing to parse
429
+ "method call on uninitialized variable" if @conf[:debug]
430
+ return false
431
+ elsif /\A\s*\z/ =~ text
432
+ # text is an empty string, nothing to parse
433
+ return false
434
+ else
435
+ # $text is valid
436
+ return true
437
+ end
438
+ end
439
+
440
+ # Return a text string with the part-of-speech tags removed
441
+ def strip_tags(tagged, downcase = false)
442
+ return nil unless valid_text(tagged)
443
+ text = tagged.gsub(/<[^>]+>/m, "")
444
+ text = text.gsub(/\s+/m, " ")
445
+ text = text.gsub(/\A\s*/, "")
446
+ text = text.gsub(/\s*\z/, "")
447
+ if downcase
448
+ return text.downcase
449
+ else
450
+ return text
451
+ end
452
+ end
453
+
454
+ # Strip the provided text of HTML-style tags and separate off any punctuation
455
+ # in preparation for tagging
456
+ def clean_text(text)
457
+ return false unless valid_text(text)
458
+ text = text.toutf8
459
+ unless $no_hpricot
460
+ # Strip out any markup and convert entities to their proper form
461
+ cleaned_text = Hpricot(text).inner_text
462
+ else
463
+ cleaned_text = text
464
+ end
465
+ tokenized = []
466
+ # Tokenize the text (splitting on punctuation as you go)
467
+ cleaned_text.split(/\s+/).each do |line|
468
+ tokenized += split_punct(line)
469
+ end
470
+ words = split_sentences(tokenized)
471
+ return words
472
+ end
473
+
474
+ # This handles all of the trailing periods, keeping those that
475
+ # belong on abbreviations and removing those that seem to be
476
+ # at the end of sentences. This method makes some assumptions
477
+ # about the use of capitalization in the incoming text
478
+ def split_sentences(array)
479
+ tokenized = array
480
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
481
+ supt det mssrs rev)
482
+ army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
483
+ inst = %w(dept univ assn bros ph.d)
484
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
485
+ hwy hway la pde pd plz pl rd st tce)
486
+ comp = %w(mfg inc ltd co corp)
487
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
488
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
489
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
490
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk)
491
+ month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
492
+ misc = %w(vs etc no esp)
493
+ abbr = Hash.new
494
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
495
+ abbr[i] = true
496
+ end
497
+ words = Array.new
498
+ tokenized.each_with_index do |t, i|
499
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
500
+ w = $1
501
+ # Don't separate the period off words that
502
+ # meet any of the following conditions:
503
+ #
504
+ # 1. It is defined in one of the lists above
505
+ # 2. It is only one letter long: Alfred E. Sloan
506
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
507
+ unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
508
+ words << w
509
+ words << '.'
510
+ next
511
+ end
512
+ end
513
+ words << tokenized[i]
514
+ end
515
+ # If the final word ends in a period..
516
+ if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
517
+ words[-1] = $1
518
+ words.push '.'
519
+ end
520
+ return words
521
+ end
522
+
523
+ # Separate punctuation from words, where appropriate. This leaves trailing
524
+ # periods in place to be dealt with later. Called by the clean_text method.
525
+ def split_punct(text)
526
+ # If there's no punctuation, return immediately
527
+ return [text] if /\A\w+\z/ =~ text
528
+ # Sanity checks
529
+ text = text.gsub(/\W{10,}/o, " ")
530
+
531
+ # Put quotes into a standard format
532
+ text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
533
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
534
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
535
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
536
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
537
+
538
+ # Handle all other punctuation
539
+ text = text.gsub(/--+/o, " - ") # Convert and separate dashes
540
+ text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
541
+ text = text.gsub(/:/o, " :") # Shift semicolons off
542
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
543
+ text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
544
+ text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
545
+
546
+ # English-specific contractions
547
+ text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
548
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
549
+ text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
550
+ result = text.split(' ')
551
+ return result
552
+ end
553
+
554
+ # Given a preceding tag, assign a tag word. Called by the add_tags method.
555
+ # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
556
+ def assign_tag(prev_tag, word)
557
+ if word == "-unknown-"
558
+ # classify unknown words accordingly
559
+ return @conf[:unknown_word_tag]
560
+ elsif word == "-sym-"
561
+ # If this is a symbol, tag it as a symbol
562
+ return "sym"
563
+ end
564
+ best_so_far = 0
565
+ w = @@lexicon[word]
566
+ t = @@hmm
567
+
568
+ # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
569
+ # which is used in most POS taggers
570
+ best_tag = ""
571
+ t[prev_tag].keys.each do |tag|
572
+ # With @config[:relax] set, this method
573
+ # will also include any `open classes' of POS tags
574
+ pw = 0
575
+ if w[tag]
576
+ pw = w[tag]
577
+ elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
578
+ pw = 0
579
+ else
580
+ next
581
+ end
582
+
583
+ # Bayesian logic:
584
+ # P = P( tag | prev_tag ) * P( tag | word )
585
+ probability = t[prev_tag][tag] * (pw + 1)
586
+ # Set the tag with maximal probability
587
+ if probability > best_so_far
588
+ best_so_far = probability
589
+ best_tag = tag
590
+ end
591
+ end
592
+ return best_tag
593
+ end
594
+
595
+ # This method determines whether a word should be considered in its
596
+ # lower or upper case form. This is useful in considering proper nouns
597
+ # and words that begin sentences. Called by add_tags.
598
+ def clean_word(word)
599
+ lcf = lcfirst(word)
600
+ # seen this word as it appears (lower or upper case)
601
+ if @@lexicon[word]
602
+ return word
603
+ elsif @@lexicon[lcf]
604
+ # seen this word only as lower case
605
+ return lcf
606
+ else
607
+ # never seen this word. guess.
608
+ return classify_unknown_word(word)
609
+ end
610
+ end
611
+
612
+ # This changes any word not appearing in the lexicon to identifiable
613
+ # classes of words handled by a simple unknown word classification
614
+ # metric. Called by the clean_word method.
615
+ def classify_unknown_word(word)
616
+ if /[\(\{\[]/ =~ word # Left brackets
617
+ classified = "*LRB*"
618
+ elsif
619
+ /[\)\}\]]/ =~ word # Right brackets
620
+ classified = "*RRB*"
621
+ elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
622
+ classified = "*NUM*"
623
+ elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
624
+ classified = "*NUM*"
625
+ elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
626
+ classified = "*ORD*"
627
+ elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
628
+ classified = "-abr-"
629
+ elsif /\w-\w/o =~ word # Hyphenated word
630
+ /-([^-]+)\z/ =~ word
631
+ h_suffix = $1
632
+ if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
633
+ # last part of this is defined as an adjective
634
+ classified = "-hyp-adj-"
635
+ else
636
+ # last part of this is not defined as an adjective
637
+ classified = "-hyp-"
638
+ end
639
+ elsif /\A\W+\z/o =~ word
640
+ classified = "-sym-" # Symbol
641
+ elsif word == ucfirst(word)
642
+ classified = "-cap-" # Capitalized word
643
+ elsif /ing\z/o =~ word
644
+ classified = "-ing-" # Ends in 'ing'
645
+ elsif /s\z/o =~ word
646
+ classified = "-s-" # Ends in 's'
647
+ elsif /tion\z/o =~ word
648
+ classified = "-tion-" # Ends in 'tion'
649
+ elsif /ly\z/o =~ word
650
+ classified = "-ly-" # Ends in 'ly'
651
+ elsif /ed\z/o =~ word
652
+ classified = "-ed-" # Ends in 'ed
653
+ else
654
+ classified = "-unknown-" # Completely unknown
655
+ end
656
+ return classified
657
+ end
658
+
659
+ # This returns a compiled regexp for extracting maximal noun phrases
660
+ # from a POS-tagged text.
661
+ def get_max_noun_regex
662
+ regex = /
663
+ # optional number, gerund - adjective -participle
664
+ (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
665
+ # Followed by one or more nouns
666
+ (?:#{NN})+
667
+ (?:
668
+ # Optional preposition, determinant, cardinal
669
+ (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
670
+ # Optional gerund-adjective -participle
671
+ (?:#{GER}|#{ADJ}|#{PART})*
672
+ # one or more nouns
673
+ (?:#{NN})+
674
+ )*
675
+ /xo #/
676
+ return regex
677
+ end
678
+
679
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
680
+ # YAML data parser. It will load a YAML document with a collection of key:
681
+ # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
682
+ # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
683
+ def load_tags(lexicon)
684
+ path = File.join($lexpath, lexicon)
685
+ fh = File.open(path, 'r')
686
+ while line = fh.gets
687
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
688
+ next unless $1 and $2
689
+ key, data = $1, $2
690
+ tags = Hash.new
691
+ items = data.split(/,\s+/)
692
+ pairs = {}
693
+ items.each do |i|
694
+ /([^:]+):\s*(.+)/ =~ i
695
+ pairs[$1] = $2.to_f
696
+ end
697
+ @@hmm[key] = pairs
698
+ end
699
+ fh.close
700
+ end
701
+
702
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
703
+ # YAML data parser. It will load a YAML document with a collection of key:
704
+ # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
705
+ # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
706
+ def load_words(lexicon)
707
+ path = File.join($lexpath, lexicon)
708
+ fh = File.open(path, 'r')
709
+ while line = fh.gets
710
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
711
+ next unless $1 and $2
712
+ key, data = $1, $2
713
+ tags = Hash.new
714
+ items = data.split(/,\s+/)
715
+ pairs = {}
716
+ items.each do |i|
717
+ /([^:]+):\s*(.+)/ =~ i
718
+ pairs[$1] = $2.to_f
719
+ end
720
+ @@lexicon[key] = pairs
721
+ end
722
+ fh.close
723
+ end
724
+
725
+ #memoize the stem and assign_tag methods
726
+ memoize("stem")
727
+ memoize("assign_tag")
728
+ end
729
+