engtagger 0.1.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/lib/engtagger.rb CHANGED
@@ -1,729 +1,831 @@
1
- #! /local/ruby/bin/ruby
2
-
3
- $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
4
- require 'rubygems'
5
- require 'kconv'
6
- require 'porter'
7
- # use hpricot for extracting English text from docs with XML like tags
8
- begin
9
- require 'hpricot'
10
- rescue LoadError
11
- $no_hpricot = true
12
- end
13
-
14
- # File paths
15
- $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
16
- $word_path = File.join($lexpath, "pos_words.hash")
17
- $tag_path = File.join($lexpath, "pos_tags.hash")
18
-
19
- # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
20
- class Module
21
- def memoize(method)
22
- # alias_method is faster than define_method + old.bind(self).call
23
- alias_method "__memoized__#{method}", method
24
- module_eval <<-EOF
25
- def #{method}(*a, &b)
26
- # assumes the block won't change the result if the args are the same
27
- (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
28
- end
29
- EOF
30
- end
31
- end
32
-
33
- # English part-of-speech tagger class
34
- class EngTagger
35
- VERSION = '0.1.1'
36
-
37
- #################
38
- # Class methods #
39
- #################
40
-
41
- # Return a class variable that holds probability data
42
- def self.hmm
43
- return @@hmm
44
- end
45
-
46
- # Return a class variable that holds lexical data
47
- def self.lexicon
48
- return @@lexicon
49
- end
50
-
51
- # Return a regexp from a string argument that matches an XML-style pos tag
52
- def self.get_ext(tag = nil)
53
- return nil unless tag
54
- return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
55
- end
56
-
57
- # Regexps to match XML-style part-of-speech tags
58
- NUM = get_ext('cd')
59
- GER = get_ext('vbg')
60
- ADJ = get_ext('jj[rs]*')
61
- PART = get_ext('vbn')
62
- NN = get_ext('nn[sp]*')
63
- NNP = get_ext('nnp')
64
- PREP = get_ext('in')
65
- DET = get_ext('det')
66
- PAREN = get_ext('[lr]rb')
67
- QUOT = get_ext('ppr')
68
- SEN = get_ext('pp')
69
- WORD = get_ext('\w+')
70
-
71
- # Convert a Treebank-style, abbreviated tag into verbose definitions
72
- def self.explain_tag(tag)
73
- if TAGS[tag]
74
- return TAGS[tag]
75
- else
76
- return tag
77
- end
78
- end
79
-
80
- # The folloging is to make a hash to convert a pos tag to its definition
81
- # used by the explain_tag method
82
- tags = [
83
- "CC", "Conjunction, coordinating",
84
- "CD", "Adjective, cardinal number",
85
- "DET", "Determiner",
86
- "EX", "Pronoun, existential there",
87
- "FW", "Foreign words",
88
- "IN", "Preposition / Conjunction",
89
- "JJ", "Adjective",
90
- "JJR", "Adjective, comparative",
91
- "JJS", "Adjective, superlative",
92
- "LS", "Symbol, list item",
93
- "MD", "Verb, modal",
94
- "NN", "Noun",
95
- "NNP", "Noun, proper",
96
- "NNPS", "Noun, proper, plural",
97
- "NNS", "Noun, plural",
98
- "PDT", "Determiner, prequalifier",
99
- "POS", "Possessive",
100
- "PRP", "Determiner, possessive second",
101
- "PRPS", "Determiner, possessive",
102
- "RB", "Adverb",
103
- "RBR", "Adverb, comparative",
104
- "RBS", "Adverb, superlative",
105
- "RP", "Adverb, particle",
106
- "SYM", "Symbol",
107
- "TO", "Preposition",
108
- "UH", "Interjection",
109
- "VB", "Verb, infinitive",
110
- "VBD", "Verb, past tense",
111
- "VBG", "Verb, gerund",
112
- "VBN", "Verb, past/passive participle",
113
- "VBP", "Verb, base present form",
114
- "VBZ", "Verb, present 3SG -s form",
115
- "WDT", "Determiner, question",
116
- "WP", "Pronoun, question",
117
- "WPS", "Determiner, possessive & question",
118
- "WRB", "Adverb, question",
119
- "PP", "Punctuation, sentence ender",
120
- "PPC", "Punctuation, comma",
121
- "PPD", "Punctuation, dollar sign",
122
- "PPL", "Punctuation, quotation mark left",
123
- "PPR", "Punctuation, quotation mark right",
124
- "PPS", "Punctuation, colon, semicolon, elipsis",
125
- "LRB", "Punctuation, left bracket",
126
- "RRB", "Punctuation, right bracket"
127
- ]
128
- tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
129
- tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
130
- TAGS = Hash[*tags]
131
-
132
- # Hash storing config values:
133
- #
134
- # * :unknown_word_tag
135
- # => (String) Tag to assign to unknown words
136
- # * :stem
137
- # => (Boolean) Stem single words using Porter module
138
- # * :weight_noun_phrases
139
- # => (Boolean) When returning occurrence counts for a noun phrase, multiply
140
- # the valuethe number of words in the NP.
141
- # * :longest_noun_phrase
142
- # => (Integer) Will ignore noun phrases longer than this threshold. This
143
- # affects only the get_words() and get_nouns() methods.
144
- # * :relax
145
- # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
146
- # uncommon words, particularly words used polysemously
147
- # * :tag_lex
148
- # => (String) Name of the YAML file containing a hash of adjacent part of
149
- # speech tags and the probability of each
150
- # * :word_lex
151
- # => (String) Name of the YAML file containing a hash of words and corresponding
152
- # parts of speech
153
- # * :unknown_lex
154
- # => (String) Name of the YAML file containing a hash of tags for unknown
155
- # words and corresponding parts of speech
156
- # * :tag_path
157
- # => (String) Directory path of tag_lex
158
- # * :word_path
159
- # => (String) Directory path of word_lex and unknown_lex
160
- # * :debug
161
- # => (Boolean) Print debug messages
162
- attr_accessor :conf
163
-
164
- ###############
165
- # Constructor #
166
- ###############
167
-
168
- # Take a hash of parameters that override default values.
169
- # See above for details.
170
- def initialize(params = {})
171
- @conf = Hash.new
172
- @conf[:unknown_word_tag] = ''
173
- @conf[:stem] = false
174
- @conf[:weight_noun_phrases] = false
175
- @conf[:longest_noun_phrase] = 5
176
- @conf[:relax] = false
177
- @conf[:tag_lex] = 'tags.yml'
178
- @conf[:word_lex] = 'words.yml'
179
- @conf[:unknown_lex] = 'unknown.yml'
180
- @conf[:word_path] = $word_path
181
- @conf[:tag_path] = $tag_path
182
- @conf[:debug] = false
183
- # assuming that we start analyzing from the beginninga new sentence...
184
- @conf[:current_tag] = 'pp'
185
- @conf.merge(params) if params
186
- unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
187
- print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
188
- @@hmm = Hash.new
189
- @@lexicon = Hash.new
190
- else
191
- lexf = File.open(@conf[:word_path], 'r')
192
- @@lexicon = Marshal.load(lexf)
193
- lexf.close
194
- hmmf = File.open(@conf[:tag_path], 'r')
195
- @@hmm = Marshal.load(hmmf)
196
- hmmf.close
197
- end
198
- @@mnp = get_max_noun_regex
199
- end
200
-
201
- ##################
202
- # Public methods #
203
- ##################
204
-
205
- # Examine the string provided and return it fully tagged in XML style
206
- def add_tags(text, verbose = false)
207
- return nil unless valid_text(text)
208
- tagged = []
209
- words = clean_text(text)
210
- tags = Array.new
211
- words.each do |word|
212
- cleaned_word = clean_word(word)
213
- tag = assign_tag(@conf[:current_tag], cleaned_word)
214
- @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
215
- tag = EngTagger.explain_tag(tag) if verbose
216
- tagged << '<' + tag + '>' + word + '</' + tag + '>'
217
- end
218
- reset
219
- return tagged.join(' ')
220
- end
221
-
222
- # Given a text string, return as many nouns and noun phrases as possible.
223
- # Applies add_tags and involves three stages:
224
- #
225
- # * Tag the text
226
- # * Extract all the maximal noun phrases
227
- # * Recursively extract all noun phrases from the MNPs
228
- #
229
- def get_words(text)
230
- return false unless valid_text(text)
231
- tagged = add_tags(text)
232
- if(@conf[:longest_noun_phrase] <= 1)
233
- return get_nouns(tagged)
234
- else
235
- return get_noun_phrases(tagged)
236
- end
237
- end
238
-
239
- # Return an easy-on-the-eyes tagged version of a text string.
240
- # Applies add_tags and reformats to be easier to read.
241
- def get_readable(text, verbose = false)
242
- return nil unless valid_text(text)
243
- tagged = add_tags(text, verbose)
244
- tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
245
- $1 + '/' + $2.upcase
246
- end
247
- return tagged
248
- end
249
-
250
- # Return an array of sentences (without POS tags) from a text.
251
- def get_sentences(text)
252
- return nil unless valid_text(text)
253
- tagged = add_tags(text)
254
- sentences = Array.new
255
- tagged.split(/<\/pp>/).each do |line|
256
- sentences << strip_tags(line)
257
- end
258
- sentences = sentences.map do |sentence|
259
- sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
260
- sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
261
- sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
262
- sentence.gsub(Regexp.new(" (\W+)$")){$1}
263
- sentence.gsub(Regexp.new("^(`+) ")){$1}
264
- end
265
- return sentences
266
- end
267
-
268
- # Given a POS-tagged text, this method returns a hash of all proper nouns
269
- # and their occurrence frequencies. The method is greedy and will
270
- # return multi-word phrases, if possible, so it would find ``Linguistic
271
- # Data Consortium'' as a single unit, rather than as three individual
272
- # proper nouns. This method does not stem the found words.
273
- def get_proper_nouns(tagged)
274
- return nil unless valid_text(tagged)
275
- trimmed = tagged.scan(NNP).map do |n|
276
- strip_tags(n)
277
- end
278
- nnp = Hash.new(0)
279
- trimmed.each do |n|
280
- next unless n.length < 100 # sanity check on word length
281
- nnp[n] += 1 unless n =~ /\A\s*\z/
282
- end
283
- # Now for some fancy resolution stuff...
284
- nnp.keys.each do |key|
285
- words = key.split(/\s/)
286
- # Let's say this is an organization's name --
287
- # (and it's got at least three words)
288
- # is there a corresponding acronym in this hash?
289
- if words.length > 2
290
- # Make a (naive) acronym out of this name
291
- acronym = words.map do |word|
292
- /\A([a-z])[a-z]*\z/ =~ word
293
- $1
294
- end.join ''
295
- # If that acronym has been seen,
296
- # remove it and add the values to
297
- # the full name
298
- if nnp[acronym]
299
- nnp[key] += nnp[acronym]
300
- nnp.delete(acronym)
301
- end
302
- end
303
- end
304
- return nnp
305
- end
306
-
307
- # Given a POS-tagged text, this method returns all nouns and their
308
- # occurrence frequencies.
309
- def get_nouns(tagged)
310
- return nil unless valid_text(tagged)
311
- NN
312
- trimmed = tagged.scan(NN).map do |n|
313
- strip_tags(n)
314
- end
315
- ret = Hash.new(0)
316
- trimmed.each do |n|
317
- n = stem(n)
318
- next unless n.length < 100 # sanity check on word length
319
- ret[n] += 1 unless n =~ /\A\s*\z/
320
- end
321
- return ret
322
- end
323
-
324
- # Given a POS-tagged text, this method returns only the maximal noun phrases.
325
- # May be called directly, but is also used by get_noun_phrases
326
- def get_max_noun_phrases(tagged)
327
- return unless valid_text(tagged)
328
- mn_phrases = tagged.scan(@@mnp).map do |m|
329
- strip_tags(m)
330
- end
331
- ret = Hash.new(0)
332
- mn_phrases.each do |p|
333
- p = stem(p) unless p =~ /\s/ # stem single words
334
- ret[p] += 1 unless p =~ /\A\s*\z/
335
- end
336
- return ret
337
- end
338
-
339
- # Similar to get_words, but requires a POS-tagged text as an argument.
340
- def get_noun_phrases(tagged)
341
- return nil unless valid_text(tagged)
342
- found = Hash.new(0)
343
- phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
344
- scanned = tagged.scan(@@mnp)
345
- # Find MNPs in the text, one sentence at a time
346
- # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
347
- mn_phrases = []
348
- scanned.each do |m|
349
- found[m] += 1 if phrase_ext =~ m
350
- mn_phrases += m.split(phrase_ext)
351
- end
352
- mn_phrases.each do |mnp|
353
- # Split the phrase into an array of words, and create a loop for each word,
354
- # shortening the phrase by removing the word in the first position.
355
- # Record the phrase and any single nouns that are found
356
- words = mnp.split
357
- words.length.times do |i|
358
- found[words.join(' ')] += 1 if words.length > 1
359
- w = words.shift
360
- found[w] += 1 if w =~ /#{NN}/
361
- end
362
- end
363
- ret = Hash.new(0)
364
- found.keys.each do |f|
365
- k = strip_tags(f)
366
- v = found[f]
367
- # We weight by the word count to favor long noun phrases
368
- space_count = k.scan(/\s+/)
369
- word_count = space_count.length + 1
370
- # Throttle MNPs if necessary
371
- next if word_count > @conf[:longest_noun_phrase]
372
- k = stem(k) unless word_count > 1 # stem single words
373
- multiplier = 1
374
- multiplier = word_count if @conf[:weight_noun_phrases]
375
- ret[k] += multiplier * v
376
- end
377
- return ret
378
- end
379
-
380
- # Reads some included corpus data and saves it in a stored hash on the
381
- # local file system. This is called automatically if the tagger can't
382
- # find the stored lexicon.
383
- def install
384
- puts "Creating part-of-speech lexicon" if @conf[:debug]
385
- load_tags(@conf[:tag_lex])
386
- load_words(@conf[:word_lex])
387
- load_words(@conf[:unknown_lex])
388
- File.open(@conf[:word_path], 'w') do |f|
389
- Marshal.dump(@@lexicon, f)
390
- end
391
- File.open(@conf[:tag_path], 'w') do |f|
392
- Marshal.dump(@@hmm, f)
393
- end
394
- end
395
-
396
- ###################
397
- # Private methods #
398
- ###################
399
-
400
- :private
401
-
402
- # Downcase the first letter of word
403
- def lcfirst(word)
404
- word.split(//)[0].downcase + word.split(//)[1..-1].join
405
- end
406
-
407
- # Upcase the first letter of word
408
- def ucfirst(word)
409
- word.split(//)[0].upcase + word.split(//)[1..-1].join
410
- end
411
-
412
- # Return the word stem as given by Stemmable module. This can be
413
- # turned off with the class parameter @conf[:stem] => false.
414
- def stem(word)
415
- return word unless @conf[:stem]
416
- return word.stem
417
- end
418
-
419
- # This method will reset the preceeding tag to a sentence ender (PP).
420
- # This prepares the first word of a new sentence to be tagged correctly.
421
- def reset
422
- @conf[:current_tag] = 'pp'
423
- end
424
-
425
- # Check whether the text is a valid string
426
- def valid_text(text)
427
- if !text
428
- # there's nothing to parse
429
- "method call on uninitialized variable" if @conf[:debug]
430
- return false
431
- elsif /\A\s*\z/ =~ text
432
- # text is an empty string, nothing to parse
433
- return false
434
- else
435
- # $text is valid
436
- return true
437
- end
438
- end
439
-
440
- # Return a text string with the part-of-speech tags removed
441
- def strip_tags(tagged, downcase = false)
442
- return nil unless valid_text(tagged)
443
- text = tagged.gsub(/<[^>]+>/m, "")
444
- text = text.gsub(/\s+/m, " ")
445
- text = text.gsub(/\A\s*/, "")
446
- text = text.gsub(/\s*\z/, "")
447
- if downcase
448
- return text.downcase
449
- else
450
- return text
451
- end
452
- end
453
-
454
- # Strip the provided text of HTML-style tags and separate off any punctuation
455
- # in preparation for tagging
456
- def clean_text(text)
457
- return false unless valid_text(text)
458
- text = text.toutf8
459
- unless $no_hpricot
460
- # Strip out any markup and convert entities to their proper form
461
- cleaned_text = Hpricot(text).inner_text
462
- else
463
- cleaned_text = text
464
- end
465
- tokenized = []
466
- # Tokenize the text (splitting on punctuation as you go)
467
- cleaned_text.split(/\s+/).each do |line|
468
- tokenized += split_punct(line)
469
- end
470
- words = split_sentences(tokenized)
471
- return words
472
- end
473
-
474
- # This handles all of the trailing periods, keeping those that
475
- # belong on abbreviations and removing those that seem to be
476
- # at the end of sentences. This method makes some assumptions
477
- # about the use of capitalization in the incoming text
478
- def split_sentences(array)
479
- tokenized = array
480
- people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
481
- supt det mssrs rev)
482
- army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
483
- inst = %w(dept univ assn bros ph.d)
484
- place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
485
- hwy hway la pde pd plz pl rd st tce)
486
- comp = %w(mfg inc ltd co corp)
487
- state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
488
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
489
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
490
- va wash wis wisc wy wyo usafa alta man ont que sask yuk)
491
- month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
492
- misc = %w(vs etc no esp)
493
- abbr = Hash.new
494
- [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
495
- abbr[i] = true
496
- end
497
- words = Array.new
498
- tokenized.each_with_index do |t, i|
499
- if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
500
- w = $1
501
- # Don't separate the period off words that
502
- # meet any of the following conditions:
503
- #
504
- # 1. It is defined in one of the lists above
505
- # 2. It is only one letter long: Alfred E. Sloan
506
- # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
507
- unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
508
- words << w
509
- words << '.'
510
- next
511
- end
512
- end
513
- words << tokenized[i]
514
- end
515
- # If the final word ends in a period..
516
- if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
517
- words[-1] = $1
518
- words.push '.'
519
- end
520
- return words
521
- end
522
-
523
- # Separate punctuation from words, where appropriate. This leaves trailing
524
- # periods in place to be dealt with later. Called by the clean_text method.
525
- def split_punct(text)
526
- # If there's no punctuation, return immediately
527
- return [text] if /\A\w+\z/ =~ text
528
- # Sanity checks
529
- text = text.gsub(/\W{10,}/o, " ")
530
-
531
- # Put quotes into a standard format
532
- text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
533
- text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
534
- text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
535
- text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
536
- text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
537
-
538
- # Handle all other punctuation
539
- text = text.gsub(/--+/o, " - ") # Convert and separate dashes
540
- text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
541
- text = text.gsub(/:/o, " :") # Shift semicolons off
542
- text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
543
- text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
544
- text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
545
-
546
- # English-specific contractions
547
- text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
548
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
549
- text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
550
- result = text.split(' ')
551
- return result
552
- end
553
-
554
- # Given a preceding tag, assign a tag word. Called by the add_tags method.
555
- # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
556
- def assign_tag(prev_tag, word)
557
- if word == "-unknown-"
558
- # classify unknown words accordingly
559
- return @conf[:unknown_word_tag]
560
- elsif word == "-sym-"
561
- # If this is a symbol, tag it as a symbol
562
- return "sym"
563
- end
564
- best_so_far = 0
565
- w = @@lexicon[word]
566
- t = @@hmm
567
-
568
- # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
569
- # which is used in most POS taggers
570
- best_tag = ""
571
- t[prev_tag].keys.each do |tag|
572
- # With @config[:relax] set, this method
573
- # will also include any `open classes' of POS tags
574
- pw = 0
575
- if w[tag]
576
- pw = w[tag]
577
- elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
578
- pw = 0
579
- else
580
- next
581
- end
582
-
583
- # Bayesian logic:
584
- # P = P( tag | prev_tag ) * P( tag | word )
585
- probability = t[prev_tag][tag] * (pw + 1)
586
- # Set the tag with maximal probability
587
- if probability > best_so_far
588
- best_so_far = probability
589
- best_tag = tag
590
- end
591
- end
592
- return best_tag
593
- end
594
-
595
- # This method determines whether a word should be considered in its
596
- # lower or upper case form. This is useful in considering proper nouns
597
- # and words that begin sentences. Called by add_tags.
598
- def clean_word(word)
599
- lcf = lcfirst(word)
600
- # seen this word as it appears (lower or upper case)
601
- if @@lexicon[word]
602
- return word
603
- elsif @@lexicon[lcf]
604
- # seen this word only as lower case
605
- return lcf
606
- else
607
- # never seen this word. guess.
608
- return classify_unknown_word(word)
609
- end
610
- end
611
-
612
- # This changes any word not appearing in the lexicon to identifiable
613
- # classes of words handled by a simple unknown word classification
614
- # metric. Called by the clean_word method.
615
- def classify_unknown_word(word)
616
- if /[\(\{\[]/ =~ word # Left brackets
617
- classified = "*LRB*"
618
- elsif
619
- /[\)\}\]]/ =~ word # Right brackets
620
- classified = "*RRB*"
621
- elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
622
- classified = "*NUM*"
623
- elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
624
- classified = "*NUM*"
625
- elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
626
- classified = "*ORD*"
627
- elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
628
- classified = "-abr-"
629
- elsif /\w-\w/o =~ word # Hyphenated word
630
- /-([^-]+)\z/ =~ word
631
- h_suffix = $1
632
- if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
633
- # last part of this is defined as an adjective
634
- classified = "-hyp-adj-"
635
- else
636
- # last part of this is not defined as an adjective
637
- classified = "-hyp-"
638
- end
639
- elsif /\A\W+\z/o =~ word
640
- classified = "-sym-" # Symbol
641
- elsif word == ucfirst(word)
642
- classified = "-cap-" # Capitalized word
643
- elsif /ing\z/o =~ word
644
- classified = "-ing-" # Ends in 'ing'
645
- elsif /s\z/o =~ word
646
- classified = "-s-" # Ends in 's'
647
- elsif /tion\z/o =~ word
648
- classified = "-tion-" # Ends in 'tion'
649
- elsif /ly\z/o =~ word
650
- classified = "-ly-" # Ends in 'ly'
651
- elsif /ed\z/o =~ word
652
- classified = "-ed-" # Ends in 'ed
653
- else
654
- classified = "-unknown-" # Completely unknown
655
- end
656
- return classified
657
- end
658
-
659
- # This returns a compiled regexp for extracting maximal noun phrases
660
- # from a POS-tagged text.
661
- def get_max_noun_regex
662
- regex = /
663
- # optional number, gerund - adjective -participle
664
- (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
665
- # Followed by one or more nouns
666
- (?:#{NN})+
667
- (?:
668
- # Optional preposition, determinant, cardinal
669
- (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
670
- # Optional gerund-adjective -participle
671
- (?:#{GER}|#{ADJ}|#{PART})*
672
- # one or more nouns
673
- (?:#{NN})+
674
- )*
675
- /xo #/
676
- return regex
677
- end
678
-
679
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
680
- # YAML data parser. It will load a YAML document with a collection of key:
681
- # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
682
- # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
683
- def load_tags(lexicon)
684
- path = File.join($lexpath, lexicon)
685
- fh = File.open(path, 'r')
686
- while line = fh.gets
687
- /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
688
- next unless $1 and $2
689
- key, data = $1, $2
690
- tags = Hash.new
691
- items = data.split(/,\s+/)
692
- pairs = {}
693
- items.each do |i|
694
- /([^:]+):\s*(.+)/ =~ i
695
- pairs[$1] = $2.to_f
696
- end
697
- @@hmm[key] = pairs
698
- end
699
- fh.close
700
- end
701
-
702
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
703
- # YAML data parser. It will load a YAML document with a collection of key:
704
- # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
705
- # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
706
- def load_words(lexicon)
707
- path = File.join($lexpath, lexicon)
708
- fh = File.open(path, 'r')
709
- while line = fh.gets
710
- /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
711
- next unless $1 and $2
712
- key, data = $1, $2
713
- tags = Hash.new
714
- items = data.split(/,\s+/)
715
- pairs = {}
716
- items.each do |i|
717
- /([^:]+):\s*(.+)/ =~ i
718
- pairs[$1] = $2.to_f
719
- end
720
- @@lexicon[key] = pairs
721
- end
722
- fh.close
723
- end
724
-
725
- #memoize the stem and assign_tag methods
726
- memoize("stem")
727
- memoize("assign_tag")
728
- end
729
-
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
3
+
4
+ $LOAD_PATH << File.join(File.dirname(__FILE__), 'engtagger')
5
+ require 'rubygems'
6
+ require 'kconv'
7
+ require 'porter'
8
+
9
+ # use hpricot for extracting English text from docs with XML like tags
10
+ begin
11
+ require 'hpricot'
12
+ rescue LoadError
13
+ $no_hpricot = true
14
+ end
15
+
16
+ # File paths
17
+ $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
18
+ $word_path = File.join($lexpath, "pos_words.hash")
19
+ $tag_path = File.join($lexpath, "pos_tags.hash")
20
+
21
+ # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
22
+ class Module
23
+ def memoize(method)
24
+ # alias_method is faster than define_method + old.bind(self).call
25
+ alias_method "__memoized__#{method}", method
26
+ module_eval <<-EOF
27
+ def #{method}(*a, &b)
28
+ # assumes the block won't change the result if the args are the same
29
+ (@__memoized_#{method}_cache ||= {})[a] ||= __memoized__#{method}(*a, &b)
30
+ end
31
+ EOF
32
+ end
33
+ end
34
+
35
+ # English part-of-speech tagger class
36
+ class EngTagger
37
+
38
+ #################
39
+ # Class methods #
40
+ #################
41
+
42
+ # Return a class variable that holds probability data
43
+ def self.hmm
44
+ return @@hmm
45
+ end
46
+
47
+ # Return a class variable that holds lexical data
48
+ def self.lexicon
49
+ return @@lexicon
50
+ end
51
+
52
+ # Return a regexp from a string argument that matches an XML-style pos tag
53
+ def self.get_ext(tag = nil)
54
+ return nil unless tag
55
+ return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
56
+ end
57
+
58
+ # Regexps to match XML-style part-of-speech tags
59
+ NUM = get_ext('cd')
60
+ GER = get_ext('vbg')
61
+ ADJ = get_ext('jj[rs]*')
62
+ NN = get_ext('nn[sp]*')
63
+ NNP = get_ext('nnp')
64
+ PREP = get_ext('in')
65
+ DET = get_ext('det')
66
+ PAREN = get_ext('[lr]rb')
67
+ QUOT = get_ext('ppr')
68
+ SEN = get_ext('pp')
69
+ WORD = get_ext('\w+')
70
+ VB = get_ext('vb')
71
+ VBG = get_ext('vbg')
72
+ VBD = get_ext('vbd')
73
+ PART = get_ext('vbn')
74
+ VBP = get_ext('vbp')
75
+ VBZ = get_ext('vbz')
76
+ JJ = get_ext('jj')
77
+ JJR = get_ext('jjr')
78
+ JJS = get_ext('jjs')
79
+ RB = get_ext('rb')
80
+ RBR = get_ext('rbr')
81
+ RBS = get_ext('rbs')
82
+ RP = get_ext('rp')
83
+ WRB = get_ext('wrb')
84
+ WDT = get_ext('wdt')
85
+ WP = get_ext('wp')
86
+ WPS = get_ext('wps')
87
+ CC = get_ext('cc')
88
+ IN = get_ext('in')
89
+
90
+ # Convert a Treebank-style, abbreviated tag into verbose definitions
91
+ def self.explain_tag(tag)
92
+ if TAGS[tag]
93
+ return TAGS[tag]
94
+ else
95
+ return tag
96
+ end
97
+ end
98
+
99
+ # The folloging is to make a hash to convert a pos tag to its definition
100
+ # used by the explain_tag method
101
+ tags = [
102
+ "CC", "Conjunction, coordinating",
103
+ "CD", "Adjective, cardinal number",
104
+ "DET", "Determiner",
105
+ "EX", "Pronoun, existential there",
106
+ "FW", "Foreign words",
107
+ "IN", "Preposition / Conjunction",
108
+ "JJ", "Adjective",
109
+ "JJR", "Adjective, comparative",
110
+ "JJS", "Adjective, superlative",
111
+ "LS", "Symbol, list item",
112
+ "MD", "Verb, modal",
113
+ "NN", "Noun",
114
+ "NNP", "Noun, proper",
115
+ "NNPS", "Noun, proper, plural",
116
+ "NNS", "Noun, plural",
117
+ "PDT", "Determiner, prequalifier",
118
+ "POS", "Possessive",
119
+ "PRP", "Determiner, possessive second",
120
+ "PRPS", "Determiner, possessive",
121
+ "RB", "Adverb",
122
+ "RBR", "Adverb, comparative",
123
+ "RBS", "Adverb, superlative",
124
+ "RP", "Adverb, particle",
125
+ "SYM", "Symbol",
126
+ "TO", "Preposition",
127
+ "UH", "Interjection",
128
+ "VB", "Verb, infinitive",
129
+ "VBD", "Verb, past tense",
130
+ "VBG", "Verb, gerund",
131
+ "VBN", "Verb, past/passive participle",
132
+ "VBP", "Verb, base present form",
133
+ "VBZ", "Verb, present 3SG -s form",
134
+ "WDT", "Determiner, question",
135
+ "WP", "Pronoun, question",
136
+ "WPS", "Determiner, possessive & question",
137
+ "WRB", "Adverb, question",
138
+ "PP", "Punctuation, sentence ender",
139
+ "PPC", "Punctuation, comma",
140
+ "PPD", "Punctuation, dollar sign",
141
+ "PPL", "Punctuation, quotation mark left",
142
+ "PPR", "Punctuation, quotation mark right",
143
+ "PPS", "Punctuation, colon, semicolon, elipsis",
144
+ "LRB", "Punctuation, left bracket",
145
+ "RRB", "Punctuation, right bracket"
146
+ ]
147
+ tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
148
+ tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
149
+ TAGS = Hash[*tags]
150
+
151
+ # Hash storing config values:
152
+ #
153
+ # * :unknown_word_tag
154
+ # => (String) Tag to assign to unknown words
155
+ # * :stem
156
+ # => (Boolean) Stem single words using Porter module
157
+ # * :weight_noun_phrases
158
+ # => (Boolean) When returning occurrence counts for a noun phrase, multiply
159
+ # the valuethe number of words in the NP.
160
+ # * :longest_noun_phrase
161
+ # => (Integer) Will ignore noun phrases longer than this threshold. This
162
+ # affects only the get_words() and get_nouns() methods.
163
+ # * :relax
164
+ # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
165
+ # uncommon words, particularly words used polysemously
166
+ # * :tag_lex
167
+ # => (String) Name of the YAML file containing a hash of adjacent part of
168
+ # speech tags and the probability of each
169
+ # * :word_lex
170
+ # => (String) Name of the YAML file containing a hash of words and corresponding
171
+ # parts of speech
172
+ # * :unknown_lex
173
+ # => (String) Name of the YAML file containing a hash of tags for unknown
174
+ # words and corresponding parts of speech
175
+ # * :tag_path
176
+ # => (String) Directory path of tag_lex
177
+ # * :word_path
178
+ # => (String) Directory path of word_lex and unknown_lex
179
+ # * :debug
180
+ # => (Boolean) Print debug messages
181
+ attr_accessor :conf
182
+
183
+ ###############
184
+ # Constructor #
185
+ ###############
186
+
187
+ # Take a hash of parameters that override default values.
188
+ # See above for details.
189
+ def initialize(params = {})
190
+ @conf = Hash.new
191
+ @conf[:unknown_word_tag] = ''
192
+ @conf[:stem] = false
193
+ @conf[:weight_noun_phrases] = false
194
+ @conf[:longest_noun_phrase] = 5
195
+ @conf[:relax] = false
196
+ @conf[:tag_lex] = 'tags.yml'
197
+ @conf[:word_lex] = 'words.yml'
198
+ @conf[:unknown_lex] = 'unknown.yml'
199
+ @conf[:word_path] = $word_path
200
+ @conf[:tag_path] = $tag_path
201
+ @conf[:debug] = false
202
+ # assuming that we start analyzing from the beginninga new sentence...
203
+ @conf[:current_tag] = 'pp'
204
+ @conf.merge!(params)
205
+ unless File.exist?(@conf[:word_path]) and File.exist?(@conf[:tag_path])
206
+ print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
207
+ @@hmm = Hash.new
208
+ @@lexicon = Hash.new
209
+ else
210
+ lexf = File.open(@conf[:word_path], 'r')
211
+ @@lexicon = Marshal.load(lexf)
212
+ lexf.close
213
+ hmmf = File.open(@conf[:tag_path], 'r')
214
+ @@hmm = Marshal.load(hmmf)
215
+ hmmf.close
216
+ end
217
+ @@mnp = get_max_noun_regex
218
+ end
219
+
220
+ ##################
221
+ # Public methods #
222
+ ##################
223
+
224
+ # Examine the string provided and return it fully tagged in XML style
225
+ def add_tags(text, verbose = false)
226
+ return nil unless valid_text(text)
227
+ tagged = []
228
+ words = clean_text(text)
229
+ tags = Array.new
230
+ words.each do |word|
231
+ cleaned_word = clean_word(word)
232
+ tag = assign_tag(@conf[:current_tag], cleaned_word)
233
+ @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
234
+ tag = EngTagger.explain_tag(tag) if verbose
235
+ tagged << '<' + tag + '>' + word + '</' + tag + '>'
236
+ end
237
+ reset
238
+ return tagged.join(' ')
239
+ end
240
+
241
+ # Given a text string, return as many nouns and noun phrases as possible.
242
+ # Applies add_tags and involves three stages:
243
+ #
244
+ # * Tag the text
245
+ # * Extract all the maximal noun phrases
246
+ # * Recursively extract all noun phrases from the MNPs
247
+ #
248
+ def get_words(text)
249
+ return false unless valid_text(text)
250
+ tagged = add_tags(text)
251
+ if(@conf[:longest_noun_phrase] <= 1)
252
+ return get_nouns(tagged)
253
+ else
254
+ return get_noun_phrases(tagged)
255
+ end
256
+ end
257
+
258
+ # Return an easy-on-the-eyes tagged version of a text string.
259
+ # Applies add_tags and reformats to be easier to read.
260
+ def get_readable(text, verbose = false)
261
+ return nil unless valid_text(text)
262
+ tagged = add_tags(text, verbose)
263
+ tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
264
+ $1 + '/' + $2.upcase
265
+ end
266
+ return tagged
267
+ end
268
+
269
+ # Return an array of sentences (without POS tags) from a text.
270
+ def get_sentences(text)
271
+ return nil unless valid_text(text)
272
+ tagged = add_tags(text)
273
+ sentences = Array.new
274
+ tagged.split(/<\/pp>/).each do |line|
275
+ sentences << strip_tags(line)
276
+ end
277
+ sentences = sentences.map do |sentence|
278
+ sentence.gsub(Regexp.new(" ('s?) ")){$1 + ' '}
279
+ sentence.gsub(Regexp.new(" (\W+) ")){$1 + ' '}
280
+ sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
281
+ sentence.gsub(Regexp.new(" (\W+)$")){$1}
282
+ sentence.gsub(Regexp.new("^(`+) ")){$1}
283
+ end
284
+ return sentences
285
+ end
286
+
287
+ # Given a POS-tagged text, this method returns a hash of all proper nouns
288
+ # and their occurrence frequencies. The method is greedy and will
289
+ # return multi-word phrases, if possible, so it would find ``Linguistic
290
+ # Data Consortium'' as a single unit, rather than as three individual
291
+ # proper nouns. This method does not stem the found words.
292
+ def get_proper_nouns(tagged)
293
+ return nil unless valid_text(tagged)
294
+ tags = [NNP]
295
+ nnp = build_matches_hash(build_trimmed(tagged, tags))
296
+ # Now for some fancy resolution stuff...
297
+ nnp.keys.each do |key|
298
+ words = key.split(/\s/)
299
+ # Let's say this is an organization's name --
300
+ # (and it's got at least three words)
301
+ # is there a corresponding acronym in this hash?
302
+ if words.length > 2
303
+ # Make a (naive) acronym out of this name
304
+ acronym = words.map do |word|
305
+ /\A([a-z])[a-z]*\z/ =~ word
306
+ $1
307
+ end.join ''
308
+ # If that acronym has been seen,
309
+ # remove it and add the values to
310
+ # the full name
311
+ if nnp[acronym]
312
+ nnp[key] += nnp[acronym]
313
+ nnp.delete(acronym)
314
+ end
315
+ end
316
+ end
317
+ return nnp
318
+ end
319
+
320
+ # Given a POS-tagged text, this method returns all nouns and their
321
+ # occurrence frequencies.
322
+ def get_nouns(tagged)
323
+ return nil unless valid_text(tagged)
324
+ tags = [NN]
325
+ build_matches_hash(build_trimmed(tagged, tags))
326
+ end
327
+
328
+ # Returns all types of verbs and does not descriminate between the various kinds.
329
+ # Is the combination of all other verb methods listed in this class.
330
+ def get_verbs(tagged)
331
+ return nil unless valid_text(tagged)
332
+ tags = [VB, VBD, VBG, PART, VBP, VBZ]
333
+ build_matches_hash(build_trimmed(tagged, tags))
334
+ end
335
+
336
+ def get_infinitive_verbs(tagged)
337
+ return nil unless valid_text(tagged)
338
+ tags = [VB]
339
+ build_matches_hash(build_trimmed(tagged, tags))
340
+ end
341
+
342
+ def get_past_tense_verbs(tagged)
343
+ return nil unless valid_text(tagged)
344
+ tags = [VBD]
345
+ build_matches_hash(build_trimmed(tagged, tags))
346
+ end
347
+
348
+ def get_gerund_verbs(tagged)
349
+ return nil unless valid_text(tagged)
350
+ tags = [VBG]
351
+ build_matches_hash(build_trimmed(tagged, tags))
352
+ end
353
+
354
+ def get_passive_verbs(tagged)
355
+ return nil unless valid_text(tagged)
356
+ tags = [PART]
357
+ build_matches_hash(build_trimmed(tagged, tags))
358
+ end
359
+
360
+ def get_base_present_verbs(tagged)
361
+ return nil unless valid_text(tagged)
362
+ tags = [VBP]
363
+ build_matches_hash(build_trimmed(tagged, tags))
364
+ end
365
+
366
+ def get_present_verbs(tagged)
367
+ return nil unless valid_text(tagged)
368
+ tags = [VBZ]
369
+ build_matches_hash(build_trimmed(tagged, tags))
370
+ end
371
+
372
+ def get_adjectives(tagged)
373
+ return nil unless valid_text(tagged)
374
+ tags = [JJ]
375
+ build_matches_hash(build_trimmed(tagged, tags))
376
+ end
377
+
378
+ def get_comparative_adjectives(tagged)
379
+ return nil unless valid_text(tagged)
380
+ tags = [JJR]
381
+ build_matches_hash(build_trimmed(tagged, tags))
382
+ end
383
+
384
+ def get_superlative_adjectives(tagged)
385
+ return nil unless valid_text(tagged)
386
+ tags = [JJS]
387
+ build_matches_hash(build_trimmed(tagged, tags))
388
+ end
389
+
390
+ def get_adverbs(tagged)
391
+ return nil unless valid_text(tagged)
392
+ tags = [RB, RBR, RBS, RP]
393
+ build_matches_hash(build_trimmed(tagged, tags))
394
+ end
395
+
396
+ def get_interrogatives(tagged)
397
+ return nil unless valid_text(tagged)
398
+ tags = [WRB, WDT, WP, WPS]
399
+ build_matches_hash(build_trimmed(tagged, tags))
400
+ end
401
+ # To be consistent with documentation's naming of 'interrogative' parts of speech as 'question'
402
+ alias_method :get_question_parts, :get_interrogatives
403
+
404
+ # Returns all types of conjunctions and does not discriminate between the various kinds.
405
+ # E.g. coordinating, subordinating, correlative...
406
+ def get_conjunctions(tagged)
407
+ return nil unless valid_text(tagged)
408
+ tags = [CC, IN]
409
+ build_matches_hash(build_trimmed(tagged, tags))
410
+ end
411
+
412
+ # Given a POS-tagged text, this method returns only the maximal noun phrases.
413
+ # May be called directly, but is also used by get_noun_phrases
414
+ def get_max_noun_phrases(tagged)
415
+ return nil unless valid_text(tagged)
416
+ tags = [@@mnp]
417
+ mn_phrases = build_trimmed(tagged, tags)
418
+ ret = Hash.new(0)
419
+ mn_phrases.each do |p|
420
+ p = stem(p) unless p =~ /\s/ # stem single words
421
+ ret[p] += 1 unless p =~ /\A\s*\z/
422
+ end
423
+ return ret
424
+ end
425
+
426
+ # Similar to get_words, but requires a POS-tagged text as an argument.
427
+ def get_noun_phrases(tagged)
428
+ return nil unless valid_text(tagged)
429
+ found = Hash.new(0)
430
+ phrase_ext = /(?:#{PREP}|#{DET}|#{NUM})+/xo
431
+ scanned = tagged.scan(@@mnp)
432
+ # Find MNPs in the text, one sentence at a time
433
+ # Record and split if the phrase is extended by a (?:PREP|DET|NUM)
434
+ mn_phrases = []
435
+ scanned.each do |m|
436
+ found[m] += 1 if phrase_ext =~ m
437
+ mn_phrases += m.split(phrase_ext)
438
+ end
439
+ mn_phrases.each do |mnp|
440
+ # Split the phrase into an array of words, and create a loop for each word,
441
+ # shortening the phrase by removing the word in the first position.
442
+ # Record the phrase and any single nouns that are found
443
+ words = mnp.split
444
+ words.length.times do |i|
445
+ found[words.join(' ')] += 1 if words.length > 1
446
+ w = words.shift
447
+ found[w] += 1 if w =~ /#{NN}/
448
+ end
449
+ end
450
+ ret = Hash.new(0)
451
+ found.keys.each do |f|
452
+ k = strip_tags(f)
453
+ v = found[f]
454
+ # We weight by the word count to favor long noun phrases
455
+ space_count = k.scan(/\s+/)
456
+ word_count = space_count.length + 1
457
+ # Throttle MNPs if necessary
458
+ next if word_count > @conf[:longest_noun_phrase]
459
+ k = stem(k) unless word_count > 1 # stem single words
460
+ multiplier = 1
461
+ multiplier = word_count if @conf[:weight_noun_phrases]
462
+ ret[k] += multiplier * v
463
+ end
464
+ return ret
465
+ end
466
+
467
+ # Reads some included corpus data and saves it in a stored hash on the
468
+ # local file system. This is called automatically if the tagger can't
469
+ # find the stored lexicon.
470
+ def install
471
+ puts "Creating part-of-speech lexicon" if @conf[:debug]
472
+ load_tags(@conf[:tag_lex])
473
+ load_words(@conf[:word_lex])
474
+ load_words(@conf[:unknown_lex])
475
+ File.open(@conf[:word_path], 'w') do |f|
476
+ Marshal.dump(@@lexicon, f)
477
+ end
478
+ File.open(@conf[:tag_path], 'w') do |f|
479
+ Marshal.dump(@@hmm, f)
480
+ end
481
+ end
482
+
483
+ ###################
484
+ # Private methods #
485
+ ###################
486
+
487
+ :private
488
+
489
+ def build_trimmed(tagged, tags)
490
+ tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
491
+ strip_tags(n)
492
+ end
493
+ end
494
+
495
+ def build_matches_hash(trimmed)
496
+ ret = Hash.new(0)
497
+ trimmed.each do |n|
498
+ n = stem(n)
499
+ next unless n.length < 100 # sanity check on word length
500
+ ret[n] += 1 unless n =~ /\A\s*\z/
501
+ end
502
+ ret
503
+ end
504
+
505
+ # Downcase the first letter of word
506
+ def lcfirst(word)
507
+ word.split(//)[0].downcase + word.split(//)[1..-1].join
508
+ end
509
+
510
+ # Upcase the first letter of word
511
+ def ucfirst(word)
512
+ word.split(//)[0].upcase + word.split(//)[1..-1].join
513
+ end
514
+
515
+ # Return the word stem as given by Stemmable module. This can be
516
+ # turned off with the class parameter @conf[:stem] => false.
517
+ def stem(word)
518
+ return word unless @conf[:stem]
519
+ return word.stem
520
+ end
521
+
522
+ # This method will reset the preceeding tag to a sentence ender (PP).
523
+ # This prepares the first word of a new sentence to be tagged correctly.
524
+ def reset
525
+ @conf[:current_tag] = 'pp'
526
+ end
527
+
528
+ # Check whether the text is a valid string
529
+ def valid_text(text)
530
+ if !text
531
+ # there's nothing to parse
532
+ "method call on uninitialized variable" if @conf[:debug]
533
+ return false
534
+ elsif /\A\s*\z/ =~ text
535
+ # text is an empty string, nothing to parse
536
+ return false
537
+ else
538
+ # $text is valid
539
+ return true
540
+ end
541
+ end
542
+
543
+ # Return a text string with the part-of-speech tags removed
544
+ def strip_tags(tagged, downcase = false)
545
+ return nil unless valid_text(tagged)
546
+ text = tagged.gsub(/<[^>]+>/m, "")
547
+ text = text.gsub(/\s+/m, " ")
548
+ text = text.gsub(/\A\s*/, "")
549
+ text = text.gsub(/\s*\z/, "")
550
+ if downcase
551
+ return text.downcase
552
+ else
553
+ return text
554
+ end
555
+ end
556
+
557
+ # Strip the provided text of HTML-style tags and separate off any punctuation
558
+ # in preparation for tagging
559
+ def clean_text(text)
560
+ return false unless valid_text(text)
561
+ text = text.toutf8
562
+ unless $no_hpricot
563
+ # Strip out any markup and convert entities to their proper form
564
+ cleaned_text = Hpricot(text).inner_text
565
+ else
566
+ cleaned_text = text
567
+ end
568
+ tokenized = []
569
+ # Tokenize the text (splitting on punctuation as you go)
570
+ cleaned_text.split(/\s+/).each do |line|
571
+ tokenized += split_punct(line)
572
+ end
573
+ words = split_sentences(tokenized)
574
+ return words
575
+ end
576
+
577
+ # This handles all of the trailing periods, keeping those that
578
+ # belong on abbreviations and removing those that seem to be
579
+ # at the end of sentences. This method makes some assumptions
580
+ # about the use of capitalization in the incoming text
581
+ def split_sentences(array)
582
+ tokenized = array
583
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
584
+ supt det mssrs rev)
585
+ army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
586
+ inst = %w(dept univ assn bros ph.d)
587
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
588
+ hwy hway la pde pd plz pl rd st tce)
589
+ comp = %w(mfg inc ltd co corp)
590
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
591
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
592
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
593
+ va wash wis wisc wy wyo usafa alta man ont que sask yuk)
594
+ month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
595
+ misc = %w(vs etc no esp)
596
+ abbr = Hash.new
597
+ [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
598
+ abbr[i] = true
599
+ end
600
+ words = Array.new
601
+ tokenized.each_with_index do |t, i|
602
+ if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
603
+ w = $1
604
+ # Don't separate the period off words that
605
+ # meet any of the following conditions:
606
+ #
607
+ # 1. It is defined in one of the lists above
608
+ # 2. It is only one letter long: Alfred E. Sloan
609
+ # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
610
+ unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
611
+ words << w
612
+ words << '.'
613
+ next
614
+ end
615
+ end
616
+ words << tokenized[i]
617
+ end
618
+ # If the final word ends in a period..
619
+ if words[-1] and words[-1] =~ /\A(.*\w)\.\z/
620
+ words[-1] = $1
621
+ words.push '.'
622
+ end
623
+ return words
624
+ end
625
+
626
+ # Separate punctuation from words, where appropriate. This leaves trailing
627
+ # periods in place to be dealt with later. Called by the clean_text method.
628
+ def split_punct(text)
629
+ # If there's no punctuation, return immediately
630
+ return [text] if /\A\w+\z/ =~ text
631
+ # Sanity checks
632
+ text = text.gsub(/\W{10,}/o, " ")
633
+
634
+ # Put quotes into a standard format
635
+ text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
636
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
637
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
638
+ text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
639
+ text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
640
+
641
+ # Handle all other punctuation
642
+ text = text.gsub(/--+/o, " - ") # Convert and separate dashes
643
+ text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
644
+ text = text.gsub(/:/o, " :") # Shift semicolons off
645
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
646
+ text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
647
+ text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
648
+
649
+ # English-specific contractions
650
+ text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
651
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
652
+ text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
653
+ result = text.split(' ')
654
+ return result
655
+ end
656
+
657
+ # Given a preceding tag, assign a tag word. Called by the add_tags method.
658
+ # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
659
+ def assign_tag(prev_tag, word)
660
+ if word == "-unknown-"
661
+ # classify unknown words accordingly
662
+ return @conf[:unknown_word_tag]
663
+ elsif word == "-sym-"
664
+ # If this is a symbol, tag it as a symbol
665
+ return "sym"
666
+ end
667
+ best_so_far = 0
668
+ w = @@lexicon[word]
669
+ t = @@hmm
670
+
671
+ # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
672
+ # which is used in most POS taggers
673
+ best_tag = ""
674
+ t[prev_tag].keys.each do |tag|
675
+ # With @config[:relax] set, this method
676
+ # will also include any `open classes' of POS tags
677
+ pw = 0
678
+ if w[tag]
679
+ pw = w[tag]
680
+ elsif @conf[:relax] and tag =~ /\A(?:jj|nn|rb|vb)/
681
+ pw = 0
682
+ else
683
+ next
684
+ end
685
+
686
+ # Bayesian logic:
687
+ # P = P( tag | prev_tag ) * P( tag | word )
688
+ probability = t[prev_tag][tag] * (pw + 1)
689
+ # Set the tag with maximal probability
690
+ if probability > best_so_far
691
+ best_so_far = probability
692
+ best_tag = tag
693
+ end
694
+ end
695
+ return best_tag
696
+ end
697
+
698
+ # This method determines whether a word should be considered in its
699
+ # lower or upper case form. This is useful in considering proper nouns
700
+ # and words that begin sentences. Called by add_tags.
701
+ def clean_word(word)
702
+ lcf = lcfirst(word)
703
+ # seen this word as it appears (lower or upper case)
704
+ if @@lexicon[word]
705
+ return word
706
+ elsif @@lexicon[lcf]
707
+ # seen this word only as lower case
708
+ return lcf
709
+ else
710
+ # never seen this word. guess.
711
+ return classify_unknown_word(word)
712
+ end
713
+ end
714
+
715
+ # This changes any word not appearing in the lexicon to identifiable
716
+ # classes of words handled by a simple unknown word classification
717
+ # metric. Called by the clean_word method.
718
+ def classify_unknown_word(word)
719
+ if /[\(\{\[]/ =~ word # Left brackets
720
+ classified = "*LRB*"
721
+ elsif
722
+ /[\)\}\]]/ =~ word # Right brackets
723
+ classified = "*RRB*"
724
+ elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
725
+ classified = "*NUM*"
726
+ elsif /\A\d+[\d\/:-]+\d\z/ =~ word # Other number constructs
727
+ classified = "*NUM*"
728
+ elsif /\A-?\d+\w+\z/o =~ word # Ordinal number
729
+ classified = "*ORD*"
730
+ elsif /\A[A-Z][A-Z\.-]*\z/o =~ word # Abbreviation (all caps)
731
+ classified = "-abr-"
732
+ elsif /\w-\w/o =~ word # Hyphenated word
733
+ /-([^-]+)\z/ =~ word
734
+ h_suffix = $1
735
+ if h_suffix and (@@lexicon[h_suffix] and @@lexicon[h_suffix]['jj'])
736
+ # last part of this is defined as an adjective
737
+ classified = "-hyp-adj-"
738
+ else
739
+ # last part of this is not defined as an adjective
740
+ classified = "-hyp-"
741
+ end
742
+ elsif /\A\W+\z/o =~ word
743
+ classified = "-sym-" # Symbol
744
+ elsif word == ucfirst(word)
745
+ classified = "-cap-" # Capitalized word
746
+ elsif /ing\z/o =~ word
747
+ classified = "-ing-" # Ends in 'ing'
748
+ elsif /s\z/o =~ word
749
+ classified = "-s-" # Ends in 's'
750
+ elsif /tion\z/o =~ word
751
+ classified = "-tion-" # Ends in 'tion'
752
+ elsif /ly\z/o =~ word
753
+ classified = "-ly-" # Ends in 'ly'
754
+ elsif /ed\z/o =~ word
755
+ classified = "-ed-" # Ends in 'ed
756
+ else
757
+ classified = "-unknown-" # Completely unknown
758
+ end
759
+ return classified
760
+ end
761
+
762
+ # This returns a compiled regexp for extracting maximal noun phrases
763
+ # from a POS-tagged text.
764
+ def get_max_noun_regex
765
+ regex = /
766
+ # optional number, gerund - adjective -participle
767
+ (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
768
+ # Followed by one or more nouns
769
+ (?:#{NN})+
770
+ (?:
771
+ # Optional preposition, determinant, cardinal
772
+ (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
773
+ # Optional gerund-adjective -participle
774
+ (?:#{GER}|#{ADJ}|#{PART})*
775
+ # one or more nouns
776
+ (?:#{NN})+
777
+ )*
778
+ /xo #/
779
+ return regex
780
+ end
781
+
782
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
783
+ # YAML data parser. It will load a YAML document with a collection of key:
784
+ # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
785
+ # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
786
+ def load_tags(lexicon)
787
+ path = File.join($lexpath, lexicon)
788
+ fh = File.open(path, 'r')
789
+ while line = fh.gets
790
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
791
+ next unless $1 and $2
792
+ key, data = $1, $2
793
+ tags = Hash.new
794
+ items = data.split(/,\s+/)
795
+ pairs = {}
796
+ items.each do |i|
797
+ /([^:]+):\s*(.+)/ =~ i
798
+ pairs[$1] = $2.to_f
799
+ end
800
+ @@hmm[key] = pairs
801
+ end
802
+ fh.close
803
+ end
804
+
805
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
806
+ # YAML data parser. It will load a YAML document with a collection of key:
807
+ # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
808
+ # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
809
+ def load_words(lexicon)
810
+ path = File.join($lexpath, lexicon)
811
+ fh = File.open(path, 'r')
812
+ while line = fh.gets
813
+ /\A"?([^{"]+)"?: \{ (.*) \}/ =~ line
814
+ next unless $1 and $2
815
+ key, data = $1, $2
816
+ tags = Hash.new
817
+ items = data.split(/,\s+/)
818
+ pairs = {}
819
+ items.each do |i|
820
+ /([^:]+):\s*(.+)/ =~ i
821
+ pairs[$1] = $2.to_f
822
+ end
823
+ @@lexicon[key] = pairs
824
+ end
825
+ fh.close
826
+ end
827
+
828
+ #memoize the stem and assign_tag methods
829
+ memoize("stem")
830
+ memoize("assign_tag")
831
+ end