engtagger 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 47684458b8965c2c1f52d0d29ca2cad340b0ed3f
4
- data.tar.gz: aa6dbd0473409e9b6b2987a42c126a6b10dce096
3
+ metadata.gz: 8a32003a9025611e547589fcc5e8d9dd906f6540
4
+ data.tar.gz: dbdf70f16444fe60178b1013b72a5155a2ae66fd
5
5
  SHA512:
6
- metadata.gz: cddd67eab940146a2426032714aedd8e5195192ead3133ade5f76c594c5f0667f0747bba8b019f63df91dd2eb0610da19288a9e06a2323eb7bde91fec025b028
7
- data.tar.gz: a76ca3422b9a3a1a813263b6e4ab5e69ca74ac998afcc259be55a6441b6b46b21ac9de6eb241327956a6fda1c2d3dbd033ba1678df86506f153089a3ef99d46d
6
+ metadata.gz: 00da48be968ddb0b7d314df05a5ba6bc2c33376ae8d1c5fda5b60c6cd72b9b8d44b73a5c553d13b1972e199f0d0d7b4a8de957ee92a03c39216f4a87c86df8a2
7
+ data.tar.gz: '08842a7e26fd7579fada8c63d1af4e2176015c470a297f1147183bbeeb91e4d4c00b8062873e53d3c815ee4d19abb45ad11104f02974a1a9cbd17e2f9c25ab5c'
data/README.md CHANGED
@@ -139,6 +139,7 @@ of this Ruby library
139
139
 
140
140
  * Carlos Ramirez III
141
141
  * Phil London
142
+ * Bazay (Baron Bloomer)
142
143
 
143
144
  ### Acknowledgement
144
145
 
@@ -18,7 +18,7 @@ $lexpath = File.join(File.dirname(__FILE__), 'engtagger')
18
18
  $word_path = File.join($lexpath, "pos_words.hash")
19
19
  $tag_path = File.join($lexpath, "pos_tags.hash")
20
20
 
21
- # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
21
+ # for memoization (code snipet from http://eigenclass.org/hiki/bounded-space-memoization)
22
22
  class Module
23
23
  def memoize(method)
24
24
  # alias_method is faster than define_method + old.bind(self).call
@@ -38,7 +38,7 @@ class EngTagger
38
38
  #################
39
39
  # Class methods #
40
40
  #################
41
-
41
+
42
42
  # Return a class variable that holds probability data
43
43
  def self.hmm
44
44
  return @@hmm
@@ -48,13 +48,13 @@ class EngTagger
48
48
  def self.lexicon
49
49
  return @@lexicon
50
50
  end
51
-
52
- # Return a regexp from a string argument that matches an XML-style pos tag
51
+
52
+ # Return a regexp from a string argument that matches an XML-style pos tag
53
53
  def self.get_ext(tag = nil)
54
54
  return nil unless tag
55
55
  return Regexp.new("<#{tag}>[^<]+</#{tag}>\s*")
56
56
  end
57
-
57
+
58
58
  # Regexps to match XML-style part-of-speech tags
59
59
  NUM = get_ext('cd')
60
60
  GER = get_ext('vbg')
@@ -70,22 +70,32 @@ class EngTagger
70
70
  VB = get_ext('vb')
71
71
  VBG = get_ext('vbg')
72
72
  VBD = get_ext('vbd')
73
- PART = get_ext('vbn')
73
+ PART = get_ext('vbn')
74
74
  VBP = get_ext('vbp')
75
75
  VBZ = get_ext('vbz')
76
76
  JJ = get_ext('jj')
77
77
  JJR = get_ext('jjr')
78
78
  JJS = get_ext('jjs')
79
+ RB = get_ext('rb')
80
+ RBR = get_ext('rbr')
81
+ RBS = get_ext('rbs')
82
+ RP = get_ext('rp')
83
+ WRB = get_ext('wrb')
84
+ WDT = get_ext('wdt')
85
+ WP = get_ext('wp')
86
+ WPS = get_ext('wps')
87
+ CC = get_ext('cc')
88
+ IN = get_ext('in')
79
89
 
80
- # Convert a Treebank-style, abbreviated tag into verbose definitions
90
+ # Convert a Treebank-style, abbreviated tag into verbose definitions
81
91
  def self.explain_tag(tag)
82
92
  if TAGS[tag]
83
93
  return TAGS[tag]
84
94
  else
85
95
  return tag
86
96
  end
87
- end
88
-
97
+ end
98
+
89
99
  # The folloging is to make a hash to convert a pos tag to its definition
90
100
  # used by the explain_tag method
91
101
  tags = [
@@ -132,35 +142,35 @@ class EngTagger
132
142
  "PPR", "Punctuation, quotation mark right",
133
143
  "PPS", "Punctuation, colon, semicolon, elipsis",
134
144
  "LRB", "Punctuation, left bracket",
135
- "RRB", "Punctuation, right bracket"
145
+ "RRB", "Punctuation, right bracket"
136
146
  ]
137
147
  tags = tags.collect{|t| t.downcase.gsub(/[\.\,\'\-\s]+/, '_')}
138
148
  tags = tags.collect{|t| t.gsub(/\&/, "and").gsub(/\//, "or")}
139
149
  TAGS = Hash[*tags]
140
-
150
+
141
151
  # Hash storing config values:
142
152
  #
143
153
  # * :unknown_word_tag
144
154
  # => (String) Tag to assign to unknown words
145
- # * :stem
155
+ # * :stem
146
156
  # => (Boolean) Stem single words using Porter module
147
157
  # * :weight_noun_phrases
148
- # => (Boolean) When returning occurrence counts for a noun phrase, multiply
158
+ # => (Boolean) When returning occurrence counts for a noun phrase, multiply
149
159
  # the valuethe number of words in the NP.
150
- # * :longest_noun_phrase
151
- # => (Integer) Will ignore noun phrases longer than this threshold. This
160
+ # * :longest_noun_phrase
161
+ # => (Integer) Will ignore noun phrases longer than this threshold. This
152
162
  # affects only the get_words() and get_nouns() methods.
153
- # * :relax
154
- # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
163
+ # * :relax
164
+ # => (Boolean) Relax the Hidden Markov Model: this may improve accuracy for
155
165
  # uncommon words, particularly words used polysemously
156
166
  # * :tag_lex
157
- # => (String) Name of the YAML file containing a hash of adjacent part of
167
+ # => (String) Name of the YAML file containing a hash of adjacent part of
158
168
  # speech tags and the probability of each
159
169
  # * :word_lex
160
- # => (String) Name of the YAML file containing a hash of words and corresponding
170
+ # => (String) Name of the YAML file containing a hash of words and corresponding
161
171
  # parts of speech
162
172
  # * :unknown_lex
163
- # => (String) Name of the YAML file containing a hash of tags for unknown
173
+ # => (String) Name of the YAML file containing a hash of tags for unknown
164
174
  # words and corresponding parts of speech
165
175
  # * :tag_path
166
176
  # => (String) Directory path of tag_lex
@@ -169,12 +179,12 @@ class EngTagger
169
179
  # * :debug
170
180
  # => (Boolean) Print debug messages
171
181
  attr_accessor :conf
172
-
182
+
173
183
  ###############
174
184
  # Constructor #
175
185
  ###############
176
-
177
- # Take a hash of parameters that override default values.
186
+
187
+ # Take a hash of parameters that override default values.
178
188
  # See above for details.
179
189
  def initialize(params = {})
180
190
  @conf = Hash.new
@@ -187,10 +197,10 @@ class EngTagger
187
197
  @conf[:word_lex] = 'words.yml'
188
198
  @conf[:unknown_lex] = 'unknown.yml'
189
199
  @conf[:word_path] = $word_path
190
- @conf[:tag_path] = $tag_path
200
+ @conf[:tag_path] = $tag_path
191
201
  @conf[:debug] = false
192
202
  # assuming that we start analyzing from the beginninga new sentence...
193
- @conf[:current_tag] = 'pp'
203
+ @conf[:current_tag] = 'pp'
194
204
  @conf.merge!(params)
195
205
  unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
196
206
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
@@ -206,11 +216,11 @@ class EngTagger
206
216
  end
207
217
  @@mnp = get_max_noun_regex
208
218
  end
209
-
219
+
210
220
  ##################
211
221
  # Public methods #
212
222
  ##################
213
-
223
+
214
224
  # Examine the string provided and return it fully tagged in XML style
215
225
  def add_tags(text, verbose = false)
216
226
  return nil unless valid_text(text)
@@ -222,15 +232,15 @@ class EngTagger
222
232
  tag = assign_tag(@conf[:current_tag], cleaned_word)
223
233
  @conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
224
234
  tag = EngTagger.explain_tag(tag) if verbose
225
- tagged << '<' + tag + '>' + word + '</' + tag + '>'
235
+ tagged << '<' + tag + '>' + word + '</' + tag + '>'
226
236
  end
227
237
  reset
228
238
  return tagged.join(' ')
229
239
  end
230
-
231
- # Given a text string, return as many nouns and noun phrases as possible.
240
+
241
+ # Given a text string, return as many nouns and noun phrases as possible.
232
242
  # Applies add_tags and involves three stages:
233
- #
243
+ #
234
244
  # * Tag the text
235
245
  # * Extract all the maximal noun phrases
236
246
  # * Recursively extract all noun phrases from the MNPs
@@ -244,19 +254,19 @@ class EngTagger
244
254
  return get_noun_phrases(tagged)
245
255
  end
246
256
  end
247
-
248
- # Return an easy-on-the-eyes tagged version of a text string.
257
+
258
+ # Return an easy-on-the-eyes tagged version of a text string.
249
259
  # Applies add_tags and reformats to be easier to read.
250
260
  def get_readable(text, verbose = false)
251
261
  return nil unless valid_text(text)
252
262
  tagged = add_tags(text, verbose)
253
- tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
263
+ tagged = tagged.gsub(/<\w+>([^<]+)<\/(\w+)>/o) do
254
264
  $1 + '/' + $2.upcase
255
265
  end
256
266
  return tagged
257
267
  end
258
-
259
- # Return an array of sentences (without POS tags) from a text.
268
+
269
+ # Return an array of sentences (without POS tags) from a text.
260
270
  def get_sentences(text)
261
271
  return nil unless valid_text(text)
262
272
  tagged = add_tags(text)
@@ -270,25 +280,19 @@ class EngTagger
270
280
  sentence.gsub(Regexp.new(" (`+) ")){' ' + $1}
271
281
  sentence.gsub(Regexp.new(" (\W+)$")){$1}
272
282
  sentence.gsub(Regexp.new("^(`+) ")){$1}
273
- end
283
+ end
274
284
  return sentences
275
285
  end
276
-
286
+
277
287
  # Given a POS-tagged text, this method returns a hash of all proper nouns
278
288
  # and their occurrence frequencies. The method is greedy and will
279
289
  # return multi-word phrases, if possible, so it would find ``Linguistic
280
- # Data Consortium'' as a single unit, rather than as three individual
281
- # proper nouns. This method does not stem the found words.
290
+ # Data Consortium'' as a single unit, rather than as three individual
291
+ # proper nouns. This method does not stem the found words.
282
292
  def get_proper_nouns(tagged)
283
293
  return nil unless valid_text(tagged)
284
- trimmed = tagged.scan(NNP).map do |n|
285
- strip_tags(n)
286
- end
287
- nnp = Hash.new(0)
288
- trimmed.each do |n|
289
- next unless n.length < 100 # sanity check on word length
290
- nnp[n] += 1 unless n =~ /\A\s*\z/
291
- end
294
+ tags = [NNP]
295
+ nnp = build_matches_hash(build_trimmed(tagged, tags))
292
296
  # Now for some fancy resolution stuff...
293
297
  nnp.keys.each do |key|
294
298
  words = key.split(/\s/)
@@ -301,7 +305,7 @@ class EngTagger
301
305
  /\A([a-z])[a-z]*\z/ =~ word
302
306
  $1
303
307
  end.join ''
304
- # If that acronym has been seen,
308
+ # If that acronym has been seen,
305
309
  # remove it and add the values to
306
310
  # the full name
307
311
  if nnp[acronym]
@@ -312,167 +316,105 @@ class EngTagger
312
316
  end
313
317
  return nnp
314
318
  end
315
-
316
- # Given a POS-tagged text, this method returns all nouns and their
317
- # occurrence frequencies.
319
+
320
+ # Given a POS-tagged text, this method returns all nouns and their
321
+ # occurrence frequencies.
318
322
  def get_nouns(tagged)
319
323
  return nil unless valid_text(tagged)
320
- NN
321
- trimmed = tagged.scan(NN).map do |n|
322
- strip_tags(n)
323
- end
324
- ret = Hash.new(0)
325
- trimmed.each do |n|
326
- n = stem(n)
327
- next unless n.length < 100 # sanity check on word length
328
- ret[n] += 1 unless n =~ /\A\s*\z/
329
- end
330
- return ret
324
+ tags = [NN]
325
+ build_matches_hash(build_trimmed(tagged, tags))
326
+ end
327
+
328
+ # Returns all types of verbs and does not descriminate between the various kinds.
329
+ # Is the combination of all other verb methods listed in this class.
330
+ def get_verbs(tagged)
331
+ return nil unless valid_text(tagged)
332
+ tags = [VB, VBD, VBG, PART, VBP, VBZ]
333
+ build_matches_hash(build_trimmed(tagged, tags))
331
334
  end
332
335
 
333
336
  def get_infinitive_verbs(tagged)
334
337
  return nil unless valid_text(tagged)
335
- VB
336
- trimmed = tagged.scan(VB).map do |n|
337
- strip_tags(n)
338
- end
339
- ret = Hash.new(0)
340
- trimmed.each do |n|
341
- n = stem(n)
342
- next unless n.length < 100 # sanity check on word length
343
- ret[n] += 1 unless n =~ /\A\s*\z/
344
- end
345
- return ret
338
+ tags = [VB]
339
+ build_matches_hash(build_trimmed(tagged, tags))
346
340
  end
347
341
 
348
342
  def get_past_tense_verbs(tagged)
349
343
  return nil unless valid_text(tagged)
350
- VBD
351
- trimmed = tagged.scan(VBD).map do |n|
352
- strip_tags(n)
353
- end
354
- ret = Hash.new(0)
355
- trimmed.each do |n|
356
- n = stem(n)
357
- next unless n.length < 100 # sanity check on word length
358
- ret[n] += 1 unless n =~ /\A\s*\z/
359
- end
360
- return ret
344
+ tags = [VBD]
345
+ build_matches_hash(build_trimmed(tagged, tags))
361
346
  end
362
347
 
363
348
  def get_gerund_verbs(tagged)
364
349
  return nil unless valid_text(tagged)
365
- VBG
366
- trimmed = tagged.scan(VB).map do |n|
367
- strip_tags(n)
368
- end
369
- ret = Hash.new(0)
370
- trimmed.each do |n|
371
- n = stem(n)
372
- next unless n.length < 100 # sanity check on word length
373
- ret[n] += 1 unless n =~ /\A\s*\z/
374
- end
375
- return ret
350
+ tags = [VBG]
351
+ build_matches_hash(build_trimmed(tagged, tags))
376
352
  end
377
353
 
378
354
  def get_passive_verbs(tagged)
379
355
  return nil unless valid_text(tagged)
380
- PART
381
- trimmed = tagged.scan(PART).map do |n|
382
- strip_tags(n)
383
- end
384
- ret = Hash.new(0)
385
- trimmed.each do |n|
386
- n = stem(n)
387
- next unless n.length < 100 # sanity check on word length
388
- ret[n] += 1 unless n =~ /\A\s*\z/
389
- end
390
- return ret
356
+ tags = [PART]
357
+ build_matches_hash(build_trimmed(tagged, tags))
391
358
  end
392
359
 
393
-
394
360
  def get_base_present_verbs(tagged)
395
361
  return nil unless valid_text(tagged)
396
- VBP
397
- trimmed = tagged.scan(VBP).map do |n|
398
- strip_tags(n)
399
- end
400
- ret = Hash.new(0)
401
- trimmed.each do |n|
402
- n = stem(n)
403
- next unless n.length < 100 # sanity check on word length
404
- ret[n] += 1 unless n =~ /\A\s*\z/
405
- end
406
- return ret
362
+ tags = [VBP]
363
+ build_matches_hash(build_trimmed(tagged, tags))
407
364
  end
408
365
 
409
366
  def get_present_verbs(tagged)
410
367
  return nil unless valid_text(tagged)
411
- VBZ
412
- trimmed = tagged.scan(VBZ).map do |n|
413
- strip_tags(n)
414
- end
415
- ret = Hash.new(0)
416
- trimmed.each do |n|
417
- n = stem(n)
418
- next unless n.length < 100 # sanity check on word length
419
- ret[n] += 1 unless n =~ /\A\s*\z/
420
- end
421
- return ret
368
+ tags = [VBZ]
369
+ build_matches_hash(build_trimmed(tagged, tags))
422
370
  end
423
371
 
424
372
  def get_adjectives(tagged)
425
373
  return nil unless valid_text(tagged)
426
- JJ
427
- trimmed = tagged.scan(JJ).map do |n|
428
- strip_tags(n)
429
- end
430
- ret = Hash.new(0)
431
- trimmed.each do |n|
432
- n = stem(n)
433
- next unless n.length < 100 # sanity check on word length
434
- ret[n] += 1 unless n =~ /\A\s*\z/
435
- end
436
- return ret
374
+ tags = [JJ]
375
+ build_matches_hash(build_trimmed(tagged, tags))
437
376
  end
438
377
 
439
378
  def get_comparative_adjectives(tagged)
440
379
  return nil unless valid_text(tagged)
441
- JJR
442
- trimmed = tagged.scan(JJR).map do |n|
443
- strip_tags(n)
444
- end
445
- ret = Hash.new(0)
446
- trimmed.each do |n|
447
- n = stem(n)
448
- next unless n.length < 100 # sanity check on word length
449
- ret[n] += 1 unless n =~ /\A\s*\z/
450
- end
451
- return ret
452
- end
380
+ tags = [JJR]
381
+ build_matches_hash(build_trimmed(tagged, tags))
382
+ end
453
383
 
454
384
  def get_superlative_adjectives(tagged)
455
385
  return nil unless valid_text(tagged)
456
- JJS
457
- trimmed = tagged.scan(JJS).map do |n|
458
- strip_tags(n)
459
- end
460
- ret = Hash.new(0)
461
- trimmed.each do |n|
462
- n = stem(n)
463
- next unless n.length < 100 # sanity check on word length
464
- ret[n] += 1 unless n =~ /\A\s*\z/
465
- end
466
- return ret
386
+ tags = [JJS]
387
+ build_matches_hash(build_trimmed(tagged, tags))
388
+ end
389
+
390
+ def get_adverbs(tagged)
391
+ return nil unless valid_text(tagged)
392
+ tags = [RB, RBR, RBS, RP]
393
+ build_matches_hash(build_trimmed(tagged, tags))
394
+ end
395
+
396
+ def get_interrogatives(tagged)
397
+ return nil unless valid_text(tagged)
398
+ tags = [WRB, WDT, WP, WPS]
399
+ build_matches_hash(build_trimmed(tagged, tags))
400
+ end
401
+ # To be consistent with documentation's naming of 'interrogative' parts of speech as 'question'
402
+ alias_method :get_question_parts, :get_interrogatives
403
+
404
+ # Returns all types of conjunctions and does not discriminate between the various kinds.
405
+ # E.g. coordinating, subordinating, correlative...
406
+ def get_conjunctions(tagged)
407
+ return nil unless valid_text(tagged)
408
+ tags = [CC, IN]
409
+ build_matches_hash(build_trimmed(tagged, tags))
467
410
  end
468
411
 
469
412
  # Given a POS-tagged text, this method returns only the maximal noun phrases.
470
413
  # May be called directly, but is also used by get_noun_phrases
471
414
  def get_max_noun_phrases(tagged)
472
- return unless valid_text(tagged)
473
- mn_phrases = tagged.scan(@@mnp).map do |m|
474
- strip_tags(m)
475
- end
415
+ return nil unless valid_text(tagged)
416
+ tags = [@@mnp]
417
+ mn_phrases = build_trimmed(tagged, tags)
476
418
  ret = Hash.new(0)
477
419
  mn_phrases.each do |p|
478
420
  p = stem(p) unless p =~ /\s/ # stem single words
@@ -495,9 +437,9 @@ class EngTagger
495
437
  mn_phrases += m.split(phrase_ext)
496
438
  end
497
439
  mn_phrases.each do |mnp|
498
- # Split the phrase into an array of words, and create a loop for each word,
440
+ # Split the phrase into an array of words, and create a loop for each word,
499
441
  # shortening the phrase by removing the word in the first position.
500
- # Record the phrase and any single nouns that are found
442
+ # Record the phrase and any single nouns that are found
501
443
  words = mnp.split
502
444
  words.length.times do |i|
503
445
  found[words.join(' ')] += 1 if words.length > 1
@@ -519,12 +461,12 @@ class EngTagger
519
461
  multiplier = word_count if @conf[:weight_noun_phrases]
520
462
  ret[k] += multiplier * v
521
463
  end
522
- return ret
464
+ return ret
523
465
  end
524
-
525
- # Reads some included corpus data and saves it in a stored hash on the
526
- # local file system. This is called automatically if the tagger can't
527
- # find the stored lexicon.
466
+
467
+ # Reads some included corpus data and saves it in a stored hash on the
468
+ # local file system. This is called automatically if the tagger can't
469
+ # find the stored lexicon.
528
470
  def install
529
471
  puts "Creating part-of-speech lexicon" if @conf[:debug]
530
472
  load_tags(@conf[:tag_lex])
@@ -542,7 +484,23 @@ class EngTagger
542
484
  # Private methods #
543
485
  ###################
544
486
 
545
- :private
487
+ :private
488
+
489
+ def build_trimmed(tagged, tags)
490
+ tags.map { |tag| tagged.scan(tag) }.flatten.map do |n|
491
+ strip_tags(n)
492
+ end
493
+ end
494
+
495
+ def build_matches_hash(trimmed)
496
+ ret = Hash.new(0)
497
+ trimmed.each do |n|
498
+ n = stem(n)
499
+ next unless n.length < 100 # sanity check on word length
500
+ ret[n] += 1 unless n =~ /\A\s*\z/
501
+ end
502
+ ret
503
+ end
546
504
 
547
505
  # Downcase the first letter of word
548
506
  def lcfirst(word)
@@ -552,8 +510,8 @@ class EngTagger
552
510
  # Upcase the first letter of word
553
511
  def ucfirst(word)
554
512
  word.split(//)[0].upcase + word.split(//)[1..-1].join
555
- end
556
-
513
+ end
514
+
557
515
  # Return the word stem as given by Stemmable module. This can be
558
516
  # turned off with the class parameter @conf[:stem] => false.
559
517
  def stem(word)
@@ -561,8 +519,8 @@ class EngTagger
561
519
  return word.stem
562
520
  end
563
521
 
564
- # This method will reset the preceeding tag to a sentence ender (PP).
565
- # This prepares the first word of a new sentence to be tagged correctly.
522
+ # This method will reset the preceeding tag to a sentence ender (PP).
523
+ # This prepares the first word of a new sentence to be tagged correctly.
566
524
  def reset
567
525
  @conf[:current_tag] = 'pp'
568
526
  end
@@ -581,7 +539,7 @@ class EngTagger
581
539
  return true
582
540
  end
583
541
  end
584
-
542
+
585
543
  # Return a text string with the part-of-speech tags removed
586
544
  def strip_tags(tagged, downcase = false)
587
545
  return nil unless valid_text(tagged)
@@ -595,8 +553,8 @@ class EngTagger
595
553
  return text
596
554
  end
597
555
  end
598
-
599
- # Strip the provided text of HTML-style tags and separate off any punctuation
556
+
557
+ # Strip the provided text of HTML-style tags and separate off any punctuation
600
558
  # in preparation for tagging
601
559
  def clean_text(text)
602
560
  return false unless valid_text(text)
@@ -615,27 +573,27 @@ class EngTagger
615
573
  words = split_sentences(tokenized)
616
574
  return words
617
575
  end
618
-
619
- # This handles all of the trailing periods, keeping those that
576
+
577
+ # This handles all of the trailing periods, keeping those that
620
578
  # belong on abbreviations and removing those that seem to be
621
579
  # at the end of sentences. This method makes some assumptions
622
580
  # about the use of capitalization in the incoming text
623
581
  def split_sentences(array)
624
582
  tokenized = array
625
- people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
583
+ people = %w(jr mr ms mrs dr prof esq sr sen sens rep reps gov attys attys
626
584
  supt det mssrs rev)
627
585
  army = %w(col gen lt cmdr adm capt sgt cpl maj brig)
628
586
  inst = %w(dept univ assn bros ph.d)
629
- place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
587
+ place = %w(arc al ave blvd bld cl ct cres exp expy dist mt mtn ft fy fwy
630
588
  hwy hway la pde pd plz pl rd st tce)
631
589
  comp = %w(mfg inc ltd co corp)
632
- state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
633
- ind ia kans kan ken ky la me md is mass mich minn miss mo mont
634
- neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
590
+ state = %w(ala ariz ark cal calif colo col conn del fed fla ga ida id ill
591
+ ind ia kans kan ken ky la me md is mass mich minn miss mo mont
592
+ neb nebr nev mex okla ok ore penna penn pa dak tenn tex ut vt
635
593
  va wash wis wisc wy wyo usafa alta man ont que sask yuk)
636
594
  month = %w(jan feb mar apr may jun jul aug sep sept oct nov dec)
637
595
  misc = %w(vs etc no esp)
638
- abbr = Hash.new
596
+ abbr = Hash.new
639
597
  [people, army, inst, place, comp, state, month, misc].flatten.each do |i|
640
598
  abbr[i] = true
641
599
  end
@@ -643,11 +601,11 @@ class EngTagger
643
601
  tokenized.each_with_index do |t, i|
644
602
  if tokenized[i + 1] and tokenized [i + 1] =~ /[A-Z\W]/ and tokenized[i] =~ /\A(.+)\.\z/
645
603
  w = $1
646
- # Don't separate the period off words that
604
+ # Don't separate the period off words that
647
605
  # meet any of the following conditions:
648
606
  #
649
607
  # 1. It is defined in one of the lists above
650
- # 2. It is only one letter long: Alfred E. Sloan
608
+ # 2. It is only one letter long: Alfred E. Sloan
651
609
  # 3. It has a repeating letter-dot: U.S.A. or J.C. Penney
652
610
  unless abbr[w.downcase] or w =~ /\A[a-z]\z/i or w =~ /[a-z](?:\.[a-z])+\z/i
653
611
  words << w
@@ -664,8 +622,8 @@ class EngTagger
664
622
  end
665
623
  return words
666
624
  end
667
-
668
- # Separate punctuation from words, where appropriate. This leaves trailing
625
+
626
+ # Separate punctuation from words, where appropriate. This leaves trailing
669
627
  # periods in place to be dealt with later. Called by the clean_text method.
670
628
  def split_punct(text)
671
629
  # If there's no punctuation, return immediately
@@ -675,27 +633,27 @@ class EngTagger
675
633
 
676
634
  # Put quotes into a standard format
677
635
  text = text.gsub(/`(?!`)(?=.*\w)/o, "` ") # Shift left quotes off text
678
- text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
679
- text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
636
+ text = text.gsub(/"(?=.*\w)/o, " `` ") # Convert left quotes to ``
637
+ text = text.gsub(/(\W|^)'(?=.*\w)/o){$1 ? $1 + " ` " : " ` "} # Convert left quotes to `
680
638
  text = text.gsub(/"/, " '' ") # Convert (remaining) quotes to ''
681
639
  text = text.gsub(/(\w)'(?!')(?=\W|$)/o){$1 + " ' "} # Separate right single quotes
682
-
640
+
683
641
  # Handle all other punctuation
684
642
  text = text.gsub(/--+/o, " - ") # Convert and separate dashes
685
643
  text = text.gsub(/,(?!\d)/o, " , ") # Shift commas off everything but numbers
686
644
  text = text.gsub(/:/o, " :") # Shift semicolons off
687
- text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
645
+ text = text.gsub(/(\.\.\.+)/o){" " + $1 + " "} # Shift ellipses off
688
646
  text = text.gsub(/([\(\[\{\}\]\)])/o){" " + $1 + " "} # Shift off brackets
689
647
  text = text.gsub(/([\!\?#\$%;~|])/o){" " + $1 + " "} # Shift off other ``standard'' punctuation
690
648
 
691
649
  # English-specific contractions
692
650
  text = text.gsub(/([A-Za-z])'([dms])\b/o){$1 + " '" + $2} # Separate off 'd 'm 's
693
- text = text.gsub(/n't\b/o, " n't") # Separate off n't
651
+ text = text.gsub(/n't\b/o, " n't") # Separate off n't
694
652
  text = text.gsub(/'(ve|ll|re)\b/o){" '" + $1} # Separate off 've, 'll, 're
695
653
  result = text.split(' ')
696
654
  return result
697
- end
698
-
655
+ end
656
+
699
657
  # Given a preceding tag, assign a tag word. Called by the add_tags method.
700
658
  # This method is a modified version of the Viterbi algorithm for part-of-speech tagging
701
659
  def assign_tag(prev_tag, word)
@@ -709,7 +667,7 @@ class EngTagger
709
667
  best_so_far = 0
710
668
  w = @@lexicon[word]
711
669
  t = @@hmm
712
-
670
+
713
671
  # TAG THE TEXT: What follows is a modified version of the Viterbi algorithm
714
672
  # which is used in most POS taggers
715
673
  best_tag = ""
@@ -724,9 +682,9 @@ class EngTagger
724
682
  else
725
683
  next
726
684
  end
727
-
728
- # Bayesian logic:
729
- # P = P( tag | prev_tag ) * P( tag | word )
685
+
686
+ # Bayesian logic:
687
+ # P = P( tag | prev_tag ) * P( tag | word )
730
688
  probability = t[prev_tag][tag] * (pw + 1)
731
689
  # Set the tag with maximal probability
732
690
  if probability > best_so_far
@@ -735,18 +693,18 @@ class EngTagger
735
693
  end
736
694
  end
737
695
  return best_tag
738
- end
739
-
740
- # This method determines whether a word should be considered in its
696
+ end
697
+
698
+ # This method determines whether a word should be considered in its
741
699
  # lower or upper case form. This is useful in considering proper nouns
742
- # and words that begin sentences. Called by add_tags.
700
+ # and words that begin sentences. Called by add_tags.
743
701
  def clean_word(word)
744
702
  lcf = lcfirst(word)
745
703
  # seen this word as it appears (lower or upper case)
746
704
  if @@lexicon[word]
747
705
  return word
748
706
  elsif @@lexicon[lcf]
749
- # seen this word only as lower case
707
+ # seen this word only as lower case
750
708
  return lcf
751
709
  else
752
710
  # never seen this word. guess.
@@ -754,13 +712,13 @@ class EngTagger
754
712
  end
755
713
  end
756
714
 
757
- # This changes any word not appearing in the lexicon to identifiable
758
- # classes of words handled by a simple unknown word classification
715
+ # This changes any word not appearing in the lexicon to identifiable
716
+ # classes of words handled by a simple unknown word classification
759
717
  # metric. Called by the clean_word method.
760
718
  def classify_unknown_word(word)
761
719
  if /[\(\{\[]/ =~ word # Left brackets
762
720
  classified = "*LRB*"
763
- elsif
721
+ elsif
764
722
  /[\)\}\]]/ =~ word # Right brackets
765
723
  classified = "*RRB*"
766
724
  elsif /-?(?:\d+(?:\.\d*)?|\.\d+)\z/ =~ word # Floating point number
@@ -800,31 +758,31 @@ class EngTagger
800
758
  end
801
759
  return classified
802
760
  end
803
-
804
- # This returns a compiled regexp for extracting maximal noun phrases
761
+
762
+ # This returns a compiled regexp for extracting maximal noun phrases
805
763
  # from a POS-tagged text.
806
764
  def get_max_noun_regex
807
765
  regex = /
808
766
  # optional number, gerund - adjective -participle
809
767
  (?:#{NUM})?(?:#{GER}|#{ADJ}|#{PART})*
810
768
  # Followed by one or more nouns
811
- (?:#{NN})+
769
+ (?:#{NN})+
812
770
  (?:
813
771
  # Optional preposition, determinant, cardinal
814
- (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
772
+ (?:#{PREP})*(?:#{DET})?(?:#{NUM})?
815
773
  # Optional gerund-adjective -participle
816
- (?:#{GER}|#{ADJ}|#{PART})*
774
+ (?:#{GER}|#{ADJ}|#{PART})*
817
775
  # one or more nouns
818
- (?:#{NN})+
776
+ (?:#{NN})+
819
777
  )*
820
778
  /xo #/
821
779
  return regex
822
- end
823
-
824
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
825
- # YAML data parser. It will load a YAML document with a collection of key:
826
- # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
827
- # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
780
+ end
781
+
782
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
783
+ # YAML data parser. It will load a YAML document with a collection of key:
784
+ # value entries ( {pos tag}: {probability} ) mapped onto single keys ( {tag} ).
785
+ # Each map is expected to be on a single line; i.e., det: { jj: 0.2, nn: 0.5, vb: 0.0002 }
828
786
  def load_tags(lexicon)
829
787
  path = File.join($lexpath, lexicon)
830
788
  fh = File.open(path, 'r')
@@ -837,17 +795,17 @@ class EngTagger
837
795
  pairs = {}
838
796
  items.each do |i|
839
797
  /([^:]+):\s*(.+)/ =~ i
840
- pairs[$1] = $2.to_f
798
+ pairs[$1] = $2.to_f
841
799
  end
842
800
  @@hmm[key] = pairs
843
801
  end
844
802
  fh.close
845
803
  end
846
804
 
847
- # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
848
- # YAML data parser. It will load a YAML document with a collection of key:
849
- # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
850
- # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
805
+ # Load the 2-grams into a hash from YAML data: This is a naive (but fast)
806
+ # YAML data parser. It will load a YAML document with a collection of key:
807
+ # value entries ( {pos tag}: {count} ) mapped onto single keys ( {a word} ).
808
+ # Each map is expected to be on a single line; i.e., key: { jj: 103, nn: 34, vb: 1 }
851
809
  def load_words(lexicon)
852
810
  path = File.join($lexpath, lexicon)
853
811
  fh = File.open(path, 'r')
@@ -860,15 +818,14 @@ class EngTagger
860
818
  pairs = {}
861
819
  items.each do |i|
862
820
  /([^:]+):\s*(.+)/ =~ i
863
- pairs[$1] = $2.to_f
821
+ pairs[$1] = $2.to_f
864
822
  end
865
823
  @@lexicon[key] = pairs
866
824
  end
867
825
  fh.close
868
826
  end
869
-
870
- #memoize the stem and assign_tag methods
827
+
828
+ #memoize the stem and assign_tag methods
871
829
  memoize("stem")
872
- memoize("assign_tag")
830
+ memoize("assign_tag")
873
831
  end
874
-
@@ -1,3 +1,3 @@
1
1
  module EngTagger
2
- VERSION = "0.2.0"
2
+ VERSION = "0.2.1"
3
3
  end
@@ -69,7 +69,7 @@ EOD
69
69
  def test_clean_text
70
70
  test = "<html><body>I am <b>100% sure</b> that Dr. Watson is too naive. I'm sorry.</body></html>"
71
71
  model = ["I","am","100","%","sure","that","Dr.","Watson","is","too","naive",".","I","'m","sorry","."]
72
- assert_equal(model, @tagger.clean_text(test))
72
+ assert_equal(model, @tagger.clean_text(test)) unless $no_hpricot
73
73
  end
74
74
 
75
75
  def test_clean_word
@@ -102,6 +102,38 @@ EOD
102
102
  assert_instance_of(Hash, result)
103
103
  end
104
104
 
105
+ def test_get_verbs
106
+ expected_result = { "have" => 1, "ruled" => 1, "contends" => 1 }
107
+ result = @tagger.get_verbs(@@tagged)
108
+ assert_equal(expected_result, result)
109
+ end
110
+
111
+ def test_get_adverbs
112
+ expected_result = { "otherwise" => 1 }
113
+ result = @tagger.get_adverbs(@@tagged)
114
+ assert_equal(expected_result, result)
115
+ end
116
+
117
+ def test_get_interrogatives
118
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
119
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
120
+ result = @tagger.get_interrogatives(tagged)
121
+ assert_equal(expected_result, result)
122
+ end
123
+
124
+ def test_get_question_parts
125
+ tagged = "<wdt>Which</wdt> <ppc>,</ppc> <wdt>whatever</wdt> <ppc>,</ppc> <wp>who</wp> <ppc>,</ppc> <wp>whoever</wp> <ppc>,</ppc> <wrb>when</wrb> <cc>and</cc> <wrb>how</wrb> <vbp>are</vbp> <det>all</det> <nns>examples</nns> <in>of</in> <nns>interrogatives</nns>"
126
+ expected_result = {"when"=>1, "how"=>1, "Which"=>1, "whatever"=>1, "who"=>1, "whoever"=>1}
127
+ result = @tagger.get_question_parts(tagged)
128
+ assert_equal(expected_result, result)
129
+ end
130
+
131
+ def test_get_conjunctions
132
+ expected_result = { "and" => 2, "of" => 2, "for" => 1, "that" => 1, "in" => 1 }
133
+ result = @tagger.get_conjunctions(@@tagged)
134
+ assert_equal(expected_result, result)
135
+ end
136
+
105
137
  def test_get_proper_nouns
106
138
  test = "<nnp>BBC</nnp> <vbz>means</vbz> <nnp>British Broadcasting Corporation</nnp> <pp>.</pp>"
107
139
  result = @tagger.get_proper_nouns(test)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yoichiro Hasebe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-20 00:00:00.000000000 Z
11
+ date: 2016-10-12 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
14
14
  tagger that assigns POS tags to English text based on a lookup dictionary and a
@@ -53,7 +53,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
53
  version: '0'
54
54
  requirements: []
55
55
  rubyforge_project:
56
- rubygems_version: 2.2.2
56
+ rubygems_version: 2.5.1
57
57
  signing_key:
58
58
  specification_version: 4
59
59
  summary: A probability based, corpus-trained English POS tagger