engtagger 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 47684458b8965c2c1f52d0d29ca2cad340b0ed3f
4
+ data.tar.gz: aa6dbd0473409e9b6b2987a42c126a6b10dce096
5
+ SHA512:
6
+ metadata.gz: cddd67eab940146a2426032714aedd8e5195192ead3133ade5f76c594c5f0667f0747bba8b019f63df91dd2eb0610da19288a9e06a2323eb7bde91fec025b028
7
+ data.tar.gz: a76ca3422b9a3a1a813263b6e4ab5e69ca74ac998afcc259be55a6441b6b46b21ac9de6eb241327956a6fda1c2d3dbd033ba1678df86506f153089a3ef99d46d
data/README.md CHANGED
@@ -54,7 +54,16 @@ of regular expressions.
54
54
  proper = tgr.get_proper_nouns(tagged)
55
55
 
56
56
  #=> {"Alice"=>1}
57
-
57
+
58
+ # Get all past tense verbs
59
+ pt_verbs = tgr.get_past_tense_verbs(tagged)
60
+
61
+ #=> {"chased"=>1}
62
+
63
+ # Get all the adjectives
64
+ adj = tgr.get_adjectives(tagged)
65
+
66
+ #=> {"big"=>1, "fat"=>1}
58
67
 
59
68
  # Get all noun phrases of any syntactic level
60
69
  # (same as word_list but take a tagged input)
@@ -126,6 +135,11 @@ of this Ruby library
126
135
 
127
136
  * Yoichiro Hasebe (yohasebe [at] gmail.com)
128
137
 
138
+ ### Contributors
139
+
140
+ * Carlos Ramirez III
141
+ * Phil London
142
+
129
143
  ### Acknowledgement
130
144
 
131
145
  This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
@@ -59,7 +59,6 @@ class EngTagger
59
59
  NUM = get_ext('cd')
60
60
  GER = get_ext('vbg')
61
61
  ADJ = get_ext('jj[rs]*')
62
- PART = get_ext('vbn')
63
62
  NN = get_ext('nn[sp]*')
64
63
  NNP = get_ext('nnp')
65
64
  PREP = get_ext('in')
@@ -68,6 +67,15 @@ class EngTagger
68
67
  QUOT = get_ext('ppr')
69
68
  SEN = get_ext('pp')
70
69
  WORD = get_ext('\w+')
70
+ VB = get_ext('vb')
71
+ VBG = get_ext('vbg')
72
+ VBD = get_ext('vbd')
73
+ PART = get_ext('vbn')
74
+ VBP = get_ext('vbp')
75
+ VBZ = get_ext('vbz')
76
+ JJ = get_ext('jj')
77
+ JJR = get_ext('jjr')
78
+ JJS = get_ext('jjs')
71
79
 
72
80
  # Convert a Treebank-style, abbreviated tag into verbose definitions
73
81
  def self.explain_tag(tag)
@@ -183,7 +191,7 @@ class EngTagger
183
191
  @conf[:debug] = false
184
192
  # assuming that we start analyzing from the beginninga new sentence...
185
193
  @conf[:current_tag] = 'pp'
186
- @conf.merge(params) if params
194
+ @conf.merge!(params)
187
195
  unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
188
196
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
189
197
  @@hmm = Hash.new
@@ -321,7 +329,143 @@ class EngTagger
321
329
  end
322
330
  return ret
323
331
  end
332
+
333
+ def get_infinitive_verbs(tagged)
334
+ return nil unless valid_text(tagged)
335
+ VB
336
+ trimmed = tagged.scan(VB).map do |n|
337
+ strip_tags(n)
338
+ end
339
+ ret = Hash.new(0)
340
+ trimmed.each do |n|
341
+ n = stem(n)
342
+ next unless n.length < 100 # sanity check on word length
343
+ ret[n] += 1 unless n =~ /\A\s*\z/
344
+ end
345
+ return ret
346
+ end
347
+
348
+ def get_past_tense_verbs(tagged)
349
+ return nil unless valid_text(tagged)
350
+ VBD
351
+ trimmed = tagged.scan(VBD).map do |n|
352
+ strip_tags(n)
353
+ end
354
+ ret = Hash.new(0)
355
+ trimmed.each do |n|
356
+ n = stem(n)
357
+ next unless n.length < 100 # sanity check on word length
358
+ ret[n] += 1 unless n =~ /\A\s*\z/
359
+ end
360
+ return ret
361
+ end
362
+
363
+ def get_gerund_verbs(tagged)
364
+ return nil unless valid_text(tagged)
365
+ VBG
366
+ trimmed = tagged.scan(VB).map do |n|
367
+ strip_tags(n)
368
+ end
369
+ ret = Hash.new(0)
370
+ trimmed.each do |n|
371
+ n = stem(n)
372
+ next unless n.length < 100 # sanity check on word length
373
+ ret[n] += 1 unless n =~ /\A\s*\z/
374
+ end
375
+ return ret
376
+ end
377
+
378
+ def get_passive_verbs(tagged)
379
+ return nil unless valid_text(tagged)
380
+ PART
381
+ trimmed = tagged.scan(PART).map do |n|
382
+ strip_tags(n)
383
+ end
384
+ ret = Hash.new(0)
385
+ trimmed.each do |n|
386
+ n = stem(n)
387
+ next unless n.length < 100 # sanity check on word length
388
+ ret[n] += 1 unless n =~ /\A\s*\z/
389
+ end
390
+ return ret
391
+ end
392
+
324
393
 
394
+ def get_base_present_verbs(tagged)
395
+ return nil unless valid_text(tagged)
396
+ VBP
397
+ trimmed = tagged.scan(VBP).map do |n|
398
+ strip_tags(n)
399
+ end
400
+ ret = Hash.new(0)
401
+ trimmed.each do |n|
402
+ n = stem(n)
403
+ next unless n.length < 100 # sanity check on word length
404
+ ret[n] += 1 unless n =~ /\A\s*\z/
405
+ end
406
+ return ret
407
+ end
408
+
409
+ def get_present_verbs(tagged)
410
+ return nil unless valid_text(tagged)
411
+ VBZ
412
+ trimmed = tagged.scan(VBZ).map do |n|
413
+ strip_tags(n)
414
+ end
415
+ ret = Hash.new(0)
416
+ trimmed.each do |n|
417
+ n = stem(n)
418
+ next unless n.length < 100 # sanity check on word length
419
+ ret[n] += 1 unless n =~ /\A\s*\z/
420
+ end
421
+ return ret
422
+ end
423
+
424
+ def get_adjectives(tagged)
425
+ return nil unless valid_text(tagged)
426
+ JJ
427
+ trimmed = tagged.scan(JJ).map do |n|
428
+ strip_tags(n)
429
+ end
430
+ ret = Hash.new(0)
431
+ trimmed.each do |n|
432
+ n = stem(n)
433
+ next unless n.length < 100 # sanity check on word length
434
+ ret[n] += 1 unless n =~ /\A\s*\z/
435
+ end
436
+ return ret
437
+ end
438
+
439
+ def get_comparative_adjectives(tagged)
440
+ return nil unless valid_text(tagged)
441
+ JJR
442
+ trimmed = tagged.scan(JJR).map do |n|
443
+ strip_tags(n)
444
+ end
445
+ ret = Hash.new(0)
446
+ trimmed.each do |n|
447
+ n = stem(n)
448
+ next unless n.length < 100 # sanity check on word length
449
+ ret[n] += 1 unless n =~ /\A\s*\z/
450
+ end
451
+ return ret
452
+ end
453
+
454
+ def get_superlative_adjectives(tagged)
455
+ return nil unless valid_text(tagged)
456
+ JJS
457
+ trimmed = tagged.scan(JJS).map do |n|
458
+ strip_tags(n)
459
+ end
460
+ ret = Hash.new(0)
461
+ trimmed.each do |n|
462
+ n = stem(n)
463
+ next unless n.length < 100 # sanity check on word length
464
+ ret[n] += 1 unless n =~ /\A\s*\z/
465
+ end
466
+ return ret
467
+ end
468
+
325
469
  # Given a POS-tagged text, this method returns only the maximal noun phrases.
326
470
  # May be called directly, but is also used by get_noun_phrases
327
471
  def get_max_noun_phrases(tagged)
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module EngTagger
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -191,6 +191,11 @@ EOD
191
191
  text = ""
192
192
  assert(!@tagger.valid_text(text))
193
193
  end
194
+
195
+ def test_override_default_params
196
+ @tagger = EngTagger.new(:longest_noun_phrase => 3)
197
+ assert_equal 3, @tagger.conf[:longest_noun_phrase]
198
+ end
194
199
  end
195
200
 
196
201
  # Number of errors detected: 24
metadata CHANGED
@@ -1,15 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
5
- prerelease:
4
+ version: 0.2.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Yoichiro Hasebe
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-06-05 00:00:00.000000000 Z
11
+ date: 2014-04-20 00:00:00.000000000 Z
13
12
  dependencies: []
14
13
  description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
15
14
  tagger that assigns POS tags to English text based on a lookup dictionary and a
@@ -20,7 +19,7 @@ executables: []
20
19
  extensions: []
21
20
  extra_rdoc_files: []
22
21
  files:
23
- - .gitignore
22
+ - ".gitignore"
24
23
  - Gemfile
25
24
  - LICENSE
26
25
  - README.md
@@ -37,27 +36,26 @@ files:
37
36
  - test/test_engtagger.rb
38
37
  homepage: http://github.com/yohasebe/engtagger
39
38
  licenses: []
39
+ metadata: {}
40
40
  post_install_message:
41
41
  rdoc_options: []
42
42
  require_paths:
43
43
  - lib
44
44
  required_ruby_version: !ruby/object:Gem::Requirement
45
- none: false
46
45
  requirements:
47
- - - ! '>='
46
+ - - ">="
48
47
  - !ruby/object:Gem::Version
49
48
  version: '0'
50
49
  required_rubygems_version: !ruby/object:Gem::Requirement
51
- none: false
52
50
  requirements:
53
- - - ! '>='
51
+ - - ">="
54
52
  - !ruby/object:Gem::Version
55
53
  version: '0'
56
54
  requirements: []
57
55
  rubyforge_project:
58
- rubygems_version: 1.8.24
56
+ rubygems_version: 2.2.2
59
57
  signing_key:
60
- specification_version: 3
58
+ specification_version: 4
61
59
  summary: A probability based, corpus-trained English POS tagger
62
60
  test_files:
63
61
  - test/test_engtagger.rb