engtagger 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 47684458b8965c2c1f52d0d29ca2cad340b0ed3f
4
+ data.tar.gz: aa6dbd0473409e9b6b2987a42c126a6b10dce096
5
+ SHA512:
6
+ metadata.gz: cddd67eab940146a2426032714aedd8e5195192ead3133ade5f76c594c5f0667f0747bba8b019f63df91dd2eb0610da19288a9e06a2323eb7bde91fec025b028
7
+ data.tar.gz: a76ca3422b9a3a1a813263b6e4ab5e69ca74ac998afcc259be55a6441b6b46b21ac9de6eb241327956a6fda1c2d3dbd033ba1678df86506f153089a3ef99d46d
data/README.md CHANGED
@@ -54,7 +54,16 @@ of regular expressions.
54
54
  proper = tgr.get_proper_nouns(tagged)
55
55
 
56
56
  #=> {"Alice"=>1}
57
-
57
+
58
+ # Get all past tense verbs
59
+ pt_verbs = tgr.get_past_tense_verbs(tagged)
60
+
61
+ #=> {"chased"=>1}
62
+
63
+ # Get all the adjectives
64
+ adj = tgr.get_adjectives(tagged)
65
+
66
+ #=> {"big"=>1, "fat"=>1}
58
67
 
59
68
  # Get all noun phrases of any syntactic level
60
69
  # (same as word_list but take a tagged input)
@@ -126,6 +135,11 @@ of this Ruby library
126
135
 
127
136
  * Yoichiro Hasebe (yohasebe [at] gmail.com)
128
137
 
138
+ ### Contributors
139
+
140
+ * Carlos Ramirez III
141
+ * Phil London
142
+
129
143
  ### Acknowledgement
130
144
 
131
145
  This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
@@ -59,7 +59,6 @@ class EngTagger
59
59
  NUM = get_ext('cd')
60
60
  GER = get_ext('vbg')
61
61
  ADJ = get_ext('jj[rs]*')
62
- PART = get_ext('vbn')
63
62
  NN = get_ext('nn[sp]*')
64
63
  NNP = get_ext('nnp')
65
64
  PREP = get_ext('in')
@@ -68,6 +67,15 @@ class EngTagger
68
67
  QUOT = get_ext('ppr')
69
68
  SEN = get_ext('pp')
70
69
  WORD = get_ext('\w+')
70
+ VB = get_ext('vb')
71
+ VBG = get_ext('vbg')
72
+ VBD = get_ext('vbd')
73
+ PART = get_ext('vbn')
74
+ VBP = get_ext('vbp')
75
+ VBZ = get_ext('vbz')
76
+ JJ = get_ext('jj')
77
+ JJR = get_ext('jjr')
78
+ JJS = get_ext('jjs')
71
79
 
72
80
  # Convert a Treebank-style, abbreviated tag into verbose definitions
73
81
  def self.explain_tag(tag)
@@ -183,7 +191,7 @@ class EngTagger
183
191
  @conf[:debug] = false
184
192
  # assuming that we start analyzing from the beginninga new sentence...
185
193
  @conf[:current_tag] = 'pp'
186
- @conf.merge(params) if params
194
+ @conf.merge!(params)
187
195
  unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
188
196
  print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
189
197
  @@hmm = Hash.new
@@ -321,7 +329,143 @@ class EngTagger
321
329
  end
322
330
  return ret
323
331
  end
332
+
333
+ def get_infinitive_verbs(tagged)
334
+ return nil unless valid_text(tagged)
335
+ VB
336
+ trimmed = tagged.scan(VB).map do |n|
337
+ strip_tags(n)
338
+ end
339
+ ret = Hash.new(0)
340
+ trimmed.each do |n|
341
+ n = stem(n)
342
+ next unless n.length < 100 # sanity check on word length
343
+ ret[n] += 1 unless n =~ /\A\s*\z/
344
+ end
345
+ return ret
346
+ end
347
+
348
+ def get_past_tense_verbs(tagged)
349
+ return nil unless valid_text(tagged)
350
+ VBD
351
+ trimmed = tagged.scan(VBD).map do |n|
352
+ strip_tags(n)
353
+ end
354
+ ret = Hash.new(0)
355
+ trimmed.each do |n|
356
+ n = stem(n)
357
+ next unless n.length < 100 # sanity check on word length
358
+ ret[n] += 1 unless n =~ /\A\s*\z/
359
+ end
360
+ return ret
361
+ end
362
+
363
+ def get_gerund_verbs(tagged)
364
+ return nil unless valid_text(tagged)
365
+ VBG
366
+ trimmed = tagged.scan(VB).map do |n|
367
+ strip_tags(n)
368
+ end
369
+ ret = Hash.new(0)
370
+ trimmed.each do |n|
371
+ n = stem(n)
372
+ next unless n.length < 100 # sanity check on word length
373
+ ret[n] += 1 unless n =~ /\A\s*\z/
374
+ end
375
+ return ret
376
+ end
377
+
378
+ def get_passive_verbs(tagged)
379
+ return nil unless valid_text(tagged)
380
+ PART
381
+ trimmed = tagged.scan(PART).map do |n|
382
+ strip_tags(n)
383
+ end
384
+ ret = Hash.new(0)
385
+ trimmed.each do |n|
386
+ n = stem(n)
387
+ next unless n.length < 100 # sanity check on word length
388
+ ret[n] += 1 unless n =~ /\A\s*\z/
389
+ end
390
+ return ret
391
+ end
392
+
324
393
 
394
+ def get_base_present_verbs(tagged)
395
+ return nil unless valid_text(tagged)
396
+ VBP
397
+ trimmed = tagged.scan(VBP).map do |n|
398
+ strip_tags(n)
399
+ end
400
+ ret = Hash.new(0)
401
+ trimmed.each do |n|
402
+ n = stem(n)
403
+ next unless n.length < 100 # sanity check on word length
404
+ ret[n] += 1 unless n =~ /\A\s*\z/
405
+ end
406
+ return ret
407
+ end
408
+
409
+ def get_present_verbs(tagged)
410
+ return nil unless valid_text(tagged)
411
+ VBZ
412
+ trimmed = tagged.scan(VBZ).map do |n|
413
+ strip_tags(n)
414
+ end
415
+ ret = Hash.new(0)
416
+ trimmed.each do |n|
417
+ n = stem(n)
418
+ next unless n.length < 100 # sanity check on word length
419
+ ret[n] += 1 unless n =~ /\A\s*\z/
420
+ end
421
+ return ret
422
+ end
423
+
424
+ def get_adjectives(tagged)
425
+ return nil unless valid_text(tagged)
426
+ JJ
427
+ trimmed = tagged.scan(JJ).map do |n|
428
+ strip_tags(n)
429
+ end
430
+ ret = Hash.new(0)
431
+ trimmed.each do |n|
432
+ n = stem(n)
433
+ next unless n.length < 100 # sanity check on word length
434
+ ret[n] += 1 unless n =~ /\A\s*\z/
435
+ end
436
+ return ret
437
+ end
438
+
439
+ def get_comparative_adjectives(tagged)
440
+ return nil unless valid_text(tagged)
441
+ JJR
442
+ trimmed = tagged.scan(JJR).map do |n|
443
+ strip_tags(n)
444
+ end
445
+ ret = Hash.new(0)
446
+ trimmed.each do |n|
447
+ n = stem(n)
448
+ next unless n.length < 100 # sanity check on word length
449
+ ret[n] += 1 unless n =~ /\A\s*\z/
450
+ end
451
+ return ret
452
+ end
453
+
454
+ def get_superlative_adjectives(tagged)
455
+ return nil unless valid_text(tagged)
456
+ JJS
457
+ trimmed = tagged.scan(JJS).map do |n|
458
+ strip_tags(n)
459
+ end
460
+ ret = Hash.new(0)
461
+ trimmed.each do |n|
462
+ n = stem(n)
463
+ next unless n.length < 100 # sanity check on word length
464
+ ret[n] += 1 unless n =~ /\A\s*\z/
465
+ end
466
+ return ret
467
+ end
468
+
325
469
  # Given a POS-tagged text, this method returns only the maximal noun phrases.
326
470
  # May be called directly, but is also used by get_noun_phrases
327
471
  def get_max_noun_phrases(tagged)
Binary file
Binary file
@@ -1,3 +1,3 @@
1
1
  module EngTagger
2
- VERSION = "0.1.2"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -191,6 +191,11 @@ EOD
191
191
  text = ""
192
192
  assert(!@tagger.valid_text(text))
193
193
  end
194
+
195
+ def test_override_default_params
196
+ @tagger = EngTagger.new(:longest_noun_phrase => 3)
197
+ assert_equal 3, @tagger.conf[:longest_noun_phrase]
198
+ end
194
199
  end
195
200
 
196
201
  # Number of errors detected: 24
metadata CHANGED
@@ -1,15 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: engtagger
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
5
- prerelease:
4
+ version: 0.2.0
6
5
  platform: ruby
7
6
  authors:
8
7
  - Yoichiro Hasebe
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2012-06-05 00:00:00.000000000 Z
11
+ date: 2014-04-20 00:00:00.000000000 Z
13
12
  dependencies: []
14
13
  description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
15
14
  tagger that assigns POS tags to English text based on a lookup dictionary and a
@@ -20,7 +19,7 @@ executables: []
20
19
  extensions: []
21
20
  extra_rdoc_files: []
22
21
  files:
23
- - .gitignore
22
+ - ".gitignore"
24
23
  - Gemfile
25
24
  - LICENSE
26
25
  - README.md
@@ -37,27 +36,26 @@ files:
37
36
  - test/test_engtagger.rb
38
37
  homepage: http://github.com/yohasebe/engtagger
39
38
  licenses: []
39
+ metadata: {}
40
40
  post_install_message:
41
41
  rdoc_options: []
42
42
  require_paths:
43
43
  - lib
44
44
  required_ruby_version: !ruby/object:Gem::Requirement
45
- none: false
46
45
  requirements:
47
- - - ! '>='
46
+ - - ">="
48
47
  - !ruby/object:Gem::Version
49
48
  version: '0'
50
49
  required_rubygems_version: !ruby/object:Gem::Requirement
51
- none: false
52
50
  requirements:
53
- - - ! '>='
51
+ - - ">="
54
52
  - !ruby/object:Gem::Version
55
53
  version: '0'
56
54
  requirements: []
57
55
  rubyforge_project:
58
- rubygems_version: 1.8.24
56
+ rubygems_version: 2.2.2
59
57
  signing_key:
60
- specification_version: 3
58
+ specification_version: 4
61
59
  summary: A probability based, corpus-trained English POS tagger
62
60
  test_files:
63
61
  - test/test_engtagger.rb