engtagger 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +15 -1
- data/lib/engtagger.rb +146 -2
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +1 -1
- data/test/test_engtagger.rb +5 -0
- metadata +8 -10
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 47684458b8965c2c1f52d0d29ca2cad340b0ed3f
|
4
|
+
data.tar.gz: aa6dbd0473409e9b6b2987a42c126a6b10dce096
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cddd67eab940146a2426032714aedd8e5195192ead3133ade5f76c594c5f0667f0747bba8b019f63df91dd2eb0610da19288a9e06a2323eb7bde91fec025b028
|
7
|
+
data.tar.gz: a76ca3422b9a3a1a813263b6e4ab5e69ca74ac998afcc259be55a6441b6b46b21ac9de6eb241327956a6fda1c2d3dbd033ba1678df86506f153089a3ef99d46d
|
data/README.md
CHANGED
@@ -54,7 +54,16 @@ of regular expressions.
|
|
54
54
|
proper = tgr.get_proper_nouns(tagged)
|
55
55
|
|
56
56
|
#=> {"Alice"=>1}
|
57
|
-
|
57
|
+
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
60
|
+
|
61
|
+
#=> {"chased"=>1}
|
62
|
+
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
65
|
+
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
58
67
|
|
59
68
|
# Get all noun phrases of any syntactic level
|
60
69
|
# (same as word_list but take a tagged input)
|
@@ -126,6 +135,11 @@ of this Ruby library
|
|
126
135
|
|
127
136
|
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
128
137
|
|
138
|
+
### Contributors
|
139
|
+
|
140
|
+
* Carlos Ramirez III
|
141
|
+
* Phil London
|
142
|
+
|
129
143
|
### Acknowledgement
|
130
144
|
|
131
145
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
data/lib/engtagger.rb
CHANGED
@@ -59,7 +59,6 @@ class EngTagger
|
|
59
59
|
NUM = get_ext('cd')
|
60
60
|
GER = get_ext('vbg')
|
61
61
|
ADJ = get_ext('jj[rs]*')
|
62
|
-
PART = get_ext('vbn')
|
63
62
|
NN = get_ext('nn[sp]*')
|
64
63
|
NNP = get_ext('nnp')
|
65
64
|
PREP = get_ext('in')
|
@@ -68,6 +67,15 @@ class EngTagger
|
|
68
67
|
QUOT = get_ext('ppr')
|
69
68
|
SEN = get_ext('pp')
|
70
69
|
WORD = get_ext('\w+')
|
70
|
+
VB = get_ext('vb')
|
71
|
+
VBG = get_ext('vbg')
|
72
|
+
VBD = get_ext('vbd')
|
73
|
+
PART = get_ext('vbn')
|
74
|
+
VBP = get_ext('vbp')
|
75
|
+
VBZ = get_ext('vbz')
|
76
|
+
JJ = get_ext('jj')
|
77
|
+
JJR = get_ext('jjr')
|
78
|
+
JJS = get_ext('jjs')
|
71
79
|
|
72
80
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
73
81
|
def self.explain_tag(tag)
|
@@ -183,7 +191,7 @@ class EngTagger
|
|
183
191
|
@conf[:debug] = false
|
184
192
|
# assuming that we start analyzing from the beginninga new sentence...
|
185
193
|
@conf[:current_tag] = 'pp'
|
186
|
-
@conf.merge(params)
|
194
|
+
@conf.merge!(params)
|
187
195
|
unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
|
188
196
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
189
197
|
@@hmm = Hash.new
|
@@ -321,7 +329,143 @@ class EngTagger
|
|
321
329
|
end
|
322
330
|
return ret
|
323
331
|
end
|
332
|
+
|
333
|
+
def get_infinitive_verbs(tagged)
|
334
|
+
return nil unless valid_text(tagged)
|
335
|
+
VB
|
336
|
+
trimmed = tagged.scan(VB).map do |n|
|
337
|
+
strip_tags(n)
|
338
|
+
end
|
339
|
+
ret = Hash.new(0)
|
340
|
+
trimmed.each do |n|
|
341
|
+
n = stem(n)
|
342
|
+
next unless n.length < 100 # sanity check on word length
|
343
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
344
|
+
end
|
345
|
+
return ret
|
346
|
+
end
|
347
|
+
|
348
|
+
def get_past_tense_verbs(tagged)
|
349
|
+
return nil unless valid_text(tagged)
|
350
|
+
VBD
|
351
|
+
trimmed = tagged.scan(VBD).map do |n|
|
352
|
+
strip_tags(n)
|
353
|
+
end
|
354
|
+
ret = Hash.new(0)
|
355
|
+
trimmed.each do |n|
|
356
|
+
n = stem(n)
|
357
|
+
next unless n.length < 100 # sanity check on word length
|
358
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
359
|
+
end
|
360
|
+
return ret
|
361
|
+
end
|
362
|
+
|
363
|
+
def get_gerund_verbs(tagged)
|
364
|
+
return nil unless valid_text(tagged)
|
365
|
+
VBG
|
366
|
+
trimmed = tagged.scan(VB).map do |n|
|
367
|
+
strip_tags(n)
|
368
|
+
end
|
369
|
+
ret = Hash.new(0)
|
370
|
+
trimmed.each do |n|
|
371
|
+
n = stem(n)
|
372
|
+
next unless n.length < 100 # sanity check on word length
|
373
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
374
|
+
end
|
375
|
+
return ret
|
376
|
+
end
|
377
|
+
|
378
|
+
def get_passive_verbs(tagged)
|
379
|
+
return nil unless valid_text(tagged)
|
380
|
+
PART
|
381
|
+
trimmed = tagged.scan(PART).map do |n|
|
382
|
+
strip_tags(n)
|
383
|
+
end
|
384
|
+
ret = Hash.new(0)
|
385
|
+
trimmed.each do |n|
|
386
|
+
n = stem(n)
|
387
|
+
next unless n.length < 100 # sanity check on word length
|
388
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
389
|
+
end
|
390
|
+
return ret
|
391
|
+
end
|
392
|
+
|
324
393
|
|
394
|
+
def get_base_present_verbs(tagged)
|
395
|
+
return nil unless valid_text(tagged)
|
396
|
+
VBP
|
397
|
+
trimmed = tagged.scan(VBP).map do |n|
|
398
|
+
strip_tags(n)
|
399
|
+
end
|
400
|
+
ret = Hash.new(0)
|
401
|
+
trimmed.each do |n|
|
402
|
+
n = stem(n)
|
403
|
+
next unless n.length < 100 # sanity check on word length
|
404
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
405
|
+
end
|
406
|
+
return ret
|
407
|
+
end
|
408
|
+
|
409
|
+
def get_present_verbs(tagged)
|
410
|
+
return nil unless valid_text(tagged)
|
411
|
+
VBZ
|
412
|
+
trimmed = tagged.scan(VBZ).map do |n|
|
413
|
+
strip_tags(n)
|
414
|
+
end
|
415
|
+
ret = Hash.new(0)
|
416
|
+
trimmed.each do |n|
|
417
|
+
n = stem(n)
|
418
|
+
next unless n.length < 100 # sanity check on word length
|
419
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
420
|
+
end
|
421
|
+
return ret
|
422
|
+
end
|
423
|
+
|
424
|
+
def get_adjectives(tagged)
|
425
|
+
return nil unless valid_text(tagged)
|
426
|
+
JJ
|
427
|
+
trimmed = tagged.scan(JJ).map do |n|
|
428
|
+
strip_tags(n)
|
429
|
+
end
|
430
|
+
ret = Hash.new(0)
|
431
|
+
trimmed.each do |n|
|
432
|
+
n = stem(n)
|
433
|
+
next unless n.length < 100 # sanity check on word length
|
434
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
435
|
+
end
|
436
|
+
return ret
|
437
|
+
end
|
438
|
+
|
439
|
+
def get_comparative_adjectives(tagged)
|
440
|
+
return nil unless valid_text(tagged)
|
441
|
+
JJR
|
442
|
+
trimmed = tagged.scan(JJR).map do |n|
|
443
|
+
strip_tags(n)
|
444
|
+
end
|
445
|
+
ret = Hash.new(0)
|
446
|
+
trimmed.each do |n|
|
447
|
+
n = stem(n)
|
448
|
+
next unless n.length < 100 # sanity check on word length
|
449
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
450
|
+
end
|
451
|
+
return ret
|
452
|
+
end
|
453
|
+
|
454
|
+
def get_superlative_adjectives(tagged)
|
455
|
+
return nil unless valid_text(tagged)
|
456
|
+
JJS
|
457
|
+
trimmed = tagged.scan(JJS).map do |n|
|
458
|
+
strip_tags(n)
|
459
|
+
end
|
460
|
+
ret = Hash.new(0)
|
461
|
+
trimmed.each do |n|
|
462
|
+
n = stem(n)
|
463
|
+
next unless n.length < 100 # sanity check on word length
|
464
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
465
|
+
end
|
466
|
+
return ret
|
467
|
+
end
|
468
|
+
|
325
469
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
326
470
|
# May be called directly, but is also used by get_noun_phrases
|
327
471
|
def get_max_noun_phrases(tagged)
|
data/lib/engtagger/pos_tags.hash
CHANGED
Binary file
|
Binary file
|
data/lib/engtagger/version.rb
CHANGED
data/test/test_engtagger.rb
CHANGED
@@ -191,6 +191,11 @@ EOD
|
|
191
191
|
text = ""
|
192
192
|
assert(!@tagger.valid_text(text))
|
193
193
|
end
|
194
|
+
|
195
|
+
def test_override_default_params
|
196
|
+
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
197
|
+
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
198
|
+
end
|
194
199
|
end
|
195
200
|
|
196
201
|
# Number of errors detected: 24
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Yoichiro Hasebe
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-20 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
15
14
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
@@ -20,7 +19,7 @@ executables: []
|
|
20
19
|
extensions: []
|
21
20
|
extra_rdoc_files: []
|
22
21
|
files:
|
23
|
-
- .gitignore
|
22
|
+
- ".gitignore"
|
24
23
|
- Gemfile
|
25
24
|
- LICENSE
|
26
25
|
- README.md
|
@@ -37,27 +36,26 @@ files:
|
|
37
36
|
- test/test_engtagger.rb
|
38
37
|
homepage: http://github.com/yohasebe/engtagger
|
39
38
|
licenses: []
|
39
|
+
metadata: {}
|
40
40
|
post_install_message:
|
41
41
|
rdoc_options: []
|
42
42
|
require_paths:
|
43
43
|
- lib
|
44
44
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
-
none: false
|
46
45
|
requirements:
|
47
|
-
- -
|
46
|
+
- - ">="
|
48
47
|
- !ruby/object:Gem::Version
|
49
48
|
version: '0'
|
50
49
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
50
|
requirements:
|
53
|
-
- -
|
51
|
+
- - ">="
|
54
52
|
- !ruby/object:Gem::Version
|
55
53
|
version: '0'
|
56
54
|
requirements: []
|
57
55
|
rubyforge_project:
|
58
|
-
rubygems_version:
|
56
|
+
rubygems_version: 2.2.2
|
59
57
|
signing_key:
|
60
|
-
specification_version:
|
58
|
+
specification_version: 4
|
61
59
|
summary: A probability based, corpus-trained English POS tagger
|
62
60
|
test_files:
|
63
61
|
- test/test_engtagger.rb
|