engtagger 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +15 -1
- data/lib/engtagger.rb +146 -2
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +1 -1
- data/test/test_engtagger.rb +5 -0
- metadata +8 -10
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 47684458b8965c2c1f52d0d29ca2cad340b0ed3f
|
4
|
+
data.tar.gz: aa6dbd0473409e9b6b2987a42c126a6b10dce096
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cddd67eab940146a2426032714aedd8e5195192ead3133ade5f76c594c5f0667f0747bba8b019f63df91dd2eb0610da19288a9e06a2323eb7bde91fec025b028
|
7
|
+
data.tar.gz: a76ca3422b9a3a1a813263b6e4ab5e69ca74ac998afcc259be55a6441b6b46b21ac9de6eb241327956a6fda1c2d3dbd033ba1678df86506f153089a3ef99d46d
|
data/README.md
CHANGED
@@ -54,7 +54,16 @@ of regular expressions.
|
|
54
54
|
proper = tgr.get_proper_nouns(tagged)
|
55
55
|
|
56
56
|
#=> {"Alice"=>1}
|
57
|
-
|
57
|
+
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
60
|
+
|
61
|
+
#=> {"chased"=>1}
|
62
|
+
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
65
|
+
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
58
67
|
|
59
68
|
# Get all noun phrases of any syntactic level
|
60
69
|
# (same as word_list but take a tagged input)
|
@@ -126,6 +135,11 @@ of this Ruby library
|
|
126
135
|
|
127
136
|
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
128
137
|
|
138
|
+
### Contributors
|
139
|
+
|
140
|
+
* Carlos Ramirez III
|
141
|
+
* Phil London
|
142
|
+
|
129
143
|
### Acknowledgement
|
130
144
|
|
131
145
|
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
data/lib/engtagger.rb
CHANGED
@@ -59,7 +59,6 @@ class EngTagger
|
|
59
59
|
NUM = get_ext('cd')
|
60
60
|
GER = get_ext('vbg')
|
61
61
|
ADJ = get_ext('jj[rs]*')
|
62
|
-
PART = get_ext('vbn')
|
63
62
|
NN = get_ext('nn[sp]*')
|
64
63
|
NNP = get_ext('nnp')
|
65
64
|
PREP = get_ext('in')
|
@@ -68,6 +67,15 @@ class EngTagger
|
|
68
67
|
QUOT = get_ext('ppr')
|
69
68
|
SEN = get_ext('pp')
|
70
69
|
WORD = get_ext('\w+')
|
70
|
+
VB = get_ext('vb')
|
71
|
+
VBG = get_ext('vbg')
|
72
|
+
VBD = get_ext('vbd')
|
73
|
+
PART = get_ext('vbn')
|
74
|
+
VBP = get_ext('vbp')
|
75
|
+
VBZ = get_ext('vbz')
|
76
|
+
JJ = get_ext('jj')
|
77
|
+
JJR = get_ext('jjr')
|
78
|
+
JJS = get_ext('jjs')
|
71
79
|
|
72
80
|
# Convert a Treebank-style, abbreviated tag into verbose definitions
|
73
81
|
def self.explain_tag(tag)
|
@@ -183,7 +191,7 @@ class EngTagger
|
|
183
191
|
@conf[:debug] = false
|
184
192
|
# assuming that we start analyzing from the beginninga new sentence...
|
185
193
|
@conf[:current_tag] = 'pp'
|
186
|
-
@conf.merge(params)
|
194
|
+
@conf.merge!(params)
|
187
195
|
unless File.exists?(@conf[:word_path]) and File.exists?(@conf[:tag_path])
|
188
196
|
print "Couldn't locate POS lexicon, creating a new one" if @conf[:debug]
|
189
197
|
@@hmm = Hash.new
|
@@ -321,7 +329,143 @@ class EngTagger
|
|
321
329
|
end
|
322
330
|
return ret
|
323
331
|
end
|
332
|
+
|
333
|
+
def get_infinitive_verbs(tagged)
|
334
|
+
return nil unless valid_text(tagged)
|
335
|
+
VB
|
336
|
+
trimmed = tagged.scan(VB).map do |n|
|
337
|
+
strip_tags(n)
|
338
|
+
end
|
339
|
+
ret = Hash.new(0)
|
340
|
+
trimmed.each do |n|
|
341
|
+
n = stem(n)
|
342
|
+
next unless n.length < 100 # sanity check on word length
|
343
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
344
|
+
end
|
345
|
+
return ret
|
346
|
+
end
|
347
|
+
|
348
|
+
def get_past_tense_verbs(tagged)
|
349
|
+
return nil unless valid_text(tagged)
|
350
|
+
VBD
|
351
|
+
trimmed = tagged.scan(VBD).map do |n|
|
352
|
+
strip_tags(n)
|
353
|
+
end
|
354
|
+
ret = Hash.new(0)
|
355
|
+
trimmed.each do |n|
|
356
|
+
n = stem(n)
|
357
|
+
next unless n.length < 100 # sanity check on word length
|
358
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
359
|
+
end
|
360
|
+
return ret
|
361
|
+
end
|
362
|
+
|
363
|
+
def get_gerund_verbs(tagged)
|
364
|
+
return nil unless valid_text(tagged)
|
365
|
+
VBG
|
366
|
+
trimmed = tagged.scan(VB).map do |n|
|
367
|
+
strip_tags(n)
|
368
|
+
end
|
369
|
+
ret = Hash.new(0)
|
370
|
+
trimmed.each do |n|
|
371
|
+
n = stem(n)
|
372
|
+
next unless n.length < 100 # sanity check on word length
|
373
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
374
|
+
end
|
375
|
+
return ret
|
376
|
+
end
|
377
|
+
|
378
|
+
def get_passive_verbs(tagged)
|
379
|
+
return nil unless valid_text(tagged)
|
380
|
+
PART
|
381
|
+
trimmed = tagged.scan(PART).map do |n|
|
382
|
+
strip_tags(n)
|
383
|
+
end
|
384
|
+
ret = Hash.new(0)
|
385
|
+
trimmed.each do |n|
|
386
|
+
n = stem(n)
|
387
|
+
next unless n.length < 100 # sanity check on word length
|
388
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
389
|
+
end
|
390
|
+
return ret
|
391
|
+
end
|
392
|
+
|
324
393
|
|
394
|
+
def get_base_present_verbs(tagged)
|
395
|
+
return nil unless valid_text(tagged)
|
396
|
+
VBP
|
397
|
+
trimmed = tagged.scan(VBP).map do |n|
|
398
|
+
strip_tags(n)
|
399
|
+
end
|
400
|
+
ret = Hash.new(0)
|
401
|
+
trimmed.each do |n|
|
402
|
+
n = stem(n)
|
403
|
+
next unless n.length < 100 # sanity check on word length
|
404
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
405
|
+
end
|
406
|
+
return ret
|
407
|
+
end
|
408
|
+
|
409
|
+
def get_present_verbs(tagged)
|
410
|
+
return nil unless valid_text(tagged)
|
411
|
+
VBZ
|
412
|
+
trimmed = tagged.scan(VBZ).map do |n|
|
413
|
+
strip_tags(n)
|
414
|
+
end
|
415
|
+
ret = Hash.new(0)
|
416
|
+
trimmed.each do |n|
|
417
|
+
n = stem(n)
|
418
|
+
next unless n.length < 100 # sanity check on word length
|
419
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
420
|
+
end
|
421
|
+
return ret
|
422
|
+
end
|
423
|
+
|
424
|
+
def get_adjectives(tagged)
|
425
|
+
return nil unless valid_text(tagged)
|
426
|
+
JJ
|
427
|
+
trimmed = tagged.scan(JJ).map do |n|
|
428
|
+
strip_tags(n)
|
429
|
+
end
|
430
|
+
ret = Hash.new(0)
|
431
|
+
trimmed.each do |n|
|
432
|
+
n = stem(n)
|
433
|
+
next unless n.length < 100 # sanity check on word length
|
434
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
435
|
+
end
|
436
|
+
return ret
|
437
|
+
end
|
438
|
+
|
439
|
+
def get_comparative_adjectives(tagged)
|
440
|
+
return nil unless valid_text(tagged)
|
441
|
+
JJR
|
442
|
+
trimmed = tagged.scan(JJR).map do |n|
|
443
|
+
strip_tags(n)
|
444
|
+
end
|
445
|
+
ret = Hash.new(0)
|
446
|
+
trimmed.each do |n|
|
447
|
+
n = stem(n)
|
448
|
+
next unless n.length < 100 # sanity check on word length
|
449
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
450
|
+
end
|
451
|
+
return ret
|
452
|
+
end
|
453
|
+
|
454
|
+
def get_superlative_adjectives(tagged)
|
455
|
+
return nil unless valid_text(tagged)
|
456
|
+
JJS
|
457
|
+
trimmed = tagged.scan(JJS).map do |n|
|
458
|
+
strip_tags(n)
|
459
|
+
end
|
460
|
+
ret = Hash.new(0)
|
461
|
+
trimmed.each do |n|
|
462
|
+
n = stem(n)
|
463
|
+
next unless n.length < 100 # sanity check on word length
|
464
|
+
ret[n] += 1 unless n =~ /\A\s*\z/
|
465
|
+
end
|
466
|
+
return ret
|
467
|
+
end
|
468
|
+
|
325
469
|
# Given a POS-tagged text, this method returns only the maximal noun phrases.
|
326
470
|
# May be called directly, but is also used by get_noun_phrases
|
327
471
|
def get_max_noun_phrases(tagged)
|
data/lib/engtagger/pos_tags.hash
CHANGED
Binary file
|
Binary file
|
data/lib/engtagger/version.rb
CHANGED
data/test/test_engtagger.rb
CHANGED
@@ -191,6 +191,11 @@ EOD
|
|
191
191
|
text = ""
|
192
192
|
assert(!@tagger.valid_text(text))
|
193
193
|
end
|
194
|
+
|
195
|
+
def test_override_default_params
|
196
|
+
@tagger = EngTagger.new(:longest_noun_phrase => 3)
|
197
|
+
assert_equal 3, @tagger.conf[:longest_noun_phrase]
|
198
|
+
end
|
194
199
|
end
|
195
200
|
|
196
201
|
# Number of errors detected: 24
|
metadata
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.2.0
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Yoichiro Hasebe
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-04-20 00:00:00.000000000 Z
|
13
12
|
dependencies: []
|
14
13
|
description: A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
15
14
|
tagger that assigns POS tags to English text based on a lookup dictionary and a
|
@@ -20,7 +19,7 @@ executables: []
|
|
20
19
|
extensions: []
|
21
20
|
extra_rdoc_files: []
|
22
21
|
files:
|
23
|
-
- .gitignore
|
22
|
+
- ".gitignore"
|
24
23
|
- Gemfile
|
25
24
|
- LICENSE
|
26
25
|
- README.md
|
@@ -37,27 +36,26 @@ files:
|
|
37
36
|
- test/test_engtagger.rb
|
38
37
|
homepage: http://github.com/yohasebe/engtagger
|
39
38
|
licenses: []
|
39
|
+
metadata: {}
|
40
40
|
post_install_message:
|
41
41
|
rdoc_options: []
|
42
42
|
require_paths:
|
43
43
|
- lib
|
44
44
|
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
-
none: false
|
46
45
|
requirements:
|
47
|
-
- -
|
46
|
+
- - ">="
|
48
47
|
- !ruby/object:Gem::Version
|
49
48
|
version: '0'
|
50
49
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
51
|
-
none: false
|
52
50
|
requirements:
|
53
|
-
- -
|
51
|
+
- - ">="
|
54
52
|
- !ruby/object:Gem::Version
|
55
53
|
version: '0'
|
56
54
|
requirements: []
|
57
55
|
rubyforge_project:
|
58
|
-
rubygems_version:
|
56
|
+
rubygems_version: 2.2.2
|
59
57
|
signing_key:
|
60
|
-
specification_version:
|
58
|
+
specification_version: 4
|
61
59
|
summary: A probability based, corpus-trained English POS tagger
|
62
60
|
test_files:
|
63
61
|
- test/test_engtagger.rb
|