engtagger 0.1.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +17 -0
- data/Gemfile +4 -0
- data/{LICENSE.txt → LICENSE} +22 -0
- data/README.md +152 -0
- data/Rakefile +2 -24
- data/engtagger.gemspec +17 -0
- data/lib/engtagger/porter.rb +2 -6
- data/lib/engtagger/pos_tags.hash +0 -0
- data/lib/engtagger/pos_words.hash +0 -0
- data/lib/engtagger/version.rb +3 -0
- data/lib/engtagger.rb +831 -729
- data/test/test_engtagger.rb +39 -2
- metadata +39 -65
- data/History.txt +0 -10
- data/Manifest.txt +0 -13
- data/README.txt +0 -140
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 65db6f78c5abff2e601841262e2aabcf936d80df76a3fecab33f4e9d730e02f5
|
4
|
+
data.tar.gz: '04896cc7bfeb84c9f720d8493f08d4b5537dcb9a177f1584d621dad6fbd1184b'
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 04c8169ba6706cffc2afc02f93c8d1916afdb08e8f99d7e5705c645efcf52fcba08edd593079fc267091582f779cf48db92ab3c9852cbb06e494f754c5f18b94
|
7
|
+
data.tar.gz: 70ad3a969bb095f72804917bf1cf906d81466b9b25dba8a7c3616ab12f8500f0a490e08af8ba4bcbbff58cdb9f04fd3cc06b2317dfbe57643bb11adaeb94f946
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/{LICENSE.txt → LICENSE}
RENAMED
@@ -338,3 +338,25 @@ proprietary programs. If your program is a subroutine library, you may
|
|
338
338
|
consider it more useful to permit linking proprietary applications with the
|
339
339
|
library. If this is what you want to do, use the GNU Library General
|
340
340
|
Public License instead of this License.
|
341
|
+
Copyright (c) 2012 Yoichiro Hasebe
|
342
|
+
|
343
|
+
MIT License
|
344
|
+
|
345
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
346
|
+
a copy of this software and associated documentation files (the
|
347
|
+
"Software"), to deal in the Software without restriction, including
|
348
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
349
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
350
|
+
permit persons to whom the Software is furnished to do so, subject to
|
351
|
+
the following conditions:
|
352
|
+
|
353
|
+
The above copyright notice and this permission notice shall be
|
354
|
+
included in all copies or substantial portions of the Software.
|
355
|
+
|
356
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
357
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
358
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
359
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
360
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
361
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
362
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,152 @@
|
|
1
|
+
# EngTagger
|
2
|
+
|
3
|
+
English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|
4
|
+
|
5
|
+
### Description
|
6
|
+
|
7
|
+
A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained
|
8
|
+
tagger that assigns POS tags to English text based on a lookup dictionary and
|
9
|
+
a set of probability values. The tagger assigns appropriate tags based on
|
10
|
+
conditional probabilities--it examines the preceding tag to determine the
|
11
|
+
appropriate tag for the current word. Unknown words are classified according to
|
12
|
+
word morphology or can be set to be treated as nouns or other parts of speech.
|
13
|
+
The tagger also extracts as many nouns and noun phrases as it can, using a set
|
14
|
+
of regular expressions.
|
15
|
+
|
16
|
+
### Features
|
17
|
+
|
18
|
+
* Assigns POS tags to English text
|
19
|
+
* Extract noun phrases from tagged text
|
20
|
+
* etc.
|
21
|
+
|
22
|
+
### Synopsis:
|
23
|
+
|
24
|
+
require 'rubygems'
|
25
|
+
require 'engtagger'
|
26
|
+
|
27
|
+
# Create a parser object
|
28
|
+
tgr = EngTagger.new
|
29
|
+
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
32
|
+
|
33
|
+
# Add part-of-speech tags to text
|
34
|
+
tagged = tgr.add_tags(text)
|
35
|
+
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
37
|
+
|
38
|
+
# Get a list of all nouns and noun phrases with occurrence counts
|
39
|
+
word_list = tgr.get_words(text)
|
40
|
+
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
42
|
+
|
43
|
+
# Get a readable version of the tagged text
|
44
|
+
readable = tgr.get_readable(text)
|
45
|
+
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
|
+
|
48
|
+
# Get all nouns from a tagged output
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
50
|
+
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
52
|
+
|
53
|
+
# Get all proper nouns
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
55
|
+
|
56
|
+
#=> {"Alice"=>1}
|
57
|
+
|
58
|
+
# Get all past tense verbs
|
59
|
+
pt_verbs = tgr.get_past_tense_verbs(tagged)
|
60
|
+
|
61
|
+
#=> {"chased"=>1}
|
62
|
+
|
63
|
+
# Get all the adjectives
|
64
|
+
adj = tgr.get_adjectives(tagged)
|
65
|
+
|
66
|
+
#=> {"big"=>1, "fat"=>1}
|
67
|
+
|
68
|
+
# Get all noun phrases of any syntactic level
|
69
|
+
# (same as word_list but take a tagged input)
|
70
|
+
nps = tgr.get_noun_phrases(tagged)
|
71
|
+
|
72
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
73
|
+
|
74
|
+
### Tag Set
|
75
|
+
|
76
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, `<DT>`.
|
77
|
+
|
78
|
+
CC Conjunction, coordinating and, or
|
79
|
+
CD Adjective, cardinal number 3, fifteen
|
80
|
+
DET Determiner this, each, some
|
81
|
+
EX Pronoun, existential there there
|
82
|
+
FW Foreign words
|
83
|
+
IN Preposition / Conjunction for, of, although, that
|
84
|
+
JJ Adjective happy, bad
|
85
|
+
JJR Adjective, comparative happier, worse
|
86
|
+
JJS Adjective, superlative happiest, worst
|
87
|
+
LS Symbol, list item A, A.
|
88
|
+
MD Verb, modal can, could, 'll
|
89
|
+
NN Noun aircraft, data
|
90
|
+
NNP Noun, proper London, Michael
|
91
|
+
NNPS Noun, proper, plural Australians, Methodists
|
92
|
+
NNS Noun, plural women, books
|
93
|
+
PDT Determiner, prequalifier quite, all, half
|
94
|
+
POS Possessive 's, '
|
95
|
+
PRP Determiner, possessive second mine, yours
|
96
|
+
PRPS Determiner, possessive their, your
|
97
|
+
RB Adverb often, not, very, here
|
98
|
+
RBR Adverb, comparative faster
|
99
|
+
RBS Adverb, superlative fastest
|
100
|
+
RP Adverb, particle up, off, out
|
101
|
+
SYM Symbol *
|
102
|
+
TO Preposition to
|
103
|
+
UH Interjection oh, yes, mmm
|
104
|
+
VB Verb, infinitive take, live
|
105
|
+
VBD Verb, past tense took, lived
|
106
|
+
VBG Verb, gerund taking, living
|
107
|
+
VBN Verb, past/passive participle taken, lived
|
108
|
+
VBP Verb, base present form take, live
|
109
|
+
VBZ Verb, present 3SG -s form takes, lives
|
110
|
+
WDT Determiner, question which, whatever
|
111
|
+
WP Pronoun, question who, whoever
|
112
|
+
WPS Determiner, possessive & question whose
|
113
|
+
WRB Adverb, question when, how, however
|
114
|
+
|
115
|
+
PP Punctuation, sentence ender ., !, ?
|
116
|
+
PPC Punctuation, comma ,
|
117
|
+
PPD Punctuation, dollar sign $
|
118
|
+
PPL Punctuation, quotation mark left ``
|
119
|
+
PPR Punctuation, quotation mark right ''
|
120
|
+
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
121
|
+
LRB Punctuation, left bracket (, {, [
|
122
|
+
RRB Punctuation, right bracket ), }, ]
|
123
|
+
|
124
|
+
### Requirements
|
125
|
+
|
126
|
+
* [Hpricot](http://code.whytheluckystiff.net/hpricot/) (optional)
|
127
|
+
|
128
|
+
### Install
|
129
|
+
|
130
|
+
(sudo) gem install engtagger
|
131
|
+
|
132
|
+
### Author
|
133
|
+
|
134
|
+
of this Ruby library
|
135
|
+
|
136
|
+
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
137
|
+
|
138
|
+
### Contributors
|
139
|
+
|
140
|
+
* Carlos Ramirez III
|
141
|
+
* Phil London
|
142
|
+
* Bazay (Baron Bloomer)
|
143
|
+
|
144
|
+
### Acknowledgement
|
145
|
+
|
146
|
+
This Ruby library is a direct port of Lingua::EN::Tagger available at CPAN.
|
147
|
+
The credit for the crucial part of its algorithm/design therefore goes to
|
148
|
+
Aaron Coburn, the author of the original Perl version.
|
149
|
+
|
150
|
+
### License
|
151
|
+
|
152
|
+
This library is distributed under the GPL. Please see the LICENSE file.
|
data/Rakefile
CHANGED
@@ -1,24 +1,2 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
require 'rubygems'
|
4
|
-
require 'hoe'
|
5
|
-
require './lib/engtagger.rb'
|
6
|
-
|
7
|
-
Hoe.new('EngTagger', EngTagger::VERSION) do |p|
|
8
|
-
p.name = "engtagger"
|
9
|
-
p.author = "Yoichiro Hasebe"
|
10
|
-
p.description = p.paragraphs_of('README.txt', 3).join("\n\n")
|
11
|
-
p.email = 'yohasebe@gmail.com'
|
12
|
-
p.summary = p.paragraphs_of('README.txt', 1).join("\n\n")
|
13
|
-
p.url = "http://engtagger.rubyforge.org"
|
14
|
-
p.remote_rdoc_dir = '' # Release to root
|
15
|
-
p.changes = p.paragraphs_of('History.txt', 0..1).join("\n\n")
|
16
|
-
p.extra_deps << ['hpricot']
|
17
|
-
p.rdoc_pattern = /^(.+\.rb|.+\.txt|.+\.yaml|[^\.]+)$/
|
18
|
-
p.need_zip = true
|
19
|
-
end
|
20
|
-
|
21
|
-
desc "Release and publish documentation"
|
22
|
-
task :repubdoc => [:release, :publish_docs]
|
23
|
-
|
24
|
-
# vim: syntax=Ruby
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
data/engtagger.gemspec
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/engtagger/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.authors = ["Yoichiro Hasebe"]
|
6
|
+
gem.email = ["yohasebe@gmail.com"]
|
7
|
+
gem.summary = %q{A probability based, corpus-trained English POS tagger}
|
8
|
+
gem.description = %q{A Ruby port of Perl Lingua::EN::Tagger, a probability based, corpus-trained tagger that assigns POS tags to English text based on a lookup dictionary and a set of probability values.}
|
9
|
+
gem.homepage = "http://github.com/yohasebe/engtagger"
|
10
|
+
|
11
|
+
gem.files = `git ls-files`.split($\)
|
12
|
+
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
13
|
+
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
14
|
+
gem.name = "engtagger"
|
15
|
+
gem.require_paths = ["lib"]
|
16
|
+
gem.version = EngTagger::VERSION
|
17
|
+
end
|
data/lib/engtagger/porter.rb
CHANGED
data/lib/engtagger/pos_tags.hash
CHANGED
Binary file
|
Binary file
|