engtagger 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/README.txt +75 -5
- data/lib/engtagger.rb +3 -3
- metadata +3 -3
data/History.txt
CHANGED
data/README.txt
CHANGED
@@ -21,27 +21,97 @@ of regular expressions.
|
|
21
21
|
|
22
22
|
=== Synopsis:
|
23
23
|
|
24
|
+
require 'rubygems'
|
25
|
+
require 'engtagger'
|
26
|
+
|
24
27
|
# Create a parser object
|
25
|
-
tgr =
|
28
|
+
tgr = EngTagger.new
|
26
29
|
|
30
|
+
# Sample text
|
31
|
+
text = "Alice chased the big fat cat."
|
32
|
+
|
27
33
|
# Add part-of-speech tags to text
|
28
34
|
tagged = tgr.add_tags(text)
|
35
|
+
|
36
|
+
#=> "<nnp>Alice</nnp> <vbd>chased</vbd> <det>the</det> <jj>big</jj> <jj>fat</jj><nn>cat</nn> <pp>.</pp>"
|
29
37
|
|
30
38
|
# Get a list of all nouns and noun phrases with occurrence counts
|
31
39
|
word_list = tgr.get_words(text)
|
40
|
+
|
41
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
32
42
|
|
33
43
|
# Get a readable version of the tagged text
|
34
|
-
|
44
|
+
readable = tgr.get_readable(text)
|
35
45
|
|
46
|
+
#=> "Alice/NNP chased/VBD the/DET big/JJ fat/JJ cat/NN ./PP"
|
47
|
+
|
36
48
|
# Get all nouns from a tagged output
|
37
|
-
|
49
|
+
nouns = tgr.get_nouns(tagged)
|
50
|
+
|
51
|
+
#=> {"cat"=>1, "Alice"=>1}
|
38
52
|
|
39
53
|
# Get all proper nouns
|
40
|
-
|
54
|
+
proper = tgr.get_proper_nouns(tagged)
|
55
|
+
|
56
|
+
#=> {"Alice"=>1}
|
41
57
|
|
58
|
+
|
42
59
|
# Get all noun phrases of any syntactic level
|
60
|
+
# (same as word_list but take a tagged input)
|
43
61
|
nps = tgr.get_noun_phrases(tagged)
|
44
62
|
|
63
|
+
#=> {"Alice"=>1, "cat"=>1, "fat cat"=>1, "big fat cat"=>1}
|
64
|
+
|
65
|
+
=== Tag Set
|
66
|
+
|
67
|
+
The set of POS tags used here is a modified version of the Penn Treebank tagset. Tags with non-letter characters have been redefined to work better in our data structures. Also, the "Determiner" tag (DET) has been changed from 'DT', in order to avoid confusion with the HTML tag, <DT>.
|
68
|
+
|
69
|
+
CC Conjunction, coordinating and, or
|
70
|
+
CD Adjective, cardinal number 3, fifteen
|
71
|
+
DET Determiner this, each, some
|
72
|
+
EX Pronoun, existential there there
|
73
|
+
FW Foreign words
|
74
|
+
IN Preposition / Conjunction for, of, although, that
|
75
|
+
JJ Adjective happy, bad
|
76
|
+
JJR Adjective, comparative happier, worse
|
77
|
+
JJS Adjective, superlative happiest, worst
|
78
|
+
LS Symbol, list item A, A.
|
79
|
+
MD Verb, modal can, could, 'll
|
80
|
+
NN Noun aircraft, data
|
81
|
+
NNP Noun, proper London, Michael
|
82
|
+
NNPS Noun, proper, plural Australians, Methodists
|
83
|
+
NNS Noun, plural women, books
|
84
|
+
PDT Determiner, prequalifier quite, all, half
|
85
|
+
POS Possessive 's, '
|
86
|
+
PRP Determiner, possessive second mine, yours
|
87
|
+
PRPS Determiner, possessive their, your
|
88
|
+
RB Adverb often, not, very, here
|
89
|
+
RBR Adverb, comparative faster
|
90
|
+
RBS Adverb, superlative fastest
|
91
|
+
RP Adverb, particle up, off, out
|
92
|
+
SYM Symbol *
|
93
|
+
TO Preposition to
|
94
|
+
UH Interjection oh, yes, mmm
|
95
|
+
VB Verb, infinitive take, live
|
96
|
+
VBD Verb, past tense took, lived
|
97
|
+
VBG Verb, gerund taking, living
|
98
|
+
VBN Verb, past/passive participle taken, lived
|
99
|
+
VBP Verb, base present form take, live
|
100
|
+
VBZ Verb, present 3SG -s form takes, lives
|
101
|
+
WDT Determiner, question which, whatever
|
102
|
+
WP Pronoun, question who, whoever
|
103
|
+
WPS Determiner, possessive & question whose
|
104
|
+
WRB Adverb, question when, how, however
|
105
|
+
|
106
|
+
PP Punctuation, sentence ender ., !, ?
|
107
|
+
PPC Punctuation, comma ,
|
108
|
+
PPD Punctuation, dollar sign $
|
109
|
+
PPL Punctuation, quotation mark left ``
|
110
|
+
PPR Punctuation, quotation mark right ''
|
111
|
+
PPS Punctuation, colon, semicolon, elipsis :, ..., -
|
112
|
+
LRB Punctuation, left bracket (, {, [
|
113
|
+
RRB Punctuation, right bracket ), }, ]
|
114
|
+
|
45
115
|
=== Requirements
|
46
116
|
|
47
117
|
* Ruby 1.8.6
|
@@ -51,7 +121,7 @@ of regular expressions.
|
|
51
121
|
|
52
122
|
(sudo) gem install engtagger
|
53
123
|
|
54
|
-
===
|
124
|
+
=== Author
|
55
125
|
|
56
126
|
of this Ruby library
|
57
127
|
* Yoichiro Hasebe (yohasebe [at] gmail.com)
|
data/lib/engtagger.rb
CHANGED
@@ -32,7 +32,7 @@ end
|
|
32
32
|
|
33
33
|
# English part-of-speech tagger class
|
34
34
|
class EngTagger
|
35
|
-
VERSION = '0.1.
|
35
|
+
VERSION = '0.1.1'
|
36
36
|
|
37
37
|
#################
|
38
38
|
# Class methods #
|
@@ -116,7 +116,7 @@ class EngTagger
|
|
116
116
|
"WP", "Pronoun, question",
|
117
117
|
"WPS", "Determiner, possessive & question",
|
118
118
|
"WRB", "Adverb, question",
|
119
|
-
"PP",
|
119
|
+
"PP", "Punctuation, sentence ender",
|
120
120
|
"PPC", "Punctuation, comma",
|
121
121
|
"PPD", "Punctuation, dollar sign",
|
122
122
|
"PPL", "Punctuation, quotation mark left",
|
@@ -212,7 +212,7 @@ class EngTagger
|
|
212
212
|
cleaned_word = clean_word(word)
|
213
213
|
tag = assign_tag(@conf[:current_tag], cleaned_word)
|
214
214
|
@conf[:current_tag] = tag = (tag and tag != "") ? tag : 'nn'
|
215
|
-
tag = explain_tag(tag) if verbose
|
215
|
+
tag = EngTagger.explain_tag(tag) if verbose
|
216
216
|
tagged << '<' + tag + '>' + word + '</' + tag + '>'
|
217
217
|
end
|
218
218
|
reset
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: engtagger
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yoichiro Hasebe
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-05-
|
12
|
+
date: 2008-05-15 00:00:00 +09:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -78,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
78
78
|
requirements: []
|
79
79
|
|
80
80
|
rubyforge_project: engtagger
|
81
|
-
rubygems_version: 1.1.
|
81
|
+
rubygems_version: 1.1.1
|
82
82
|
signing_key:
|
83
83
|
specification_version: 2
|
84
84
|
summary: English Part-of-Speech Tagger Library; a Ruby port of Lingua::EN::Tagger
|