part_of_speech 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +28 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/corpus/lexicon.txt +93696 -0
- data/lib/part_of_speech.rb +131 -0
- data/part_of_speech.gemspec +56 -0
- data/spec/part_of_speech_spec.rb +10 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +77 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
class PartOfSpeech
|
2
|
+
|
3
|
+
class << self
|
4
|
+
def analyze(text)
|
5
|
+
new.tag(text)
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
# Place corpus into memory
|
10
|
+
def initialize
|
11
|
+
@lexicons = {}
|
12
|
+
File.open(corpus_path).each do |line|
|
13
|
+
line = line.split
|
14
|
+
@lexicons[line.shift] = line
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def tag(text)
|
19
|
+
@text = text.split(/\s|\.|,|\:|\;|\'/)
|
20
|
+
|
21
|
+
@pos = []
|
22
|
+
@text.each do |word|
|
23
|
+
if @lexicons.key?(word) || @lexicons.key?(word.downcase)
|
24
|
+
@pos << @lexicons[word][0]
|
25
|
+
else
|
26
|
+
@pos << "NN"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Apply Transformational rules
|
31
|
+
@pos.each_index do |index|
|
32
|
+
rule_one(index)
|
33
|
+
rule_two(index)
|
34
|
+
rule_three(index)
|
35
|
+
rule_four(index)
|
36
|
+
rule_five(index)
|
37
|
+
rule_six(index)
|
38
|
+
rule_seven(index)
|
39
|
+
rule_eight(index)
|
40
|
+
rule_nine(index)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Organize [word, pos]
|
44
|
+
results = []
|
45
|
+
@text.each_with_index do |word, i|
|
46
|
+
results << [word, @pos[i]]
|
47
|
+
end
|
48
|
+
|
49
|
+
results
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def rule_one(index)
|
55
|
+
## rule 1: DT, {VBD | VBP} --> DT, NN
|
56
|
+
return unless index > 0
|
57
|
+
if @pos[index - 1] == "DT" && (@pos[index] == "VBD" || @pos[index] == "VBP" || @pos[index] == "VB")
|
58
|
+
@pos[index] = "NN"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def rule_two(index)
|
63
|
+
## rule 2: convert a noun to a number (CD) if "." appears in the word
|
64
|
+
if @pos[index] =~ /^N/ && @text[index] =~ /\./
|
65
|
+
@pos[index] = "CD"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def rule_three(index)
|
70
|
+
## rule 3: convert a noun to a past participle if words[i] ends with "ed"
|
71
|
+
if @pos[index] =~ /^N/ && @text[index] =~ /ed$/
|
72
|
+
@pos[index] = "VBN"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def rule_four(index)
|
77
|
+
## rule 4: convert any type to adverb if it ends in "ly"
|
78
|
+
if @text[index] =~ /ly$/
|
79
|
+
@pos[index] = "RB"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def rule_five(index)
|
84
|
+
## rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
|
85
|
+
if @pos[index] =~ /^NN/ && @text[index] =~ /al$/
|
86
|
+
@pos[index] = "JJ"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def rule_six(index)
|
91
|
+
## rule 6: convert a noun to a verb if the preceeding work is "would"
|
92
|
+
return unless index > 0
|
93
|
+
if @pos[index] =~ /^NN/ && @text[index-1].downcase == "would"
|
94
|
+
@pos[index] = "VB"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def rule_seven(index)
|
99
|
+
# rule 7: if a word has been categorized as a common noun and
|
100
|
+
# it ends with "s", then set its type to plural common noun (NNS)
|
101
|
+
if @pos[index] == "NN" && @text[index] =~ /s$/
|
102
|
+
@pos[index] = "NNS"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def rule_eight(index)
|
107
|
+
## rule 8: convert a common noun to a present participle verb (i.e., a gerand)
|
108
|
+
if @pos[index] =~ /^NN/ && @text[index] =~ /ing$/
|
109
|
+
@pos[index] = "VBG"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def rule_nine(index)
|
114
|
+
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> can also be a verb
|
115
|
+
return unless index > 0
|
116
|
+
|
117
|
+
if @pos[index-1] =~ /^NN/ && @pos[index] =~ /^NN/
|
118
|
+
if @lexicon[@text[index]].include?("VBN")
|
119
|
+
@pos[index] = "VBN"
|
120
|
+
end
|
121
|
+
if @lexicon[@text[index]].include?("VBZ")
|
122
|
+
@pos[index] = "VBZ"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def corpus_path
|
128
|
+
File.expand_path(File.dirname(__FILE__) + '/corpus/lexicon.txt')
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{part_of_speech}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["reddavis"]
|
12
|
+
s.date = %q{2010-03-01}
|
13
|
+
s.description = %q{Part of speech tagger based off Mark Watsons code}
|
14
|
+
s.email = %q{reddavis@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/corpus/lexicon.txt",
|
27
|
+
"lib/part_of_speech.rb",
|
28
|
+
"part_of_speech.gemspec",
|
29
|
+
"spec/part_of_speech_spec.rb",
|
30
|
+
"spec/spec.opts",
|
31
|
+
"spec/spec_helper.rb"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/reddavis/Part-Of-Speech}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.3.5}
|
37
|
+
s.summary = %q{Part of speech tagger based off Mark Watsons code}
|
38
|
+
s.test_files = [
|
39
|
+
"spec/part_of_speech_spec.rb",
|
40
|
+
"spec/spec_helper.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
|
+
s.specification_version = 3
|
46
|
+
|
47
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
48
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
51
|
+
end
|
52
|
+
else
|
53
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "PartOfSpeech" do
|
4
|
+
it "should properly tag 'the fast fox'" do
|
5
|
+
a = PartOfSpeech.analyze('the fast fox')
|
6
|
+
a[0][1].should == "DT"
|
7
|
+
a[1][1].should == "RB"
|
8
|
+
a[2][1].should == "NN"
|
9
|
+
end
|
10
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: part_of_speech
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- reddavis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-01 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.2.9
|
24
|
+
version:
|
25
|
+
description: Part of speech tagger based off Mark Watsons code
|
26
|
+
email: reddavis@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files:
|
32
|
+
- LICENSE
|
33
|
+
- README.rdoc
|
34
|
+
files:
|
35
|
+
- .document
|
36
|
+
- .gitignore
|
37
|
+
- LICENSE
|
38
|
+
- README.rdoc
|
39
|
+
- Rakefile
|
40
|
+
- VERSION
|
41
|
+
- lib/corpus/lexicon.txt
|
42
|
+
- lib/part_of_speech.rb
|
43
|
+
- part_of_speech.gemspec
|
44
|
+
- spec/part_of_speech_spec.rb
|
45
|
+
- spec/spec.opts
|
46
|
+
- spec/spec_helper.rb
|
47
|
+
has_rdoc: true
|
48
|
+
homepage: http://github.com/reddavis/Part-Of-Speech
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options:
|
53
|
+
- --charset=UTF-8
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.3.5
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Part of speech tagger based off Mark Watsons code
|
75
|
+
test_files:
|
76
|
+
- spec/part_of_speech_spec.rb
|
77
|
+
- spec/spec_helper.rb
|