part_of_speech 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +21 -0
- data/LICENSE +20 -0
- data/README.rdoc +28 -0
- data/Rakefile +45 -0
- data/VERSION +1 -0
- data/lib/corpus/lexicon.txt +93696 -0
- data/lib/part_of_speech.rb +131 -0
- data/part_of_speech.gemspec +56 -0
- data/spec/part_of_speech_spec.rb +10 -0
- data/spec/spec.opts +1 -0
- data/spec/spec_helper.rb +9 -0
- metadata +77 -0
@@ -0,0 +1,131 @@
|
|
1
|
+
class PartOfSpeech
|
2
|
+
|
3
|
+
class << self
|
4
|
+
def analyze(text)
|
5
|
+
new.tag(text)
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
# Place corpus into memory
|
10
|
+
def initialize
|
11
|
+
@lexicons = {}
|
12
|
+
File.open(corpus_path).each do |line|
|
13
|
+
line = line.split
|
14
|
+
@lexicons[line.shift] = line
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def tag(text)
|
19
|
+
@text = text.split(/\s|\.|,|\:|\;|\'/)
|
20
|
+
|
21
|
+
@pos = []
|
22
|
+
@text.each do |word|
|
23
|
+
if @lexicons.key?(word) || @lexicons.key?(word.downcase)
|
24
|
+
@pos << @lexicons[word][0]
|
25
|
+
else
|
26
|
+
@pos << "NN"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Apply Transformational rules
|
31
|
+
@pos.each_index do |index|
|
32
|
+
rule_one(index)
|
33
|
+
rule_two(index)
|
34
|
+
rule_three(index)
|
35
|
+
rule_four(index)
|
36
|
+
rule_five(index)
|
37
|
+
rule_six(index)
|
38
|
+
rule_seven(index)
|
39
|
+
rule_eight(index)
|
40
|
+
rule_nine(index)
|
41
|
+
end
|
42
|
+
|
43
|
+
# Organize [word, pos]
|
44
|
+
results = []
|
45
|
+
@text.each_with_index do |word, i|
|
46
|
+
results << [word, @pos[i]]
|
47
|
+
end
|
48
|
+
|
49
|
+
results
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def rule_one(index)
|
55
|
+
## rule 1: DT, {VBD | VBP} --> DT, NN
|
56
|
+
return unless index > 0
|
57
|
+
if @pos[index - 1] == "DT" && (@pos[index] == "VBD" || @pos[index] == "VBP" || @pos[index] == "VB")
|
58
|
+
@pos[index] = "NN"
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def rule_two(index)
|
63
|
+
## rule 2: convert a noun to a number (CD) if "." appears in the word
|
64
|
+
if @pos[index] =~ /^N/ && @text[index] =~ /\./
|
65
|
+
@pos[index] = "CD"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def rule_three(index)
|
70
|
+
## rule 3: convert a noun to a past participle if words[i] ends with "ed"
|
71
|
+
if @pos[index] =~ /^N/ && @text[index] =~ /ed$/
|
72
|
+
@pos[index] = "VBN"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def rule_four(index)
|
77
|
+
## rule 4: convert any type to adverb if it ends in "ly"
|
78
|
+
if @text[index] =~ /ly$/
|
79
|
+
@pos[index] = "RB"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def rule_five(index)
|
84
|
+
## rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
|
85
|
+
if @pos[index] =~ /^NN/ && @text[index] =~ /al$/
|
86
|
+
@pos[index] = "JJ"
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def rule_six(index)
|
91
|
+
## rule 6: convert a noun to a verb if the preceeding work is "would"
|
92
|
+
return unless index > 0
|
93
|
+
if @pos[index] =~ /^NN/ && @text[index-1].downcase == "would"
|
94
|
+
@pos[index] = "VB"
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
def rule_seven(index)
|
99
|
+
# rule 7: if a word has been categorized as a common noun and
|
100
|
+
# it ends with "s", then set its type to plural common noun (NNS)
|
101
|
+
if @pos[index] == "NN" && @text[index] =~ /s$/
|
102
|
+
@pos[index] = "NNS"
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
def rule_eight(index)
|
107
|
+
## rule 8: convert a common noun to a present participle verb (i.e., a gerand)
|
108
|
+
if @pos[index] =~ /^NN/ && @text[index] =~ /ing$/
|
109
|
+
@pos[index] = "VBG"
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def rule_nine(index)
|
114
|
+
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> can also be a verb
|
115
|
+
return unless index > 0
|
116
|
+
|
117
|
+
if @pos[index-1] =~ /^NN/ && @pos[index] =~ /^NN/
|
118
|
+
if @lexicon[@text[index]].include?("VBN")
|
119
|
+
@pos[index] = "VBN"
|
120
|
+
end
|
121
|
+
if @lexicon[@text[index]].include?("VBZ")
|
122
|
+
@pos[index] = "VBZ"
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def corpus_path
|
128
|
+
File.expand_path(File.dirname(__FILE__) + '/corpus/lexicon.txt')
|
129
|
+
end
|
130
|
+
|
131
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{part_of_speech}
|
8
|
+
s.version = "0.0.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["reddavis"]
|
12
|
+
s.date = %q{2010-03-01}
|
13
|
+
s.description = %q{Part of speech tagger based off Mark Watsons code}
|
14
|
+
s.email = %q{reddavis@gmail.com}
|
15
|
+
s.extra_rdoc_files = [
|
16
|
+
"LICENSE",
|
17
|
+
"README.rdoc"
|
18
|
+
]
|
19
|
+
s.files = [
|
20
|
+
".document",
|
21
|
+
".gitignore",
|
22
|
+
"LICENSE",
|
23
|
+
"README.rdoc",
|
24
|
+
"Rakefile",
|
25
|
+
"VERSION",
|
26
|
+
"lib/corpus/lexicon.txt",
|
27
|
+
"lib/part_of_speech.rb",
|
28
|
+
"part_of_speech.gemspec",
|
29
|
+
"spec/part_of_speech_spec.rb",
|
30
|
+
"spec/spec.opts",
|
31
|
+
"spec/spec_helper.rb"
|
32
|
+
]
|
33
|
+
s.homepage = %q{http://github.com/reddavis/Part-Of-Speech}
|
34
|
+
s.rdoc_options = ["--charset=UTF-8"]
|
35
|
+
s.require_paths = ["lib"]
|
36
|
+
s.rubygems_version = %q{1.3.5}
|
37
|
+
s.summary = %q{Part of speech tagger based off Mark Watsons code}
|
38
|
+
s.test_files = [
|
39
|
+
"spec/part_of_speech_spec.rb",
|
40
|
+
"spec/spec_helper.rb"
|
41
|
+
]
|
42
|
+
|
43
|
+
if s.respond_to? :specification_version then
|
44
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
45
|
+
s.specification_version = 3
|
46
|
+
|
47
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
48
|
+
s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
|
49
|
+
else
|
50
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
51
|
+
end
|
52
|
+
else
|
53
|
+
s.add_dependency(%q<rspec>, [">= 1.2.9"])
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,10 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
2
|
+
|
3
|
+
describe "PartOfSpeech" do
|
4
|
+
it "should properly tag 'the fast fox'" do
|
5
|
+
a = PartOfSpeech.analyze('the fast fox')
|
6
|
+
a[0][1].should == "DT"
|
7
|
+
a[1][1].should == "RB"
|
8
|
+
a[2][1].should == "NN"
|
9
|
+
end
|
10
|
+
end
|
data/spec/spec.opts
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: part_of_speech
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- reddavis
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-03-01 00:00:00 +00:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rspec
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 1.2.9
|
24
|
+
version:
|
25
|
+
description: Part of speech tagger based off Mark Watsons code
|
26
|
+
email: reddavis@gmail.com
|
27
|
+
executables: []
|
28
|
+
|
29
|
+
extensions: []
|
30
|
+
|
31
|
+
extra_rdoc_files:
|
32
|
+
- LICENSE
|
33
|
+
- README.rdoc
|
34
|
+
files:
|
35
|
+
- .document
|
36
|
+
- .gitignore
|
37
|
+
- LICENSE
|
38
|
+
- README.rdoc
|
39
|
+
- Rakefile
|
40
|
+
- VERSION
|
41
|
+
- lib/corpus/lexicon.txt
|
42
|
+
- lib/part_of_speech.rb
|
43
|
+
- part_of_speech.gemspec
|
44
|
+
- spec/part_of_speech_spec.rb
|
45
|
+
- spec/spec.opts
|
46
|
+
- spec/spec_helper.rb
|
47
|
+
has_rdoc: true
|
48
|
+
homepage: http://github.com/reddavis/Part-Of-Speech
|
49
|
+
licenses: []
|
50
|
+
|
51
|
+
post_install_message:
|
52
|
+
rdoc_options:
|
53
|
+
- --charset=UTF-8
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: "0"
|
61
|
+
version:
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
requirements:
|
64
|
+
- - ">="
|
65
|
+
- !ruby/object:Gem::Version
|
66
|
+
version: "0"
|
67
|
+
version:
|
68
|
+
requirements: []
|
69
|
+
|
70
|
+
rubyforge_project:
|
71
|
+
rubygems_version: 1.3.5
|
72
|
+
signing_key:
|
73
|
+
specification_version: 3
|
74
|
+
summary: Part of speech tagger based off Mark Watsons code
|
75
|
+
test_files:
|
76
|
+
- spec/part_of_speech_spec.rb
|
77
|
+
- spec/spec_helper.rb
|