part_of_speech 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,131 @@
1
+ class PartOfSpeech
2
+
3
+ class << self
4
+ def analyze(text)
5
+ new.tag(text)
6
+ end
7
+ end
8
+
9
+ # Place corpus into memory
10
+ def initialize
11
+ @lexicons = {}
12
+ File.open(corpus_path).each do |line|
13
+ line = line.split
14
+ @lexicons[line.shift] = line
15
+ end
16
+ end
17
+
18
+ def tag(text)
19
+ @text = text.split(/\s|\.|,|\:|\;|\'/)
20
+
21
+ @pos = []
22
+ @text.each do |word|
23
+ if @lexicons.key?(word) || @lexicons.key?(word.downcase)
24
+ @pos << @lexicons[word][0]
25
+ else
26
+ @pos << "NN"
27
+ end
28
+ end
29
+
30
+ # Apply Transformational rules
31
+ @pos.each_index do |index|
32
+ rule_one(index)
33
+ rule_two(index)
34
+ rule_three(index)
35
+ rule_four(index)
36
+ rule_five(index)
37
+ rule_six(index)
38
+ rule_seven(index)
39
+ rule_eight(index)
40
+ rule_nine(index)
41
+ end
42
+
43
+ # Organize [word, pos]
44
+ results = []
45
+ @text.each_with_index do |word, i|
46
+ results << [word, @pos[i]]
47
+ end
48
+
49
+ results
50
+ end
51
+
52
+ private
53
+
54
+ def rule_one(index)
55
+ ## rule 1: DT, {VBD | VBP} --> DT, NN
56
+ return unless index > 0
57
+ if @pos[index - 1] == "DT" && (@pos[index] == "VBD" || @pos[index] == "VBP" || @pos[index] == "VB")
58
+ @pos[index] = "NN"
59
+ end
60
+ end
61
+
62
+ def rule_two(index)
63
+ ## rule 2: convert a noun to a number (CD) if "." appears in the word
64
+ if @pos[index] =~ /^N/ && @text[index] =~ /\./
65
+ @pos[index] = "CD"
66
+ end
67
+ end
68
+
69
+ def rule_three(index)
70
+ ## rule 3: convert a noun to a past participle if words[i] ends with "ed"
71
+ if @pos[index] =~ /^N/ && @text[index] =~ /ed$/
72
+ @pos[index] = "VBN"
73
+ end
74
+ end
75
+
76
+ def rule_four(index)
77
+ ## rule 4: convert any type to adverb if it ends in "ly"
78
+ if @text[index] =~ /ly$/
79
+ @pos[index] = "RB"
80
+ end
81
+ end
82
+
83
+ def rule_five(index)
84
+ ## rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
85
+ if @pos[index] =~ /^NN/ && @text[index] =~ /al$/
86
+ @pos[index] = "JJ"
87
+ end
88
+ end
89
+
90
+ def rule_six(index)
91
+ ## rule 6: convert a noun to a verb if the preceeding work is "would"
92
+ return unless index > 0
93
+ if @pos[index] =~ /^NN/ && @text[index-1].downcase == "would"
94
+ @pos[index] = "VB"
95
+ end
96
+ end
97
+
98
+ def rule_seven(index)
99
+ # rule 7: if a word has been categorized as a common noun and
100
+ # it ends with "s", then set its type to plural common noun (NNS)
101
+ if @pos[index] == "NN" && @text[index] =~ /s$/
102
+ @pos[index] = "NNS"
103
+ end
104
+ end
105
+
106
+ def rule_eight(index)
107
+ ## rule 8: convert a common noun to a present participle verb (i.e., a gerand)
108
+ if @pos[index] =~ /^NN/ && @text[index] =~ /ing$/
109
+ @pos[index] = "VBG"
110
+ end
111
+ end
112
+
113
+ def rule_nine(index)
114
+ ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> can also be a verb
115
+ return unless index > 0
116
+
117
+ if @pos[index-1] =~ /^NN/ && @pos[index] =~ /^NN/
118
+ if @lexicon[@text[index]].include?("VBN")
119
+ @pos[index] = "VBN"
120
+ end
121
+ if @lexicon[@text[index]].include?("VBZ")
122
+ @pos[index] = "VBZ"
123
+ end
124
+ end
125
+ end
126
+
127
+ def corpus_path
128
+ File.expand_path(File.dirname(__FILE__) + '/corpus/lexicon.txt')
129
+ end
130
+
131
+ end
@@ -0,0 +1,56 @@
1
+ # Generated by jeweler
2
+ # DO NOT EDIT THIS FILE DIRECTLY
3
+ # Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
4
+ # -*- encoding: utf-8 -*-
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = %q{part_of_speech}
8
+ s.version = "0.0.0"
9
+
10
+ s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
+ s.authors = ["reddavis"]
12
+ s.date = %q{2010-03-01}
13
+ s.description = %q{Part of speech tagger based off Mark Watsons code}
14
+ s.email = %q{reddavis@gmail.com}
15
+ s.extra_rdoc_files = [
16
+ "LICENSE",
17
+ "README.rdoc"
18
+ ]
19
+ s.files = [
20
+ ".document",
21
+ ".gitignore",
22
+ "LICENSE",
23
+ "README.rdoc",
24
+ "Rakefile",
25
+ "VERSION",
26
+ "lib/corpus/lexicon.txt",
27
+ "lib/part_of_speech.rb",
28
+ "part_of_speech.gemspec",
29
+ "spec/part_of_speech_spec.rb",
30
+ "spec/spec.opts",
31
+ "spec/spec_helper.rb"
32
+ ]
33
+ s.homepage = %q{http://github.com/reddavis/Part-Of-Speech}
34
+ s.rdoc_options = ["--charset=UTF-8"]
35
+ s.require_paths = ["lib"]
36
+ s.rubygems_version = %q{1.3.5}
37
+ s.summary = %q{Part of speech tagger based off Mark Watsons code}
38
+ s.test_files = [
39
+ "spec/part_of_speech_spec.rb",
40
+ "spec/spec_helper.rb"
41
+ ]
42
+
43
+ if s.respond_to? :specification_version then
44
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
45
+ s.specification_version = 3
46
+
47
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
48
+ s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
49
+ else
50
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
51
+ end
52
+ else
53
+ s.add_dependency(%q<rspec>, [">= 1.2.9"])
54
+ end
55
+ end
56
+
@@ -0,0 +1,10 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2
+
3
+ describe "PartOfSpeech" do
4
+ it "should properly tag 'the fast fox'" do
5
+ a = PartOfSpeech.analyze('the fast fox')
6
+ a[0][1].should == "DT"
7
+ a[1][1].should == "RB"
8
+ a[2][1].should == "NN"
9
+ end
10
+ end
data/spec/spec.opts ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,9 @@
1
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
2
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3
+ require 'part_of_speech'
4
+ require 'spec'
5
+ require 'spec/autorun'
6
+
7
+ Spec::Runner.configure do |config|
8
+
9
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: part_of_speech
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.0
5
+ platform: ruby
6
+ authors:
7
+ - reddavis
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-03-01 00:00:00 +00:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rspec
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ">="
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.9
24
+ version:
25
+ description: Part of speech tagger based off Mark Watsons code
26
+ email: reddavis@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - LICENSE
33
+ - README.rdoc
34
+ files:
35
+ - .document
36
+ - .gitignore
37
+ - LICENSE
38
+ - README.rdoc
39
+ - Rakefile
40
+ - VERSION
41
+ - lib/corpus/lexicon.txt
42
+ - lib/part_of_speech.rb
43
+ - part_of_speech.gemspec
44
+ - spec/part_of_speech_spec.rb
45
+ - spec/spec.opts
46
+ - spec/spec_helper.rb
47
+ has_rdoc: true
48
+ homepage: http://github.com/reddavis/Part-Of-Speech
49
+ licenses: []
50
+
51
+ post_install_message:
52
+ rdoc_options:
53
+ - --charset=UTF-8
54
+ require_paths:
55
+ - lib
56
+ required_ruby_version: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: "0"
61
+ version:
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ requirements:
64
+ - - ">="
65
+ - !ruby/object:Gem::Version
66
+ version: "0"
67
+ version:
68
+ requirements: []
69
+
70
+ rubyforge_project:
71
+ rubygems_version: 1.3.5
72
+ signing_key:
73
+ specification_version: 3
74
+ summary: Part of speech tagger based off Mark Watsons code
75
+ test_files:
76
+ - spec/part_of_speech_spec.rb
77
+ - spec/spec_helper.rb