part_of_speech 0.0.0 → 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +3 -1
- data/VERSION +1 -1
- data/lib/part_of_speech.rb +9 -9
- data/part_of_speech.gemspec +3 -3
- data/{lib/corpus → spec/files}/lexicon.txt +0 -0
- data/spec/part_of_speech_spec.rb +2 -2
- data/spec/spec_helper.rb +9 -0
- metadata +3 -3
data/README.rdoc
CHANGED
@@ -9,6 +9,8 @@ I've just cleaned it up a little bit and packaged it into a gem.
|
|
9
9
|
gem sources -a http://gemcutter.org
|
10
10
|
sudo gem install part_of_speech
|
11
11
|
|
12
|
+
You can download a lexicon list from http://github.com/downloads/reddavis/Part-Of-Speech/lexicon.txt
|
13
|
+
|
12
14
|
== How To Use
|
13
15
|
|
14
16
|
require 'rubygems'
|
@@ -16,7 +18,7 @@ I've just cleaned it up a little bit and packaged it into a gem.
|
|
16
18
|
|
17
19
|
text = "This is some text that I want analyzing"
|
18
20
|
|
19
|
-
PartOfSpeechTagger.analyze(text)
|
21
|
+
PartOfSpeechTagger.analyze(lexicon_path, text)
|
20
22
|
#=> [["This", "DT"], ["is", "VBZ"], ["some", "DT"], ["text", "NN"], ["that", "IN"], ["I", "PRP"], ["want", "VBP"], ["analyzing", "VBG"]]
|
21
23
|
|
22
24
|
== What Do These Letters Mean?
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.0.
|
1
|
+
0.0.1
|
data/lib/part_of_speech.rb
CHANGED
@@ -1,22 +1,22 @@
|
|
1
1
|
class PartOfSpeech
|
2
2
|
|
3
3
|
class << self
|
4
|
-
def analyze(text)
|
5
|
-
new.tag(text)
|
4
|
+
def analyze(lexicon_path, text)
|
5
|
+
new(lexicon_path).tag(text)
|
6
6
|
end
|
7
7
|
end
|
8
8
|
|
9
9
|
# Place corpus into memory
|
10
|
-
def initialize
|
11
|
-
@lexicons = {}
|
12
|
-
File.open(
|
10
|
+
def initialize(lexicon_path)
|
11
|
+
@lexicons = Hash.new {|hash, k| hash[k] = []}
|
12
|
+
File.open(lexicon_path, 'r').each do |line|
|
13
13
|
line = line.split
|
14
14
|
@lexicons[line.shift] = line
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
18
|
def tag(text)
|
19
|
-
@text = text.split(
|
19
|
+
@text = text.split(/ |,|\.|\:|\;|\'/)
|
20
20
|
|
21
21
|
@pos = []
|
22
22
|
@text.each do |word|
|
@@ -114,11 +114,11 @@ class PartOfSpeech
|
|
114
114
|
## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> can also be a verb
|
115
115
|
return unless index > 0
|
116
116
|
|
117
|
-
if @pos[index-1] =~ /^NN/ && @pos[index] =~ /^NN/
|
118
|
-
if @
|
117
|
+
if @pos[index-1] =~ /^NN/ && @pos[index] =~ /^NN/
|
118
|
+
if @lexicons[@text[index]].include?("VBN")
|
119
119
|
@pos[index] = "VBN"
|
120
120
|
end
|
121
|
-
if @
|
121
|
+
if @lexicons[@text[index]].include?("VBZ")
|
122
122
|
@pos[index] = "VBZ"
|
123
123
|
end
|
124
124
|
end
|
data/part_of_speech.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = %q{part_of_speech}
|
8
|
-
s.version = "0.0.
|
8
|
+
s.version = "0.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["reddavis"]
|
12
|
-
s.date = %q{2010-03-
|
12
|
+
s.date = %q{2010-03-02}
|
13
13
|
s.description = %q{Part of speech tagger based off Mark Watsons code}
|
14
14
|
s.email = %q{reddavis@gmail.com}
|
15
15
|
s.extra_rdoc_files = [
|
@@ -23,9 +23,9 @@ Gem::Specification.new do |s|
|
|
23
23
|
"README.rdoc",
|
24
24
|
"Rakefile",
|
25
25
|
"VERSION",
|
26
|
-
"lib/corpus/lexicon.txt",
|
27
26
|
"lib/part_of_speech.rb",
|
28
27
|
"part_of_speech.gemspec",
|
28
|
+
"spec/files/lexicon.txt",
|
29
29
|
"spec/part_of_speech_spec.rb",
|
30
30
|
"spec/spec.opts",
|
31
31
|
"spec/spec_helper.rb"
|
File without changes
|
data/spec/part_of_speech_spec.rb
CHANGED
@@ -2,9 +2,9 @@ require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
|
|
2
2
|
|
3
3
|
describe "PartOfSpeech" do
|
4
4
|
it "should properly tag 'the fast fox'" do
|
5
|
-
a = PartOfSpeech.analyze(
|
5
|
+
a = PartOfSpeech.analyze(lexicon_path, text_sample)
|
6
6
|
a[0][1].should == "DT"
|
7
|
-
a[1][1].should == "
|
7
|
+
a[1][1].should == "JJ"
|
8
8
|
a[2][1].should == "NN"
|
9
9
|
end
|
10
10
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -4,6 +4,15 @@ require 'part_of_speech'
|
|
4
4
|
require 'spec'
|
5
5
|
require 'spec/autorun'
|
6
6
|
|
7
|
+
def text_sample
|
8
|
+
"This regular expression is read in the following manner: Zero or more adjectives or nouns, followed by an option group of a noun and a preposition, followed again by zero or more adjectives or nouns, followed by a single noun. A sequence of tags matching this pattern ensures that the corresponding words make up a noun phrase.
|
9
|
+
In addition to simply pulling out the phrases, it is common to do some simple post processing to link variants together (For example, unpluralizing plural variants)."
|
10
|
+
end
|
11
|
+
|
12
|
+
def lexicon_path
|
13
|
+
File.expand_path(File.dirname(__FILE__) + '/files/lexicon.txt')
|
14
|
+
end
|
15
|
+
|
7
16
|
Spec::Runner.configure do |config|
|
8
17
|
|
9
18
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: part_of_speech
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- reddavis
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-03-
|
12
|
+
date: 2010-03-02 00:00:00 +00:00
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -38,9 +38,9 @@ files:
|
|
38
38
|
- README.rdoc
|
39
39
|
- Rakefile
|
40
40
|
- VERSION
|
41
|
-
- lib/corpus/lexicon.txt
|
42
41
|
- lib/part_of_speech.rb
|
43
42
|
- part_of_speech.gemspec
|
43
|
+
- spec/files/lexicon.txt
|
44
44
|
- spec/part_of_speech_spec.rb
|
45
45
|
- spec/spec.opts
|
46
46
|
- spec/spec_helper.rb
|