RubyGems - part_of_speech - Versions diffs - 0.0.0 - Mend

part_of_speech 0.0.0

Files changed (13) hide show

data/lib/part_of_speech.rb ADDED Viewed

@@ -0,0 +1,131 @@
+class PartOfSpeech
+  class << self
+    def analyze(text)
+      new.tag(text)
+    end
+  end
+  # Place corpus into memory
+  def initialize
+    @lexicons = {}
+    File.open(corpus_path).each do |line|
+      line = line.split
+      @lexicons[line.shift] = line
+    end
+  end
+  def tag(text)
+    @text = text.split(/\s|\.|,|\:|\;|\'/)
+    @pos = []
+    @text.each do |word|
+      if @lexicons.key?(word) || @lexicons.key?(word.downcase)
+        @pos << @lexicons[word][0]
+      else
+        @pos << "NN"
+      end
+    end
+    # Apply Transformational rules
+    @pos.each_index do |index|
+      rule_one(index)
+      rule_two(index)
+      rule_three(index)
+      rule_four(index)
+      rule_five(index)
+      rule_six(index)
+      rule_seven(index)
+      rule_eight(index)
+      rule_nine(index)
+    end
+    # Organize [word, pos]
+    results = []
+    @text.each_with_index do |word, i|
+      results << [word, @pos[i]]
+    end
+    results
+  end
+  private
+  def rule_one(index)
+    ## rule 1: DT, {VBD | VBP} --> DT, NN
+    return unless index > 0
+    if @pos[index - 1] == "DT" && (@pos[index] == "VBD" || @pos[index] == "VBP" || @pos[index] == "VB")
+      @pos[index] = "NN"
+    end
+  end
+  def rule_two(index)
+    ## rule 2: convert a noun to a number (CD) if "." appears in the word
+    if @pos[index] =~ /^N/ && @text[index] =~ /\./
+      @pos[index] = "CD"
+    end
+  end
+  def rule_three(index)
+    ## rule 3: convert a noun to a past participle if words[i] ends with "ed"
+    if @pos[index] =~ /^N/ && @text[index] =~ /ed$/
+      @pos[index] = "VBN"
+    end
+  end
+  def rule_four(index)
+    ## rule 4: convert any type to adverb if it ends in "ly"
+    if @text[index] =~ /ly$/
+      @pos[index] = "RB"
+    end
+  end
+  def rule_five(index)
+    ## rule 5: convert a common noun (NN or NNS) to a adjective if it ends with "al"
+    if @pos[index] =~ /^NN/ && @text[index] =~ /al$/
+      @pos[index] = "JJ"
+    end
+  end
+  def rule_six(index)
+    ## rule 6: convert a noun to a verb if the preceeding work is "would"
+    return unless index > 0
+    if @pos[index] =~ /^NN/ && @text[index-1].downcase == "would"
+      @pos[index] = "VB"
+    end
+  end
+  def rule_seven(index)
+    # rule 7: if a word has been categorized as a common noun and
+    # it ends with "s", then set its type to plural common noun (NNS)
+    if @pos[index] == "NN" && @text[index] =~ /s$/
+      @pos[index] = "NNS"
+    end
+  end
+  def rule_eight(index)
+    ## rule 8: convert a common noun to a present participle verb (i.e., a gerand)
+    if @pos[index] =~ /^NN/ && @text[index] =~ /ing$/
+      @pos[index] = "VBG"
+    end
+  end
+  def rule_nine(index)
+    ## rule 9: <noun> <noun 2> --> <noun> <verb> if <noun 2> can also be a verb
+    return unless index > 0
+    if @pos[index-1] =~ /^NN/  && @pos[index] =~ /^NN/
+      if @lexicon[@text[index]].include?("VBN")
+        @pos[index] = "VBN"
+      end
+      if @lexicon[@text[index]].include?("VBZ")
+        @pos[index] = "VBZ"
+      end
+    end
+  end
+  def corpus_path
+    File.expand_path(File.dirname(__FILE__) + '/corpus/lexicon.txt')
+  end
+end

data/part_of_speech.gemspec ADDED Viewed

@@ -0,0 +1,56 @@
+# Generated by jeweler
+# DO NOT EDIT THIS FILE DIRECTLY
+# Instead, edit Jeweler::Tasks in Rakefile, and run the gemspec command
+# -*- encoding: utf-8 -*-
+Gem::Specification.new do |s|
+  s.name = %q{part_of_speech}
+  s.version = "0.0.0"
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["reddavis"]
+  s.date = %q{2010-03-01}
+  s.description = %q{Part of speech tagger based off Mark Watsons code}
+  s.email = %q{reddavis@gmail.com}
+  s.extra_rdoc_files = [
+    "LICENSE",
+     "README.rdoc"
+  ]
+  s.files = [
+    ".document",
+     ".gitignore",
+     "LICENSE",
+     "README.rdoc",
+     "Rakefile",
+     "VERSION",
+     "lib/corpus/lexicon.txt",
+     "lib/part_of_speech.rb",
+     "part_of_speech.gemspec",
+     "spec/part_of_speech_spec.rb",
+     "spec/spec.opts",
+     "spec/spec_helper.rb"
+  ]
+  s.homepage = %q{http://github.com/reddavis/Part-Of-Speech}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{Part of speech tagger based off Mark Watsons code}
+  s.test_files = [
+    "spec/part_of_speech_spec.rb",
+     "spec/spec_helper.rb"
+  ]
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_development_dependency(%q<rspec>, [">= 1.2.9"])
+    else
+      s.add_dependency(%q<rspec>, [">= 1.2.9"])
+    end
+  else
+    s.add_dependency(%q<rspec>, [">= 1.2.9"])
+  end
+end

data/spec/part_of_speech_spec.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
+describe "PartOfSpeech" do
+  it "should properly tag 'the fast fox'" do
+    a = PartOfSpeech.analyze('the fast fox')
+    a[0][1].should == "DT"
+    a[1][1].should == "RB"
+    a[2][1].should == "NN"
+  end
+end

data/spec/spec.opts ADDED Viewed

	@@ -0,0 +1 @@
1	+ --color

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,9 @@
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+require 'part_of_speech'
+require 'spec'
+require 'spec/autorun'
+Spec::Runner.configure do |config|
+end

metadata ADDED Viewed

@@ -0,0 +1,77 @@
+--- !ruby/object:Gem::Specification
+name: part_of_speech
+version: !ruby/object:Gem::Version
+  version: 0.0.0
+platform: ruby
+authors:
+- reddavis
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2010-03-01 00:00:00 +00:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rspec
+  type: :development
+  version_requirement:
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 1.2.9
+    version:
+description: Part of speech tagger based off Mark Watsons code
+email: reddavis@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files:
+- LICENSE
+- README.rdoc
+files:
+- .document
+- .gitignore
+- LICENSE
+- README.rdoc
+- Rakefile
+- VERSION
+- lib/corpus/lexicon.txt
+- lib/part_of_speech.rb
+- part_of_speech.gemspec
+- spec/part_of_speech_spec.rb
+- spec/spec.opts
+- spec/spec_helper.rb
+has_rdoc: true
+homepage: http://github.com/reddavis/Part-Of-Speech
+licenses: []
+post_install_message:
+rdoc_options:
+- --charset=UTF-8
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+  version:
+requirements: []
+rubyforge_project:
+rubygems_version: 1.3.5
+signing_key:
+specification_version: 3
+summary: Part of speech tagger based off Mark Watsons code
+test_files:
+- spec/part_of_speech_spec.rb
+- spec/spec_helper.rb