RubyGems - stanfordparser - Versions diffs - 1.0.0 - Mend

stanfordparser 1.0.0

Files changed (4) hide show

data/README +55 -0
data/lib/stanfordparser.rb +249 -0
data/test/test_stanfordparser.rb +103 -0
metadata +53 -0

data/README ADDED Viewed

@@ -0,0 +1,55 @@
+= Stanford Natural Language Parser
+This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
+The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic.  This module provides a thin wrapper around the Java code to make it accessible from Ruby.
+= Installation
+To run this module you must install the following additional software
+* The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
+* The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
+This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory.  This is the directory that contains the <tt>stanford-parser.jar</tt> file.  An alternate directory can be specified with a <tt>/etc/ruby-stanford-parser.yaml</tt> configuration file.  This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and contains a single <tt>root</tt> value, for example:
+	root: /usr/local/stanford-parser/other/location
+=Usage
+Use the StanfordParser::LexicalizedParser class to parse sentences.
+	irb(main):001:0> require 'stanfordparser'
+	=> true
+	irb(main):002:0> parser = StanfordParser::LexicalizedParser.new
+	Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
+	=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
+	irb(main):003:0> puts parser.apply("This is a sentence.")
+	(ROOT
+	  (S [24.917]
+	    (NP [6.139] (DT [2.300] This))
+	    (VP [17.636] (VBZ [0.144] is)
+	      (NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
+	    (. [0.002] .)))
+Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
+	irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
+	irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence.  So is this.")
+	This is a sentence .
+	So is this .
+For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
+= Copyright
+Copyright 2007, William Patrick McNeill
+This program is distributed under the GNU General Public License.
+= Author
+W.P. McNeill mailto:billmcn@gmail.com

data/lib/stanfordparser.rb ADDED Viewed

@@ -0,0 +1,249 @@
+# Copyright 2007 William Patrick McNeill
+#
+# This file is part of the Stanford Parser Ruby Wrapper.
+#
+# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the License,
+# or (at your option) any later version.
+#
+# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
+# St, Fifth Floor, Boston, MA 02110-1301 USA
+require "pathname"
+require "rjb"
+require "set"
+require "yaml"
+# Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
+# adds a generic Java object wrapper class.
+module Rjb
+  # A generic wrapper for a Java object loaded via the Ruby Java Bridge.  The
+  # wrapper class handles intialization and stringification, and passes other
+  # method calls down to the underlying Java object.  Objects returned by the
+  # underlying Java object are converted to the appropriate Ruby object.
+  #
+  # This object is enumerable, yielding items in the order defined by the Java
+  # object's iterator.
+  class JavaObjectWrapper
+    include Enumerable
+    # The underlying Java object.
+    attr_reader :java_object
+    # Initialize with a Java object <em>obj</em>.  If <em>obj</em> is a
+    # String, assume it is a Java class name and instantiate it.  Otherwise,
+    # treat <em>obj</em> as an instance of a Java object.
+    def initialize(obj, *args)
+      @java_object = obj.class == String ?
+      Rjb::import(obj).send(:new, *args) : obj
+    end
+    # Enumerate all the items in the object using its iterator.  If the object
+    # has no iterator, this function yields nothing.
+    def each
+      if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
+        i = @java_object.iterator
+        while i.hasNext
+          yield wrap_java_object(i.next)
+        end
+      end
+    end # each
+    # Reflect unhandled method calls to the underlying Java object.
+    def method_missing(m, *args)
+      wrap_java_object(@java_object.send(m, *args))
+    end
+    # Convert a value returned by a call to the underlying Java object to the
+    # appropriate Ruby object as follows:
+    # * RJB objects are placed inside a generic JavaObjectWrapper wrapper.
+    # * <tt>java.util.ArrayList</tt> objects are converted to Ruby Arrays.
+    # * <tt>java.util.HashSet</tt> objects are converted to Ruby Sets
+    # * Other objects are left unchanged.
+    #
+    # This function is applied recursively to items in collection objects such
+    # as set and arrays.
+    def wrap_java_object(object)
+      if object.kind_of?(Array)
+        object.collect {|item| wrap_java_object(item)}
+      # Ruby-Java Bridge Java objects all have a _classname member which tells
+      # the name of their Java class.
+      elsif object.respond_to?(:_classname)
+        case object._classname
+        when /java\.util\.ArrayList/
+          # Convert java.util.ArrayList objects to Ruby arrays.
+          array_list = []
+          object.size.times do
+            |i| array_list << wrap_java_object(object.get(i))
+          end
+          array_list
+        when /java\.util\.HashSet/
+          # Convert java.util.HashSet objects to Ruby sets.
+          set = Set.new
+          i = object.iterator
+          while i.hasNext
+            set << wrap_java_object(i.next)
+          end
+          set
+        else
+          # Passs other RJB objects off to a handler.
+          wrap_rjb_object(object)
+        end # case
+      else
+        # Return non-RJB objects unchanged.
+        object
+      end # if
+    end # wrap_java_object
+    # By default, all RJB classes other than <tt>java.util.ArrayList</tt> go
+    # in a generic wrapper.  Derived classes may change this behavior.
+    def wrap_rjb_object(object)
+      JavaObjectWrapper.new(object)
+    end
+    # Show the classname of the underlying Java object.
+    def inspect
+      "<#{@java_object._classname}>"
+    end
+    # Use the underlying Java object's stringification.
+    def to_s
+      toString
+    end
+    protected :wrap_java_object, :wrap_rjb_object
+  end # JavaObjectWrapper
+end # Rjb
+# Wrapper for the {Stanford Natural Language
+# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
+module StanfordParser
+  VERSION = "1.0.0"
+  # This function is executed once when the module is loaded.  It adds the
+  # Stanford parser jarfile to the JVM classpath and return the root of the
+  # parser installation.  The root of the installation may be written in a
+  # YAML file in <tt>/etc/ruby_stanford_parser.yaml</tt>.  If this file is not
+  # present, the default root <tt>/usr/local/stanford-parser/current</tt> is
+  # used.
+  def StanfordParser.initialize_on_load
+    root = Pathname.new("/usr/local/stanford-parser/current")
+    config = Pathname.new("/etc/ruby-stanford-parser.yaml")
+    if config.file?
+      configuration = open(config) {|f| YAML.load(f)}
+      if configuration.key?("root")
+        root = Pathname.new(configuration["root"])
+      end
+    end
+    Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
+    root
+  end
+  # The root directory of the Stanford parser installation.
+  ROOT = StanfordParser.initialize_on_load
+  # Extension of the generic Ruby-Java Bridge wrapper object for the
+  # StanfordParser module.
+  class JavaObjectWrapper < Rjb::JavaObjectWrapper
+    # Wrap a return value with a specialized wrapper class in the
+    # StanfordParser module in the appropriate class.
+    def wrap_rjb_object(object)
+      case object._classname
+      when /^edu\.stanford\.nlp\.trees\.
+        (Tree|LabeledScoredTreeLeaf|
+        LabeledScoredTreeNode|
+        SimpleTree|TreeGraphNode)$/x
+        # Tree objects go inside a Tree wrapper.
+        Tree.new(object)
+      else
+        super(object)
+      end # case
+    end # wrap_rjb_object
+  end # JavaObjectWrapper
+  # Lexicalized probabalistic parser.
+  #
+  # This is an wrapper for the
+  # <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
+  class LexicalizedParser < JavaObjectWrapper
+    # The grammar used by the parser
+    attr_reader :grammar
+    # Create the parser given a grammar and options.  The <em>grammar</em>
+    # argument is a path to a grammar file.  This path may contain the string
+    # <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
+    # Stanford Parser. By default, an English grammar is loaded.
+    #
+    # The <em>options</em> argument is a list of string arguments as they
+    # would appear on a command line.  See the documentaion of
+    # <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
+    # details.
+    def initialize(grammar = "$(ROOT)/englishPCFG.ser.gz", options = [])
+      @grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
+      super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
+      @java_object.setOptionFlags(options)
+    end
+    def to_s
+      "LexicalizedParser(#{grammar.basename})"
+    end
+  end # LexicalizedParser
+  # A parse tree that supports preorder enumeration via the Enumerable mixin.
+  #
+  # This is a wrapper for the
+  # <tt>edu.stanford.nlp.trees.Tree</tt> objects.
+  class Tree < JavaObjectWrapper
+    include Enumerable
+    def initialize(obj = "edu.stanford.nlp.trees.Tree")
+      super(obj)
+    end
+    # Return the label along with the score if there is one.
+    def inspect
+      s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
+      "(#{s})"
+    end
+    # The Penn treebank representation.  This prints with indenting instead of
+    # putting everything on one line.
+    def to_s
+      "#{pennString}"
+    end
+  end # Tree
+  # Tokenizes documents into words and sentences.
+  #
+  # This is a wrapper for the
+  # <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
+  class DocumentPreprocessor < JavaObjectWrapper
+    def initialize(suppressEscaping = false)
+      super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
+    end
+    # Returns a list of sentences in a string.
+    def getSentencesFromString(s)
+      s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
+      _invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
+    end
+  end # DocumentPreprocessor
+end # StanfordParser

data/test/test_stanfordparser.rb ADDED Viewed

@@ -0,0 +1,103 @@
+#!/bin/env ruby
+#--
+# Copyright 2007 William Patrick McNeill
+#
+# This file is part of the Stanford Parser Ruby Wrapper.
+#
+# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
+# and/or modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2 of the License,
+# or (at your option) any later version.
+#
+# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
+# Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
+# St, Fifth Floor, Boston, MA 02110-1301 USA
+#
+#++
+# Test cases for the Stanford Parser module
+require "test/unit"
+require "set"
+require "singleton"
+require "stanfordparser"
+# Make the Lexicalized Parser a singleton for the tests because it takes
+# several seconds to load.
+class StanfordParser::LexicalizedParser
+  include Singleton
+end
+class LexicalizedParserTestCase < Test::Unit::TestCase
+  def test_root_path
+    assert_equal StanfordParser::ROOT.class, Pathname
+  end
+  def setup
+    @parser = StanfordParser::LexicalizedParser.instance
+    @tree = @parser.apply("This is a sentence.")
+  end
+  def test_parser
+    assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
+    assert_equal @tree.class, StanfordParser::Tree
+  end
+  def test_localTrees
+    l = @tree.localTrees
+    assert_equal l.size, 5
+    assert_equal Set.new(l.collect {|t| "#{t.label}"}),
+                 Set.new(["S", "NP", "VP", "ROOT", "NP"])
+  end
+  def test_enumerable
+    # StanfordParser::LexicalizedParser is not an enumerable object.
+    assert_equal @parser.map, []
+  end
+end # LexicalizedParserTestCase
+class TreeTestCase < Test::Unit::TestCase
+  def setup
+    @parser = StanfordParser::LexicalizedParser.instance
+    @tree = @parser.apply("This is a sentence.")
+  end
+  def test_enumerable
+    assert @tree.all? {|n| n.class == StanfordParser::Tree}
+    assert @tree.all? {|n|
+      n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
+      n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
+    }
+    assert_equal @tree.map {|n| "#{n.label}"},
+      ["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
+       "NN", "sentence", ".", "."]
+  end
+end # TreeTestCase
+class DocumentPreprocessorTestCase < Test::Unit::TestCase
+  def setup
+    @preproc = StanfordParser::DocumentPreprocessor.new
+  end
+  def test_get_sentences_from_string
+    s = @preproc.getSentencesFromString("This is a sentence.  So is this.")
+    assert_equal "#{s[0]}", "This is a sentence ."
+    assert_equal "#{s[1]}", "So is this ."
+  end
+  def test_enumerable
+    # StanfordParser::DocumentPreprocessor is not an enumerable object.
+    assert_equal @preproc.map, []
+  end
+end # DocumentPreprocessorTestCase

metadata ADDED Viewed

@@ -0,0 +1,53 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.4
+specification_version: 1
+name: stanfordparser
+version: !ruby/object:Gem::Version
+  version: 1.0.0
+date: 2007-11-04 00:00:00 -07:00
+summary: Ruby wrapper for the Stanford Natural Language Parser
+require_paths:
+- lib
+email: billmcn@gmail.com
+homepage: http://stanfordparser.rubyforge.org/
+rubyforge_project: stanfordparser
+description: This module is a Ruby wrapper for the Stanford Natural Language Parser.
+autorequire:
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+post_install_message:
+authors:
+- W.P. McNeill
+files:
+- test/test_stanfordparser.rb
+- lib/stanfordparser.rb
+- README
+test_files:
+- test/test_stanfordparser.rb
+rdoc_options:
+- - --title
+  - StanfordParser -- Stanford Parser
+  - --main
+  - README
+  - --line-numbers
+  - --inline-source
+extra_rdoc_files:
+- README
+executables: []
+extensions: []
+requirements: []
+dependencies: []