stanfordparser 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/README ADDED
@@ -0,0 +1,55 @@
1
+ = Stanford Natural Language Parser
2
+
3
+ This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
4
+
5
+ The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby.
6
+
7
+ = Installation
8
+
9
+ To run this module you must install the following additional software
10
+
11
+ * The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
12
+ * The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
13
+
14
+ This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory. This is the directory that contains the <tt>stanford-parser.jar</tt> file. An alternate directory can be specified with a <tt>/etc/ruby-stanford-parser.yaml</tt> configuration file. This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and contains a single <tt>root</tt> value, for example:
15
+
16
+ root: /usr/local/stanford-parser/other/location
17
+
18
+
19
+ =Usage
20
+
21
+ Use the StanfordParser::LexicalizedParser class to parse sentences.
22
+
23
+ irb(main):001:0> require 'stanfordparser'
24
+ => true
25
+ irb(main):002:0> parser = StanfordParser::LexicalizedParser.new
26
+ Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
27
+ => edu.stanford.nlp.parser.lexparser.LexicalizedParser
28
+ irb(main):003:0> puts parser.apply("This is a sentence.")
29
+ (ROOT
30
+ (S [24.917]
31
+ (NP [6.139] (DT [2.300] This))
32
+ (VP [17.636] (VBZ [0.144] is)
33
+ (NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
34
+ (. [0.002] .)))
35
+
36
+ Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
37
+
38
+ irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
39
+ irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
40
+ This is a sentence .
41
+ So is this .
42
+
43
+ For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
44
+
45
+
46
+ = Copyright
47
+
48
+ Copyright 2007, William Patrick McNeill
49
+
50
+ This program is distributed under the GNU General Public License.
51
+
52
+
53
+ = Author
54
+
55
+ W.P. McNeill mailto:billmcn@gmail.com
@@ -0,0 +1,249 @@
1
+ # Copyright 2007 William Patrick McNeill
2
+ #
3
+ # This file is part of the Stanford Parser Ruby Wrapper.
4
+ #
5
+ # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
6
+ # and/or modify it under the terms of the GNU General Public License as
7
+ # published by the Free Software Foundation; either version 2 of the License,
8
+ # or (at your option) any later version.
9
+ #
10
+ # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
11
+ # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13
+ # Public License for more details.
14
+ #
15
+ # You should have received a copy of the GNU General Public License along with
16
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
17
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
18
+
19
+
20
+ require "pathname"
21
+ require "rjb"
22
+ require "set"
23
+ require "yaml"
24
+
25
+ # Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
26
+ # adds a generic Java object wrapper class.
27
+ module Rjb
28
+
29
+ # A generic wrapper for a Java object loaded via the Ruby Java Bridge. The
30
+ # wrapper class handles intialization and stringification, and passes other
31
+ # method calls down to the underlying Java object. Objects returned by the
32
+ # underlying Java object are converted to the appropriate Ruby object.
33
+ #
34
+ # This object is enumerable, yielding items in the order defined by the Java
35
+ # object's iterator.
36
+ class JavaObjectWrapper
37
+ include Enumerable
38
+
39
+ # The underlying Java object.
40
+ attr_reader :java_object
41
+
42
+ # Initialize with a Java object <em>obj</em>. If <em>obj</em> is a
43
+ # String, assume it is a Java class name and instantiate it. Otherwise,
44
+ # treat <em>obj</em> as an instance of a Java object.
45
+ def initialize(obj, *args)
46
+ @java_object = obj.class == String ?
47
+ Rjb::import(obj).send(:new, *args) : obj
48
+ end
49
+
50
+ # Enumerate all the items in the object using its iterator. If the object
51
+ # has no iterator, this function yields nothing.
52
+ def each
53
+ if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
54
+ i = @java_object.iterator
55
+ while i.hasNext
56
+ yield wrap_java_object(i.next)
57
+ end
58
+ end
59
+ end # each
60
+
61
+ # Reflect unhandled method calls to the underlying Java object.
62
+ def method_missing(m, *args)
63
+ wrap_java_object(@java_object.send(m, *args))
64
+ end
65
+
66
+ # Convert a value returned by a call to the underlying Java object to the
67
+ # appropriate Ruby object as follows:
68
+ # * RJB objects are placed inside a generic JavaObjectWrapper wrapper.
69
+ # * <tt>java.util.ArrayList</tt> objects are converted to Ruby Arrays.
70
+ # * <tt>java.util.HashSet</tt> objects are converted to Ruby Sets
71
+ # * Other objects are left unchanged.
72
+ #
73
+ # This function is applied recursively to items in collection objects such
74
+ # as set and arrays.
75
+ def wrap_java_object(object)
76
+ if object.kind_of?(Array)
77
+ object.collect {|item| wrap_java_object(item)}
78
+ # Ruby-Java Bridge Java objects all have a _classname member which tells
79
+ # the name of their Java class.
80
+ elsif object.respond_to?(:_classname)
81
+ case object._classname
82
+ when /java\.util\.ArrayList/
83
+ # Convert java.util.ArrayList objects to Ruby arrays.
84
+ array_list = []
85
+ object.size.times do
86
+ |i| array_list << wrap_java_object(object.get(i))
87
+ end
88
+ array_list
89
+ when /java\.util\.HashSet/
90
+ # Convert java.util.HashSet objects to Ruby sets.
91
+ set = Set.new
92
+ i = object.iterator
93
+ while i.hasNext
94
+ set << wrap_java_object(i.next)
95
+ end
96
+ set
97
+ else
98
+ # Passs other RJB objects off to a handler.
99
+ wrap_rjb_object(object)
100
+ end # case
101
+ else
102
+ # Return non-RJB objects unchanged.
103
+ object
104
+ end # if
105
+ end # wrap_java_object
106
+
107
+ # By default, all RJB classes other than <tt>java.util.ArrayList</tt> go
108
+ # in a generic wrapper. Derived classes may change this behavior.
109
+ def wrap_rjb_object(object)
110
+ JavaObjectWrapper.new(object)
111
+ end
112
+
113
+ # Show the classname of the underlying Java object.
114
+ def inspect
115
+ "<#{@java_object._classname}>"
116
+ end
117
+
118
+ # Use the underlying Java object's stringification.
119
+ def to_s
120
+ toString
121
+ end
122
+
123
+ protected :wrap_java_object, :wrap_rjb_object
124
+
125
+ end # JavaObjectWrapper
126
+
127
+ end # Rjb
128
+
129
+
130
+ # Wrapper for the {Stanford Natural Language
131
+ # Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
132
+ module StanfordParser
133
+
134
+ VERSION = "1.0.0"
135
+
136
+ # This function is executed once when the module is loaded. It adds the
137
+ # Stanford parser jarfile to the JVM classpath and return the root of the
138
+ # parser installation. The root of the installation may be written in a
139
+ # YAML file in <tt>/etc/ruby_stanford_parser.yaml</tt>. If this file is not
140
+ # present, the default root <tt>/usr/local/stanford-parser/current</tt> is
141
+ # used.
142
+ def StanfordParser.initialize_on_load
143
+ root = Pathname.new("/usr/local/stanford-parser/current")
144
+ config = Pathname.new("/etc/ruby-stanford-parser.yaml")
145
+ if config.file?
146
+ configuration = open(config) {|f| YAML.load(f)}
147
+ if configuration.key?("root")
148
+ root = Pathname.new(configuration["root"])
149
+ end
150
+ end
151
+ Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
152
+ root
153
+ end
154
+
155
+ # The root directory of the Stanford parser installation.
156
+ ROOT = StanfordParser.initialize_on_load
157
+
158
+
159
+ # Extension of the generic Ruby-Java Bridge wrapper object for the
160
+ # StanfordParser module.
161
+ class JavaObjectWrapper < Rjb::JavaObjectWrapper
162
+ # Wrap a return value with a specialized wrapper class in the
163
+ # StanfordParser module in the appropriate class.
164
+ def wrap_rjb_object(object)
165
+ case object._classname
166
+ when /^edu\.stanford\.nlp\.trees\.
167
+ (Tree|LabeledScoredTreeLeaf|
168
+ LabeledScoredTreeNode|
169
+ SimpleTree|TreeGraphNode)$/x
170
+ # Tree objects go inside a Tree wrapper.
171
+ Tree.new(object)
172
+ else
173
+ super(object)
174
+ end # case
175
+ end # wrap_rjb_object
176
+ end # JavaObjectWrapper
177
+
178
+
179
+ # Lexicalized probabalistic parser.
180
+ #
181
+ # This is an wrapper for the
182
+ # <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
183
+ class LexicalizedParser < JavaObjectWrapper
184
+ # The grammar used by the parser
185
+ attr_reader :grammar
186
+
187
+ # Create the parser given a grammar and options. The <em>grammar</em>
188
+ # argument is a path to a grammar file. This path may contain the string
189
+ # <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
190
+ # Stanford Parser. By default, an English grammar is loaded.
191
+ #
192
+ # The <em>options</em> argument is a list of string arguments as they
193
+ # would appear on a command line. See the documentaion of
194
+ # <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
195
+ # details.
196
+ def initialize(grammar = "$(ROOT)/englishPCFG.ser.gz", options = [])
197
+ @grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
198
+ super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
199
+ @java_object.setOptionFlags(options)
200
+ end
201
+
202
+ def to_s
203
+ "LexicalizedParser(#{grammar.basename})"
204
+ end
205
+ end # LexicalizedParser
206
+
207
+
208
+ # A parse tree that supports preorder enumeration via the Enumerable mixin.
209
+ #
210
+ # This is a wrapper for the
211
+ # <tt>edu.stanford.nlp.trees.Tree</tt> objects.
212
+ class Tree < JavaObjectWrapper
213
+ include Enumerable
214
+
215
+ def initialize(obj = "edu.stanford.nlp.trees.Tree")
216
+ super(obj)
217
+ end
218
+
219
+ # Return the label along with the score if there is one.
220
+ def inspect
221
+ s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
222
+ "(#{s})"
223
+ end
224
+
225
+ # The Penn treebank representation. This prints with indenting instead of
226
+ # putting everything on one line.
227
+ def to_s
228
+ "#{pennString}"
229
+ end
230
+ end # Tree
231
+
232
+
233
+ # Tokenizes documents into words and sentences.
234
+ #
235
+ # This is a wrapper for the
236
+ # <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
237
+ class DocumentPreprocessor < JavaObjectWrapper
238
+ def initialize(suppressEscaping = false)
239
+ super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
240
+ end
241
+
242
+ # Returns a list of sentences in a string.
243
+ def getSentencesFromString(s)
244
+ s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
245
+ _invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
246
+ end
247
+ end # DocumentPreprocessor
248
+
249
+ end # StanfordParser
@@ -0,0 +1,103 @@
1
+ #!/bin/env ruby
2
+
3
+ #--
4
+
5
+ # Copyright 2007 William Patrick McNeill
6
+ #
7
+ # This file is part of the Stanford Parser Ruby Wrapper.
8
+ #
9
+ # The Stanford Parser Ruby Wrapper is free software; you can redistribute it
10
+ # and/or modify it under the terms of the GNU General Public License as
11
+ # published by the Free Software Foundation; either version 2 of the License,
12
+ # or (at your option) any later version.
13
+ #
14
+ # The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
15
+ # useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
16
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
17
+ # Public License for more details.
18
+ #
19
+ # You should have received a copy of the GNU General Public License along with
20
+ # editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
21
+ # St, Fifth Floor, Boston, MA 02110-1301 USA
22
+ #
23
+ #++
24
+
25
+ # Test cases for the Stanford Parser module
26
+
27
+ require "test/unit"
28
+ require "set"
29
+ require "singleton"
30
+ require "stanfordparser"
31
+
32
+
33
+ # Make the Lexicalized Parser a singleton for the tests because it takes
34
+ # several seconds to load.
35
+ class StanfordParser::LexicalizedParser
36
+ include Singleton
37
+ end
38
+
39
+
40
+ class LexicalizedParserTestCase < Test::Unit::TestCase
41
+ def test_root_path
42
+ assert_equal StanfordParser::ROOT.class, Pathname
43
+ end
44
+
45
+ def setup
46
+ @parser = StanfordParser::LexicalizedParser.instance
47
+ @tree = @parser.apply("This is a sentence.")
48
+ end
49
+
50
+ def test_parser
51
+ assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
52
+ assert_equal @tree.class, StanfordParser::Tree
53
+ end
54
+
55
+ def test_localTrees
56
+ l = @tree.localTrees
57
+ assert_equal l.size, 5
58
+ assert_equal Set.new(l.collect {|t| "#{t.label}"}),
59
+ Set.new(["S", "NP", "VP", "ROOT", "NP"])
60
+ end
61
+
62
+ def test_enumerable
63
+ # StanfordParser::LexicalizedParser is not an enumerable object.
64
+ assert_equal @parser.map, []
65
+ end
66
+ end # LexicalizedParserTestCase
67
+
68
+
69
+ class TreeTestCase < Test::Unit::TestCase
70
+ def setup
71
+ @parser = StanfordParser::LexicalizedParser.instance
72
+ @tree = @parser.apply("This is a sentence.")
73
+ end
74
+
75
+ def test_enumerable
76
+ assert @tree.all? {|n| n.class == StanfordParser::Tree}
77
+ assert @tree.all? {|n|
78
+ n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
79
+ n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
80
+ }
81
+ assert_equal @tree.map {|n| "#{n.label}"},
82
+ ["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
83
+ "NN", "sentence", ".", "."]
84
+ end
85
+ end # TreeTestCase
86
+
87
+
88
+ class DocumentPreprocessorTestCase < Test::Unit::TestCase
89
+ def setup
90
+ @preproc = StanfordParser::DocumentPreprocessor.new
91
+ end
92
+
93
+ def test_get_sentences_from_string
94
+ s = @preproc.getSentencesFromString("This is a sentence. So is this.")
95
+ assert_equal "#{s[0]}", "This is a sentence ."
96
+ assert_equal "#{s[1]}", "So is this ."
97
+ end
98
+
99
+ def test_enumerable
100
+ # StanfordParser::DocumentPreprocessor is not an enumerable object.
101
+ assert_equal @preproc.map, []
102
+ end
103
+ end # DocumentPreprocessorTestCase
metadata ADDED
@@ -0,0 +1,53 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.4
3
+ specification_version: 1
4
+ name: stanfordparser
5
+ version: !ruby/object:Gem::Version
6
+ version: 1.0.0
7
+ date: 2007-11-04 00:00:00 -07:00
8
+ summary: Ruby wrapper for the Stanford Natural Language Parser
9
+ require_paths:
10
+ - lib
11
+ email: billmcn@gmail.com
12
+ homepage: http://stanfordparser.rubyforge.org/
13
+ rubyforge_project: stanfordparser
14
+ description: This module is a Ruby wrapper for the Stanford Natural Language Parser.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - W.P. McNeill
31
+ files:
32
+ - test/test_stanfordparser.rb
33
+ - lib/stanfordparser.rb
34
+ - README
35
+ test_files:
36
+ - test/test_stanfordparser.rb
37
+ rdoc_options:
38
+ - - --title
39
+ - StanfordParser -- Stanford Parser
40
+ - --main
41
+ - README
42
+ - --line-numbers
43
+ - --inline-source
44
+ extra_rdoc_files:
45
+ - README
46
+ executables: []
47
+
48
+ extensions: []
49
+
50
+ requirements: []
51
+
52
+ dependencies: []
53
+