stanfordparser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +55 -0
- data/lib/stanfordparser.rb +249 -0
- data/test/test_stanfordparser.rb +103 -0
- metadata +53 -0
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= Stanford Natural Language Parser
|
2
|
+
|
3
|
+
This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
4
|
+
|
5
|
+
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby.
|
6
|
+
|
7
|
+
= Installation
|
8
|
+
|
9
|
+
To run this module you must install the following additional software
|
10
|
+
|
11
|
+
* The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
|
12
|
+
* The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
|
13
|
+
|
14
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory. This is the directory that contains the <tt>stanford-parser.jar</tt> file. An alternate directory can be specified with a <tt>/etc/ruby-stanford-parser.yaml</tt> configuration file. This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and contains a single <tt>root</tt> value, for example:
|
15
|
+
|
16
|
+
root: /usr/local/stanford-parser/other/location
|
17
|
+
|
18
|
+
|
19
|
+
=Usage
|
20
|
+
|
21
|
+
Use the StanfordParser::LexicalizedParser class to parse sentences.
|
22
|
+
|
23
|
+
irb(main):001:0> require 'stanfordparser'
|
24
|
+
=> true
|
25
|
+
irb(main):002:0> parser = StanfordParser::LexicalizedParser.new
|
26
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
|
27
|
+
=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
|
28
|
+
irb(main):003:0> puts parser.apply("This is a sentence.")
|
29
|
+
(ROOT
|
30
|
+
(S [24.917]
|
31
|
+
(NP [6.139] (DT [2.300] This))
|
32
|
+
(VP [17.636] (VBZ [0.144] is)
|
33
|
+
(NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
|
34
|
+
(. [0.002] .)))
|
35
|
+
|
36
|
+
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
|
37
|
+
|
38
|
+
irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
|
39
|
+
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
40
|
+
This is a sentence .
|
41
|
+
So is this .
|
42
|
+
|
43
|
+
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
44
|
+
|
45
|
+
|
46
|
+
= Copyright
|
47
|
+
|
48
|
+
Copyright 2007, William Patrick McNeill
|
49
|
+
|
50
|
+
This program is distributed under the GNU General Public License.
|
51
|
+
|
52
|
+
|
53
|
+
= Author
|
54
|
+
|
55
|
+
W.P. McNeill mailto:billmcn@gmail.com
|
@@ -0,0 +1,249 @@
|
|
1
|
+
# Copyright 2007 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
4
|
+
#
|
5
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
6
|
+
# and/or modify it under the terms of the GNU General Public License as
|
7
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
8
|
+
# or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
11
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
13
|
+
# Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License along with
|
16
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
17
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
|
20
|
+
require "pathname"
|
21
|
+
require "rjb"
|
22
|
+
require "set"
|
23
|
+
require "yaml"
|
24
|
+
|
25
|
+
# Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
|
26
|
+
# adds a generic Java object wrapper class.
|
27
|
+
module Rjb
|
28
|
+
|
29
|
+
# A generic wrapper for a Java object loaded via the Ruby Java Bridge. The
|
30
|
+
# wrapper class handles intialization and stringification, and passes other
|
31
|
+
# method calls down to the underlying Java object. Objects returned by the
|
32
|
+
# underlying Java object are converted to the appropriate Ruby object.
|
33
|
+
#
|
34
|
+
# This object is enumerable, yielding items in the order defined by the Java
|
35
|
+
# object's iterator.
|
36
|
+
class JavaObjectWrapper
|
37
|
+
include Enumerable
|
38
|
+
|
39
|
+
# The underlying Java object.
|
40
|
+
attr_reader :java_object
|
41
|
+
|
42
|
+
# Initialize with a Java object <em>obj</em>. If <em>obj</em> is a
|
43
|
+
# String, assume it is a Java class name and instantiate it. Otherwise,
|
44
|
+
# treat <em>obj</em> as an instance of a Java object.
|
45
|
+
def initialize(obj, *args)
|
46
|
+
@java_object = obj.class == String ?
|
47
|
+
Rjb::import(obj).send(:new, *args) : obj
|
48
|
+
end
|
49
|
+
|
50
|
+
# Enumerate all the items in the object using its iterator. If the object
|
51
|
+
# has no iterator, this function yields nothing.
|
52
|
+
def each
|
53
|
+
if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
|
54
|
+
i = @java_object.iterator
|
55
|
+
while i.hasNext
|
56
|
+
yield wrap_java_object(i.next)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end # each
|
60
|
+
|
61
|
+
# Reflect unhandled method calls to the underlying Java object.
|
62
|
+
def method_missing(m, *args)
|
63
|
+
wrap_java_object(@java_object.send(m, *args))
|
64
|
+
end
|
65
|
+
|
66
|
+
# Convert a value returned by a call to the underlying Java object to the
|
67
|
+
# appropriate Ruby object as follows:
|
68
|
+
# * RJB objects are placed inside a generic JavaObjectWrapper wrapper.
|
69
|
+
# * <tt>java.util.ArrayList</tt> objects are converted to Ruby Arrays.
|
70
|
+
# * <tt>java.util.HashSet</tt> objects are converted to Ruby Sets
|
71
|
+
# * Other objects are left unchanged.
|
72
|
+
#
|
73
|
+
# This function is applied recursively to items in collection objects such
|
74
|
+
# as set and arrays.
|
75
|
+
def wrap_java_object(object)
|
76
|
+
if object.kind_of?(Array)
|
77
|
+
object.collect {|item| wrap_java_object(item)}
|
78
|
+
# Ruby-Java Bridge Java objects all have a _classname member which tells
|
79
|
+
# the name of their Java class.
|
80
|
+
elsif object.respond_to?(:_classname)
|
81
|
+
case object._classname
|
82
|
+
when /java\.util\.ArrayList/
|
83
|
+
# Convert java.util.ArrayList objects to Ruby arrays.
|
84
|
+
array_list = []
|
85
|
+
object.size.times do
|
86
|
+
|i| array_list << wrap_java_object(object.get(i))
|
87
|
+
end
|
88
|
+
array_list
|
89
|
+
when /java\.util\.HashSet/
|
90
|
+
# Convert java.util.HashSet objects to Ruby sets.
|
91
|
+
set = Set.new
|
92
|
+
i = object.iterator
|
93
|
+
while i.hasNext
|
94
|
+
set << wrap_java_object(i.next)
|
95
|
+
end
|
96
|
+
set
|
97
|
+
else
|
98
|
+
# Passs other RJB objects off to a handler.
|
99
|
+
wrap_rjb_object(object)
|
100
|
+
end # case
|
101
|
+
else
|
102
|
+
# Return non-RJB objects unchanged.
|
103
|
+
object
|
104
|
+
end # if
|
105
|
+
end # wrap_java_object
|
106
|
+
|
107
|
+
# By default, all RJB classes other than <tt>java.util.ArrayList</tt> go
|
108
|
+
# in a generic wrapper. Derived classes may change this behavior.
|
109
|
+
def wrap_rjb_object(object)
|
110
|
+
JavaObjectWrapper.new(object)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Show the classname of the underlying Java object.
|
114
|
+
def inspect
|
115
|
+
"<#{@java_object._classname}>"
|
116
|
+
end
|
117
|
+
|
118
|
+
# Use the underlying Java object's stringification.
|
119
|
+
def to_s
|
120
|
+
toString
|
121
|
+
end
|
122
|
+
|
123
|
+
protected :wrap_java_object, :wrap_rjb_object
|
124
|
+
|
125
|
+
end # JavaObjectWrapper
|
126
|
+
|
127
|
+
end # Rjb
|
128
|
+
|
129
|
+
|
130
|
+
# Wrapper for the {Stanford Natural Language
|
131
|
+
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
132
|
+
module StanfordParser
|
133
|
+
|
134
|
+
VERSION = "1.0.0"
|
135
|
+
|
136
|
+
# This function is executed once when the module is loaded. It adds the
|
137
|
+
# Stanford parser jarfile to the JVM classpath and return the root of the
|
138
|
+
# parser installation. The root of the installation may be written in a
|
139
|
+
# YAML file in <tt>/etc/ruby_stanford_parser.yaml</tt>. If this file is not
|
140
|
+
# present, the default root <tt>/usr/local/stanford-parser/current</tt> is
|
141
|
+
# used.
|
142
|
+
def StanfordParser.initialize_on_load
|
143
|
+
root = Pathname.new("/usr/local/stanford-parser/current")
|
144
|
+
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
145
|
+
if config.file?
|
146
|
+
configuration = open(config) {|f| YAML.load(f)}
|
147
|
+
if configuration.key?("root")
|
148
|
+
root = Pathname.new(configuration["root"])
|
149
|
+
end
|
150
|
+
end
|
151
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
|
152
|
+
root
|
153
|
+
end
|
154
|
+
|
155
|
+
# The root directory of the Stanford parser installation.
|
156
|
+
ROOT = StanfordParser.initialize_on_load
|
157
|
+
|
158
|
+
|
159
|
+
# Extension of the generic Ruby-Java Bridge wrapper object for the
|
160
|
+
# StanfordParser module.
|
161
|
+
class JavaObjectWrapper < Rjb::JavaObjectWrapper
|
162
|
+
# Wrap a return value with a specialized wrapper class in the
|
163
|
+
# StanfordParser module in the appropriate class.
|
164
|
+
def wrap_rjb_object(object)
|
165
|
+
case object._classname
|
166
|
+
when /^edu\.stanford\.nlp\.trees\.
|
167
|
+
(Tree|LabeledScoredTreeLeaf|
|
168
|
+
LabeledScoredTreeNode|
|
169
|
+
SimpleTree|TreeGraphNode)$/x
|
170
|
+
# Tree objects go inside a Tree wrapper.
|
171
|
+
Tree.new(object)
|
172
|
+
else
|
173
|
+
super(object)
|
174
|
+
end # case
|
175
|
+
end # wrap_rjb_object
|
176
|
+
end # JavaObjectWrapper
|
177
|
+
|
178
|
+
|
179
|
+
# Lexicalized probabalistic parser.
|
180
|
+
#
|
181
|
+
# This is an wrapper for the
|
182
|
+
# <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
|
183
|
+
class LexicalizedParser < JavaObjectWrapper
|
184
|
+
# The grammar used by the parser
|
185
|
+
attr_reader :grammar
|
186
|
+
|
187
|
+
# Create the parser given a grammar and options. The <em>grammar</em>
|
188
|
+
# argument is a path to a grammar file. This path may contain the string
|
189
|
+
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
190
|
+
# Stanford Parser. By default, an English grammar is loaded.
|
191
|
+
#
|
192
|
+
# The <em>options</em> argument is a list of string arguments as they
|
193
|
+
# would appear on a command line. See the documentaion of
|
194
|
+
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
195
|
+
# details.
|
196
|
+
def initialize(grammar = "$(ROOT)/englishPCFG.ser.gz", options = [])
|
197
|
+
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
198
|
+
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
199
|
+
@java_object.setOptionFlags(options)
|
200
|
+
end
|
201
|
+
|
202
|
+
def to_s
|
203
|
+
"LexicalizedParser(#{grammar.basename})"
|
204
|
+
end
|
205
|
+
end # LexicalizedParser
|
206
|
+
|
207
|
+
|
208
|
+
# A parse tree that supports preorder enumeration via the Enumerable mixin.
|
209
|
+
#
|
210
|
+
# This is a wrapper for the
|
211
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects.
|
212
|
+
class Tree < JavaObjectWrapper
|
213
|
+
include Enumerable
|
214
|
+
|
215
|
+
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
216
|
+
super(obj)
|
217
|
+
end
|
218
|
+
|
219
|
+
# Return the label along with the score if there is one.
|
220
|
+
def inspect
|
221
|
+
s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
|
222
|
+
"(#{s})"
|
223
|
+
end
|
224
|
+
|
225
|
+
# The Penn treebank representation. This prints with indenting instead of
|
226
|
+
# putting everything on one line.
|
227
|
+
def to_s
|
228
|
+
"#{pennString}"
|
229
|
+
end
|
230
|
+
end # Tree
|
231
|
+
|
232
|
+
|
233
|
+
# Tokenizes documents into words and sentences.
|
234
|
+
#
|
235
|
+
# This is a wrapper for the
|
236
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
|
237
|
+
class DocumentPreprocessor < JavaObjectWrapper
|
238
|
+
def initialize(suppressEscaping = false)
|
239
|
+
super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
|
240
|
+
end
|
241
|
+
|
242
|
+
# Returns a list of sentences in a string.
|
243
|
+
def getSentencesFromString(s)
|
244
|
+
s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
|
245
|
+
_invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
|
246
|
+
end
|
247
|
+
end # DocumentPreprocessor
|
248
|
+
|
249
|
+
end # StanfordParser
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
|
5
|
+
# Copyright 2007 William Patrick McNeill
|
6
|
+
#
|
7
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
8
|
+
#
|
9
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
10
|
+
# and/or modify it under the terms of the GNU General Public License as
|
11
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
12
|
+
# or (at your option) any later version.
|
13
|
+
#
|
14
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
15
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
17
|
+
# Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License along with
|
20
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
21
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
22
|
+
#
|
23
|
+
#++
|
24
|
+
|
25
|
+
# Test cases for the Stanford Parser module
|
26
|
+
|
27
|
+
require "test/unit"
|
28
|
+
require "set"
|
29
|
+
require "singleton"
|
30
|
+
require "stanfordparser"
|
31
|
+
|
32
|
+
|
33
|
+
# Make the Lexicalized Parser a singleton for the tests because it takes
|
34
|
+
# several seconds to load.
|
35
|
+
class StanfordParser::LexicalizedParser
|
36
|
+
include Singleton
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
class LexicalizedParserTestCase < Test::Unit::TestCase
|
41
|
+
def test_root_path
|
42
|
+
assert_equal StanfordParser::ROOT.class, Pathname
|
43
|
+
end
|
44
|
+
|
45
|
+
def setup
|
46
|
+
@parser = StanfordParser::LexicalizedParser.instance
|
47
|
+
@tree = @parser.apply("This is a sentence.")
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_parser
|
51
|
+
assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
|
52
|
+
assert_equal @tree.class, StanfordParser::Tree
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_localTrees
|
56
|
+
l = @tree.localTrees
|
57
|
+
assert_equal l.size, 5
|
58
|
+
assert_equal Set.new(l.collect {|t| "#{t.label}"}),
|
59
|
+
Set.new(["S", "NP", "VP", "ROOT", "NP"])
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_enumerable
|
63
|
+
# StanfordParser::LexicalizedParser is not an enumerable object.
|
64
|
+
assert_equal @parser.map, []
|
65
|
+
end
|
66
|
+
end # LexicalizedParserTestCase
|
67
|
+
|
68
|
+
|
69
|
+
class TreeTestCase < Test::Unit::TestCase
|
70
|
+
def setup
|
71
|
+
@parser = StanfordParser::LexicalizedParser.instance
|
72
|
+
@tree = @parser.apply("This is a sentence.")
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_enumerable
|
76
|
+
assert @tree.all? {|n| n.class == StanfordParser::Tree}
|
77
|
+
assert @tree.all? {|n|
|
78
|
+
n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
|
79
|
+
n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
|
80
|
+
}
|
81
|
+
assert_equal @tree.map {|n| "#{n.label}"},
|
82
|
+
["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
|
83
|
+
"NN", "sentence", ".", "."]
|
84
|
+
end
|
85
|
+
end # TreeTestCase
|
86
|
+
|
87
|
+
|
88
|
+
class DocumentPreprocessorTestCase < Test::Unit::TestCase
|
89
|
+
def setup
|
90
|
+
@preproc = StanfordParser::DocumentPreprocessor.new
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_get_sentences_from_string
|
94
|
+
s = @preproc.getSentencesFromString("This is a sentence. So is this.")
|
95
|
+
assert_equal "#{s[0]}", "This is a sentence ."
|
96
|
+
assert_equal "#{s[1]}", "So is this ."
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_enumerable
|
100
|
+
# StanfordParser::DocumentPreprocessor is not an enumerable object.
|
101
|
+
assert_equal @preproc.map, []
|
102
|
+
end
|
103
|
+
end # DocumentPreprocessorTestCase
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: stanfordparser
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2007-11-04 00:00:00 -07:00
|
8
|
+
summary: Ruby wrapper for the Stanford Natural Language Parser
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@gmail.com
|
12
|
+
homepage: http://stanfordparser.rubyforge.org/
|
13
|
+
rubyforge_project: stanfordparser
|
14
|
+
description: This module is a Ruby wrapper for the Stanford Natural Language Parser.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- W.P. McNeill
|
31
|
+
files:
|
32
|
+
- test/test_stanfordparser.rb
|
33
|
+
- lib/stanfordparser.rb
|
34
|
+
- README
|
35
|
+
test_files:
|
36
|
+
- test/test_stanfordparser.rb
|
37
|
+
rdoc_options:
|
38
|
+
- - --title
|
39
|
+
- StanfordParser -- Stanford Parser
|
40
|
+
- --main
|
41
|
+
- README
|
42
|
+
- --line-numbers
|
43
|
+
- --inline-source
|
44
|
+
extra_rdoc_files:
|
45
|
+
- README
|
46
|
+
executables: []
|
47
|
+
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies: []
|
53
|
+
|