stanfordparser 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +55 -0
- data/lib/stanfordparser.rb +249 -0
- data/test/test_stanfordparser.rb +103 -0
- metadata +53 -0
data/README
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= Stanford Natural Language Parser
|
2
|
+
|
3
|
+
This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
4
|
+
|
5
|
+
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby.
|
6
|
+
|
7
|
+
= Installation
|
8
|
+
|
9
|
+
To run this module you must install the following additional software
|
10
|
+
|
11
|
+
* The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
|
12
|
+
* The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
|
13
|
+
|
14
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory. This is the directory that contains the <tt>stanford-parser.jar</tt> file. An alternate directory can be specified with a <tt>/etc/ruby-stanford-parser.yaml</tt> configuration file. This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and contains a single <tt>root</tt> value, for example:
|
15
|
+
|
16
|
+
root: /usr/local/stanford-parser/other/location
|
17
|
+
|
18
|
+
|
19
|
+
=Usage
|
20
|
+
|
21
|
+
Use the StanfordParser::LexicalizedParser class to parse sentences.
|
22
|
+
|
23
|
+
irb(main):001:0> require 'stanfordparser'
|
24
|
+
=> true
|
25
|
+
irb(main):002:0> parser = StanfordParser::LexicalizedParser.new
|
26
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
|
27
|
+
=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
|
28
|
+
irb(main):003:0> puts parser.apply("This is a sentence.")
|
29
|
+
(ROOT
|
30
|
+
(S [24.917]
|
31
|
+
(NP [6.139] (DT [2.300] This))
|
32
|
+
(VP [17.636] (VBZ [0.144] is)
|
33
|
+
(NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
|
34
|
+
(. [0.002] .)))
|
35
|
+
|
36
|
+
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
|
37
|
+
|
38
|
+
irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
|
39
|
+
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
40
|
+
This is a sentence .
|
41
|
+
So is this .
|
42
|
+
|
43
|
+
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
44
|
+
|
45
|
+
|
46
|
+
= Copyright
|
47
|
+
|
48
|
+
Copyright 2007, William Patrick McNeill
|
49
|
+
|
50
|
+
This program is distributed under the GNU General Public License.
|
51
|
+
|
52
|
+
|
53
|
+
= Author
|
54
|
+
|
55
|
+
W.P. McNeill mailto:billmcn@gmail.com
|
@@ -0,0 +1,249 @@
|
|
1
|
+
# Copyright 2007 William Patrick McNeill
|
2
|
+
#
|
3
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
4
|
+
#
|
5
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
6
|
+
# and/or modify it under the terms of the GNU General Public License as
|
7
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
8
|
+
# or (at your option) any later version.
|
9
|
+
#
|
10
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
11
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
13
|
+
# Public License for more details.
|
14
|
+
#
|
15
|
+
# You should have received a copy of the GNU General Public License along with
|
16
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
17
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
|
19
|
+
|
20
|
+
require "pathname"
|
21
|
+
require "rjb"
|
22
|
+
require "set"
|
23
|
+
require "yaml"
|
24
|
+
|
25
|
+
# Extenions to the {Ruby-Java Bridge}[http://rjb.rubyforge.org/] module that
|
26
|
+
# adds a generic Java object wrapper class.
|
27
|
+
module Rjb
|
28
|
+
|
29
|
+
# A generic wrapper for a Java object loaded via the Ruby Java Bridge. The
|
30
|
+
# wrapper class handles intialization and stringification, and passes other
|
31
|
+
# method calls down to the underlying Java object. Objects returned by the
|
32
|
+
# underlying Java object are converted to the appropriate Ruby object.
|
33
|
+
#
|
34
|
+
# This object is enumerable, yielding items in the order defined by the Java
|
35
|
+
# object's iterator.
|
36
|
+
class JavaObjectWrapper
|
37
|
+
include Enumerable
|
38
|
+
|
39
|
+
# The underlying Java object.
|
40
|
+
attr_reader :java_object
|
41
|
+
|
42
|
+
# Initialize with a Java object <em>obj</em>. If <em>obj</em> is a
|
43
|
+
# String, assume it is a Java class name and instantiate it. Otherwise,
|
44
|
+
# treat <em>obj</em> as an instance of a Java object.
|
45
|
+
def initialize(obj, *args)
|
46
|
+
@java_object = obj.class == String ?
|
47
|
+
Rjb::import(obj).send(:new, *args) : obj
|
48
|
+
end
|
49
|
+
|
50
|
+
# Enumerate all the items in the object using its iterator. If the object
|
51
|
+
# has no iterator, this function yields nothing.
|
52
|
+
def each
|
53
|
+
if @java_object.getClass.getMethods.any? {|m| m.getName == "iterator"}
|
54
|
+
i = @java_object.iterator
|
55
|
+
while i.hasNext
|
56
|
+
yield wrap_java_object(i.next)
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end # each
|
60
|
+
|
61
|
+
# Reflect unhandled method calls to the underlying Java object.
|
62
|
+
def method_missing(m, *args)
|
63
|
+
wrap_java_object(@java_object.send(m, *args))
|
64
|
+
end
|
65
|
+
|
66
|
+
# Convert a value returned by a call to the underlying Java object to the
|
67
|
+
# appropriate Ruby object as follows:
|
68
|
+
# * RJB objects are placed inside a generic JavaObjectWrapper wrapper.
|
69
|
+
# * <tt>java.util.ArrayList</tt> objects are converted to Ruby Arrays.
|
70
|
+
# * <tt>java.util.HashSet</tt> objects are converted to Ruby Sets
|
71
|
+
# * Other objects are left unchanged.
|
72
|
+
#
|
73
|
+
# This function is applied recursively to items in collection objects such
|
74
|
+
# as set and arrays.
|
75
|
+
def wrap_java_object(object)
|
76
|
+
if object.kind_of?(Array)
|
77
|
+
object.collect {|item| wrap_java_object(item)}
|
78
|
+
# Ruby-Java Bridge Java objects all have a _classname member which tells
|
79
|
+
# the name of their Java class.
|
80
|
+
elsif object.respond_to?(:_classname)
|
81
|
+
case object._classname
|
82
|
+
when /java\.util\.ArrayList/
|
83
|
+
# Convert java.util.ArrayList objects to Ruby arrays.
|
84
|
+
array_list = []
|
85
|
+
object.size.times do
|
86
|
+
|i| array_list << wrap_java_object(object.get(i))
|
87
|
+
end
|
88
|
+
array_list
|
89
|
+
when /java\.util\.HashSet/
|
90
|
+
# Convert java.util.HashSet objects to Ruby sets.
|
91
|
+
set = Set.new
|
92
|
+
i = object.iterator
|
93
|
+
while i.hasNext
|
94
|
+
set << wrap_java_object(i.next)
|
95
|
+
end
|
96
|
+
set
|
97
|
+
else
|
98
|
+
# Passs other RJB objects off to a handler.
|
99
|
+
wrap_rjb_object(object)
|
100
|
+
end # case
|
101
|
+
else
|
102
|
+
# Return non-RJB objects unchanged.
|
103
|
+
object
|
104
|
+
end # if
|
105
|
+
end # wrap_java_object
|
106
|
+
|
107
|
+
# By default, all RJB classes other than <tt>java.util.ArrayList</tt> go
|
108
|
+
# in a generic wrapper. Derived classes may change this behavior.
|
109
|
+
def wrap_rjb_object(object)
|
110
|
+
JavaObjectWrapper.new(object)
|
111
|
+
end
|
112
|
+
|
113
|
+
# Show the classname of the underlying Java object.
|
114
|
+
def inspect
|
115
|
+
"<#{@java_object._classname}>"
|
116
|
+
end
|
117
|
+
|
118
|
+
# Use the underlying Java object's stringification.
|
119
|
+
def to_s
|
120
|
+
toString
|
121
|
+
end
|
122
|
+
|
123
|
+
protected :wrap_java_object, :wrap_rjb_object
|
124
|
+
|
125
|
+
end # JavaObjectWrapper
|
126
|
+
|
127
|
+
end # Rjb
|
128
|
+
|
129
|
+
|
130
|
+
# Wrapper for the {Stanford Natural Language
|
131
|
+
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
132
|
+
module StanfordParser
|
133
|
+
|
134
|
+
VERSION = "1.0.0"
|
135
|
+
|
136
|
+
# This function is executed once when the module is loaded. It adds the
|
137
|
+
# Stanford parser jarfile to the JVM classpath and return the root of the
|
138
|
+
# parser installation. The root of the installation may be written in a
|
139
|
+
# YAML file in <tt>/etc/ruby_stanford_parser.yaml</tt>. If this file is not
|
140
|
+
# present, the default root <tt>/usr/local/stanford-parser/current</tt> is
|
141
|
+
# used.
|
142
|
+
def StanfordParser.initialize_on_load
|
143
|
+
root = Pathname.new("/usr/local/stanford-parser/current")
|
144
|
+
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
145
|
+
if config.file?
|
146
|
+
configuration = open(config) {|f| YAML.load(f)}
|
147
|
+
if configuration.key?("root")
|
148
|
+
root = Pathname.new(configuration["root"])
|
149
|
+
end
|
150
|
+
end
|
151
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
|
152
|
+
root
|
153
|
+
end
|
154
|
+
|
155
|
+
# The root directory of the Stanford parser installation.
|
156
|
+
ROOT = StanfordParser.initialize_on_load
|
157
|
+
|
158
|
+
|
159
|
+
# Extension of the generic Ruby-Java Bridge wrapper object for the
|
160
|
+
# StanfordParser module.
|
161
|
+
class JavaObjectWrapper < Rjb::JavaObjectWrapper
|
162
|
+
# Wrap a return value with a specialized wrapper class in the
|
163
|
+
# StanfordParser module in the appropriate class.
|
164
|
+
def wrap_rjb_object(object)
|
165
|
+
case object._classname
|
166
|
+
when /^edu\.stanford\.nlp\.trees\.
|
167
|
+
(Tree|LabeledScoredTreeLeaf|
|
168
|
+
LabeledScoredTreeNode|
|
169
|
+
SimpleTree|TreeGraphNode)$/x
|
170
|
+
# Tree objects go inside a Tree wrapper.
|
171
|
+
Tree.new(object)
|
172
|
+
else
|
173
|
+
super(object)
|
174
|
+
end # case
|
175
|
+
end # wrap_rjb_object
|
176
|
+
end # JavaObjectWrapper
|
177
|
+
|
178
|
+
|
179
|
+
# Lexicalized probabalistic parser.
|
180
|
+
#
|
181
|
+
# This is an wrapper for the
|
182
|
+
# <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
|
183
|
+
class LexicalizedParser < JavaObjectWrapper
|
184
|
+
# The grammar used by the parser
|
185
|
+
attr_reader :grammar
|
186
|
+
|
187
|
+
# Create the parser given a grammar and options. The <em>grammar</em>
|
188
|
+
# argument is a path to a grammar file. This path may contain the string
|
189
|
+
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
190
|
+
# Stanford Parser. By default, an English grammar is loaded.
|
191
|
+
#
|
192
|
+
# The <em>options</em> argument is a list of string arguments as they
|
193
|
+
# would appear on a command line. See the documentaion of
|
194
|
+
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
195
|
+
# details.
|
196
|
+
def initialize(grammar = "$(ROOT)/englishPCFG.ser.gz", options = [])
|
197
|
+
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
198
|
+
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
199
|
+
@java_object.setOptionFlags(options)
|
200
|
+
end
|
201
|
+
|
202
|
+
def to_s
|
203
|
+
"LexicalizedParser(#{grammar.basename})"
|
204
|
+
end
|
205
|
+
end # LexicalizedParser
|
206
|
+
|
207
|
+
|
208
|
+
# A parse tree that supports preorder enumeration via the Enumerable mixin.
|
209
|
+
#
|
210
|
+
# This is a wrapper for the
|
211
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects.
|
212
|
+
class Tree < JavaObjectWrapper
|
213
|
+
include Enumerable
|
214
|
+
|
215
|
+
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
216
|
+
super(obj)
|
217
|
+
end
|
218
|
+
|
219
|
+
# Return the label along with the score if there is one.
|
220
|
+
def inspect
|
221
|
+
s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
|
222
|
+
"(#{s})"
|
223
|
+
end
|
224
|
+
|
225
|
+
# The Penn treebank representation. This prints with indenting instead of
|
226
|
+
# putting everything on one line.
|
227
|
+
def to_s
|
228
|
+
"#{pennString}"
|
229
|
+
end
|
230
|
+
end # Tree
|
231
|
+
|
232
|
+
|
233
|
+
# Tokenizes documents into words and sentences.
|
234
|
+
#
|
235
|
+
# This is a wrapper for the
|
236
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
|
237
|
+
class DocumentPreprocessor < JavaObjectWrapper
|
238
|
+
def initialize(suppressEscaping = false)
|
239
|
+
super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
|
240
|
+
end
|
241
|
+
|
242
|
+
# Returns a list of sentences in a string.
|
243
|
+
def getSentencesFromString(s)
|
244
|
+
s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
|
245
|
+
_invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
|
246
|
+
end
|
247
|
+
end # DocumentPreprocessor
|
248
|
+
|
249
|
+
end # StanfordParser
|
@@ -0,0 +1,103 @@
|
|
1
|
+
#!/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
|
5
|
+
# Copyright 2007 William Patrick McNeill
|
6
|
+
#
|
7
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
8
|
+
#
|
9
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
10
|
+
# and/or modify it under the terms of the GNU General Public License as
|
11
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
12
|
+
# or (at your option) any later version.
|
13
|
+
#
|
14
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
15
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
17
|
+
# Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License along with
|
20
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
21
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
22
|
+
#
|
23
|
+
#++
|
24
|
+
|
25
|
+
# Test cases for the Stanford Parser module
|
26
|
+
|
27
|
+
require "test/unit"
|
28
|
+
require "set"
|
29
|
+
require "singleton"
|
30
|
+
require "stanfordparser"
|
31
|
+
|
32
|
+
|
33
|
+
# Make the Lexicalized Parser a singleton for the tests because it takes
|
34
|
+
# several seconds to load.
|
35
|
+
class StanfordParser::LexicalizedParser
|
36
|
+
include Singleton
|
37
|
+
end
|
38
|
+
|
39
|
+
|
40
|
+
class LexicalizedParserTestCase < Test::Unit::TestCase
|
41
|
+
def test_root_path
|
42
|
+
assert_equal StanfordParser::ROOT.class, Pathname
|
43
|
+
end
|
44
|
+
|
45
|
+
def setup
|
46
|
+
@parser = StanfordParser::LexicalizedParser.instance
|
47
|
+
@tree = @parser.apply("This is a sentence.")
|
48
|
+
end
|
49
|
+
|
50
|
+
def test_parser
|
51
|
+
assert_equal @parser.grammar, StanfordParser::ROOT + "englishPCFG.ser.gz"
|
52
|
+
assert_equal @tree.class, StanfordParser::Tree
|
53
|
+
end
|
54
|
+
|
55
|
+
def test_localTrees
|
56
|
+
l = @tree.localTrees
|
57
|
+
assert_equal l.size, 5
|
58
|
+
assert_equal Set.new(l.collect {|t| "#{t.label}"}),
|
59
|
+
Set.new(["S", "NP", "VP", "ROOT", "NP"])
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_enumerable
|
63
|
+
# StanfordParser::LexicalizedParser is not an enumerable object.
|
64
|
+
assert_equal @parser.map, []
|
65
|
+
end
|
66
|
+
end # LexicalizedParserTestCase
|
67
|
+
|
68
|
+
|
69
|
+
class TreeTestCase < Test::Unit::TestCase
|
70
|
+
def setup
|
71
|
+
@parser = StanfordParser::LexicalizedParser.instance
|
72
|
+
@tree = @parser.apply("This is a sentence.")
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_enumerable
|
76
|
+
assert @tree.all? {|n| n.class == StanfordParser::Tree}
|
77
|
+
assert @tree.all? {|n|
|
78
|
+
n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeNode" or
|
79
|
+
n._classname == "edu.stanford.nlp.trees.LabeledScoredTreeLeaf"
|
80
|
+
}
|
81
|
+
assert_equal @tree.map {|n| "#{n.label}"},
|
82
|
+
["ROOT", "S", "NP", "DT", "This", "VP", "VBZ", "is", "NP", "DT", "a", \
|
83
|
+
"NN", "sentence", ".", "."]
|
84
|
+
end
|
85
|
+
end # TreeTestCase
|
86
|
+
|
87
|
+
|
88
|
+
class DocumentPreprocessorTestCase < Test::Unit::TestCase
|
89
|
+
def setup
|
90
|
+
@preproc = StanfordParser::DocumentPreprocessor.new
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_get_sentences_from_string
|
94
|
+
s = @preproc.getSentencesFromString("This is a sentence. So is this.")
|
95
|
+
assert_equal "#{s[0]}", "This is a sentence ."
|
96
|
+
assert_equal "#{s[1]}", "So is this ."
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_enumerable
|
100
|
+
# StanfordParser::DocumentPreprocessor is not an enumerable object.
|
101
|
+
assert_equal @preproc.map, []
|
102
|
+
end
|
103
|
+
end # DocumentPreprocessorTestCase
|
metadata
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.4
|
3
|
+
specification_version: 1
|
4
|
+
name: stanfordparser
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 1.0.0
|
7
|
+
date: 2007-11-04 00:00:00 -07:00
|
8
|
+
summary: Ruby wrapper for the Stanford Natural Language Parser
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: billmcn@gmail.com
|
12
|
+
homepage: http://stanfordparser.rubyforge.org/
|
13
|
+
rubyforge_project: stanfordparser
|
14
|
+
description: This module is a Ruby wrapper for the Stanford Natural Language Parser.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- W.P. McNeill
|
31
|
+
files:
|
32
|
+
- test/test_stanfordparser.rb
|
33
|
+
- lib/stanfordparser.rb
|
34
|
+
- README
|
35
|
+
test_files:
|
36
|
+
- test/test_stanfordparser.rb
|
37
|
+
rdoc_options:
|
38
|
+
- - --title
|
39
|
+
- StanfordParser -- Stanford Parser
|
40
|
+
- --main
|
41
|
+
- README
|
42
|
+
- --line-numbers
|
43
|
+
- --inline-source
|
44
|
+
extra_rdoc_files:
|
45
|
+
- README
|
46
|
+
executables: []
|
47
|
+
|
48
|
+
extensions: []
|
49
|
+
|
50
|
+
requirements: []
|
51
|
+
|
52
|
+
dependencies: []
|
53
|
+
|