stanfordparser 1.1.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README +12 -7
- data/lib/stanfordparser.rb +50 -18
- data/test/test_stanfordparser.rb +11 -0
- metadata +2 -2
data/README
CHANGED
@@ -4,17 +4,21 @@ This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.s
|
|
4
4
|
|
5
5
|
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby.
|
6
6
|
|
7
|
-
= Installation
|
7
|
+
= Installation and Configuration
|
8
8
|
|
9
9
|
To run this module you must install the following additional software
|
10
10
|
|
11
11
|
* The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
|
12
12
|
* The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
|
13
13
|
|
14
|
-
|
14
|
+
Note that the Stanford Parser is not a Ruby application and is therefore not a Ruby gem and must be manually installed.
|
15
15
|
|
16
|
-
|
16
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory. This is the directory that contains the <tt>stanford-parser.jar</tt> file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments <tt>-server -Xmx150m</tt>.
|
17
|
+
|
18
|
+
These defaults can be overridden by creating a configuration file in <tt>/etc/ruby_stanford_parser.yaml</tt>. This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and may contain two values: <tt>root</tt> and <tt>jvmargs</tt>. For example, the file might look like the following:
|
17
19
|
|
20
|
+
root: /usr/local/stanford-parser/other/location
|
21
|
+
jvmargs: -Xmx100m -verbose
|
18
22
|
|
19
23
|
=Usage
|
20
24
|
|
@@ -36,9 +40,9 @@ Use the StanfordParser::LexicalizedParser class to parse sentences.
|
|
36
40
|
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
|
37
41
|
|
38
42
|
irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
|
39
|
-
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
40
|
-
This is a sentence .
|
41
|
-
So is this .
|
43
|
+
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
44
|
+
This is a sentence .
|
45
|
+
So is this .
|
42
46
|
|
43
47
|
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
44
48
|
|
@@ -46,7 +50,8 @@ For complete details about the use of these classes, see the documentation on th
|
|
46
50
|
= History
|
47
51
|
|
48
52
|
1.0.0:: Initial release
|
49
|
-
1.1.0:: Make module initialization function private
|
53
|
+
1.1.0:: Make module initialization function private. Add example code.
|
54
|
+
1.2.0:: Read Java VM arguments from the configuration file. Add Word class.
|
50
55
|
|
51
56
|
|
52
57
|
= Copyright
|
data/lib/stanfordparser.rb
CHANGED
@@ -104,8 +104,9 @@ module Rjb
|
|
104
104
|
end # if
|
105
105
|
end # wrap_java_object
|
106
106
|
|
107
|
-
# By default, all RJB classes other than <tt>java.util.ArrayList</tt>
|
108
|
-
# in a generic wrapper. Derived classes may
|
107
|
+
# By default, all RJB classes other than <tt>java.util.ArrayList</tt> and
|
108
|
+
# <tt>java.util.HashSet</tt> go in a generic wrapper. Derived classes may
|
109
|
+
# change this behavior.
|
109
110
|
def wrap_rjb_object(object)
|
110
111
|
JavaObjectWrapper.new(object)
|
111
112
|
end
|
@@ -131,24 +132,36 @@ end # Rjb
|
|
131
132
|
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
132
133
|
module StanfordParser
|
133
134
|
|
134
|
-
VERSION = "1.
|
135
|
-
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
|
140
|
-
|
141
|
-
#
|
135
|
+
VERSION = "1.2.0"
|
136
|
+
|
137
|
+
# Path to an English PCFG model that comes with the Stanford Parser. The
|
138
|
+
# location is relative to the parser root directory. This is a valid value
|
139
|
+
# for the <em>grammar</em> parameter of the LexicalizedParser constructor.
|
140
|
+
ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
|
141
|
+
|
142
|
+
# This function is executed once when the module is loaded. It initializes
|
143
|
+
# the Java virtual machine in which the Stanford parser will run. By
|
144
|
+
# default, it adds the parser installation root
|
145
|
+
# <tt>/usr/local/stanford-parser/current</tt> to the Java classpath and
|
146
|
+
# launches the VM with the arguments <tt>-server -Xmx150m</tt>. Different
|
147
|
+
# values may be specified with the <tt>/etc/ruby-stanford-parser.yaml</tt>
|
148
|
+
# configuration file.
|
149
|
+
#
|
150
|
+
# This function returns the path of the parser installation root.
|
142
151
|
def StanfordParser.initialize_on_load
|
143
152
|
root = Pathname.new("/usr/local/stanford-parser/current")
|
153
|
+
jvmargs = ["-server", "-Xmx150m"]
|
144
154
|
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
145
155
|
if config.file?
|
146
156
|
configuration = open(config) {|f| YAML.load(f)}
|
147
|
-
if configuration.key?("root")
|
157
|
+
if configuration.key?("root") and not configuration["root"].nil?
|
148
158
|
root = Pathname.new(configuration["root"])
|
149
159
|
end
|
160
|
+
if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
|
161
|
+
jvmargs = configuration["jvmargs"].split
|
162
|
+
end
|
150
163
|
end
|
151
|
-
Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
|
164
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
|
152
165
|
root
|
153
166
|
end
|
154
167
|
|
@@ -189,13 +202,13 @@ module StanfordParser
|
|
189
202
|
# Create the parser given a grammar and options. The <em>grammar</em>
|
190
203
|
# argument is a path to a grammar file. This path may contain the string
|
191
204
|
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
192
|
-
# Stanford Parser. By default, an English grammar is loaded.
|
205
|
+
# Stanford Parser. By default, an English PCFG grammar is loaded.
|
193
206
|
#
|
194
207
|
# The <em>options</em> argument is a list of string arguments as they
|
195
208
|
# would appear on a command line. See the documentaion of
|
196
209
|
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
197
210
|
# details.
|
198
|
-
def initialize(grammar =
|
211
|
+
def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
|
199
212
|
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
200
213
|
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
201
214
|
@java_object.setOptionFlags(options)
|
@@ -207,10 +220,9 @@ module StanfordParser
|
|
207
220
|
end # LexicalizedParser
|
208
221
|
|
209
222
|
|
210
|
-
#
|
211
|
-
#
|
212
|
-
#
|
213
|
-
# <tt>edu.stanford.nlp.trees.Tree</tt> objects.
|
223
|
+
# This is a wrapper for
|
224
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects. It customizes
|
225
|
+
# stringification.
|
214
226
|
class Tree < JavaObjectWrapper
|
215
227
|
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
216
228
|
super(obj)
|
@@ -230,6 +242,26 @@ module StanfordParser
|
|
230
242
|
end # Tree
|
231
243
|
|
232
244
|
|
245
|
+
# This is a wrapper for
|
246
|
+
# <tt>edu.stanford.nlp.ling.Word</tt> objects. It customizes
|
247
|
+
# stringification and adds an equivalence operator.
|
248
|
+
class Word < JavaObjectWrapper
|
249
|
+
def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
|
250
|
+
super(obj, *args)
|
251
|
+
end
|
252
|
+
|
253
|
+
# See the word values.
|
254
|
+
def inspect
|
255
|
+
to_s
|
256
|
+
end
|
257
|
+
|
258
|
+
# Equivalence is defined relative to the word value.
|
259
|
+
def ==(other)
|
260
|
+
word == other
|
261
|
+
end
|
262
|
+
end # Word
|
263
|
+
|
264
|
+
|
233
265
|
# Tokenizes documents into words and sentences.
|
234
266
|
#
|
235
267
|
# This is a wrapper for the
|
data/test/test_stanfordparser.rb
CHANGED
@@ -101,3 +101,14 @@ class DocumentPreprocessorTestCase < Test::Unit::TestCase
|
|
101
101
|
assert_equal @preproc.map, []
|
102
102
|
end
|
103
103
|
end # DocumentPreprocessorTestCase
|
104
|
+
|
105
|
+
|
106
|
+
class MiscPreprocessorTestCase < Test::Unit::TestCase
|
107
|
+
def test_model_location
|
108
|
+
assert_equal "$(ROOT)/englishPCFG.ser.gz", StanfordParser::ENGLISH_PCFG_MODEL
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_word
|
112
|
+
assert StanfordParser::Word.new("edu.stanford.nlp.ling.Word", "dog") == "dog"
|
113
|
+
end
|
114
|
+
end # MiscPreprocessorTestCase
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stanfordparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2007-
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2007-12-18 00:00:00 -08:00
|
8
8
|
summary: Ruby wrapper for the Stanford Natural Language Parser
|
9
9
|
require_paths:
|
10
10
|
- lib
|