stanfordparser 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +12 -7
- data/lib/stanfordparser.rb +50 -18
- data/test/test_stanfordparser.rb +11 -0
- metadata +2 -2
data/README
CHANGED
@@ -4,17 +4,21 @@ This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.s
|
|
4
4
|
|
5
5
|
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby.
|
6
6
|
|
7
|
-
= Installation
|
7
|
+
= Installation and Configuration
|
8
8
|
|
9
9
|
To run this module you must install the following additional software
|
10
10
|
|
11
11
|
* The {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml]
|
12
12
|
* The {Ruby Java Bridge}[http://rjb.rubyforge.org/] gem.
|
13
13
|
|
14
|
-
|
14
|
+
Note that the Stanford Parser is not a Ruby application and is therefore not a Ruby gem and must be manually installed.
|
15
15
|
|
16
|
-
|
16
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory. This is the directory that contains the <tt>stanford-parser.jar</tt> file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments <tt>-server -Xmx150m</tt>.
|
17
|
+
|
18
|
+
These defaults can be overridden by creating a configuration file in <tt>/etc/ruby_stanford_parser.yaml</tt>. This file is in the Ruby YAML[http://www.ruby-doc.org/core/classes/YAML.html] format, and may contain two values: <tt>root</tt> and <tt>jvmargs</tt>. For example, the file might look like the following:
|
17
19
|
|
20
|
+
root: /usr/local/stanford-parser/other/location
|
21
|
+
jvmargs: -Xmx100m -verbose
|
18
22
|
|
19
23
|
=Usage
|
20
24
|
|
@@ -36,9 +40,9 @@ Use the StanfordParser::LexicalizedParser class to parse sentences.
|
|
36
40
|
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into words or sentences.
|
37
41
|
|
38
42
|
irb(main):004:0> preproc = StanfordParser::DocumentPreprocessor.new
|
39
|
-
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
40
|
-
This is a sentence .
|
41
|
-
So is this .
|
43
|
+
irb(main):008:0> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
44
|
+
This is a sentence .
|
45
|
+
So is this .
|
42
46
|
|
43
47
|
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
44
48
|
|
@@ -46,7 +50,8 @@ For complete details about the use of these classes, see the documentation on th
|
|
46
50
|
= History
|
47
51
|
|
48
52
|
1.0.0:: Initial release
|
49
|
-
1.1.0:: Make module initialization function private
|
53
|
+
1.1.0:: Make module initialization function private. Add example code.
|
54
|
+
1.2.0:: Read Java VM arguments from the configuration file. Add Word class.
|
50
55
|
|
51
56
|
|
52
57
|
= Copyright
|
data/lib/stanfordparser.rb
CHANGED
@@ -104,8 +104,9 @@ module Rjb
|
|
104
104
|
end # if
|
105
105
|
end # wrap_java_object
|
106
106
|
|
107
|
-
# By default, all RJB classes other than <tt>java.util.ArrayList</tt>
|
108
|
-
# in a generic wrapper. Derived classes may
|
107
|
+
# By default, all RJB classes other than <tt>java.util.ArrayList</tt> and
|
108
|
+
# <tt>java.util.HashSet</tt> go in a generic wrapper. Derived classes may
|
109
|
+
# change this behavior.
|
109
110
|
def wrap_rjb_object(object)
|
110
111
|
JavaObjectWrapper.new(object)
|
111
112
|
end
|
@@ -131,24 +132,36 @@ end # Rjb
|
|
131
132
|
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
132
133
|
module StanfordParser
|
133
134
|
|
134
|
-
VERSION = "1.
|
135
|
-
|
136
|
-
#
|
137
|
-
#
|
138
|
-
#
|
139
|
-
|
140
|
-
|
141
|
-
#
|
135
|
+
VERSION = "1.2.0"
|
136
|
+
|
137
|
+
# Path to an English PCFG model that comes with the Stanford Parser. The
|
138
|
+
# location is relative to the parser root directory. This is a valid value
|
139
|
+
# for the <em>grammar</em> parameter of the LexicalizedParser constructor.
|
140
|
+
ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
|
141
|
+
|
142
|
+
# This function is executed once when the module is loaded. It initializes
|
143
|
+
# the Java virtual machine in which the Stanford parser will run. By
|
144
|
+
# default, it adds the parser installation root
|
145
|
+
# <tt>/usr/local/stanford-parser/current</tt> to the Java classpath and
|
146
|
+
# launches the VM with the arguments <tt>-server -Xmx150m</tt>. Different
|
147
|
+
# values may be specified with the <tt>/etc/ruby-stanford-parser.yaml</tt>
|
148
|
+
# configuration file.
|
149
|
+
#
|
150
|
+
# This function returns the path of the parser installation root.
|
142
151
|
def StanfordParser.initialize_on_load
|
143
152
|
root = Pathname.new("/usr/local/stanford-parser/current")
|
153
|
+
jvmargs = ["-server", "-Xmx150m"]
|
144
154
|
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
145
155
|
if config.file?
|
146
156
|
configuration = open(config) {|f| YAML.load(f)}
|
147
|
-
if configuration.key?("root")
|
157
|
+
if configuration.key?("root") and not configuration["root"].nil?
|
148
158
|
root = Pathname.new(configuration["root"])
|
149
159
|
end
|
160
|
+
if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
|
161
|
+
jvmargs = configuration["jvmargs"].split
|
162
|
+
end
|
150
163
|
end
|
151
|
-
Rjb::load(classpath = (root + "stanford-parser.jar").to_s)
|
164
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
|
152
165
|
root
|
153
166
|
end
|
154
167
|
|
@@ -189,13 +202,13 @@ module StanfordParser
|
|
189
202
|
# Create the parser given a grammar and options. The <em>grammar</em>
|
190
203
|
# argument is a path to a grammar file. This path may contain the string
|
191
204
|
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
192
|
-
# Stanford Parser. By default, an English grammar is loaded.
|
205
|
+
# Stanford Parser. By default, an English PCFG grammar is loaded.
|
193
206
|
#
|
194
207
|
# The <em>options</em> argument is a list of string arguments as they
|
195
208
|
# would appear on a command line. See the documentaion of
|
196
209
|
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
197
210
|
# details.
|
198
|
-
def initialize(grammar =
|
211
|
+
def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
|
199
212
|
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
200
213
|
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
201
214
|
@java_object.setOptionFlags(options)
|
@@ -207,10 +220,9 @@ module StanfordParser
|
|
207
220
|
end # LexicalizedParser
|
208
221
|
|
209
222
|
|
210
|
-
#
|
211
|
-
#
|
212
|
-
#
|
213
|
-
# <tt>edu.stanford.nlp.trees.Tree</tt> objects.
|
223
|
+
# This is a wrapper for
|
224
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects. It customizes
|
225
|
+
# stringification.
|
214
226
|
class Tree < JavaObjectWrapper
|
215
227
|
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
216
228
|
super(obj)
|
@@ -230,6 +242,26 @@ module StanfordParser
|
|
230
242
|
end # Tree
|
231
243
|
|
232
244
|
|
245
|
+
# This is a wrapper for
|
246
|
+
# <tt>edu.stanford.nlp.ling.Word</tt> objects. It customizes
|
247
|
+
# stringification and adds an equivalence operator.
|
248
|
+
class Word < JavaObjectWrapper
|
249
|
+
def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
|
250
|
+
super(obj, *args)
|
251
|
+
end
|
252
|
+
|
253
|
+
# See the word values.
|
254
|
+
def inspect
|
255
|
+
to_s
|
256
|
+
end
|
257
|
+
|
258
|
+
# Equivalence is defined relative to the word value.
|
259
|
+
def ==(other)
|
260
|
+
word == other
|
261
|
+
end
|
262
|
+
end # Word
|
263
|
+
|
264
|
+
|
233
265
|
# Tokenizes documents into words and sentences.
|
234
266
|
#
|
235
267
|
# This is a wrapper for the
|
data/test/test_stanfordparser.rb
CHANGED
@@ -101,3 +101,14 @@ class DocumentPreprocessorTestCase < Test::Unit::TestCase
|
|
101
101
|
assert_equal @preproc.map, []
|
102
102
|
end
|
103
103
|
end # DocumentPreprocessorTestCase
|
104
|
+
|
105
|
+
|
106
|
+
class MiscPreprocessorTestCase < Test::Unit::TestCase
|
107
|
+
def test_model_location
|
108
|
+
assert_equal "$(ROOT)/englishPCFG.ser.gz", StanfordParser::ENGLISH_PCFG_MODEL
|
109
|
+
end
|
110
|
+
|
111
|
+
def test_word
|
112
|
+
assert StanfordParser::Word.new("edu.stanford.nlp.ling.Word", "dog") == "dog"
|
113
|
+
end
|
114
|
+
end # MiscPreprocessorTestCase
|
metadata
CHANGED
@@ -3,8 +3,8 @@ rubygems_version: 0.9.2
|
|
3
3
|
specification_version: 1
|
4
4
|
name: stanfordparser
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2007-
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2007-12-18 00:00:00 -08:00
|
8
8
|
summary: Ruby wrapper for the Stanford Natural Language Parser
|
9
9
|
require_paths:
|
10
10
|
- lib
|