stanfordparser-infochimps 2.2.1.s
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/Gemfile +6 -0
- data/LICENSE +18 -0
- data/README.orig.rdoc +123 -0
- data/README.rdoc +37 -0
- data/Rakefile +60 -0
- data/TESTS_STATUS.rdoc +2 -0
- data/VERSION.yml +5 -0
- data/examples/stanford-sentence-parser.rb +46 -0
- data/lib/stanfordparser.rb +453 -0
- data/lib/stanfordparser/java_object.rb +129 -0
- data/stanfordparser.gemspec +69 -0
- data/test/test_stanfordparser.rb +224 -0
- metadata +207 -0
data/.document
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright 2007-2008 William Patrick McNeill
|
2
|
+
Copyright (c) 2010 John Wilkinson
|
3
|
+
|
4
|
+
This file is part of the Stanford Parser Ruby Wrapper.
|
5
|
+
|
6
|
+
The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
7
|
+
and/or modify it under the terms of the GNU General Public License as
|
8
|
+
published by the Free Software Foundation; either version 2 of the License,
|
9
|
+
or (at your option) any later version.
|
10
|
+
|
11
|
+
The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
12
|
+
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
14
|
+
Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License along with
|
17
|
+
editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
18
|
+
St, Fifth Floor, Boston, MA 02110-1301 USA
|
data/README.orig.rdoc
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
= Stanford Natural Language Parser Wrapper
|
2
|
+
|
3
|
+
This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
4
|
+
|
5
|
+
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby along with pure Ruby objects that enable standoff parsing.
|
6
|
+
|
7
|
+
|
8
|
+
= Installation and Configuration
|
9
|
+
|
10
|
+
In addition to the Ruby gems it requires, to run this module you must manually install the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
11
|
+
|
12
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory on UNIX platforms and in the <tt>C:\stanford-parser\current</tt> directory on Windows platforms. This is the directory that contains the <tt>stanford-parser.jar</tt> file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments <tt>-server -Xmx150m</tt>.
|
13
|
+
|
14
|
+
These defaults can be overridden by creating the configuration file <tt>/etc/ruby_stanford_parser.yaml</tt> on UNIX platforms and <tt>C:\stanford-parser\ruby-stanford-parser.yaml</tt> on Windows platforms. This file is in the Ruby YAML[http://ruby-doc.org/stdlib/libdoc/yaml/rdoc/index.html] format, and may contain two values: <tt>root</tt> and <tt>jvmargs</tt>. For example, the file might look like the following:
|
15
|
+
|
16
|
+
root: /usr/local/stanford-parser/other/location
|
17
|
+
jvmargs: -Xmx100m -verbose
|
18
|
+
|
19
|
+
|
20
|
+
=Tokenization and Parsing
|
21
|
+
|
22
|
+
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into sentences and words.
|
23
|
+
|
24
|
+
>> require "stanfordparser"
|
25
|
+
=> true
|
26
|
+
>> preproc = StanfordParser::DocumentPreprocessor.new
|
27
|
+
=> <DocumentPreprocessor>
|
28
|
+
>> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
29
|
+
This is a sentence .
|
30
|
+
So is this .
|
31
|
+
|
32
|
+
Use the StanfordParser::LexicalizedParser class to parse sentences.
|
33
|
+
|
34
|
+
>> parser = StanfordParser::LexicalizedParser.new
|
35
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
|
36
|
+
=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
|
37
|
+
>> puts parser.apply("This is a sentence.")
|
38
|
+
(ROOT
|
39
|
+
(S [24.917]
|
40
|
+
(NP [6.139] (DT [2.300] This))
|
41
|
+
(VP [17.636] (VBZ [0.144] is)
|
42
|
+
(NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
|
43
|
+
(. [0.002] .)))
|
44
|
+
|
45
|
+
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
46
|
+
|
47
|
+
|
48
|
+
=Standoff Tokenization and Parsing
|
49
|
+
|
50
|
+
This module also contains support for standoff tokenization and parsing, in which the terminal nodes of parse trees contain information about the text that was used to generate them.
|
51
|
+
|
52
|
+
Use StanfordParser::StandoffDocumentPreprocessor class to tokenize text and files into sentences and words.
|
53
|
+
|
54
|
+
>> preproc = StanfordParser::StandoffDocumentPreprocessor.new
|
55
|
+
=> <StandoffDocumentPreprocessor>
|
56
|
+
>> s = preproc.getSentencesFromString("This is a sentence. So is this.")
|
57
|
+
=> [This is a sentence., So is this.]
|
58
|
+
|
59
|
+
The standoff preprocessor returns StanfordParser::StandoffToken objects, which contain character offsets into the original text along with information about spacing characters that came before and after the token.
|
60
|
+
|
61
|
+
>> puts s
|
62
|
+
This [0,4]
|
63
|
+
is [5,7]
|
64
|
+
a [8,9]
|
65
|
+
sentence [10,18]
|
66
|
+
. [18,19]
|
67
|
+
So [21,23]
|
68
|
+
is [24,26]
|
69
|
+
this [27,31]
|
70
|
+
. [31,32]
|
71
|
+
>> "This is a sentence. So is this."[27..31]
|
72
|
+
=> "this."
|
73
|
+
|
74
|
+
This is the same information contained in the <tt>edu.stanford.nlp.ling.FeatureLabel</tt> class in the Stanford Parser Java implementation.
|
75
|
+
|
76
|
+
Similarly, use the StanfordParser::StandoffParsedText object to parse a block of text into StanfordParser::StandoffNode parse trees whose terminal nodes are StanfordParser::StandoffToken objects.
|
77
|
+
|
78
|
+
>> t = StanfordParser::StandoffParsedText.new("This is a sentence. So is this.")
|
79
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [4.9 sec].
|
80
|
+
=> <StanfordParser::StandoffParsedText, 2 sentences>
|
81
|
+
>> puts t.first
|
82
|
+
(ROOT
|
83
|
+
(S
|
84
|
+
(NP (DT This [0,4]))
|
85
|
+
(VP (VBZ is [5,7])
|
86
|
+
(NP (DT a [8,9]) (NN sentence [10,18])))
|
87
|
+
(. . [18,19])))
|
88
|
+
|
89
|
+
Standoff parse trees can reproduce the text from which they were generated verbatim.
|
90
|
+
|
91
|
+
>> t.first.to_original_string
|
92
|
+
=> "This is a sentence. "
|
93
|
+
|
94
|
+
They can also reproduce the original text with brackets inserted around the yields of specified parse nodes.
|
95
|
+
|
96
|
+
>> t.first.to_bracketed_string([[0,0,0], [0,1,1]])
|
97
|
+
=> "[This] is [a sentence]. "
|
98
|
+
|
99
|
+
The format of the coordinates used to specify individual nodes is described in the documentation for the Ruby Treebank[http://rubyforge.org/projects/treebank/] gem.
|
100
|
+
|
101
|
+
See the documentation of the individual classes in this module for more details.
|
102
|
+
|
103
|
+
Unlike their parents StanfordParser::DocumentPreprocessor and StanfordParser::LexicalizedParser, which produce Ruby wrappers around Java objects, StanfordParser::StandoffDocumentPreprocessor and StanfordParser::StandoffParsedText produce pure Ruby objects. This is to facilitate serialization of these objects using tools like the Marshal module, which cannot serialize Java objects.
|
104
|
+
|
105
|
+
= History
|
106
|
+
|
107
|
+
1.0.0:: Initial release
|
108
|
+
1.1.0:: Make module initialization function private. Add example code.
|
109
|
+
1.2.0:: Read Java VM arguments from the configuration file. Add Word class.
|
110
|
+
2.0.0:: Add support for standoff parsing. Change the way Rjb::JavaObjectWrapper wraps returned values: see wrap_java_object for details. Rjb::JavaObjectWrapper supports static members. Minor changes to stanford-sentence-parser script.
|
111
|
+
2.1.0:: Different default paths for Windows machines; Minor changes to StandoffToken definition
|
112
|
+
2.2.0:: Add parent information to StandoffNode
|
113
|
+
|
114
|
+
= Copyright
|
115
|
+
|
116
|
+
Copyright 2007-2008, William Patrick McNeill
|
117
|
+
|
118
|
+
This program is distributed under the GNU General Public License.
|
119
|
+
|
120
|
+
|
121
|
+
= Author
|
122
|
+
|
123
|
+
W.P. McNeill mailto:billmcn@gmail.com
|
data/README.rdoc
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
= stanfordparser
|
2
|
+
|
3
|
+
This is an upload/extension of Bill McNeal's stanfordparser rubyforge gem, check it out at its homepage (seems to be partially in French)
|
4
|
+
|
5
|
+
http://rubyforge.org/projects/stanfordparser/
|
6
|
+
|
7
|
+
or its rdocs
|
8
|
+
|
9
|
+
http://stanfordparser.rubyforge.org/
|
10
|
+
|
11
|
+
I've been having issues trying to use this gem so I decided to upload it to github and try to organize it to be a little more gem-friendly, especially using jeweler.
|
12
|
+
|
13
|
+
AFAIK there aren't other copies of this on github, please correct me if I'm mistaken. The only similar one I can see is http://github.com/tiendung/ruby-nlp which has much less code and I can only assume to be something else.
|
14
|
+
|
15
|
+
It seems like using version 1.6.1 of the java StanfordParser package is your best bet for compatability.
|
16
|
+
|
17
|
+
See README.orig.rdoc for Bill's readme, which includes dependencies, installation, and usage.
|
18
|
+
|
19
|
+
== Branches
|
20
|
+
|
21
|
+
* master - Jeweler and Bundler integrated along with slight reorganization of files to be more gem-standard. This is the branch you should use if you want to source the gem straight from github. I will leave this branch alone for the most part unless I find/come up with stable and useful additions. All changes will be backwards compatible.
|
22
|
+
* stock - Almost untouched from Bill's version, except for the README. Use this branch if that's what you're looking for.
|
23
|
+
* fixing_tests - The tests are currently broken, this branch is trying to address that. Once the tests are fixed it will be merged back into master. Help appreciated! I'll keep a TESTS_STATUS.rdoc keeping track of progress.
|
24
|
+
* experimental - I'll be putting in some code as examples and testing out some ideas. Do not use this branch as a gem. You are very encouraged, however, to fork it and add some code/make my code better. I'll try to integrate all the pull requests I get, if not in that branch into another.
|
25
|
+
|
26
|
+
== Note on Patches/Pull Requests
|
27
|
+
|
28
|
+
* Fork the project.
|
29
|
+
* Make your feature addition or bug fix.
|
30
|
+
* Add tests for it. I would prefer rSpec, but TestUnit is acceptable as well since there are some of those from the original author.
|
31
|
+
* Commit.
|
32
|
+
* Send me a pull request. Bonus points for topic branches.
|
33
|
+
|
34
|
+
== Copyright
|
35
|
+
|
36
|
+
Copyright (c) 2010 John Wilkinson. See LICENSE for details.
|
37
|
+
Copyright 2007-2008, William Patrick McNeill. See README.orig for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
$LOAD_PATH.unshift('lib')
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'jeweler'
|
8
|
+
Jeweler::Tasks.new do |gem|
|
9
|
+
gem.name = "stanfordparser-infochimps"
|
10
|
+
gem.summary = "GitHub upload/extension of Bill McNeal's stanfordparser rubygem"
|
11
|
+
gem.description = "Ruby wrapper of the Stanford Parser, a NLP parser built in Java."
|
12
|
+
gem.email = "jcwilk@gmail.com"
|
13
|
+
gem.homepage = "http://github.com/jcwilk/stanfordparser"
|
14
|
+
gem.authors = ["John Wilkinson","Bill McNeal"]
|
15
|
+
|
16
|
+
gem.add_dependency "rjb", ">= 1.2.5"
|
17
|
+
gem.add_dependency "treebank", ">= 3.0.0"
|
18
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
19
|
+
end
|
20
|
+
Jeweler::GemcutterTasks.new
|
21
|
+
rescue LoadError
|
22
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'rake/testtask'
|
26
|
+
Rake::TestTask.new(:test) do |test|
|
27
|
+
test.test_files = FileList.new('test/**/test_*.rb') do |list|
|
28
|
+
list.exclude 'test/test_helper.rb'
|
29
|
+
end
|
30
|
+
test.libs << 'test'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
|
34
|
+
# require 'spec/rake/spectask'
|
35
|
+
# Spec::Rake::SpecTask.new(:spec) do |spec|
|
36
|
+
# spec.libs << 'lib' << 'spec'
|
37
|
+
# spec.spec_files = FileList['spec/**/*_spec.rb']
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
41
|
+
# spec.libs << 'lib' << 'spec'
|
42
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
43
|
+
# spec.rcov = true
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# task :test => :check_dependencies
|
47
|
+
#
|
48
|
+
# task :spec => :check_dependencies
|
49
|
+
#
|
50
|
+
# task :default => :test
|
51
|
+
|
52
|
+
require 'rake/rdoctask'
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
55
|
+
|
56
|
+
rdoc.rdoc_dir = 'rdoc'
|
57
|
+
rdoc.title = "stanfordparser #{version}"
|
58
|
+
rdoc.rdoc_files.include('README*')
|
59
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
60
|
+
end
|
data/TESTS_STATUS.rdoc
ADDED
data/VERSION.yml
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
|
5
|
+
# Copyright 2007-2008 William Patrick McNeill
|
6
|
+
#
|
7
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
8
|
+
#
|
9
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
10
|
+
# and/or modify it under the terms of the GNU General Public License as
|
11
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
12
|
+
# or (at your option) any later version.
|
13
|
+
#
|
14
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
15
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
17
|
+
# Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License along with
|
20
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
21
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
22
|
+
#
|
23
|
+
#++
|
24
|
+
|
25
|
+
# == Synopsis
|
26
|
+
#
|
27
|
+
# Parse a sentence passed in on the command line.
|
28
|
+
#
|
29
|
+
# == Usage
|
30
|
+
#
|
31
|
+
# stanford-sentence-parser.rb [options] sentence
|
32
|
+
#
|
33
|
+
# options::
|
34
|
+
# See the Java Stanford Parser documentation for details
|
35
|
+
#
|
36
|
+
# sentence::
|
37
|
+
# A sentence to parse. This must appear after all the options and be quoted.
|
38
|
+
|
39
|
+
require 'rubygems'
|
40
|
+
require "stanfordparser"
|
41
|
+
|
42
|
+
# The last argument is the sentence. The rest of the command line is passed
|
43
|
+
# along to the parser object.
|
44
|
+
sentence = ARGV.pop
|
45
|
+
parser = StanfordParser::LexicalizedParser.new(StanfordParser::ENGLISH_PCFG_MODEL, ARGV)
|
46
|
+
puts parser.apply(sentence)
|
@@ -0,0 +1,453 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require "pathname"
|
4
|
+
require "rjb"
|
5
|
+
require "singleton"
|
6
|
+
begin
|
7
|
+
require "treebank"
|
8
|
+
gem "treebank", ">= 3.0.0"
|
9
|
+
rescue LoadError
|
10
|
+
require "treebank"
|
11
|
+
end
|
12
|
+
require "yaml"
|
13
|
+
|
14
|
+
# Wrapper for the {Stanford Natural Language
|
15
|
+
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
16
|
+
module StanfordParser
|
17
|
+
|
18
|
+
require "stanfordparser/java_object"
|
19
|
+
|
20
|
+
VERSION = "2.2.1"
|
21
|
+
|
22
|
+
# The default sentence segmenter and tokenizer. This is an English-language
|
23
|
+
# tokenizer with support for Penn Treebank markup.
|
24
|
+
EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer"
|
25
|
+
|
26
|
+
# Path to an English PCFG model that comes with the Stanford Parser. The
|
27
|
+
# location is relative to the parser root directory. This is a valid value
|
28
|
+
# for the <em>grammar</em> parameter of the LexicalizedParser constructor.
|
29
|
+
ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
|
30
|
+
|
31
|
+
# This function is executed once when the module is loaded. It initializes
|
32
|
+
# the Java virtual machine in which the Stanford parser will run. By
|
33
|
+
# default, it adds the parser installation root to the Java classpath and
|
34
|
+
# launches the VM with the arguments <tt>-server -Xmx150m</tt>. Different
|
35
|
+
# values may be specified with the <tt>ruby-stanford-parser.yaml</tt>
|
36
|
+
# configuration file.
|
37
|
+
#
|
38
|
+
# This function determines which operating system we are running on and sets
|
39
|
+
# default pathnames accordingly:
|
40
|
+
#
|
41
|
+
# UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml
|
42
|
+
# Windows:: C:\stanford-parser\current,
|
43
|
+
# C:\stanford-parser\ruby-stanford-parser.yaml
|
44
|
+
#
|
45
|
+
# This function returns the path of the parser installation root.
|
46
|
+
def StanfordParser.initialize_on_load
|
47
|
+
if RUBY_PLATFORM =~ /(win|w)32$/
|
48
|
+
root = Pathname.new("C:\\stanford-parser\\current ")
|
49
|
+
config = Pathname.new("C:\\stanford-parser\\ruby-stanford-parser.yaml")
|
50
|
+
else
|
51
|
+
root = Pathname.new("/usr/local/stanford-parser/current")
|
52
|
+
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
53
|
+
end
|
54
|
+
jvmargs = ["-server", "-Xmx150m"]
|
55
|
+
if config.file?
|
56
|
+
configuration = open(config) {|f| YAML.load(f)}
|
57
|
+
if configuration.key?("root") and not configuration["root"].nil?
|
58
|
+
root = Pathname.new(configuration["root"])
|
59
|
+
end
|
60
|
+
if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
|
61
|
+
jvmargs = configuration["jvmargs"].split
|
62
|
+
end
|
63
|
+
end
|
64
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
|
65
|
+
root
|
66
|
+
end
|
67
|
+
|
68
|
+
private_class_method :initialize_on_load
|
69
|
+
|
70
|
+
# The root directory of the Stanford parser installation.
|
71
|
+
ROOT = initialize_on_load
|
72
|
+
|
73
|
+
#--
|
74
|
+
# The documentation below is for the original Rjb::JavaObjectWrapper object.
|
75
|
+
# It is reproduced here because rdoc only takes the last document block
|
76
|
+
# defined. If Rjb is moved into its own gem, this documentation should go
|
77
|
+
# with it, and the following should be written as documentation for this
|
78
|
+
# class:
|
79
|
+
#
|
80
|
+
# Extension of the generic Ruby-Java Bridge wrapper object for the
|
81
|
+
# StanfordParser module.
|
82
|
+
#++
|
83
|
+
# A generic wrapper for a Java object loaded via the {Ruby-Java
|
84
|
+
# Bridge}[http://rjb.rubyforge.org/]. The wrapper class handles
|
85
|
+
# intialization and stringification, and passes other method calls down to
|
86
|
+
# the underlying Java object. Objects returned by the underlying Java
|
87
|
+
# object are converted to the appropriate Ruby object.
|
88
|
+
#
|
89
|
+
# Other modules may extend the list of Java objects that are converted by
|
90
|
+
# adding their own converter functions. See wrap_java_object for details.
|
91
|
+
#
|
92
|
+
# This object is enumerable, yielding items in the order defined by the
|
93
|
+
# underlying Java object's iterator.
|
94
|
+
class Rjb::JavaObjectWrapper
|
95
|
+
# FeatureLabel objects go inside a FeatureLabel wrapper.
|
96
|
+
def wrap_edu_stanford_nlp_ling_FeatureLabel(object)
|
97
|
+
StanfordParser::FeatureLabel.new(object)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Tree objects go inside a Tree wrapper. Various tree types are aliased
|
101
|
+
# to this function.
|
102
|
+
def wrap_edu_stanford_nlp_trees_Tree(object)
|
103
|
+
Tree.new(object)
|
104
|
+
end
|
105
|
+
|
106
|
+
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree
|
107
|
+
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree
|
108
|
+
alias :wrap_edu_stanford_nlp_trees_SimpleTree :wrap_edu_stanford_nlp_trees_Tree
|
109
|
+
alias :wrap_edu_stanford_nlp_trees_TreeGraphNode :wrap_edu_stanford_nlp_trees_Tree
|
110
|
+
|
111
|
+
protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel
|
112
|
+
end # Rjb::JavaObjectWrapper
|
113
|
+
|
114
|
+
|
115
|
+
# Lexicalized probabalistic parser.
|
116
|
+
#
|
117
|
+
# This is an wrapper for the
|
118
|
+
# <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
|
119
|
+
class LexicalizedParser < Rjb::JavaObjectWrapper
|
120
|
+
# The grammar used by the parser
|
121
|
+
attr_reader :grammar
|
122
|
+
|
123
|
+
# Create the parser given a grammar and options. The <em>grammar</em>
|
124
|
+
# argument is a path to a grammar file. This path may contain the string
|
125
|
+
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
126
|
+
# Stanford Parser. By default, an English PCFG grammar is loaded.
|
127
|
+
#
|
128
|
+
# The <em>options</em> argument is a list of string arguments as they
|
129
|
+
# would appear on a command line. See the documentaion of
|
130
|
+
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
131
|
+
# details.
|
132
|
+
def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
|
133
|
+
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
134
|
+
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
135
|
+
@java_object.setOptionFlags(options)
|
136
|
+
end
|
137
|
+
|
138
|
+
def to_s
|
139
|
+
"LexicalizedParser(#{grammar.basename})"
|
140
|
+
end
|
141
|
+
end # LexicalizedParser
|
142
|
+
|
143
|
+
|
144
|
+
# A singleton instance of the default Stanford Natural Language parser. A
|
145
|
+
# singleton is used because the parser can take a few seconds to load.
|
146
|
+
class DefaultParser < StanfordParser::LexicalizedParser
|
147
|
+
include Singleton
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
# This is a wrapper for
|
152
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects. It customizes
|
153
|
+
# stringification.
|
154
|
+
class Tree < Rjb::JavaObjectWrapper
|
155
|
+
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
156
|
+
super(obj)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return the label along with the score if there is one.
|
160
|
+
def inspect
|
161
|
+
s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
|
162
|
+
"(#{s})"
|
163
|
+
end
|
164
|
+
|
165
|
+
# The Penn treebank representation. This prints with indenting instead of
|
166
|
+
# putting everything on one line.
|
167
|
+
def to_s
|
168
|
+
"#{pennString}"
|
169
|
+
end
|
170
|
+
end # Tree
|
171
|
+
|
172
|
+
|
173
|
+
# This is a wrapper for
|
174
|
+
# <tt>edu.stanford.nlp.ling.Word</tt> objects. It customizes
|
175
|
+
# stringification and adds an equivalence operator.
|
176
|
+
class Word < Rjb::JavaObjectWrapper
|
177
|
+
def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
|
178
|
+
super(obj, *args)
|
179
|
+
end
|
180
|
+
|
181
|
+
# See the word values.
|
182
|
+
def inspect
|
183
|
+
to_s
|
184
|
+
end
|
185
|
+
|
186
|
+
# Equivalence is defined relative to the word value.
|
187
|
+
def ==(other)
|
188
|
+
word == other
|
189
|
+
end
|
190
|
+
end # Word
|
191
|
+
|
192
|
+
|
193
|
+
# This is a wrapper for <tt>edu.stanford.nlp.ling.FeatureLabel</tt> objects.
|
194
|
+
# It customizes stringification.
|
195
|
+
class FeatureLabel < Rjb::JavaObjectWrapper
|
196
|
+
def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel")
|
197
|
+
super
|
198
|
+
end
|
199
|
+
|
200
|
+
# Stringify with just the token and its begin and end position.
|
201
|
+
def to_s
|
202
|
+
# BUGBUG The position values come back as java.lang.Integer though I
|
203
|
+
# would expect Rjb to convert them to Ruby integers.
|
204
|
+
begin_position = get(self.BEGIN_POSITION_KEY)
|
205
|
+
end_position = get(self.END_POSITION_KEY)
|
206
|
+
"#{current} [#{begin_position},#{end_position}]"
|
207
|
+
end
|
208
|
+
|
209
|
+
# More verbose stringification with all the fields and their values.
|
210
|
+
def inspect
|
211
|
+
toString
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
# Tokenizes documents into words and sentences.
|
217
|
+
#
|
218
|
+
# This is a wrapper for the
|
219
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
|
220
|
+
class DocumentPreprocessor < Rjb::JavaObjectWrapper
|
221
|
+
def initialize(suppressEscaping = false)
|
222
|
+
super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
|
223
|
+
end
|
224
|
+
|
225
|
+
# Returns a list of sentences in a string.
|
226
|
+
def getSentencesFromString(s)
|
227
|
+
s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
|
228
|
+
_invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
|
229
|
+
end
|
230
|
+
|
231
|
+
def inspect
|
232
|
+
"<#{self.class.to_s.split('::').last}>"
|
233
|
+
end
|
234
|
+
|
235
|
+
def to_s
|
236
|
+
inspect
|
237
|
+
end
|
238
|
+
end # DocumentPreprocessor
|
239
|
+
|
240
|
+
# A text token that contains raw and normalized token identity (.e.g "(" and
|
241
|
+
# "-LRB-"), an offset span, and the characters immediately preceding and
|
242
|
+
# following the token. Given a list of these objects it is possible to
|
243
|
+
# recreate the text from which they came verbatim.
|
244
|
+
class StandoffToken < Struct.new(:current, :word, :before, :after,
|
245
|
+
:begin_position, :end_position)
|
246
|
+
def to_s
|
247
|
+
"#{current} [#{begin_position},#{end_position}]"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
# A preprocessor that segments text into sentences and tokens that contain
|
253
|
+
# character offset and token context information that can be used for
|
254
|
+
# standoff annotation.
|
255
|
+
class StandoffDocumentPreprocessor < DocumentPreprocessor
|
256
|
+
def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER)
|
257
|
+
# PTBTokenizer.factory is a static function, so use RJB to call it
|
258
|
+
# directly instead of going through a JavaObjectWrapper. We do it this
|
259
|
+
# way because the Standford parser Java code does not provide a
|
260
|
+
# constructor that allows you to specify the second parameter,
|
261
|
+
# invertible, to true, and we need this to write character offset
|
262
|
+
# information into the tokens.
|
263
|
+
ptb_tokenizer_class = Rjb::import(tokenizer)
|
264
|
+
# See the documentation for
|
265
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> for a
|
266
|
+
# description of these parameters.
|
267
|
+
ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false)
|
268
|
+
super(ptb_tokenizer_factory)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Returns a list of sentences in a string. This wraps the returned
|
272
|
+
# sentences in a StandoffSentence object.
|
273
|
+
def getSentencesFromString(s)
|
274
|
+
super(s).map!{|s| StandoffSentence.new(s)}
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
# A sentence is an array of StandoffToken objects.
|
280
|
+
class StandoffSentence < Array
|
281
|
+
# Construct an array of StandoffToken objects from a Java list sentence
|
282
|
+
# object returned by the preprocessor.
|
283
|
+
def initialize(stanford_parser_sentence)
|
284
|
+
# Convert FeatureStructure wrappers to StandoffToken objects.
|
285
|
+
s = stanford_parser_sentence.to_a.collect do |fs|
|
286
|
+
current = fs.current
|
287
|
+
word = fs.word
|
288
|
+
before = fs.before
|
289
|
+
after = fs.after
|
290
|
+
# The to_s.to_i is necessary because the get function returns
|
291
|
+
# java.lang.Integer objects instead of Ruby integers.
|
292
|
+
begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i
|
293
|
+
end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i
|
294
|
+
StandoffToken.new(current, word, before, after,
|
295
|
+
begin_position, end_position)
|
296
|
+
end
|
297
|
+
super(s)
|
298
|
+
end
|
299
|
+
|
300
|
+
# Return the original string verbatim.
|
301
|
+
def to_s
|
302
|
+
self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current
|
303
|
+
end
|
304
|
+
|
305
|
+
# Return the original string verbatim.
|
306
|
+
def inspect
|
307
|
+
to_s
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
# Standoff syntactic annotation of natural language text which may contain
|
313
|
+
# multiple sentences.
|
314
|
+
#
|
315
|
+
# This is an Array of StandoffNode objects, one for each sentence in the
|
316
|
+
# text.
|
317
|
+
class StandoffParsedText < Array
|
318
|
+
# Parse the text and create the standoff annotation.
|
319
|
+
#
|
320
|
+
# The default parser is a singleton instance of the English language
|
321
|
+
# Stanford Natural Langugage parser. There may be a delay of a few
|
322
|
+
# seconds for it to load the first time it is created.
|
323
|
+
def initialize(text, nodetype = StandoffNode,
|
324
|
+
tokenizer = EN_PENN_TREEBANK_TOKENIZER,
|
325
|
+
parser = DefaultParser.instance)
|
326
|
+
preprocessor = StandoffDocumentPreprocessor.new(tokenizer)
|
327
|
+
# Segment the text into sentences. Parse each sentence, writing
|
328
|
+
# standoff annotation information into the terminal nodes.
|
329
|
+
preprocessor.getSentencesFromString(text).map do |sentence|
|
330
|
+
parse = parser.apply(sentence.to_s)
|
331
|
+
push(nodetype.new(parse, sentence))
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
# Print class name and number of sentences.
|
336
|
+
def inspect
|
337
|
+
"<#{self.class.name}, #{length} sentences>"
|
338
|
+
end
|
339
|
+
|
340
|
+
# Print parses.
|
341
|
+
def to_s
|
342
|
+
flatten.join(" ")
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
# Standoff syntactic tree annotation of text. Terminal nodes are labeled
|
348
|
+
# with the appropriate StandoffToken objects. Standoff parses can reproduce
|
349
|
+
# the original string from which they were generated verbatim, optionally
|
350
|
+
# with brackets around the yields of specified non-terminal nodes.
|
351
|
+
class StandoffNode < Treebank::ParentedNode
|
352
|
+
# Create the standoff tree from a tree returned by the Stanford parser.
|
353
|
+
# For non-terminal nodes, the <em>tokens</em> argument will be a
|
354
|
+
# StandoffSentence containing the StandoffToken objects representing all
|
355
|
+
# the tokens beneath and after this node. For terminal nodes, the
|
356
|
+
# <em>tokens</em> argument will be a StandoffToken.
|
357
|
+
def initialize(stanford_parser_node, tokens)
|
358
|
+
# Annotate this node with a non-terminal label or a StandoffToken as
|
359
|
+
# appropriate.
|
360
|
+
super(tokens.instance_of?(StandoffSentence) ?
|
361
|
+
stanford_parser_node.value : tokens)
|
362
|
+
# Enumerate the children depth-first. Tokens are removed from the list
|
363
|
+
# left-to-right as terminal nodes are added to the tree.
|
364
|
+
stanford_parser_node.children.each do |child|
|
365
|
+
subtree = self.class.new(child, child.leaf? ? tokens.shift : tokens)
|
366
|
+
attach_child!(subtree)
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
# Return the original text string dominated by this node.
|
371
|
+
def to_original_string
|
372
|
+
leaves.inject("") do |s, leaf|
|
373
|
+
s += leaf.label.current + leaf.label.after
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
# Print the original string with brackets around word spans dominated by
|
378
|
+
# the specified consituents.
|
379
|
+
#
|
380
|
+
# The constituents to bracket are specified by passing a list of node
|
381
|
+
# coordinates, which are arrays of integers of the form returned by the
|
382
|
+
# tree enumerators of Treebank::Node objects.
|
383
|
+
#
|
384
|
+
# _coords_:: the coordinates of the nodes around which to place brackets
|
385
|
+
# _open_:: the open bracket symbol
|
386
|
+
# _close_:: the close bracket symbol
|
387
|
+
def to_bracketed_string(coords, open = "[", close = "]")
|
388
|
+
# Get a list of all the leaf nodes and their coordinates.
|
389
|
+
items = depth_first_enumerator(true).find_all {|n| n.first.leaf?}
|
390
|
+
# Enumerate over all the matching constituents inserting open and close
|
391
|
+
# brackets around their yields in the items list.
|
392
|
+
coords.each do |matching|
|
393
|
+
# Insert using a simple state machine with three states: :start,
|
394
|
+
# :open, and :close.
|
395
|
+
state = :start
|
396
|
+
# Enumerate over the items list looking for nodes that are the
|
397
|
+
# children of the matching constituent.
|
398
|
+
items.each_with_index do |item, index|
|
399
|
+
# Skip inserted bracket characters.
|
400
|
+
next if item.is_a? String
|
401
|
+
# Handle terminal node items with the state machine.
|
402
|
+
node, terminal_coordinate = item
|
403
|
+
if state == :start
|
404
|
+
next if not in_yield?(matching, terminal_coordinate)
|
405
|
+
items.insert(index, open)
|
406
|
+
state = :open
|
407
|
+
else # state == :open
|
408
|
+
next if in_yield?(matching, terminal_coordinate)
|
409
|
+
items.insert(index, close)
|
410
|
+
state = :close
|
411
|
+
break
|
412
|
+
end
|
413
|
+
end # items.each_with_index
|
414
|
+
# Handle the case where a matching constituent is flush with the end
|
415
|
+
# of the sentence.
|
416
|
+
items << close if state == :open
|
417
|
+
end # each
|
418
|
+
# Replace terminal nodes with their string representations. Insert
|
419
|
+
# spacing characters in the list.
|
420
|
+
items.each_with_index do |item, index|
|
421
|
+
next if item.is_a? String
|
422
|
+
text = item.first.label.current
|
423
|
+
spacing = item.first.label.after
|
424
|
+
# Replace the terminal node with its text.
|
425
|
+
items[index] = text
|
426
|
+
# Insert the spacing that comes after this text before the first
|
427
|
+
# non-close bracket character.
|
428
|
+
close_pos = find_index(items[index+1..-1]) {|item| not item == close}
|
429
|
+
items.insert(index + close_pos + 1, spacing)
|
430
|
+
end
|
431
|
+
items.join
|
432
|
+
end # to_bracketed_string
|
433
|
+
|
434
|
+
# Find the index of the first item in _list_ for which _block_ is true.
|
435
|
+
# Return 0 if no items are found.
|
436
|
+
def find_index(list, &block)
|
437
|
+
list.each_with_index do |item, index|
|
438
|
+
return index if block.call(item)
|
439
|
+
end
|
440
|
+
0
|
441
|
+
end
|
442
|
+
|
443
|
+
# Is the node at _terminal_ in the yield of the node at _node_?
|
444
|
+
def in_yield?(node, terminal)
|
445
|
+
# If node A's coordinates match the prefix of node B's coordinates, node
|
446
|
+
# B is in the yield of node A.
|
447
|
+
terminal.first(node.length) == node
|
448
|
+
end
|
449
|
+
|
450
|
+
private :in_yield?, :find_index
|
451
|
+
end # StandoffNode
|
452
|
+
|
453
|
+
end # StanfordParser
|