stanfordparser-infochimps 2.2.1.s
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/Gemfile +6 -0
- data/LICENSE +18 -0
- data/README.orig.rdoc +123 -0
- data/README.rdoc +37 -0
- data/Rakefile +60 -0
- data/TESTS_STATUS.rdoc +2 -0
- data/VERSION.yml +5 -0
- data/examples/stanford-sentence-parser.rb +46 -0
- data/lib/stanfordparser.rb +453 -0
- data/lib/stanfordparser/java_object.rb +129 -0
- data/stanfordparser.gemspec +69 -0
- data/test/test_stanfordparser.rb +224 -0
- metadata +207 -0
data/.document
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
Copyright 2007-2008 William Patrick McNeill
|
2
|
+
Copyright (c) 2010 John Wilkinson
|
3
|
+
|
4
|
+
This file is part of the Stanford Parser Ruby Wrapper.
|
5
|
+
|
6
|
+
The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
7
|
+
and/or modify it under the terms of the GNU General Public License as
|
8
|
+
published by the Free Software Foundation; either version 2 of the License,
|
9
|
+
or (at your option) any later version.
|
10
|
+
|
11
|
+
The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
12
|
+
useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
13
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
14
|
+
Public License for more details.
|
15
|
+
|
16
|
+
You should have received a copy of the GNU General Public License along with
|
17
|
+
editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
18
|
+
St, Fifth Floor, Boston, MA 02110-1301 USA
|
data/README.orig.rdoc
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
= Stanford Natural Language Parser Wrapper
|
2
|
+
|
3
|
+
This module is a wrapper for the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
4
|
+
|
5
|
+
The Stanford Natural Language Parser is a Java implementation of a probabilistic PCFG and dependency parser for English, German, Chinese, and Arabic. This module provides a thin wrapper around the Java code to make it accessible from Ruby along with pure Ruby objects that enable standoff parsing.
|
6
|
+
|
7
|
+
|
8
|
+
= Installation and Configuration
|
9
|
+
|
10
|
+
In addition to the Ruby gems it requires, to run this module you must manually install the {Stanford Natural Language Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
11
|
+
|
12
|
+
This module expects the parser to be installed in the <tt>/usr/local/stanford-parser/current</tt> directory on UNIX platforms and in the <tt>C:\stanford-parser\current</tt> directory on Windows platforms. This is the directory that contains the <tt>stanford-parser.jar</tt> file. When the module is loaded, it adds this directory to the Java classpath and launches the Java VM with the arguments <tt>-server -Xmx150m</tt>.
|
13
|
+
|
14
|
+
These defaults can be overridden by creating the configuration file <tt>/etc/ruby_stanford_parser.yaml</tt> on UNIX platforms and <tt>C:\stanford-parser\ruby-stanford-parser.yaml</tt> on Windows platforms. This file is in the Ruby YAML[http://ruby-doc.org/stdlib/libdoc/yaml/rdoc/index.html] format, and may contain two values: <tt>root</tt> and <tt>jvmargs</tt>. For example, the file might look like the following:
|
15
|
+
|
16
|
+
root: /usr/local/stanford-parser/other/location
|
17
|
+
jvmargs: -Xmx100m -verbose
|
18
|
+
|
19
|
+
|
20
|
+
=Tokenization and Parsing
|
21
|
+
|
22
|
+
Use the StanfordParser::DocumentPreprocessor class to tokenize text and files into sentences and words.
|
23
|
+
|
24
|
+
>> require "stanfordparser"
|
25
|
+
=> true
|
26
|
+
>> preproc = StanfordParser::DocumentPreprocessor.new
|
27
|
+
=> <DocumentPreprocessor>
|
28
|
+
>> puts preproc.getSentencesFromString("This is a sentence. So is this.")
|
29
|
+
This is a sentence .
|
30
|
+
So is this .
|
31
|
+
|
32
|
+
Use the StanfordParser::LexicalizedParser class to parse sentences.
|
33
|
+
|
34
|
+
>> parser = StanfordParser::LexicalizedParser.new
|
35
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [5.5 sec].
|
36
|
+
=> edu.stanford.nlp.parser.lexparser.LexicalizedParser
|
37
|
+
>> puts parser.apply("This is a sentence.")
|
38
|
+
(ROOT
|
39
|
+
(S [24.917]
|
40
|
+
(NP [6.139] (DT [2.300] This))
|
41
|
+
(VP [17.636] (VBZ [0.144] is)
|
42
|
+
(NP [12.299] (DT [1.419] a) (NN [8.897] sentence)))
|
43
|
+
(. [0.002] .)))
|
44
|
+
|
45
|
+
For complete details about the use of these classes, see the documentation on the Stanford Natural Language Parser website.
|
46
|
+
|
47
|
+
|
48
|
+
=Standoff Tokenization and Parsing
|
49
|
+
|
50
|
+
This module also contains support for standoff tokenization and parsing, in which the terminal nodes of parse trees contain information about the text that was used to generate them.
|
51
|
+
|
52
|
+
Use StanfordParser::StandoffDocumentPreprocessor class to tokenize text and files into sentences and words.
|
53
|
+
|
54
|
+
>> preproc = StanfordParser::StandoffDocumentPreprocessor.new
|
55
|
+
=> <StandoffDocumentPreprocessor>
|
56
|
+
>> s = preproc.getSentencesFromString("This is a sentence. So is this.")
|
57
|
+
=> [This is a sentence., So is this.]
|
58
|
+
|
59
|
+
The standoff preprocessor returns StanfordParser::StandoffToken objects, which contain character offsets into the original text along with information about spacing characters that came before and after the token.
|
60
|
+
|
61
|
+
>> puts s
|
62
|
+
This [0,4]
|
63
|
+
is [5,7]
|
64
|
+
a [8,9]
|
65
|
+
sentence [10,18]
|
66
|
+
. [18,19]
|
67
|
+
So [21,23]
|
68
|
+
is [24,26]
|
69
|
+
this [27,31]
|
70
|
+
. [31,32]
|
71
|
+
>> "This is a sentence. So is this."[27..31]
|
72
|
+
=> "this."
|
73
|
+
|
74
|
+
This is the same information contained in the <tt>edu.stanford.nlp.ling.FeatureLabel</tt> class in the Stanford Parser Java implementation.
|
75
|
+
|
76
|
+
Similarly, use the StanfordParser::StandoffParsedText object to parse a block of text into StanfordParser::StandoffNode parse trees whose terminal nodes are StanfordParser::StandoffToken objects.
|
77
|
+
|
78
|
+
>> t = StanfordParser::StandoffParsedText.new("This is a sentence. So is this.")
|
79
|
+
Loading parser from serialized file /usr/local/stanford-parser/current/englishPCFG.ser.gz ... done [4.9 sec].
|
80
|
+
=> <StanfordParser::StandoffParsedText, 2 sentences>
|
81
|
+
>> puts t.first
|
82
|
+
(ROOT
|
83
|
+
(S
|
84
|
+
(NP (DT This [0,4]))
|
85
|
+
(VP (VBZ is [5,7])
|
86
|
+
(NP (DT a [8,9]) (NN sentence [10,18])))
|
87
|
+
(. . [18,19])))
|
88
|
+
|
89
|
+
Standoff parse trees can reproduce the text from which they were generated verbatim.
|
90
|
+
|
91
|
+
>> t.first.to_original_string
|
92
|
+
=> "This is a sentence. "
|
93
|
+
|
94
|
+
They can also reproduce the original text with brackets inserted around the yields of specified parse nodes.
|
95
|
+
|
96
|
+
>> t.first.to_bracketed_string([[0,0,0], [0,1,1]])
|
97
|
+
=> "[This] is [a sentence]. "
|
98
|
+
|
99
|
+
The format of the coordinates used to specify individual nodes is described in the documentation for the Ruby Treebank[http://rubyforge.org/projects/treebank/] gem.
|
100
|
+
|
101
|
+
See the documentation of the individual classes in this module for more details.
|
102
|
+
|
103
|
+
Unlike their parents StanfordParser::DocumentPreprocessor and StanfordParser::LexicalizedParser, which produce Ruby wrappers around Java objects, StanfordParser::StandoffDocumentPreprocessor and StanfordParser::StandoffParsedText produce pure Ruby objects. This is to facilitate serialization of these objects using tools like the Marshal module, which cannot serialize Java objects.
|
104
|
+
|
105
|
+
= History
|
106
|
+
|
107
|
+
1.0.0:: Initial release
|
108
|
+
1.1.0:: Make module initialization function private. Add example code.
|
109
|
+
1.2.0:: Read Java VM arguments from the configuration file. Add Word class.
|
110
|
+
2.0.0:: Add support for standoff parsing. Change the way Rjb::JavaObjectWrapper wraps returned values: see wrap_java_object for details. Rjb::JavaObjectWrapper supports static members. Minor changes to stanford-sentence-parser script.
|
111
|
+
2.1.0:: Different default paths for Windows machines; Minor changes to StandoffToken definition
|
112
|
+
2.2.0:: Add parent information to StandoffNode
|
113
|
+
|
114
|
+
= Copyright
|
115
|
+
|
116
|
+
Copyright 2007-2008, William Patrick McNeill
|
117
|
+
|
118
|
+
This program is distributed under the GNU General Public License.
|
119
|
+
|
120
|
+
|
121
|
+
= Author
|
122
|
+
|
123
|
+
W.P. McNeill mailto:billmcn@gmail.com
|
data/README.rdoc
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
= stanfordparser
|
2
|
+
|
3
|
+
This is an upload/extension of Bill McNeal's stanfordparser rubyforge gem, check it out at its homepage (seems to be partially in French)
|
4
|
+
|
5
|
+
http://rubyforge.org/projects/stanfordparser/
|
6
|
+
|
7
|
+
or its rdocs
|
8
|
+
|
9
|
+
http://stanfordparser.rubyforge.org/
|
10
|
+
|
11
|
+
I've been having issues trying to use this gem so I decided to upload it to github and try to organize it to be a little more gem-friendly, especially using jeweler.
|
12
|
+
|
13
|
+
AFAIK there aren't other copies of this on github, please correct me if I'm mistaken. The only similar one I can see is http://github.com/tiendung/ruby-nlp which has much less code and I can only assume to be something else.
|
14
|
+
|
15
|
+
It seems like using version 1.6.1 of the java StanfordParser package is your best bet for compatability.
|
16
|
+
|
17
|
+
See README.orig.rdoc for Bill's readme, which includes dependencies, installation, and usage.
|
18
|
+
|
19
|
+
== Branches
|
20
|
+
|
21
|
+
* master - Jeweler and Bundler integrated along with slight reorganization of files to be more gem-standard. This is the branch you should use if you want to source the gem straight from github. I will leave this branch alone for the most part unless I find/come up with stable and useful additions. All changes will be backwards compatible.
|
22
|
+
* stock - Almost untouched from Bill's version, except for the README. Use this branch if that's what you're looking for.
|
23
|
+
* fixing_tests - The tests are currently broken, this branch is trying to address that. Once the tests are fixed it will be merged back into master. Help appreciated! I'll keep a TESTS_STATUS.rdoc keeping track of progress.
|
24
|
+
* experimental - I'll be putting in some code as examples and testing out some ideas. Do not use this branch as a gem. You are very encouraged, however, to fork it and add some code/make my code better. I'll try to integrate all the pull requests I get, if not in that branch into another.
|
25
|
+
|
26
|
+
== Note on Patches/Pull Requests
|
27
|
+
|
28
|
+
* Fork the project.
|
29
|
+
* Make your feature addition or bug fix.
|
30
|
+
* Add tests for it. I would prefer rSpec, but TestUnit is acceptable as well since there are some of those from the original author.
|
31
|
+
* Commit.
|
32
|
+
* Send me a pull request. Bonus points for topic branches.
|
33
|
+
|
34
|
+
== Copyright
|
35
|
+
|
36
|
+
Copyright (c) 2010 John Wilkinson. See LICENSE for details.
|
37
|
+
Copyright 2007-2008, William Patrick McNeill. See README.orig for details.
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
$LOAD_PATH.unshift('lib')
|
5
|
+
|
6
|
+
begin
|
7
|
+
require 'jeweler'
|
8
|
+
Jeweler::Tasks.new do |gem|
|
9
|
+
gem.name = "stanfordparser-infochimps"
|
10
|
+
gem.summary = "GitHub upload/extension of Bill McNeal's stanfordparser rubygem"
|
11
|
+
gem.description = "Ruby wrapper of the Stanford Parser, a NLP parser built in Java."
|
12
|
+
gem.email = "jcwilk@gmail.com"
|
13
|
+
gem.homepage = "http://github.com/jcwilk/stanfordparser"
|
14
|
+
gem.authors = ["John Wilkinson","Bill McNeal"]
|
15
|
+
|
16
|
+
gem.add_dependency "rjb", ">= 1.2.5"
|
17
|
+
gem.add_dependency "treebank", ">= 3.0.0"
|
18
|
+
gem.add_development_dependency "rspec", ">= 1.2.9"
|
19
|
+
end
|
20
|
+
Jeweler::GemcutterTasks.new
|
21
|
+
rescue LoadError
|
22
|
+
puts "Jeweler (or a dependency) not available. Install it with: gem install jeweler"
|
23
|
+
end
|
24
|
+
|
25
|
+
require 'rake/testtask'
|
26
|
+
Rake::TestTask.new(:test) do |test|
|
27
|
+
test.test_files = FileList.new('test/**/test_*.rb') do |list|
|
28
|
+
list.exclude 'test/test_helper.rb'
|
29
|
+
end
|
30
|
+
test.libs << 'test'
|
31
|
+
test.verbose = true
|
32
|
+
end
|
33
|
+
|
34
|
+
# require 'spec/rake/spectask'
|
35
|
+
# Spec::Rake::SpecTask.new(:spec) do |spec|
|
36
|
+
# spec.libs << 'lib' << 'spec'
|
37
|
+
# spec.spec_files = FileList['spec/**/*_spec.rb']
|
38
|
+
# end
|
39
|
+
#
|
40
|
+
# Spec::Rake::SpecTask.new(:rcov) do |spec|
|
41
|
+
# spec.libs << 'lib' << 'spec'
|
42
|
+
# spec.pattern = 'spec/**/*_spec.rb'
|
43
|
+
# spec.rcov = true
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# task :test => :check_dependencies
|
47
|
+
#
|
48
|
+
# task :spec => :check_dependencies
|
49
|
+
#
|
50
|
+
# task :default => :test
|
51
|
+
|
52
|
+
require 'rake/rdoctask'
|
53
|
+
Rake::RDocTask.new do |rdoc|
|
54
|
+
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
55
|
+
|
56
|
+
rdoc.rdoc_dir = 'rdoc'
|
57
|
+
rdoc.title = "stanfordparser #{version}"
|
58
|
+
rdoc.rdoc_files.include('README*')
|
59
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
60
|
+
end
|
data/TESTS_STATUS.rdoc
ADDED
data/VERSION.yml
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
#--
|
4
|
+
|
5
|
+
# Copyright 2007-2008 William Patrick McNeill
|
6
|
+
#
|
7
|
+
# This file is part of the Stanford Parser Ruby Wrapper.
|
8
|
+
#
|
9
|
+
# The Stanford Parser Ruby Wrapper is free software; you can redistribute it
|
10
|
+
# and/or modify it under the terms of the GNU General Public License as
|
11
|
+
# published by the Free Software Foundation; either version 2 of the License,
|
12
|
+
# or (at your option) any later version.
|
13
|
+
#
|
14
|
+
# The Stanford Parser Ruby Wrapper is distributed in the hope that it will be
|
15
|
+
# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
|
17
|
+
# Public License for more details.
|
18
|
+
#
|
19
|
+
# You should have received a copy of the GNU General Public License along with
|
20
|
+
# editalign; if not, write to the Free Software Foundation, Inc., 51 Franklin
|
21
|
+
# St, Fifth Floor, Boston, MA 02110-1301 USA
|
22
|
+
#
|
23
|
+
#++
|
24
|
+
|
25
|
+
# == Synopsis
|
26
|
+
#
|
27
|
+
# Parse a sentence passed in on the command line.
|
28
|
+
#
|
29
|
+
# == Usage
|
30
|
+
#
|
31
|
+
# stanford-sentence-parser.rb [options] sentence
|
32
|
+
#
|
33
|
+
# options::
|
34
|
+
# See the Java Stanford Parser documentation for details
|
35
|
+
#
|
36
|
+
# sentence::
|
37
|
+
# A sentence to parse. This must appear after all the options and be quoted.
|
38
|
+
|
39
|
+
require 'rubygems'
|
40
|
+
require "stanfordparser"
|
41
|
+
|
42
|
+
# The last argument is the sentence. The rest of the command line is passed
|
43
|
+
# along to the parser object.
|
44
|
+
sentence = ARGV.pop
|
45
|
+
parser = StanfordParser::LexicalizedParser.new(StanfordParser::ENGLISH_PCFG_MODEL, ARGV)
|
46
|
+
puts parser.apply(sentence)
|
@@ -0,0 +1,453 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
|
3
|
+
require "pathname"
|
4
|
+
require "rjb"
|
5
|
+
require "singleton"
|
6
|
+
begin
|
7
|
+
require "treebank"
|
8
|
+
gem "treebank", ">= 3.0.0"
|
9
|
+
rescue LoadError
|
10
|
+
require "treebank"
|
11
|
+
end
|
12
|
+
require "yaml"
|
13
|
+
|
14
|
+
# Wrapper for the {Stanford Natural Language
|
15
|
+
# Parser}[http://nlp.stanford.edu/downloads/lex-parser.shtml].
|
16
|
+
module StanfordParser
|
17
|
+
|
18
|
+
require "stanfordparser/java_object"
|
19
|
+
|
20
|
+
VERSION = "2.2.1"
|
21
|
+
|
22
|
+
# The default sentence segmenter and tokenizer. This is an English-language
|
23
|
+
# tokenizer with support for Penn Treebank markup.
|
24
|
+
EN_PENN_TREEBANK_TOKENIZER = "edu.stanford.nlp.process.PTBTokenizer"
|
25
|
+
|
26
|
+
# Path to an English PCFG model that comes with the Stanford Parser. The
|
27
|
+
# location is relative to the parser root directory. This is a valid value
|
28
|
+
# for the <em>grammar</em> parameter of the LexicalizedParser constructor.
|
29
|
+
ENGLISH_PCFG_MODEL = "$(ROOT)/englishPCFG.ser.gz"
|
30
|
+
|
31
|
+
# This function is executed once when the module is loaded. It initializes
|
32
|
+
# the Java virtual machine in which the Stanford parser will run. By
|
33
|
+
# default, it adds the parser installation root to the Java classpath and
|
34
|
+
# launches the VM with the arguments <tt>-server -Xmx150m</tt>. Different
|
35
|
+
# values may be specified with the <tt>ruby-stanford-parser.yaml</tt>
|
36
|
+
# configuration file.
|
37
|
+
#
|
38
|
+
# This function determines which operating system we are running on and sets
|
39
|
+
# default pathnames accordingly:
|
40
|
+
#
|
41
|
+
# UNIX:: /usr/local/stanford-parser/current, /etc/ruby-stanford-parser.yaml
|
42
|
+
# Windows:: C:\stanford-parser\current,
|
43
|
+
# C:\stanford-parser\ruby-stanford-parser.yaml
|
44
|
+
#
|
45
|
+
# This function returns the path of the parser installation root.
|
46
|
+
def StanfordParser.initialize_on_load
|
47
|
+
if RUBY_PLATFORM =~ /(win|w)32$/
|
48
|
+
root = Pathname.new("C:\\stanford-parser\\current ")
|
49
|
+
config = Pathname.new("C:\\stanford-parser\\ruby-stanford-parser.yaml")
|
50
|
+
else
|
51
|
+
root = Pathname.new("/usr/local/stanford-parser/current")
|
52
|
+
config = Pathname.new("/etc/ruby-stanford-parser.yaml")
|
53
|
+
end
|
54
|
+
jvmargs = ["-server", "-Xmx150m"]
|
55
|
+
if config.file?
|
56
|
+
configuration = open(config) {|f| YAML.load(f)}
|
57
|
+
if configuration.key?("root") and not configuration["root"].nil?
|
58
|
+
root = Pathname.new(configuration["root"])
|
59
|
+
end
|
60
|
+
if configuration.key?("jvmargs") and not configuration["jvmargs"].nil?
|
61
|
+
jvmargs = configuration["jvmargs"].split
|
62
|
+
end
|
63
|
+
end
|
64
|
+
Rjb::load(classpath = (root + "stanford-parser.jar").to_s, jvmargs)
|
65
|
+
root
|
66
|
+
end
|
67
|
+
|
68
|
+
private_class_method :initialize_on_load
|
69
|
+
|
70
|
+
# The root directory of the Stanford parser installation.
|
71
|
+
ROOT = initialize_on_load
|
72
|
+
|
73
|
+
#--
|
74
|
+
# The documentation below is for the original Rjb::JavaObjectWrapper object.
|
75
|
+
# It is reproduced here because rdoc only takes the last document block
|
76
|
+
# defined. If Rjb is moved into its own gem, this documentation should go
|
77
|
+
# with it, and the following should be written as documentation for this
|
78
|
+
# class:
|
79
|
+
#
|
80
|
+
# Extension of the generic Ruby-Java Bridge wrapper object for the
|
81
|
+
# StanfordParser module.
|
82
|
+
#++
|
83
|
+
# A generic wrapper for a Java object loaded via the {Ruby-Java
|
84
|
+
# Bridge}[http://rjb.rubyforge.org/]. The wrapper class handles
|
85
|
+
# intialization and stringification, and passes other method calls down to
|
86
|
+
# the underlying Java object. Objects returned by the underlying Java
|
87
|
+
# object are converted to the appropriate Ruby object.
|
88
|
+
#
|
89
|
+
# Other modules may extend the list of Java objects that are converted by
|
90
|
+
# adding their own converter functions. See wrap_java_object for details.
|
91
|
+
#
|
92
|
+
# This object is enumerable, yielding items in the order defined by the
|
93
|
+
# underlying Java object's iterator.
|
94
|
+
class Rjb::JavaObjectWrapper
|
95
|
+
# FeatureLabel objects go inside a FeatureLabel wrapper.
|
96
|
+
def wrap_edu_stanford_nlp_ling_FeatureLabel(object)
|
97
|
+
StanfordParser::FeatureLabel.new(object)
|
98
|
+
end
|
99
|
+
|
100
|
+
# Tree objects go inside a Tree wrapper. Various tree types are aliased
|
101
|
+
# to this function.
|
102
|
+
def wrap_edu_stanford_nlp_trees_Tree(object)
|
103
|
+
Tree.new(object)
|
104
|
+
end
|
105
|
+
|
106
|
+
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeLeaf :wrap_edu_stanford_nlp_trees_Tree
|
107
|
+
alias :wrap_edu_stanford_nlp_trees_LabeledScoredTreeNode :wrap_edu_stanford_nlp_trees_Tree
|
108
|
+
alias :wrap_edu_stanford_nlp_trees_SimpleTree :wrap_edu_stanford_nlp_trees_Tree
|
109
|
+
alias :wrap_edu_stanford_nlp_trees_TreeGraphNode :wrap_edu_stanford_nlp_trees_Tree
|
110
|
+
|
111
|
+
protected :wrap_edu_stanford_nlp_trees_Tree, :wrap_edu_stanford_nlp_ling_FeatureLabel
|
112
|
+
end # Rjb::JavaObjectWrapper
|
113
|
+
|
114
|
+
|
115
|
+
# Lexicalized probabalistic parser.
|
116
|
+
#
|
117
|
+
# This is an wrapper for the
|
118
|
+
# <tt>edu.stanford.nlp.parser.lexparser.LexicalizedParser</tt> object.
|
119
|
+
class LexicalizedParser < Rjb::JavaObjectWrapper
|
120
|
+
# The grammar used by the parser
|
121
|
+
attr_reader :grammar
|
122
|
+
|
123
|
+
# Create the parser given a grammar and options. The <em>grammar</em>
|
124
|
+
# argument is a path to a grammar file. This path may contain the string
|
125
|
+
# <tt>$(ROOT)</tt>, which will be replaced with the root directory of the
|
126
|
+
# Stanford Parser. By default, an English PCFG grammar is loaded.
|
127
|
+
#
|
128
|
+
# The <em>options</em> argument is a list of string arguments as they
|
129
|
+
# would appear on a command line. See the documentaion of
|
130
|
+
# <tt>edu.stanford.nlp.parser.lexparser.Options.setOptions</tt> for more
|
131
|
+
# details.
|
132
|
+
def initialize(grammar = ENGLISH_PCFG_MODEL, options = [])
|
133
|
+
@grammar = Pathname.new(grammar.gsub(/\$\(ROOT\)/, ROOT))
|
134
|
+
super("edu.stanford.nlp.parser.lexparser.LexicalizedParser", @grammar.to_s)
|
135
|
+
@java_object.setOptionFlags(options)
|
136
|
+
end
|
137
|
+
|
138
|
+
def to_s
|
139
|
+
"LexicalizedParser(#{grammar.basename})"
|
140
|
+
end
|
141
|
+
end # LexicalizedParser
|
142
|
+
|
143
|
+
|
144
|
+
# A singleton instance of the default Stanford Natural Language parser. A
|
145
|
+
# singleton is used because the parser can take a few seconds to load.
|
146
|
+
class DefaultParser < StanfordParser::LexicalizedParser
|
147
|
+
include Singleton
|
148
|
+
end
|
149
|
+
|
150
|
+
|
151
|
+
# This is a wrapper for
|
152
|
+
# <tt>edu.stanford.nlp.trees.Tree</tt> objects. It customizes
|
153
|
+
# stringification.
|
154
|
+
class Tree < Rjb::JavaObjectWrapper
|
155
|
+
def initialize(obj = "edu.stanford.nlp.trees.Tree")
|
156
|
+
super(obj)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Return the label along with the score if there is one.
|
160
|
+
def inspect
|
161
|
+
s = "#{label}" + (score.nan? ? "" : " [#{sprintf '%.2f', score}]")
|
162
|
+
"(#{s})"
|
163
|
+
end
|
164
|
+
|
165
|
+
# The Penn treebank representation. This prints with indenting instead of
|
166
|
+
# putting everything on one line.
|
167
|
+
def to_s
|
168
|
+
"#{pennString}"
|
169
|
+
end
|
170
|
+
end # Tree
|
171
|
+
|
172
|
+
|
173
|
+
# This is a wrapper for
|
174
|
+
# <tt>edu.stanford.nlp.ling.Word</tt> objects. It customizes
|
175
|
+
# stringification and adds an equivalence operator.
|
176
|
+
class Word < Rjb::JavaObjectWrapper
|
177
|
+
def initialize(obj = "edu.stanford.nlp.ling.Word", *args)
|
178
|
+
super(obj, *args)
|
179
|
+
end
|
180
|
+
|
181
|
+
# See the word values.
|
182
|
+
def inspect
|
183
|
+
to_s
|
184
|
+
end
|
185
|
+
|
186
|
+
# Equivalence is defined relative to the word value.
|
187
|
+
def ==(other)
|
188
|
+
word == other
|
189
|
+
end
|
190
|
+
end # Word
|
191
|
+
|
192
|
+
|
193
|
+
# This is a wrapper for <tt>edu.stanford.nlp.ling.FeatureLabel</tt> objects.
|
194
|
+
# It customizes stringification.
|
195
|
+
class FeatureLabel < Rjb::JavaObjectWrapper
|
196
|
+
def initialize(obj = "edu.stanford.nlp.ling.FeatureLabel")
|
197
|
+
super
|
198
|
+
end
|
199
|
+
|
200
|
+
# Stringify with just the token and its begin and end position.
|
201
|
+
def to_s
|
202
|
+
# BUGBUG The position values come back as java.lang.Integer though I
|
203
|
+
# would expect Rjb to convert them to Ruby integers.
|
204
|
+
begin_position = get(self.BEGIN_POSITION_KEY)
|
205
|
+
end_position = get(self.END_POSITION_KEY)
|
206
|
+
"#{current} [#{begin_position},#{end_position}]"
|
207
|
+
end
|
208
|
+
|
209
|
+
# More verbose stringification with all the fields and their values.
|
210
|
+
def inspect
|
211
|
+
toString
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
|
216
|
+
# Tokenizes documents into words and sentences.
|
217
|
+
#
|
218
|
+
# This is a wrapper for the
|
219
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> object.
|
220
|
+
class DocumentPreprocessor < Rjb::JavaObjectWrapper
|
221
|
+
def initialize(suppressEscaping = false)
|
222
|
+
super("edu.stanford.nlp.process.DocumentPreprocessor", suppressEscaping)
|
223
|
+
end
|
224
|
+
|
225
|
+
# Returns a list of sentences in a string.
|
226
|
+
def getSentencesFromString(s)
|
227
|
+
s = Rjb::JavaObjectWrapper.new("java.io.StringReader", s)
|
228
|
+
_invoke(:getSentencesFromText, "Ljava.io.Reader;", s.java_object)
|
229
|
+
end
|
230
|
+
|
231
|
+
def inspect
|
232
|
+
"<#{self.class.to_s.split('::').last}>"
|
233
|
+
end
|
234
|
+
|
235
|
+
def to_s
|
236
|
+
inspect
|
237
|
+
end
|
238
|
+
end # DocumentPreprocessor
|
239
|
+
|
240
|
+
# A text token that contains raw and normalized token identity (.e.g "(" and
|
241
|
+
# "-LRB-"), an offset span, and the characters immediately preceding and
|
242
|
+
# following the token. Given a list of these objects it is possible to
|
243
|
+
# recreate the text from which they came verbatim.
|
244
|
+
class StandoffToken < Struct.new(:current, :word, :before, :after,
|
245
|
+
:begin_position, :end_position)
|
246
|
+
def to_s
|
247
|
+
"#{current} [#{begin_position},#{end_position}]"
|
248
|
+
end
|
249
|
+
end
|
250
|
+
|
251
|
+
|
252
|
+
# A preprocessor that segments text into sentences and tokens that contain
|
253
|
+
# character offset and token context information that can be used for
|
254
|
+
# standoff annotation.
|
255
|
+
class StandoffDocumentPreprocessor < DocumentPreprocessor
|
256
|
+
def initialize(tokenizer = EN_PENN_TREEBANK_TOKENIZER)
|
257
|
+
# PTBTokenizer.factory is a static function, so use RJB to call it
|
258
|
+
# directly instead of going through a JavaObjectWrapper. We do it this
|
259
|
+
# way because the Standford parser Java code does not provide a
|
260
|
+
# constructor that allows you to specify the second parameter,
|
261
|
+
# invertible, to true, and we need this to write character offset
|
262
|
+
# information into the tokens.
|
263
|
+
ptb_tokenizer_class = Rjb::import(tokenizer)
|
264
|
+
# See the documentation for
|
265
|
+
# <tt>edu.stanford.nlp.process.DocumentPreprocessor</tt> for a
|
266
|
+
# description of these parameters.
|
267
|
+
ptb_tokenizer_factory = ptb_tokenizer_class.factory(false, true, false)
|
268
|
+
super(ptb_tokenizer_factory)
|
269
|
+
end
|
270
|
+
|
271
|
+
# Returns a list of sentences in a string. This wraps the returned
|
272
|
+
# sentences in a StandoffSentence object.
|
273
|
+
def getSentencesFromString(s)
|
274
|
+
super(s).map!{|s| StandoffSentence.new(s)}
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
|
279
|
+
# A sentence is an array of StandoffToken objects.
|
280
|
+
class StandoffSentence < Array
|
281
|
+
# Construct an array of StandoffToken objects from a Java list sentence
|
282
|
+
# object returned by the preprocessor.
|
283
|
+
def initialize(stanford_parser_sentence)
|
284
|
+
# Convert FeatureStructure wrappers to StandoffToken objects.
|
285
|
+
s = stanford_parser_sentence.to_a.collect do |fs|
|
286
|
+
current = fs.current
|
287
|
+
word = fs.word
|
288
|
+
before = fs.before
|
289
|
+
after = fs.after
|
290
|
+
# The to_s.to_i is necessary because the get function returns
|
291
|
+
# java.lang.Integer objects instead of Ruby integers.
|
292
|
+
begin_position = fs.get(fs.BEGIN_POSITION_KEY).to_s.to_i
|
293
|
+
end_position = fs.get(fs.END_POSITION_KEY).to_s.to_i
|
294
|
+
StandoffToken.new(current, word, before, after,
|
295
|
+
begin_position, end_position)
|
296
|
+
end
|
297
|
+
super(s)
|
298
|
+
end
|
299
|
+
|
300
|
+
# Return the original string verbatim.
|
301
|
+
def to_s
|
302
|
+
self[0..-2].inject(""){|s, word| s + word.current + word.after} + last.current
|
303
|
+
end
|
304
|
+
|
305
|
+
# Return the original string verbatim.
|
306
|
+
def inspect
|
307
|
+
to_s
|
308
|
+
end
|
309
|
+
end
|
310
|
+
|
311
|
+
|
312
|
+
# Standoff syntactic annotation of natural language text which may contain
|
313
|
+
# multiple sentences.
|
314
|
+
#
|
315
|
+
# This is an Array of StandoffNode objects, one for each sentence in the
|
316
|
+
# text.
|
317
|
+
class StandoffParsedText < Array
|
318
|
+
# Parse the text and create the standoff annotation.
|
319
|
+
#
|
320
|
+
# The default parser is a singleton instance of the English language
|
321
|
+
# Stanford Natural Langugage parser. There may be a delay of a few
|
322
|
+
# seconds for it to load the first time it is created.
|
323
|
+
def initialize(text, nodetype = StandoffNode,
|
324
|
+
tokenizer = EN_PENN_TREEBANK_TOKENIZER,
|
325
|
+
parser = DefaultParser.instance)
|
326
|
+
preprocessor = StandoffDocumentPreprocessor.new(tokenizer)
|
327
|
+
# Segment the text into sentences. Parse each sentence, writing
|
328
|
+
# standoff annotation information into the terminal nodes.
|
329
|
+
preprocessor.getSentencesFromString(text).map do |sentence|
|
330
|
+
parse = parser.apply(sentence.to_s)
|
331
|
+
push(nodetype.new(parse, sentence))
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
# Print class name and number of sentences.
|
336
|
+
def inspect
|
337
|
+
"<#{self.class.name}, #{length} sentences>"
|
338
|
+
end
|
339
|
+
|
340
|
+
# Print parses.
|
341
|
+
def to_s
|
342
|
+
flatten.join(" ")
|
343
|
+
end
|
344
|
+
end
|
345
|
+
|
346
|
+
|
347
|
+
# Standoff syntactic tree annotation of text. Terminal nodes are labeled
|
348
|
+
# with the appropriate StandoffToken objects. Standoff parses can reproduce
|
349
|
+
# the original string from which they were generated verbatim, optionally
|
350
|
+
# with brackets around the yields of specified non-terminal nodes.
|
351
|
+
class StandoffNode < Treebank::ParentedNode
|
352
|
+
# Create the standoff tree from a tree returned by the Stanford parser.
|
353
|
+
# For non-terminal nodes, the <em>tokens</em> argument will be a
|
354
|
+
# StandoffSentence containing the StandoffToken objects representing all
|
355
|
+
# the tokens beneath and after this node. For terminal nodes, the
|
356
|
+
# <em>tokens</em> argument will be a StandoffToken.
|
357
|
+
def initialize(stanford_parser_node, tokens)
|
358
|
+
# Annotate this node with a non-terminal label or a StandoffToken as
|
359
|
+
# appropriate.
|
360
|
+
super(tokens.instance_of?(StandoffSentence) ?
|
361
|
+
stanford_parser_node.value : tokens)
|
362
|
+
# Enumerate the children depth-first. Tokens are removed from the list
|
363
|
+
# left-to-right as terminal nodes are added to the tree.
|
364
|
+
stanford_parser_node.children.each do |child|
|
365
|
+
subtree = self.class.new(child, child.leaf? ? tokens.shift : tokens)
|
366
|
+
attach_child!(subtree)
|
367
|
+
end
|
368
|
+
end
|
369
|
+
|
370
|
+
# Return the original text string dominated by this node.
|
371
|
+
def to_original_string
|
372
|
+
leaves.inject("") do |s, leaf|
|
373
|
+
s += leaf.label.current + leaf.label.after
|
374
|
+
end
|
375
|
+
end
|
376
|
+
|
377
|
+
# Print the original string with brackets around word spans dominated by
|
378
|
+
# the specified consituents.
|
379
|
+
#
|
380
|
+
# The constituents to bracket are specified by passing a list of node
|
381
|
+
# coordinates, which are arrays of integers of the form returned by the
|
382
|
+
# tree enumerators of Treebank::Node objects.
|
383
|
+
#
|
384
|
+
# _coords_:: the coordinates of the nodes around which to place brackets
|
385
|
+
# _open_:: the open bracket symbol
|
386
|
+
# _close_:: the close bracket symbol
|
387
|
+
def to_bracketed_string(coords, open = "[", close = "]")
|
388
|
+
# Get a list of all the leaf nodes and their coordinates.
|
389
|
+
items = depth_first_enumerator(true).find_all {|n| n.first.leaf?}
|
390
|
+
# Enumerate over all the matching constituents inserting open and close
|
391
|
+
# brackets around their yields in the items list.
|
392
|
+
coords.each do |matching|
|
393
|
+
# Insert using a simple state machine with three states: :start,
|
394
|
+
# :open, and :close.
|
395
|
+
state = :start
|
396
|
+
# Enumerate over the items list looking for nodes that are the
|
397
|
+
# children of the matching constituent.
|
398
|
+
items.each_with_index do |item, index|
|
399
|
+
# Skip inserted bracket characters.
|
400
|
+
next if item.is_a? String
|
401
|
+
# Handle terminal node items with the state machine.
|
402
|
+
node, terminal_coordinate = item
|
403
|
+
if state == :start
|
404
|
+
next if not in_yield?(matching, terminal_coordinate)
|
405
|
+
items.insert(index, open)
|
406
|
+
state = :open
|
407
|
+
else # state == :open
|
408
|
+
next if in_yield?(matching, terminal_coordinate)
|
409
|
+
items.insert(index, close)
|
410
|
+
state = :close
|
411
|
+
break
|
412
|
+
end
|
413
|
+
end # items.each_with_index
|
414
|
+
# Handle the case where a matching constituent is flush with the end
|
415
|
+
# of the sentence.
|
416
|
+
items << close if state == :open
|
417
|
+
end # each
|
418
|
+
# Replace terminal nodes with their string representations. Insert
|
419
|
+
# spacing characters in the list.
|
420
|
+
items.each_with_index do |item, index|
|
421
|
+
next if item.is_a? String
|
422
|
+
text = item.first.label.current
|
423
|
+
spacing = item.first.label.after
|
424
|
+
# Replace the terminal node with its text.
|
425
|
+
items[index] = text
|
426
|
+
# Insert the spacing that comes after this text before the first
|
427
|
+
# non-close bracket character.
|
428
|
+
close_pos = find_index(items[index+1..-1]) {|item| not item == close}
|
429
|
+
items.insert(index + close_pos + 1, spacing)
|
430
|
+
end
|
431
|
+
items.join
|
432
|
+
end # to_bracketed_string
|
433
|
+
|
434
|
+
# Find the index of the first item in _list_ for which _block_ is true.
|
435
|
+
# Return 0 if no items are found.
|
436
|
+
def find_index(list, &block)
|
437
|
+
list.each_with_index do |item, index|
|
438
|
+
return index if block.call(item)
|
439
|
+
end
|
440
|
+
0
|
441
|
+
end
|
442
|
+
|
443
|
+
# Is the node at _terminal_ in the yield of the node at _node_?
|
444
|
+
def in_yield?(node, terminal)
|
445
|
+
# If node A's coordinates match the prefix of node B's coordinates, node
|
446
|
+
# B is in the yield of node A.
|
447
|
+
terminal.first(node.length) == node
|
448
|
+
end
|
449
|
+
|
450
|
+
private :in_yield?, :find_index
|
451
|
+
end # StandoffNode
|
452
|
+
|
453
|
+
end # StanfordParser
|