stanford-core-nlp 0.1.7 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.markdown +13 -5
- data/lib/stanford-core-nlp.rb +84 -44
- data/lib/stanford-core-nlp/java_wrapper.rb +5 -4
- metadata +4 -4
data/README.markdown
CHANGED
@@ -1,16 +1,22 @@
|
|
1
|
+
[![Build Status](https://secure.travis-ci.org/louismullie/stanford-core-nlp.png)](http://travis-ci.org/louismullie/stanford-core-nlp)
|
2
|
+
|
1
3
|
**About**
|
2
4
|
|
3
|
-
This gem provides high-level Ruby bindings to the [Stanford Core NLP package](http://nlp.stanford.edu/software/corenlp.shtml), a set natural language processing tools that provides tokenization, part-of-speech tagging, lemmatization, and parsing for
|
5
|
+
This gem provides high-level Ruby bindings to the [Stanford Core NLP package](http://nlp.stanford.edu/software/corenlp.shtml), a set natural language processing tools that provides tokenization, part-of-speech tagging, lemmatization, and parsing for several languages, as well as named entity recognition and coreference resolution for English. This gem is compatible with Ruby 1.9.2 and above.
|
4
6
|
|
5
7
|
**Installing**
|
6
8
|
|
7
|
-
|
9
|
+
Firs, install the gem: `gem install stanford-core-nlp`. Then, download the Stanford Core NLP JAR and model files. Three different packages are available:
|
10
|
+
|
11
|
+
* A [minimal package for English](http://louismullie.com/stanford-core-nlp-minimal.zip) with one tagger model and one parser model for English.
|
12
|
+
* A [full package for English](http://louismullie.com/stanford-core-nlp-english.zip), with all tagger and parser models for English, plus the coreference resolution and named entity recognition models.
|
13
|
+
* A [full package for all languages](http://louismullie.com/stanford-core-nlp-all.zip), including tagger and parser models for English, French, German, Arabic and Chinese.
|
8
14
|
|
9
|
-
|
15
|
+
Place the contents of the extracted archive inside the /bin/ folder of the stanford-core-nlp gem (e.g. /usr/local/lib/ruby/gems/1.X.x/gems/stanford-core-nlp-0.x/bin/).
|
10
16
|
|
11
17
|
**Configuration**
|
12
18
|
|
13
|
-
After installing and requiring the gem (`require 'stanford-core-nlp'`), you may want to set some configuration options
|
19
|
+
After installing and requiring the gem (`require 'stanford-core-nlp'`), you may want to set some optional configuration options. Here are some examples:
|
14
20
|
|
15
21
|
```ruby
|
16
22
|
# Set an alternative path to look for the JAR files
|
@@ -19,7 +25,7 @@ StanfordCoreNLP.jar_path = '/path_to_jars/'
|
|
19
25
|
|
20
26
|
# Set an alternative path to look for the model files
|
21
27
|
# Default is gem's bin folder.
|
22
|
-
StanfordCoreNLP.
|
28
|
+
StanfordCoreNLP.model_path = '/path_to_models/'
|
23
29
|
|
24
30
|
# Pass some alternative arguments to the Java VM.
|
25
31
|
# Default is ['-Xms512M', '-Xmx1024M'] (be prepared
|
@@ -48,6 +54,8 @@ text = StanfordCoreNLP::Text.new(text)
|
|
48
54
|
pipeline.annotate(text)
|
49
55
|
|
50
56
|
text.get(:sentences).each do |sentence|
|
57
|
+
# Syntatical dependencies
|
58
|
+
puts sentence.get(:basic_dependencies).to_s
|
51
59
|
sentence.get(:tokens).each do |token|
|
52
60
|
# Default annotations for all tokens
|
53
61
|
puts token.get(:value).to_s
|
data/lib/stanford-core-nlp.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
2
|
|
3
|
-
VERSION = '0.
|
4
|
-
|
3
|
+
VERSION = '0.2.0'
|
4
|
+
|
5
5
|
require 'stanford-core-nlp/jar_loader'
|
6
6
|
require 'stanford-core-nlp/java_wrapper'
|
7
7
|
require 'stanford-core-nlp/config'
|
@@ -11,31 +11,31 @@ module StanfordCoreNLP
|
|
11
11
|
# with a trailing slash.
|
12
12
|
#
|
13
13
|
# The structure of the JAR folder must be as follows:
|
14
|
-
#
|
14
|
+
#
|
15
15
|
# Files:
|
16
|
-
#
|
16
|
+
#
|
17
17
|
# /stanford-core-nlp.jar
|
18
18
|
# /joda-time.jar
|
19
|
-
# /xom.jar
|
19
|
+
# /xom.jar
|
20
20
|
# /bridge.jar*
|
21
|
-
#
|
21
|
+
#
|
22
22
|
# Folders:
|
23
23
|
#
|
24
24
|
# /classifiers # Models for the NER system.
|
25
25
|
# /dcoref # Models for the coreference resolver.
|
26
26
|
# /taggers # Models for the POS tagger.
|
27
27
|
# /grammar # Models for the parser.
|
28
|
-
#
|
28
|
+
#
|
29
29
|
# *The file bridge.jar is a thin JAVA wrapper over the
|
30
|
-
# Stanford Core NLP get() function, which allows to
|
30
|
+
# Stanford Core NLP get() function, which allows to
|
31
31
|
# retrieve annotations using static classes as names.
|
32
32
|
# This works around one of the lacunae of Rjb.
|
33
33
|
attr_accessor :jar_path
|
34
|
-
# The path to the main folder containing the folders
|
34
|
+
# The path to the main folder containing the folders
|
35
35
|
# with the individual models inside. By default, this
|
36
36
|
# is the same as the JAR path.
|
37
37
|
attr_accessor :model_path
|
38
|
-
# The flags for starting the JVM machine. The parser
|
38
|
+
# The flags for starting the JVM machine. The parser
|
39
39
|
# and named entity recognizer are very memory consuming.
|
40
40
|
attr_accessor :jvm_args
|
41
41
|
# A file to redirect JVM output to.
|
@@ -54,8 +54,8 @@ module StanfordCoreNLP
|
|
54
54
|
# Turn logging off by default.
|
55
55
|
self.log_file = nil
|
56
56
|
|
57
|
-
# Use models for a given language. Language can be
|
58
|
-
# supplied as full-length, or ISO-639 2 or 3 letter
|
57
|
+
# Use models for a given language. Language can be
|
58
|
+
# supplied as full-length, or ISO-639 2 or 3 letter
|
59
59
|
# code (e.g. :english, :eng or :en will work).
|
60
60
|
def self.use(language)
|
61
61
|
lang = nil
|
@@ -70,19 +70,19 @@ module StanfordCoreNLP
|
|
70
70
|
n = n.to_s
|
71
71
|
n += '.model' if n == 'ner'
|
72
72
|
models.each do |m, file|
|
73
|
-
self.model_files["#{n}.#{m}"] =
|
73
|
+
self.model_files["#{n}.#{m}"] =
|
74
74
|
folder + file
|
75
75
|
end
|
76
76
|
elsif models.is_a?(String)
|
77
|
-
self.model_files["#{n}.model"] =
|
77
|
+
self.model_files["#{n}.model"] =
|
78
78
|
folder + models
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
# Use english by default.
|
84
|
-
self.use(:english)
|
85
|
-
|
84
|
+
self.use(:english)
|
85
|
+
|
86
86
|
# Set a model file. Here are the default models for English:
|
87
87
|
#
|
88
88
|
# 'pos.model' => 'english-left3words-distsim.tagger',
|
@@ -103,32 +103,40 @@ module StanfordCoreNLP
|
|
103
103
|
#
|
104
104
|
def self.set_model(name, file)
|
105
105
|
n = name.split('.')[0].intern
|
106
|
-
self.model_files[name] =
|
106
|
+
self.model_files[name] =
|
107
107
|
Config::ModelFolders[n] + file
|
108
108
|
end
|
109
109
|
|
110
110
|
# Whether the classes are initialized or not.
|
111
111
|
@@initialized = false
|
112
|
-
# Whether the JAR files are loaded or not.
|
113
|
-
@@loaded = false
|
114
112
|
|
115
113
|
# Load the JARs, create the classes.
|
116
114
|
def self.init
|
117
|
-
|
118
|
-
|
115
|
+
unless @@initialized
|
116
|
+
self.load_jars
|
117
|
+
self.load_default_classes
|
118
|
+
end
|
119
119
|
@@initialized = true
|
120
120
|
end
|
121
121
|
|
122
|
-
# Load a StanfordCoreNLP pipeline with the
|
123
|
-
# specified JVM flags and StanfordCoreNLP
|
122
|
+
# Load a StanfordCoreNLP pipeline with the
|
123
|
+
# specified JVM flags and StanfordCoreNLP
|
124
124
|
# properties.
|
125
125
|
def self.load(*annotators)
|
126
|
-
|
126
|
+
|
127
127
|
self.init unless @@initialized
|
128
|
+
|
128
129
|
# Prepend the JAR path to the model files.
|
129
130
|
properties = {}
|
130
|
-
self.model_files.each do |k,v|
|
131
|
-
|
131
|
+
self.model_files.each do |k,v|
|
132
|
+
found = false
|
133
|
+
annotators.each do |annotator|
|
134
|
+
found = true if k.index(annotator.to_s)
|
135
|
+
break if found
|
136
|
+
end
|
137
|
+
next unless found
|
138
|
+
f = self.model_path + v
|
139
|
+
puts f
|
132
140
|
unless File.readable?(f)
|
133
141
|
raise "Model file #{f} could not be found. " +
|
134
142
|
"You may need to download this file manually "+
|
@@ -137,14 +145,15 @@ module StanfordCoreNLP
|
|
137
145
|
properties[k] = f
|
138
146
|
end
|
139
147
|
end
|
148
|
+
|
140
149
|
properties['annotators'] =
|
141
150
|
annotators.map { |x| x.to_s }.join(', ')
|
142
151
|
CoreNLP.new(get_properties(properties))
|
143
152
|
end
|
144
|
-
|
145
|
-
# Once it loads a specific annotator model once,
|
146
|
-
# the program always loads the same models when
|
147
|
-
# you make new pipelines and request the annotator
|
153
|
+
|
154
|
+
# Once it loads a specific annotator model once,
|
155
|
+
# the program always loads the same models when
|
156
|
+
# you make new pipelines and request the annotator
|
148
157
|
# again, ignoring the changes in models.
|
149
158
|
#
|
150
159
|
# This function kills the JVM and reloads everything
|
@@ -153,26 +162,40 @@ module StanfordCoreNLP
|
|
153
162
|
#def self.reload
|
154
163
|
# raise 'Not implemented.'
|
155
164
|
#end
|
156
|
-
|
165
|
+
|
157
166
|
# Load the jars.
|
158
167
|
def self.load_jars
|
168
|
+
JarLoader.log(self.log_file)
|
159
169
|
JarLoader.jvm_args = self.jvm_args
|
160
170
|
JarLoader.jar_path = self.jar_path
|
161
171
|
JarLoader.load('joda-time.jar')
|
162
172
|
JarLoader.load('xom.jar')
|
163
173
|
JarLoader.load('stanford-corenlp.jar')
|
164
174
|
JarLoader.load('bridge.jar')
|
165
|
-
@@loaded = true
|
166
175
|
end
|
167
176
|
|
168
177
|
# Create the Ruby classes corresponding to the StanfordNLP
|
169
178
|
# core classes.
|
170
|
-
def self.
|
171
|
-
|
172
|
-
const_set(:
|
173
|
-
|
174
|
-
|
175
|
-
|
179
|
+
def self.load_default_classes
|
180
|
+
|
181
|
+
const_set(:CoreNLP,
|
182
|
+
Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')
|
183
|
+
)
|
184
|
+
|
185
|
+
self.load_klass 'Annotation'
|
186
|
+
self.load_klass 'Word', 'edu.stanford.nlp.ling'
|
187
|
+
|
188
|
+
self.load_klass 'MaxentTagger', 'edu.stanford.nlp.tagger.maxent'
|
189
|
+
|
190
|
+
self.load_klass 'CRFClassifier', 'edu.stanford.nlp.ie.crf'
|
191
|
+
|
192
|
+
self.load_klass 'Properties', 'java.util'
|
193
|
+
self.load_klass 'ArrayList', 'java.util'
|
194
|
+
|
195
|
+
self.load_klass 'AnnotationBridge', ''
|
196
|
+
|
197
|
+
const_set(:Text, Annotation)
|
198
|
+
|
176
199
|
end
|
177
200
|
|
178
201
|
# Load a class (e.g. PTBTokenizerAnnotator) in a specific
|
@@ -198,12 +221,10 @@ module StanfordCoreNLP
|
|
198
221
|
# - DeterministicCorefAnnotator - implements anaphora resolution using a deterministic model (newer model, use this!).
|
199
222
|
# - NFLAnnotator - implements entity and relation mention extraction for the NFL domain.
|
200
223
|
def self.load_class(klass, base = 'edu.stanford.nlp.pipeline')
|
201
|
-
self.
|
202
|
-
|
224
|
+
self.init unless @@initialized
|
225
|
+
self.load_klass(klass, base)
|
203
226
|
end
|
204
227
|
|
205
|
-
# Private helper functions.
|
206
|
-
private
|
207
228
|
# HCreate a java.util.Properties object from a hash.
|
208
229
|
def self.get_properties(properties)
|
209
230
|
props = Properties.new
|
@@ -213,9 +234,28 @@ module StanfordCoreNLP
|
|
213
234
|
props
|
214
235
|
end
|
215
236
|
|
237
|
+
# Get a Java ArrayList binding to pass lists
|
238
|
+
# of tokens to the Stanford Core NLP process.
|
239
|
+
def self.get_list(tokens)
|
240
|
+
list = StanfordCoreNLP::ArrayList.new
|
241
|
+
tokens.each do |t|
|
242
|
+
list.add(StanfordCoreNLP::Word.new(t.to_s))
|
243
|
+
end
|
244
|
+
list
|
245
|
+
end
|
246
|
+
|
216
247
|
# Under_case -> CamelCase.
|
217
248
|
def self.camel_case(text)
|
218
|
-
text.to_s.gsub(/^[a-z]|_[a-z]/)
|
249
|
+
text.to_s.gsub(/^[a-z]|_[a-z]/) do |a|
|
250
|
+
a.upcase
|
251
|
+
end.gsub('_', '')
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
def self.load_klass(klass, base = 'edu.stanford.nlp.pipeline')
|
256
|
+
base += '.' unless base == ''
|
257
|
+
const_set(klass.intern,
|
258
|
+
Rjb::import("#{base}#{klass}"))
|
219
259
|
end
|
220
260
|
|
221
261
|
end
|
@@ -22,14 +22,14 @@ module StanfordCoreNLP
|
|
22
22
|
# Get an annotation using the annotation bridge.
|
23
23
|
def get(annotation, anno_base = nil)
|
24
24
|
if !java_methods.include?('get(Ljava.lang.Class;)')
|
25
|
-
raise'No annotation can be retrieved on this object.'
|
25
|
+
raise 'No annotation can be retrieved on this object.'
|
26
26
|
else
|
27
27
|
anno_class = "#{StanfordCoreNLP.camel_case(annotation)}Annotation"
|
28
28
|
if anno_base
|
29
|
-
raise "The path #{anno_base} doesn't exist." unless Annotations[anno_base]
|
29
|
+
raise "The path #{anno_base} doesn't exist." unless StanfordNLP::Config::Annotations[anno_base]
|
30
30
|
anno_bases = [anno_base]
|
31
31
|
else
|
32
|
-
anno_bases = Config::AnnotationsByName[anno_class]
|
32
|
+
anno_bases = StanfordCoreNLP::Config::AnnotationsByName[anno_class]
|
33
33
|
raise "The annotation #{anno_class} doesn't exist." unless anno_bases
|
34
34
|
end
|
35
35
|
if anno_bases.size > 1
|
@@ -41,9 +41,10 @@ module StanfordCoreNLP
|
|
41
41
|
base_class = anno_bases[0]
|
42
42
|
end
|
43
43
|
url = "edu.stanford.#{base_class}$#{anno_class}"
|
44
|
-
AnnotationBridge.getAnnotation(self, url)
|
44
|
+
StanfordCoreNLP::AnnotationBridge.getAnnotation(self, url)
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
48
|
end
|
49
|
+
|
49
50
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stanford-core-nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rjb
|
16
|
-
requirement: &
|
16
|
+
requirement: &70252071542960 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70252071542960
|
25
25
|
description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
|
26
26
|
language processing \ntools that provides tokenization, part-of-speech tagging and
|
27
27
|
parsing for several languages, as well as named entity \nrecognition and coreference
|