stanford-core-nlp 0.1.7 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.markdown +13 -5
- data/lib/stanford-core-nlp.rb +84 -44
- data/lib/stanford-core-nlp/java_wrapper.rb +5 -4
- metadata +4 -4
data/README.markdown
CHANGED
@@ -1,16 +1,22 @@
|
|
1
|
+
[](http://travis-ci.org/louismullie/stanford-core-nlp)
|
2
|
+
|
1
3
|
**About**
|
2
4
|
|
3
|
-
This gem provides high-level Ruby bindings to the [Stanford Core NLP package](http://nlp.stanford.edu/software/corenlp.shtml), a set natural language processing tools that provides tokenization, part-of-speech tagging, lemmatization, and parsing for
|
5
|
+
This gem provides high-level Ruby bindings to the [Stanford Core NLP package](http://nlp.stanford.edu/software/corenlp.shtml), a set natural language processing tools that provides tokenization, part-of-speech tagging, lemmatization, and parsing for several languages, as well as named entity recognition and coreference resolution for English. This gem is compatible with Ruby 1.9.2 and above.
|
4
6
|
|
5
7
|
**Installing**
|
6
8
|
|
7
|
-
|
9
|
+
Firs, install the gem: `gem install stanford-core-nlp`. Then, download the Stanford Core NLP JAR and model files. Three different packages are available:
|
10
|
+
|
11
|
+
* A [minimal package for English](http://louismullie.com/stanford-core-nlp-minimal.zip) with one tagger model and one parser model for English.
|
12
|
+
* A [full package for English](http://louismullie.com/stanford-core-nlp-english.zip), with all tagger and parser models for English, plus the coreference resolution and named entity recognition models.
|
13
|
+
* A [full package for all languages](http://louismullie.com/stanford-core-nlp-all.zip), including tagger and parser models for English, French, German, Arabic and Chinese.
|
8
14
|
|
9
|
-
|
15
|
+
Place the contents of the extracted archive inside the /bin/ folder of the stanford-core-nlp gem (e.g. /usr/local/lib/ruby/gems/1.X.x/gems/stanford-core-nlp-0.x/bin/).
|
10
16
|
|
11
17
|
**Configuration**
|
12
18
|
|
13
|
-
After installing and requiring the gem (`require 'stanford-core-nlp'`), you may want to set some configuration options
|
19
|
+
After installing and requiring the gem (`require 'stanford-core-nlp'`), you may want to set some optional configuration options. Here are some examples:
|
14
20
|
|
15
21
|
```ruby
|
16
22
|
# Set an alternative path to look for the JAR files
|
@@ -19,7 +25,7 @@ StanfordCoreNLP.jar_path = '/path_to_jars/'
|
|
19
25
|
|
20
26
|
# Set an alternative path to look for the model files
|
21
27
|
# Default is gem's bin folder.
|
22
|
-
StanfordCoreNLP.
|
28
|
+
StanfordCoreNLP.model_path = '/path_to_models/'
|
23
29
|
|
24
30
|
# Pass some alternative arguments to the Java VM.
|
25
31
|
# Default is ['-Xms512M', '-Xmx1024M'] (be prepared
|
@@ -48,6 +54,8 @@ text = StanfordCoreNLP::Text.new(text)
|
|
48
54
|
pipeline.annotate(text)
|
49
55
|
|
50
56
|
text.get(:sentences).each do |sentence|
|
57
|
+
# Syntatical dependencies
|
58
|
+
puts sentence.get(:basic_dependencies).to_s
|
51
59
|
sentence.get(:tokens).each do |token|
|
52
60
|
# Default annotations for all tokens
|
53
61
|
puts token.get(:value).to_s
|
data/lib/stanford-core-nlp.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
2
|
|
3
|
-
VERSION = '0.
|
4
|
-
|
3
|
+
VERSION = '0.2.0'
|
4
|
+
|
5
5
|
require 'stanford-core-nlp/jar_loader'
|
6
6
|
require 'stanford-core-nlp/java_wrapper'
|
7
7
|
require 'stanford-core-nlp/config'
|
@@ -11,31 +11,31 @@ module StanfordCoreNLP
|
|
11
11
|
# with a trailing slash.
|
12
12
|
#
|
13
13
|
# The structure of the JAR folder must be as follows:
|
14
|
-
#
|
14
|
+
#
|
15
15
|
# Files:
|
16
|
-
#
|
16
|
+
#
|
17
17
|
# /stanford-core-nlp.jar
|
18
18
|
# /joda-time.jar
|
19
|
-
# /xom.jar
|
19
|
+
# /xom.jar
|
20
20
|
# /bridge.jar*
|
21
|
-
#
|
21
|
+
#
|
22
22
|
# Folders:
|
23
23
|
#
|
24
24
|
# /classifiers # Models for the NER system.
|
25
25
|
# /dcoref # Models for the coreference resolver.
|
26
26
|
# /taggers # Models for the POS tagger.
|
27
27
|
# /grammar # Models for the parser.
|
28
|
-
#
|
28
|
+
#
|
29
29
|
# *The file bridge.jar is a thin JAVA wrapper over the
|
30
|
-
# Stanford Core NLP get() function, which allows to
|
30
|
+
# Stanford Core NLP get() function, which allows to
|
31
31
|
# retrieve annotations using static classes as names.
|
32
32
|
# This works around one of the lacunae of Rjb.
|
33
33
|
attr_accessor :jar_path
|
34
|
-
# The path to the main folder containing the folders
|
34
|
+
# The path to the main folder containing the folders
|
35
35
|
# with the individual models inside. By default, this
|
36
36
|
# is the same as the JAR path.
|
37
37
|
attr_accessor :model_path
|
38
|
-
# The flags for starting the JVM machine. The parser
|
38
|
+
# The flags for starting the JVM machine. The parser
|
39
39
|
# and named entity recognizer are very memory consuming.
|
40
40
|
attr_accessor :jvm_args
|
41
41
|
# A file to redirect JVM output to.
|
@@ -54,8 +54,8 @@ module StanfordCoreNLP
|
|
54
54
|
# Turn logging off by default.
|
55
55
|
self.log_file = nil
|
56
56
|
|
57
|
-
# Use models for a given language. Language can be
|
58
|
-
# supplied as full-length, or ISO-639 2 or 3 letter
|
57
|
+
# Use models for a given language. Language can be
|
58
|
+
# supplied as full-length, or ISO-639 2 or 3 letter
|
59
59
|
# code (e.g. :english, :eng or :en will work).
|
60
60
|
def self.use(language)
|
61
61
|
lang = nil
|
@@ -70,19 +70,19 @@ module StanfordCoreNLP
|
|
70
70
|
n = n.to_s
|
71
71
|
n += '.model' if n == 'ner'
|
72
72
|
models.each do |m, file|
|
73
|
-
self.model_files["#{n}.#{m}"] =
|
73
|
+
self.model_files["#{n}.#{m}"] =
|
74
74
|
folder + file
|
75
75
|
end
|
76
76
|
elsif models.is_a?(String)
|
77
|
-
self.model_files["#{n}.model"] =
|
77
|
+
self.model_files["#{n}.model"] =
|
78
78
|
folder + models
|
79
79
|
end
|
80
80
|
end
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
# Use english by default.
|
84
|
-
self.use(:english)
|
85
|
-
|
84
|
+
self.use(:english)
|
85
|
+
|
86
86
|
# Set a model file. Here are the default models for English:
|
87
87
|
#
|
88
88
|
# 'pos.model' => 'english-left3words-distsim.tagger',
|
@@ -103,32 +103,40 @@ module StanfordCoreNLP
|
|
103
103
|
#
|
104
104
|
def self.set_model(name, file)
|
105
105
|
n = name.split('.')[0].intern
|
106
|
-
self.model_files[name] =
|
106
|
+
self.model_files[name] =
|
107
107
|
Config::ModelFolders[n] + file
|
108
108
|
end
|
109
109
|
|
110
110
|
# Whether the classes are initialized or not.
|
111
111
|
@@initialized = false
|
112
|
-
# Whether the JAR files are loaded or not.
|
113
|
-
@@loaded = false
|
114
112
|
|
115
113
|
# Load the JARs, create the classes.
|
116
114
|
def self.init
|
117
|
-
|
118
|
-
|
115
|
+
unless @@initialized
|
116
|
+
self.load_jars
|
117
|
+
self.load_default_classes
|
118
|
+
end
|
119
119
|
@@initialized = true
|
120
120
|
end
|
121
121
|
|
122
|
-
# Load a StanfordCoreNLP pipeline with the
|
123
|
-
# specified JVM flags and StanfordCoreNLP
|
122
|
+
# Load a StanfordCoreNLP pipeline with the
|
123
|
+
# specified JVM flags and StanfordCoreNLP
|
124
124
|
# properties.
|
125
125
|
def self.load(*annotators)
|
126
|
-
|
126
|
+
|
127
127
|
self.init unless @@initialized
|
128
|
+
|
128
129
|
# Prepend the JAR path to the model files.
|
129
130
|
properties = {}
|
130
|
-
self.model_files.each do |k,v|
|
131
|
-
|
131
|
+
self.model_files.each do |k,v|
|
132
|
+
found = false
|
133
|
+
annotators.each do |annotator|
|
134
|
+
found = true if k.index(annotator.to_s)
|
135
|
+
break if found
|
136
|
+
end
|
137
|
+
next unless found
|
138
|
+
f = self.model_path + v
|
139
|
+
puts f
|
132
140
|
unless File.readable?(f)
|
133
141
|
raise "Model file #{f} could not be found. " +
|
134
142
|
"You may need to download this file manually "+
|
@@ -137,14 +145,15 @@ module StanfordCoreNLP
|
|
137
145
|
properties[k] = f
|
138
146
|
end
|
139
147
|
end
|
148
|
+
|
140
149
|
properties['annotators'] =
|
141
150
|
annotators.map { |x| x.to_s }.join(', ')
|
142
151
|
CoreNLP.new(get_properties(properties))
|
143
152
|
end
|
144
|
-
|
145
|
-
# Once it loads a specific annotator model once,
|
146
|
-
# the program always loads the same models when
|
147
|
-
# you make new pipelines and request the annotator
|
153
|
+
|
154
|
+
# Once it loads a specific annotator model once,
|
155
|
+
# the program always loads the same models when
|
156
|
+
# you make new pipelines and request the annotator
|
148
157
|
# again, ignoring the changes in models.
|
149
158
|
#
|
150
159
|
# This function kills the JVM and reloads everything
|
@@ -153,26 +162,40 @@ module StanfordCoreNLP
|
|
153
162
|
#def self.reload
|
154
163
|
# raise 'Not implemented.'
|
155
164
|
#end
|
156
|
-
|
165
|
+
|
157
166
|
# Load the jars.
|
158
167
|
def self.load_jars
|
168
|
+
JarLoader.log(self.log_file)
|
159
169
|
JarLoader.jvm_args = self.jvm_args
|
160
170
|
JarLoader.jar_path = self.jar_path
|
161
171
|
JarLoader.load('joda-time.jar')
|
162
172
|
JarLoader.load('xom.jar')
|
163
173
|
JarLoader.load('stanford-corenlp.jar')
|
164
174
|
JarLoader.load('bridge.jar')
|
165
|
-
@@loaded = true
|
166
175
|
end
|
167
176
|
|
168
177
|
# Create the Ruby classes corresponding to the StanfordNLP
|
169
178
|
# core classes.
|
170
|
-
def self.
|
171
|
-
|
172
|
-
const_set(:
|
173
|
-
|
174
|
-
|
175
|
-
|
179
|
+
def self.load_default_classes
|
180
|
+
|
181
|
+
const_set(:CoreNLP,
|
182
|
+
Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP')
|
183
|
+
)
|
184
|
+
|
185
|
+
self.load_klass 'Annotation'
|
186
|
+
self.load_klass 'Word', 'edu.stanford.nlp.ling'
|
187
|
+
|
188
|
+
self.load_klass 'MaxentTagger', 'edu.stanford.nlp.tagger.maxent'
|
189
|
+
|
190
|
+
self.load_klass 'CRFClassifier', 'edu.stanford.nlp.ie.crf'
|
191
|
+
|
192
|
+
self.load_klass 'Properties', 'java.util'
|
193
|
+
self.load_klass 'ArrayList', 'java.util'
|
194
|
+
|
195
|
+
self.load_klass 'AnnotationBridge', ''
|
196
|
+
|
197
|
+
const_set(:Text, Annotation)
|
198
|
+
|
176
199
|
end
|
177
200
|
|
178
201
|
# Load a class (e.g. PTBTokenizerAnnotator) in a specific
|
@@ -198,12 +221,10 @@ module StanfordCoreNLP
|
|
198
221
|
# - DeterministicCorefAnnotator - implements anaphora resolution using a deterministic model (newer model, use this!).
|
199
222
|
# - NFLAnnotator - implements entity and relation mention extraction for the NFL domain.
|
200
223
|
def self.load_class(klass, base = 'edu.stanford.nlp.pipeline')
|
201
|
-
self.
|
202
|
-
|
224
|
+
self.init unless @@initialized
|
225
|
+
self.load_klass(klass, base)
|
203
226
|
end
|
204
227
|
|
205
|
-
# Private helper functions.
|
206
|
-
private
|
207
228
|
# HCreate a java.util.Properties object from a hash.
|
208
229
|
def self.get_properties(properties)
|
209
230
|
props = Properties.new
|
@@ -213,9 +234,28 @@ module StanfordCoreNLP
|
|
213
234
|
props
|
214
235
|
end
|
215
236
|
|
237
|
+
# Get a Java ArrayList binding to pass lists
|
238
|
+
# of tokens to the Stanford Core NLP process.
|
239
|
+
def self.get_list(tokens)
|
240
|
+
list = StanfordCoreNLP::ArrayList.new
|
241
|
+
tokens.each do |t|
|
242
|
+
list.add(StanfordCoreNLP::Word.new(t.to_s))
|
243
|
+
end
|
244
|
+
list
|
245
|
+
end
|
246
|
+
|
216
247
|
# Under_case -> CamelCase.
|
217
248
|
def self.camel_case(text)
|
218
|
-
text.to_s.gsub(/^[a-z]|_[a-z]/)
|
249
|
+
text.to_s.gsub(/^[a-z]|_[a-z]/) do |a|
|
250
|
+
a.upcase
|
251
|
+
end.gsub('_', '')
|
252
|
+
end
|
253
|
+
|
254
|
+
private
|
255
|
+
def self.load_klass(klass, base = 'edu.stanford.nlp.pipeline')
|
256
|
+
base += '.' unless base == ''
|
257
|
+
const_set(klass.intern,
|
258
|
+
Rjb::import("#{base}#{klass}"))
|
219
259
|
end
|
220
260
|
|
221
261
|
end
|
@@ -22,14 +22,14 @@ module StanfordCoreNLP
|
|
22
22
|
# Get an annotation using the annotation bridge.
|
23
23
|
def get(annotation, anno_base = nil)
|
24
24
|
if !java_methods.include?('get(Ljava.lang.Class;)')
|
25
|
-
raise'No annotation can be retrieved on this object.'
|
25
|
+
raise 'No annotation can be retrieved on this object.'
|
26
26
|
else
|
27
27
|
anno_class = "#{StanfordCoreNLP.camel_case(annotation)}Annotation"
|
28
28
|
if anno_base
|
29
|
-
raise "The path #{anno_base} doesn't exist." unless Annotations[anno_base]
|
29
|
+
raise "The path #{anno_base} doesn't exist." unless StanfordNLP::Config::Annotations[anno_base]
|
30
30
|
anno_bases = [anno_base]
|
31
31
|
else
|
32
|
-
anno_bases = Config::AnnotationsByName[anno_class]
|
32
|
+
anno_bases = StanfordCoreNLP::Config::AnnotationsByName[anno_class]
|
33
33
|
raise "The annotation #{anno_class} doesn't exist." unless anno_bases
|
34
34
|
end
|
35
35
|
if anno_bases.size > 1
|
@@ -41,9 +41,10 @@ module StanfordCoreNLP
|
|
41
41
|
base_class = anno_bases[0]
|
42
42
|
end
|
43
43
|
url = "edu.stanford.#{base_class}$#{anno_class}"
|
44
|
-
AnnotationBridge.getAnnotation(self, url)
|
44
|
+
StanfordCoreNLP::AnnotationBridge.getAnnotation(self, url)
|
45
45
|
end
|
46
46
|
end
|
47
47
|
|
48
48
|
end
|
49
|
+
|
49
50
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stanford-core-nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-04 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rjb
|
16
|
-
requirement: &
|
16
|
+
requirement: &70252071542960 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70252071542960
|
25
25
|
description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
|
26
26
|
language processing \ntools that provides tokenization, part-of-speech tagging and
|
27
27
|
parsing for several languages, as well as named entity \nrecognition and coreference
|