stanford-core-nlp 0.1 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README +2 -2
- data/lib/stanford-core-nlp.rb +53 -62
- data/lib/stanford-core-nlp/jar-loader.rb +15 -21
- data/lib/stanford-core-nlp/java-wrapper.rb +37 -0
- metadata +8 -7
data/README
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
1
|
+
Ruby bindings for the Stanford CoreNLP package
|
2
2
|
|
3
|
-
See the wiki for more information at https://github.com/louismullie/
|
3
|
+
See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.
|
data/lib/stanford-core-nlp.rb
CHANGED
@@ -1,65 +1,82 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
2
|
|
3
|
-
VERSION = '0.1'
|
3
|
+
VERSION = '0.1.1'
|
4
4
|
require 'stanford-core-nlp/jar-loader.rb'
|
5
|
-
|
5
|
+
require 'stanford-core-nlp/java-wrapper.rb'
|
6
|
+
|
6
7
|
class << self
|
7
8
|
# The path in which to look for the Stanford JAR files.
|
8
9
|
# This is passed to JarLoader.
|
9
10
|
attr_accessor :jar_path
|
10
11
|
# The flags for starting the JVM machine.
|
11
12
|
# Parser and named entity recognizer are very memory consuming.
|
12
|
-
attr_accessor :
|
13
|
+
attr_accessor :jvm_args
|
14
|
+
# A file to redirect JVM output to.
|
15
|
+
attr_accessor :log_file
|
16
|
+
# The model files. Use #set_model to modify these.
|
17
|
+
attr_accessor :model_files
|
13
18
|
end
|
14
19
|
|
20
|
+
# The default JAR path is the gem's bin folder.
|
15
21
|
self.jar_path = File.dirname(__FILE__) + '/../bin/'
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
22
|
+
# Load the JVM with a minimum heap size of 512MB and a
|
23
|
+
# maximum heap size of 1024MB.
|
24
|
+
self.jvm_args = ['-Xms512M', '-Xmx1024M']
|
25
|
+
# Turn logging off by default.
|
26
|
+
self.log_file = nil
|
27
|
+
|
28
|
+
# Default model files.
|
29
|
+
self.model_files = {
|
30
|
+
'pos.model' => 'taggers/english-left3words-distsim.tagger',
|
31
|
+
'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
|
32
|
+
'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
|
33
|
+
'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
|
34
|
+
'parser.model' => 'grammar/englishPCFG.ser.gz',
|
35
|
+
'dcoref.demonym' => 'dcoref/demonyms.txt',
|
36
|
+
'dcoref.animate' => 'dcoref/animate.unigrams.txt',
|
37
|
+
'dcoref.female' => 'dcoref/female.unigrams.txt',
|
38
|
+
'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
|
39
|
+
'dcoref.male' => 'dcoref/male.unigrams.txt',
|
40
|
+
'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
|
41
|
+
'dcoref.plural' => 'dcoref/plural.unigrams.txt',
|
42
|
+
'dcoref.singular' => 'dcoref/singular.unigrams.txt',
|
43
|
+
'dcoref.states' => 'dcoref/state-abbreviations.txt',
|
44
|
+
'dcoref.countries' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
|
45
|
+
'dcoref.states.provinces' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
|
46
|
+
'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
|
47
|
+
}
|
48
|
+
|
49
|
+
# Set a model file.
|
50
|
+
def self.set_model(name, file)
|
51
|
+
self.model_files[name] = file
|
41
52
|
end
|
42
|
-
|
53
|
+
|
43
54
|
# Load a StanfordCoreNLP pipeline with the specified JVM flags and
|
44
55
|
# StanfordCoreNLP properties (hash of property => values).
|
45
|
-
def self.load(
|
46
|
-
self.load_jars(
|
56
|
+
def self.load(*annotators)
|
57
|
+
self.load_jars(self.jvm_args, self.jar_path, self.log_file)
|
47
58
|
self.create_classes
|
48
|
-
|
59
|
+
# Prepend the JAR path to the model files.
|
60
|
+
properties = {}
|
61
|
+
self.model_files.each { |k,v| properties[k] = self.jar_path + v }
|
62
|
+
properties['annotators'] =
|
63
|
+
annotators.map { |x| x.to_s }.join(', ')
|
49
64
|
CoreNLP.new(get_properties(properties))
|
50
65
|
end
|
51
66
|
|
52
67
|
# Load the jars.
|
53
|
-
def self.load_jars(
|
54
|
-
JarLoader.
|
68
|
+
def self.load_jars(jvm_args, jar_path, log_file)
|
69
|
+
JarLoader.jvm_args = jvm_args
|
55
70
|
JarLoader.jar_path = jar_path
|
71
|
+
JarLoader.log(log_file) if log_file
|
56
72
|
JarLoader.load('joda-time.jar')
|
57
73
|
JarLoader.load('xom.jar')
|
58
74
|
JarLoader.load('stanford-corenlp.jar')
|
59
75
|
JarLoader.load('bridge.jar')
|
60
76
|
end
|
61
77
|
|
62
|
-
# Create the Ruby classes
|
78
|
+
# Create the Ruby classes corresponding to the StanfordNLP
|
79
|
+
# core classes.
|
63
80
|
def self.create_classes
|
64
81
|
const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
|
65
82
|
const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
|
@@ -76,31 +93,5 @@ module StanfordCoreNLP
|
|
76
93
|
end
|
77
94
|
props
|
78
95
|
end
|
79
|
-
|
80
|
-
Rjb::Rjb_JavaProxy.class_eval do
|
81
|
-
|
82
|
-
# Get an annotation using the annotation bridge.
|
83
|
-
def get(annotation)
|
84
|
-
base_class = (annotation.to_s.split('_')[0] == 'coref') ?
|
85
|
-
'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
|
86
|
-
'edu.stanford.nlp.ling.CoreAnnotations$'
|
87
|
-
anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
|
88
|
-
url = "#{base_class}#{anno_class}Annotation"
|
89
|
-
AnnotationBridge.getAnnotation(self, url)
|
90
|
-
end
|
91
|
-
|
92
|
-
# Shorthand for to_string defined by Java classes.
|
93
|
-
def to_s; to_string; end
|
94
|
-
|
95
|
-
# Provide Ruby-style iterators to wrap Java iterators.
|
96
|
-
def each
|
97
|
-
if !java_methods.include?('iterator()')
|
98
|
-
raise 'This object cannot be iterated.'
|
99
|
-
else
|
100
|
-
i = self.iterator
|
101
|
-
while i.has_next; yield i.next;end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
96
|
+
|
106
97
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
|
-
|
3
2
|
class JarLoader
|
4
3
|
|
5
4
|
require 'rjb'
|
@@ -7,28 +6,32 @@ module StanfordCoreNLP
|
|
7
6
|
# Configuration options.
|
8
7
|
class << self
|
9
8
|
# An array of flags to pass to the JVM machine.
|
10
|
-
attr_accessor :
|
9
|
+
attr_accessor :jvm_args
|
11
10
|
attr_accessor :jar_path
|
12
11
|
attr_accessor :log_file
|
13
12
|
end
|
14
13
|
|
15
14
|
# An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
|
16
|
-
self.
|
15
|
+
self.jvm_args = []
|
17
16
|
# The path in which to look for Jars.
|
18
17
|
self.jar_path = ''
|
19
|
-
#
|
20
|
-
# Setting this before the parser automatically calls self.redirect_to_log
|
18
|
+
# By default, disable logging.
|
21
19
|
self.log_file = nil
|
22
|
-
|
20
|
+
|
23
21
|
# Load Rjb and create Java VM.
|
24
22
|
def self.rjb_initialize
|
25
23
|
return if ::Rjb::loaded?
|
26
|
-
::Rjb::load(nil, self.
|
27
|
-
|
24
|
+
::Rjb::load(nil, self.jvm_args)
|
25
|
+
set_java_logging if self.log_file
|
28
26
|
end
|
29
|
-
|
30
|
-
#
|
31
|
-
def self.
|
27
|
+
|
28
|
+
# Enable logging.
|
29
|
+
def self.log(file = 'log.txt')
|
30
|
+
self.log_file = file
|
31
|
+
end
|
32
|
+
|
33
|
+
# Redirect the output of the JVM to supplied log file.
|
34
|
+
def self.set_java_logging
|
32
35
|
const_set(:System, Rjb::import('java.lang.System'))
|
33
36
|
const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
|
34
37
|
const_set(:File2, Rjb::import('java.io.File'))
|
@@ -37,7 +40,7 @@ module StanfordCoreNLP
|
|
37
40
|
System.setOut(ps)
|
38
41
|
System.setErr(ps)
|
39
42
|
end
|
40
|
-
|
43
|
+
|
41
44
|
# Load a jar.
|
42
45
|
def self.load(jar)
|
43
46
|
self.rjb_initialize
|
@@ -48,14 +51,5 @@ module StanfordCoreNLP
|
|
48
51
|
::Rjb::add_jar(jar)
|
49
52
|
end
|
50
53
|
|
51
|
-
# Silence output and log to file.
|
52
|
-
def self.log(file = 'log.txt')
|
53
|
-
@@log_file = file
|
54
|
-
end
|
55
|
-
|
56
|
-
# Whether the output is logged or not.
|
57
|
-
def self.log?; @@log_file; end
|
58
|
-
|
59
54
|
end
|
60
|
-
|
61
55
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module StanfordCoreNLP
|
2
|
+
|
3
|
+
# Modify the Rjb JavaProxy class to add our own methods to every Java object.
|
4
|
+
Rjb::Rjb_JavaProxy.class_eval do
|
5
|
+
|
6
|
+
# Dynamically defined on all proxied Java objects.
|
7
|
+
# Shorthand for to_string defined by Java classes.
|
8
|
+
def to_s; to_string; end
|
9
|
+
|
10
|
+
# Dynamically defined on all proxied Java iterators.
|
11
|
+
# Provide Ruby-style iterators to wrap Java iterators.
|
12
|
+
def each
|
13
|
+
if !java_methods.include?('iterator()')
|
14
|
+
raise 'This object cannot be iterated.'
|
15
|
+
else
|
16
|
+
i = self.iterator
|
17
|
+
while i.has_next; yield i.next; end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Dynamically defined on all proxied annotation classes.
|
22
|
+
# Get an annotation using the annotation bridge.
|
23
|
+
def get(annotation)
|
24
|
+
if !java_methods.include?('get(Ljava.lang.Class;)')
|
25
|
+
raise 'No annotation can be retrieved on this object.'
|
26
|
+
else
|
27
|
+
base_class = (annotation.to_s.split('_')[0] == 'coref') ?
|
28
|
+
'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
|
29
|
+
'edu.stanford.nlp.ling.CoreAnnotations$'
|
30
|
+
anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
|
31
|
+
url = "#{base_class}#{anno_class}Annotation"
|
32
|
+
AnnotationBridge.getAnnotation(self, url)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stanford-core-nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rjb
|
16
|
-
requirement: &
|
16
|
+
requirement: &70258761325680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description: !
|
26
|
-
language processing
|
27
|
-
lemmatization, named entity recognition,
|
24
|
+
version_requirements: *70258761325680
|
25
|
+
description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
|
26
|
+
language processing \ntools for English, including tokenization, part-of-speech
|
27
|
+
tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
|
28
28
|
email:
|
29
29
|
- louis.mullie@gmail.com
|
30
30
|
executables: []
|
@@ -32,6 +32,7 @@ extensions: []
|
|
32
32
|
extra_rdoc_files: []
|
33
33
|
files:
|
34
34
|
- lib/stanford-core-nlp/jar-loader.rb
|
35
|
+
- lib/stanford-core-nlp/java-wrapper.rb
|
35
36
|
- lib/stanford-core-nlp.rb
|
36
37
|
- bin/bridge.jar
|
37
38
|
- bin/classifiers/all.3class.distsim.crf.ser.gz
|
@@ -86,5 +87,5 @@ rubyforge_project:
|
|
86
87
|
rubygems_version: 1.8.15
|
87
88
|
signing_key:
|
88
89
|
specification_version: 3
|
89
|
-
summary: Ruby bindings to the Stanford
|
90
|
+
summary: Ruby bindings to the Stanford Core NLP tools.
|
90
91
|
test_files: []
|