stanford-core-nlp 0.1 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +2 -2
- data/lib/stanford-core-nlp.rb +53 -62
- data/lib/stanford-core-nlp/jar-loader.rb +15 -21
- data/lib/stanford-core-nlp/java-wrapper.rb +37 -0
- metadata +8 -7
data/README
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
1
|
+
Ruby bindings for the Stanford CoreNLP package
|
2
2
|
|
3
|
-
See the wiki for more information at https://github.com/louismullie/
|
3
|
+
See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.
|
data/lib/stanford-core-nlp.rb
CHANGED
@@ -1,65 +1,82 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
2
|
|
3
|
-
VERSION = '0.1'
|
3
|
+
VERSION = '0.1.1'
|
4
4
|
require 'stanford-core-nlp/jar-loader.rb'
|
5
|
-
|
5
|
+
require 'stanford-core-nlp/java-wrapper.rb'
|
6
|
+
|
6
7
|
class << self
|
7
8
|
# The path in which to look for the Stanford JAR files.
|
8
9
|
# This is passed to JarLoader.
|
9
10
|
attr_accessor :jar_path
|
10
11
|
# The flags for starting the JVM machine.
|
11
12
|
# Parser and named entity recognizer are very memory consuming.
|
12
|
-
attr_accessor :
|
13
|
+
attr_accessor :jvm_args
|
14
|
+
# A file to redirect JVM output to.
|
15
|
+
attr_accessor :log_file
|
16
|
+
# The model files. Use #set_model to modify these.
|
17
|
+
attr_accessor :model_files
|
13
18
|
end
|
14
19
|
|
20
|
+
# The default JAR path is the gem's bin folder.
|
15
21
|
self.jar_path = File.dirname(__FILE__) + '/../bin/'
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
22
|
+
# Load the JVM with a minimum heap size of 512MB and a
|
23
|
+
# maximum heap size of 1024MB.
|
24
|
+
self.jvm_args = ['-Xms512M', '-Xmx1024M']
|
25
|
+
# Turn logging off by default.
|
26
|
+
self.log_file = nil
|
27
|
+
|
28
|
+
# Default model files.
|
29
|
+
self.model_files = {
|
30
|
+
'pos.model' => 'taggers/english-left3words-distsim.tagger',
|
31
|
+
'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
|
32
|
+
'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
|
33
|
+
'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
|
34
|
+
'parser.model' => 'grammar/englishPCFG.ser.gz',
|
35
|
+
'dcoref.demonym' => 'dcoref/demonyms.txt',
|
36
|
+
'dcoref.animate' => 'dcoref/animate.unigrams.txt',
|
37
|
+
'dcoref.female' => 'dcoref/female.unigrams.txt',
|
38
|
+
'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
|
39
|
+
'dcoref.male' => 'dcoref/male.unigrams.txt',
|
40
|
+
'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
|
41
|
+
'dcoref.plural' => 'dcoref/plural.unigrams.txt',
|
42
|
+
'dcoref.singular' => 'dcoref/singular.unigrams.txt',
|
43
|
+
'dcoref.states' => 'dcoref/state-abbreviations.txt',
|
44
|
+
'dcoref.countries' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
|
45
|
+
'dcoref.states.provinces' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
|
46
|
+
'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
|
47
|
+
}
|
48
|
+
|
49
|
+
# Set a model file.
|
50
|
+
def self.set_model(name, file)
|
51
|
+
self.model_files[name] = file
|
41
52
|
end
|
42
|
-
|
53
|
+
|
43
54
|
# Load a StanfordCoreNLP pipeline with the specified JVM flags and
|
44
55
|
# StanfordCoreNLP properties (hash of property => values).
|
45
|
-
def self.load(
|
46
|
-
self.load_jars(
|
56
|
+
def self.load(*annotators)
|
57
|
+
self.load_jars(self.jvm_args, self.jar_path, self.log_file)
|
47
58
|
self.create_classes
|
48
|
-
|
59
|
+
# Prepend the JAR path to the model files.
|
60
|
+
properties = {}
|
61
|
+
self.model_files.each { |k,v| properties[k] = self.jar_path + v }
|
62
|
+
properties['annotators'] =
|
63
|
+
annotators.map { |x| x.to_s }.join(', ')
|
49
64
|
CoreNLP.new(get_properties(properties))
|
50
65
|
end
|
51
66
|
|
52
67
|
# Load the jars.
|
53
|
-
def self.load_jars(
|
54
|
-
JarLoader.
|
68
|
+
def self.load_jars(jvm_args, jar_path, log_file)
|
69
|
+
JarLoader.jvm_args = jvm_args
|
55
70
|
JarLoader.jar_path = jar_path
|
71
|
+
JarLoader.log(log_file) if log_file
|
56
72
|
JarLoader.load('joda-time.jar')
|
57
73
|
JarLoader.load('xom.jar')
|
58
74
|
JarLoader.load('stanford-corenlp.jar')
|
59
75
|
JarLoader.load('bridge.jar')
|
60
76
|
end
|
61
77
|
|
62
|
-
# Create the Ruby classes
|
78
|
+
# Create the Ruby classes corresponding to the StanfordNLP
|
79
|
+
# core classes.
|
63
80
|
def self.create_classes
|
64
81
|
const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
|
65
82
|
const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
|
@@ -76,31 +93,5 @@ module StanfordCoreNLP
|
|
76
93
|
end
|
77
94
|
props
|
78
95
|
end
|
79
|
-
|
80
|
-
Rjb::Rjb_JavaProxy.class_eval do
|
81
|
-
|
82
|
-
# Get an annotation using the annotation bridge.
|
83
|
-
def get(annotation)
|
84
|
-
base_class = (annotation.to_s.split('_')[0] == 'coref') ?
|
85
|
-
'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
|
86
|
-
'edu.stanford.nlp.ling.CoreAnnotations$'
|
87
|
-
anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
|
88
|
-
url = "#{base_class}#{anno_class}Annotation"
|
89
|
-
AnnotationBridge.getAnnotation(self, url)
|
90
|
-
end
|
91
|
-
|
92
|
-
# Shorthand for to_string defined by Java classes.
|
93
|
-
def to_s; to_string; end
|
94
|
-
|
95
|
-
# Provide Ruby-style iterators to wrap Java iterators.
|
96
|
-
def each
|
97
|
-
if !java_methods.include?('iterator()')
|
98
|
-
raise 'This object cannot be iterated.'
|
99
|
-
else
|
100
|
-
i = self.iterator
|
101
|
-
while i.has_next; yield i.next;end
|
102
|
-
end
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
96
|
+
|
106
97
|
end
|
@@ -1,5 +1,4 @@
|
|
1
1
|
module StanfordCoreNLP
|
2
|
-
|
3
2
|
class JarLoader
|
4
3
|
|
5
4
|
require 'rjb'
|
@@ -7,28 +6,32 @@ module StanfordCoreNLP
|
|
7
6
|
# Configuration options.
|
8
7
|
class << self
|
9
8
|
# An array of flags to pass to the JVM machine.
|
10
|
-
attr_accessor :
|
9
|
+
attr_accessor :jvm_args
|
11
10
|
attr_accessor :jar_path
|
12
11
|
attr_accessor :log_file
|
13
12
|
end
|
14
13
|
|
15
14
|
# An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
|
16
|
-
self.
|
15
|
+
self.jvm_args = []
|
17
16
|
# The path in which to look for Jars.
|
18
17
|
self.jar_path = ''
|
19
|
-
#
|
20
|
-
# Setting this before the parser automatically calls self.redirect_to_log
|
18
|
+
# By default, disable logging.
|
21
19
|
self.log_file = nil
|
22
|
-
|
20
|
+
|
23
21
|
# Load Rjb and create Java VM.
|
24
22
|
def self.rjb_initialize
|
25
23
|
return if ::Rjb::loaded?
|
26
|
-
::Rjb::load(nil, self.
|
27
|
-
|
24
|
+
::Rjb::load(nil, self.jvm_args)
|
25
|
+
set_java_logging if self.log_file
|
28
26
|
end
|
29
|
-
|
30
|
-
#
|
31
|
-
def self.
|
27
|
+
|
28
|
+
# Enable logging.
|
29
|
+
def self.log(file = 'log.txt')
|
30
|
+
self.log_file = file
|
31
|
+
end
|
32
|
+
|
33
|
+
# Redirect the output of the JVM to supplied log file.
|
34
|
+
def self.set_java_logging
|
32
35
|
const_set(:System, Rjb::import('java.lang.System'))
|
33
36
|
const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
|
34
37
|
const_set(:File2, Rjb::import('java.io.File'))
|
@@ -37,7 +40,7 @@ module StanfordCoreNLP
|
|
37
40
|
System.setOut(ps)
|
38
41
|
System.setErr(ps)
|
39
42
|
end
|
40
|
-
|
43
|
+
|
41
44
|
# Load a jar.
|
42
45
|
def self.load(jar)
|
43
46
|
self.rjb_initialize
|
@@ -48,14 +51,5 @@ module StanfordCoreNLP
|
|
48
51
|
::Rjb::add_jar(jar)
|
49
52
|
end
|
50
53
|
|
51
|
-
# Silence output and log to file.
|
52
|
-
def self.log(file = 'log.txt')
|
53
|
-
@@log_file = file
|
54
|
-
end
|
55
|
-
|
56
|
-
# Whether the output is logged or not.
|
57
|
-
def self.log?; @@log_file; end
|
58
|
-
|
59
54
|
end
|
60
|
-
|
61
55
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module StanfordCoreNLP
|
2
|
+
|
3
|
+
# Modify the Rjb JavaProxy class to add our own methods to every Java object.
|
4
|
+
Rjb::Rjb_JavaProxy.class_eval do
|
5
|
+
|
6
|
+
# Dynamically defined on all proxied Java objects.
|
7
|
+
# Shorthand for to_string defined by Java classes.
|
8
|
+
def to_s; to_string; end
|
9
|
+
|
10
|
+
# Dynamically defined on all proxied Java iterators.
|
11
|
+
# Provide Ruby-style iterators to wrap Java iterators.
|
12
|
+
def each
|
13
|
+
if !java_methods.include?('iterator()')
|
14
|
+
raise 'This object cannot be iterated.'
|
15
|
+
else
|
16
|
+
i = self.iterator
|
17
|
+
while i.has_next; yield i.next; end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
# Dynamically defined on all proxied annotation classes.
|
22
|
+
# Get an annotation using the annotation bridge.
|
23
|
+
def get(annotation)
|
24
|
+
if !java_methods.include?('get(Ljava.lang.Class;)')
|
25
|
+
raise 'No annotation can be retrieved on this object.'
|
26
|
+
else
|
27
|
+
base_class = (annotation.to_s.split('_')[0] == 'coref') ?
|
28
|
+
'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
|
29
|
+
'edu.stanford.nlp.ling.CoreAnnotations$'
|
30
|
+
anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
|
31
|
+
url = "#{base_class}#{anno_class}Annotation"
|
32
|
+
AnnotationBridge.getAnnotation(self, url)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: stanford-core-nlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 0.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rjb
|
16
|
-
requirement: &
|
16
|
+
requirement: &70258761325680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
25
|
-
description: !
|
26
|
-
language processing
|
27
|
-
lemmatization, named entity recognition,
|
24
|
+
version_requirements: *70258761325680
|
25
|
+
description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
|
26
|
+
language processing \ntools for English, including tokenization, part-of-speech
|
27
|
+
tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
|
28
28
|
email:
|
29
29
|
- louis.mullie@gmail.com
|
30
30
|
executables: []
|
@@ -32,6 +32,7 @@ extensions: []
|
|
32
32
|
extra_rdoc_files: []
|
33
33
|
files:
|
34
34
|
- lib/stanford-core-nlp/jar-loader.rb
|
35
|
+
- lib/stanford-core-nlp/java-wrapper.rb
|
35
36
|
- lib/stanford-core-nlp.rb
|
36
37
|
- bin/bridge.jar
|
37
38
|
- bin/classifiers/all.3class.distsim.crf.ser.gz
|
@@ -86,5 +87,5 @@ rubyforge_project:
|
|
86
87
|
rubygems_version: 1.8.15
|
87
88
|
signing_key:
|
88
89
|
specification_version: 3
|
89
|
-
summary: Ruby bindings to the Stanford
|
90
|
+
summary: Ruby bindings to the Stanford Core NLP tools.
|
90
91
|
test_files: []
|