stanford-core-nlp 0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README CHANGED
@@ -1,3 +1,3 @@
1
- Treat - Text Retrieval and Annotation Toolkit
1
+ Ruby bindings for the Stanford CoreNLP package
2
2
 
3
- See the wiki for more information at https://github.com/louismullie/treat/wiki/.
3
+ See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.
@@ -1,65 +1,82 @@
1
1
  module StanfordCoreNLP
2
2
 
3
- VERSION = '0.1'
3
+ VERSION = '0.1.1'
4
4
  require 'stanford-core-nlp/jar-loader.rb'
5
-
5
+ require 'stanford-core-nlp/java-wrapper.rb'
6
+
6
7
  class << self
7
8
  # The path in which to look for the Stanford JAR files.
8
9
  # This is passed to JarLoader.
9
10
  attr_accessor :jar_path
10
11
  # The flags for starting the JVM machine.
11
12
  # Parser and named entity recognizer are very memory consuming.
12
- attr_accessor :jvm_flags
13
+ attr_accessor :jvm_args
14
+ # A file to redirect JVM output to.
15
+ attr_accessor :log_file
16
+ # The model files. Use #set_model to modify these.
17
+ attr_accessor :model_files
13
18
  end
14
19
 
20
+ # The default JAR path is the gem's bin folder.
15
21
  self.jar_path = File.dirname(__FILE__) + '/../bin/'
16
- self.jvm_flags = ['-Xms512M', '-Xmx1024M']
17
-
18
- # Return the default properties (English models with
19
- # tokenizer and sentence splitter).
20
- def self.default_properties
21
- {
22
- 'annotators' => 'tokenize, ssplit',
23
- 'pos.model' => self.jar_path + 'taggers/english-left3words-distsim.tagger',
24
- 'ner.model.3class' => self.jar_path + 'classifiers/all.3class.distsim.crf.ser.gz',
25
- 'ner.model.7class' => self.jar_path + 'classifiers/muc.7class.distsim.crf.ser.gz',
26
- 'ner.model.MISCclass' => self.jar_path + 'classifiers/conll.4class.distsim.crf.ser.gz',
27
- 'parser.model' => self.jar_path + 'grammar/englishPCFG.ser.gz',
28
- 'dcoref.demonym' => self.jar_path + 'dcoref/demonyms.txt',
29
- 'dcoref.animate' => self.jar_path + 'dcoref/animate.unigrams.txt',
30
- 'dcoref.female' => self.jar_path + 'dcoref/female.unigrams.txt',
31
- 'dcoref.inanimate' => self.jar_path + 'dcoref/inanimate.unigrams.txt',
32
- 'dcoref.male' => self.jar_path + 'dcoref/male.unigrams.txt',
33
- 'dcoref.neutral' => self.jar_path + 'dcoref/neutral.unigrams.txt',
34
- 'dcoref.plural' => self.jar_path + 'dcoref/plural.unigrams.txt',
35
- 'dcoref.singular' => self.jar_path + 'dcoref/singular.unigrams.txt',
36
- 'dcoref.states' => self.jar_path + 'dcoref/state-abbreviations.txt',
37
- 'dcoref.countries' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
38
- 'dcoref.states.provinces' => self.jar_path + 'dcoref/unknown.txt', # Fix - can somebody provide this file?
39
- 'dcoref.extra.gender' => self.jar_path + 'dcoref/namegender.combine.txt'
40
- }
22
+ # Load the JVM with a minimum heap size of 512MB and a
23
+ # maximum heap size of 1024MB.
24
+ self.jvm_args = ['-Xms512M', '-Xmx1024M']
25
+ # Turn logging off by default.
26
+ self.log_file = nil
27
+
28
+ # Default model files.
29
+ self.model_files = {
30
+ 'pos.model' => 'taggers/english-left3words-distsim.tagger',
31
+ 'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
32
+ 'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
33
+ 'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
34
+ 'parser.model' => 'grammar/englishPCFG.ser.gz',
35
+ 'dcoref.demonym' => 'dcoref/demonyms.txt',
36
+ 'dcoref.animate' => 'dcoref/animate.unigrams.txt',
37
+ 'dcoref.female' => 'dcoref/female.unigrams.txt',
38
+ 'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
39
+ 'dcoref.male' => 'dcoref/male.unigrams.txt',
40
+ 'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
41
+ 'dcoref.plural' => 'dcoref/plural.unigrams.txt',
42
+ 'dcoref.singular' => 'dcoref/singular.unigrams.txt',
43
+ 'dcoref.states' => 'dcoref/state-abbreviations.txt',
44
+ 'dcoref.countries' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
45
+ 'dcoref.states.provinces' => 'dcoref/unknown.txt', # Fix - can somebody provide this file?
46
+ 'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
47
+ }
48
+
49
+ # Set a model file.
50
+ def self.set_model(name, file)
51
+ self.model_files[name] = file
41
52
  end
42
-
53
+
43
54
  # Load a StanfordCoreNLP pipeline with the specified JVM flags and
44
55
  # StanfordCoreNLP properties (hash of property => values).
45
- def self.load(properties)
46
- self.load_jars(jvm_flags, self.jar_path)
56
+ def self.load(*annotators)
57
+ self.load_jars(self.jvm_args, self.jar_path, self.log_file)
47
58
  self.create_classes
48
- properties = default_properties.merge(properties)
59
+ # Prepend the JAR path to the model files.
60
+ properties = {}
61
+ self.model_files.each { |k,v| properties[k] = self.jar_path + v }
62
+ properties['annotators'] =
63
+ annotators.map { |x| x.to_s }.join(', ')
49
64
  CoreNLP.new(get_properties(properties))
50
65
  end
51
66
 
52
67
  # Load the jars.
53
- def self.load_jars(jvm_flags, jar_path)
54
- JarLoader.jvm_flags = jvm_flags
68
+ def self.load_jars(jvm_args, jar_path, log_file)
69
+ JarLoader.jvm_args = jvm_args
55
70
  JarLoader.jar_path = jar_path
71
+ JarLoader.log(log_file) if log_file
56
72
  JarLoader.load('joda-time.jar')
57
73
  JarLoader.load('xom.jar')
58
74
  JarLoader.load('stanford-corenlp.jar')
59
75
  JarLoader.load('bridge.jar')
60
76
  end
61
77
 
62
- # Create the Ruby classes for core classes.
78
+ # Create the Ruby classes corresponding to the StanfordNLP
79
+ # core classes.
63
80
  def self.create_classes
64
81
  const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
65
82
  const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
@@ -76,31 +93,5 @@ module StanfordCoreNLP
76
93
  end
77
94
  props
78
95
  end
79
-
80
- Rjb::Rjb_JavaProxy.class_eval do
81
-
82
- # Get an annotation using the annotation bridge.
83
- def get(annotation)
84
- base_class = (annotation.to_s.split('_')[0] == 'coref') ?
85
- 'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
86
- 'edu.stanford.nlp.ling.CoreAnnotations$'
87
- anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
88
- url = "#{base_class}#{anno_class}Annotation"
89
- AnnotationBridge.getAnnotation(self, url)
90
- end
91
-
92
- # Shorthand for to_string defined by Java classes.
93
- def to_s; to_string; end
94
-
95
- # Provide Ruby-style iterators to wrap Java iterators.
96
- def each
97
- if !java_methods.include?('iterator()')
98
- raise 'This object cannot be iterated.'
99
- else
100
- i = self.iterator
101
- while i.has_next; yield i.next;end
102
- end
103
- end
104
- end
105
-
96
+
106
97
  end
@@ -1,5 +1,4 @@
1
1
  module StanfordCoreNLP
2
-
3
2
  class JarLoader
4
3
 
5
4
  require 'rjb'
@@ -7,28 +6,32 @@ module StanfordCoreNLP
7
6
  # Configuration options.
8
7
  class << self
9
8
  # An array of flags to pass to the JVM machine.
10
- attr_accessor :jvm_flags
9
+ attr_accessor :jvm_args
11
10
  attr_accessor :jar_path
12
11
  attr_accessor :log_file
13
12
  end
14
13
 
15
14
  # An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
16
- self.jvm_flags = []
15
+ self.jvm_args = []
17
16
  # The path in which to look for Jars.
18
17
  self.jar_path = ''
19
- # The name of the file to log to.
20
- # Setting this before the parser automatically calls self.redirect_to_log
18
+ # By default, disable logging.
21
19
  self.log_file = nil
22
-
20
+
23
21
  # Load Rjb and create Java VM.
24
22
  def self.rjb_initialize
25
23
  return if ::Rjb::loaded?
26
- ::Rjb::load(nil, self.jvm_flags)
27
- redirect_to_log if self.log_file
24
+ ::Rjb::load(nil, self.jvm_args)
25
+ set_java_logging if self.log_file
28
26
  end
29
-
30
- # Redirect the output of the JVM to self.log_file.
31
- def self.redirect_to_log
27
+
28
+ # Enable logging.
29
+ def self.log(file = 'log.txt')
30
+ self.log_file = file
31
+ end
32
+
33
+ # Redirect the output of the JVM to supplied log file.
34
+ def self.set_java_logging
32
35
  const_set(:System, Rjb::import('java.lang.System'))
33
36
  const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
34
37
  const_set(:File2, Rjb::import('java.io.File'))
@@ -37,7 +40,7 @@ module StanfordCoreNLP
37
40
  System.setOut(ps)
38
41
  System.setErr(ps)
39
42
  end
40
-
43
+
41
44
  # Load a jar.
42
45
  def self.load(jar)
43
46
  self.rjb_initialize
@@ -48,14 +51,5 @@ module StanfordCoreNLP
48
51
  ::Rjb::add_jar(jar)
49
52
  end
50
53
 
51
- # Silence output and log to file.
52
- def self.log(file = 'log.txt')
53
- @@log_file = file
54
- end
55
-
56
- # Whether the output is logged or not.
57
- def self.log?; @@log_file; end
58
-
59
54
  end
60
-
61
55
  end
@@ -0,0 +1,37 @@
1
+ module StanfordCoreNLP
2
+
3
+ # Modify the Rjb JavaProxy class to add our own methods to every Java object.
4
+ Rjb::Rjb_JavaProxy.class_eval do
5
+
6
+ # Dynamically defined on all proxied Java objects.
7
+ # Shorthand for to_string defined by Java classes.
8
+ def to_s; to_string; end
9
+
10
+ # Dynamically defined on all proxied Java iterators.
11
+ # Provide Ruby-style iterators to wrap Java iterators.
12
+ def each
13
+ if !java_methods.include?('iterator()')
14
+ raise 'This object cannot be iterated.'
15
+ else
16
+ i = self.iterator
17
+ while i.has_next; yield i.next; end
18
+ end
19
+ end
20
+
21
+ # Dynamically defined on all proxied annotation classes.
22
+ # Get an annotation using the annotation bridge.
23
+ def get(annotation)
24
+ if !java_methods.include?('get(Ljava.lang.Class;)')
25
+ raise 'No annotation can be retrieved on this object.'
26
+ else
27
+ base_class = (annotation.to_s.split('_')[0] == 'coref') ?
28
+ 'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
29
+ 'edu.stanford.nlp.ling.CoreAnnotations$'
30
+ anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
31
+ url = "#{base_class}#{anno_class}Annotation"
32
+ AnnotationBridge.getAnnotation(self, url)
33
+ end
34
+ end
35
+
36
+ end
37
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: stanford-core-nlp
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: 0.1.1
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rjb
16
- requirement: &70234870930100 !ruby/object:Gem::Requirement
16
+ requirement: &70258761325680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70234870930100
25
- description: ! ' High-level Ruby bindings to the Stanford CoreNLP package, a set natural
26
- language processing tools for English, including tokenization, part-of-speech tagging,
27
- lemmatization, named entity recognition, parsing, and coreference resolution. '
24
+ version_requirements: *70258761325680
25
+ description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
26
+ language processing \ntools for English, including tokenization, part-of-speech
27
+ tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
28
28
  email:
29
29
  - louis.mullie@gmail.com
30
30
  executables: []
@@ -32,6 +32,7 @@ extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
34
  - lib/stanford-core-nlp/jar-loader.rb
35
+ - lib/stanford-core-nlp/java-wrapper.rb
35
36
  - lib/stanford-core-nlp.rb
36
37
  - bin/bridge.jar
37
38
  - bin/classifiers/all.3class.distsim.crf.ser.gz
@@ -86,5 +87,5 @@ rubyforge_project:
86
87
  rubygems_version: 1.8.15
87
88
  signing_key:
88
89
  specification_version: 3
89
- summary: Ruby bindings to the Stanford CoreNLP tools.
90
+ summary: Ruby bindings to the Stanford Core NLP tools.
90
91
  test_files: []