RubyGems - stanford-core-nlp - Versions diffs - 0.1 → 0.1.1 - Mend

stanford-core-nlp 0.1 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

data/README +2 -2
data/lib/stanford-core-nlp.rb +53 -62
data/lib/stanford-core-nlp/jar-loader.rb +15 -21
data/lib/stanford-core-nlp/java-wrapper.rb +37 -0
metadata +8 -7

data/README CHANGED Viewed

@@ -1,3 +1,3 @@
-Treat - Text Retrieval and Annotation Toolkit
+Ruby bindings for the Stanford CoreNLP package
-See the wiki for more information at https://github.com/louismullie/treat/wiki/.
+See the wiki for more information at https://github.com/louismullie/stanford-core-nlp/wiki/.

data/lib/stanford-core-nlp.rb CHANGED Viewed

@@ -1,65 +1,82 @@
 module StanfordCoreNLP
-  VERSION = '0.1'
+  VERSION = '0.1.1'
   require 'stanford-core-nlp/jar-loader.rb'
+  require 'stanford-core-nlp/java-wrapper.rb'
   class << self
     # The path in which to look for the Stanford JAR files.
     # This is passed to JarLoader.
     attr_accessor :jar_path
     # The flags for starting the JVM machine.
     # Parser and named entity recognizer are very memory consuming.
-    attr_accessor :jvm_flags
+    attr_accessor :jvm_args
+    # A file to redirect JVM output to.
+    attr_accessor :log_file
+    # The model files. Use #set_model to modify these.
+    attr_accessor :model_files
   end
+  # The default JAR path is the gem's bin folder.
   self.jar_path = File.dirname(__FILE__) + '/../bin/'
-  self.jvm_flags = ['-Xms512M', '-Xmx1024M']
-  # Return the default properties (English models with
-  # tokenizer and sentence splitter).
-  def self.default_properties
-    {
-      'annotators' => 'tokenize, ssplit',
-      'pos.model' => self.jar_path + 'taggers/english-left3words-distsim.tagger',
-      'ner.model.3class' => self.jar_path + 'classifiers/all.3class.distsim.crf.ser.gz',
-      'ner.model.7class' => self.jar_path + 'classifiers/muc.7class.distsim.crf.ser.gz',
-      'ner.model.MISCclass' => self.jar_path + 'classifiers/conll.4class.distsim.crf.ser.gz',
-      'parser.model' => self.jar_path + 'grammar/englishPCFG.ser.gz',
-      'dcoref.demonym' => self.jar_path + 'dcoref/demonyms.txt',
-      'dcoref.animate' => self.jar_path + 'dcoref/animate.unigrams.txt',
-      'dcoref.female' => self.jar_path + 'dcoref/female.unigrams.txt',
-      'dcoref.inanimate' => self.jar_path + 'dcoref/inanimate.unigrams.txt',
-      'dcoref.male' => self.jar_path + 'dcoref/male.unigrams.txt',
-      'dcoref.neutral' => self.jar_path + 'dcoref/neutral.unigrams.txt',
-      'dcoref.plural' => self.jar_path + 'dcoref/plural.unigrams.txt',
-      'dcoref.singular' => self.jar_path + 'dcoref/singular.unigrams.txt',
-      'dcoref.states' => self.jar_path + 'dcoref/state-abbreviations.txt',
-      'dcoref.countries' => self.jar_path + 'dcoref/unknown.txt',     # Fix - can somebody provide this file?
-      'dcoref.states.provinces' => self.jar_path + 'dcoref/unknown.txt',   # Fix - can somebody provide this file?
-      'dcoref.extra.gender' => self.jar_path + 'dcoref/namegender.combine.txt'
-    }
+  # Load the JVM with a minimum heap size of 512MB and a
+  # maximum heap size of 1024MB.
+  self.jvm_args = ['-Xms512M', '-Xmx1024M']
+  # Turn logging off by default.
+  self.log_file = nil
+  # Default model files.
+  self.model_files = {
+    'pos.model' => 'taggers/english-left3words-distsim.tagger',
+    'ner.model.3class' => 'classifiers/all.3class.distsim.crf.ser.gz',
+    'ner.model.7class' => 'classifiers/muc.7class.distsim.crf.ser.gz',
+    'ner.model.MISCclass' => 'classifiers/conll.4class.distsim.crf.ser.gz',
+    'parser.model' => 'grammar/englishPCFG.ser.gz',
+    'dcoref.demonym' => 'dcoref/demonyms.txt',
+    'dcoref.animate' => 'dcoref/animate.unigrams.txt',
+    'dcoref.female' => 'dcoref/female.unigrams.txt',
+    'dcoref.inanimate' => 'dcoref/inanimate.unigrams.txt',
+    'dcoref.male' => 'dcoref/male.unigrams.txt',
+    'dcoref.neutral' => 'dcoref/neutral.unigrams.txt',
+    'dcoref.plural' => 'dcoref/plural.unigrams.txt',
+    'dcoref.singular' => 'dcoref/singular.unigrams.txt',
+    'dcoref.states' => 'dcoref/state-abbreviations.txt',
+    'dcoref.countries' => 'dcoref/unknown.txt',     # Fix - can somebody provide this file?
+    'dcoref.states.provinces' => 'dcoref/unknown.txt',   # Fix - can somebody provide this file?
+    'dcoref.extra.gender' => 'dcoref/namegender.combine.txt'
+  }
+  # Set a model file.
+  def self.set_model(name, file)
+    self.model_files[name] = file
   end
   # Load a StanfordCoreNLP pipeline with the specified JVM flags and
   # StanfordCoreNLP properties (hash of property => values).
-  def self.load(properties)
-    self.load_jars(jvm_flags, self.jar_path)
+  def self.load(*annotators)
+    self.load_jars(self.jvm_args, self.jar_path, self.log_file)
     self.create_classes
-    properties = default_properties.merge(properties)
+    # Prepend the JAR path to the model files.
+    properties = {}
+    self.model_files.each { |k,v| properties[k] = self.jar_path + v }
+    properties['annotators'] =
+    annotators.map { |x| x.to_s }.join(', ')
     CoreNLP.new(get_properties(properties))
   end
   # Load the jars.
-  def self.load_jars(jvm_flags, jar_path)
-    JarLoader.jvm_flags = jvm_flags
+  def self.load_jars(jvm_args, jar_path, log_file)
+    JarLoader.jvm_args = jvm_args
     JarLoader.jar_path = jar_path
+    JarLoader.log(log_file) if log_file
     JarLoader.load('joda-time.jar')
     JarLoader.load('xom.jar')
     JarLoader.load('stanford-corenlp.jar')
     JarLoader.load('bridge.jar')
   end
-  # Create the Ruby classes for core classes.
+  # Create the Ruby classes corresponding to the StanfordNLP
+  # core classes.
   def self.create_classes
     const_set(:CoreNLP, Rjb::import('edu.stanford.nlp.pipeline.StanfordCoreNLP'))
     const_set(:Annotation, Rjb::import('edu.stanford.nlp.pipeline.Annotation'))
@@ -76,31 +93,5 @@ module StanfordCoreNLP
     end
     props
   end
-  Rjb::Rjb_JavaProxy.class_eval do
-    # Get an annotation using the annotation bridge.
-    def get(annotation)
-      base_class = (annotation.to_s.split('_')[0] == 'coref') ?
-      'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
-      'edu.stanford.nlp.ling.CoreAnnotations$'
-      anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
-      url = "#{base_class}#{anno_class}Annotation"
-      AnnotationBridge.getAnnotation(self, url)
-    end
-    # Shorthand for to_string defined by Java classes.
-    def to_s; to_string; end
-    # Provide Ruby-style iterators to wrap Java iterators.
-    def each
-      if !java_methods.include?('iterator()')
-        raise 'This object cannot be iterated.'
-      else
-        i = self.iterator
-        while i.has_next; yield i.next;end
-      end
-    end
-  end
 end

data/lib/stanford-core-nlp/jar-loader.rb CHANGED Viewed

@@ -1,5 +1,4 @@
 module StanfordCoreNLP
   class JarLoader
     require 'rjb'
@@ -7,28 +6,32 @@ module StanfordCoreNLP
     # Configuration options.
     class << self
       # An array of flags to pass to the JVM machine.
-      attr_accessor :jvm_flags
+      attr_accessor :jvm_args
       attr_accessor :jar_path
       attr_accessor :log_file
     end
     # An array of string flags to supply to the JVM, e.g. ['-Xms512M', '-Xmx1024M']
-    self.jvm_flags = []
+    self.jvm_args = []
     # The path in which to look for Jars.
     self.jar_path = ''
-    # The name of the file to log to.
-    # Setting this before the parser automatically calls self.redirect_to_log
+    # By default, disable logging.
     self.log_file = nil
     # Load Rjb and create Java VM.
     def self.rjb_initialize
       return if ::Rjb::loaded?
-      ::Rjb::load(nil, self.jvm_flags)
-      redirect_to_log if self.log_file
+      ::Rjb::load(nil, self.jvm_args)
+      set_java_logging if self.log_file
     end
-    # Redirect the output of the JVM to self.log_file.
-    def self.redirect_to_log
+    # Enable logging.
+    def self.log(file = 'log.txt')
+      self.log_file = file
+    end
+    # Redirect the output of the JVM to supplied log file.
+    def self.set_java_logging
       const_set(:System, Rjb::import('java.lang.System'))
       const_set(:PrintStream, Rjb::import('java.io.PrintStream'))
       const_set(:File2, Rjb::import('java.io.File'))
@@ -37,7 +40,7 @@ module StanfordCoreNLP
       System.setOut(ps)
       System.setErr(ps)
     end
     # Load a jar.
     def self.load(jar)
       self.rjb_initialize
@@ -48,14 +51,5 @@ module StanfordCoreNLP
       ::Rjb::add_jar(jar)
     end
-    # Silence output and log to file.
-    def self.log(file = 'log.txt')
-      @@log_file = file
-    end
-    # Whether the output is logged or not.
-    def self.log?; @@log_file; end
   end
 end

data/lib/stanford-core-nlp/java-wrapper.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module StanfordCoreNLP
+  # Modify the Rjb JavaProxy class to add our own methods to every Java object.
+  Rjb::Rjb_JavaProxy.class_eval do
+    # Dynamically defined on all proxied Java objects.
+    # Shorthand for to_string defined by Java classes.
+    def to_s; to_string; end
+    # Dynamically defined on all proxied Java iterators.
+    # Provide Ruby-style iterators to wrap Java iterators.
+    def each
+      if !java_methods.include?('iterator()')
+        raise 'This object cannot be iterated.'
+      else
+        i = self.iterator
+        while i.has_next; yield i.next; end
+      end
+    end
+    # Dynamically defined on all proxied annotation classes.
+    # Get an annotation using the annotation bridge.
+    def get(annotation)
+      if !java_methods.include?('get(Ljava.lang.Class;)')
+        raise 'No annotation can be retrieved on this object.'
+      else
+        base_class = (annotation.to_s.split('_')[0] == 'coref') ?
+        'edu.stanford.nlp.dcoref.CorefCoreAnnotations$' :
+        'edu.stanford.nlp.ling.CoreAnnotations$'
+        anno_class = annotation.to_s.gsub(/^[a-z]|_[a-z]/) { |a| a.upcase }.gsub('_', '')
+        url = "#{base_class}#{anno_class}Annotation"
+        AnnotationBridge.getAnnotation(self, url)
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: stanford-core-nlp
 version: !ruby/object:Gem::Version
-  version: '0.1'
+  version: 0.1.1
   prerelease:
 platform: ruby
 authors:
@@ -13,7 +13,7 @@ date: 2012-01-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rjb
-  requirement: &70234870930100 !ruby/object:Gem::Requirement
+  requirement: &70258761325680 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
         version: '0'
   type: :runtime
   prerelease: false
-  version_requirements: *70234870930100
-description: ! ' High-level Ruby bindings to the Stanford CoreNLP package, a set natural
-  language processing tools for English, including tokenization, part-of-speech tagging,
-  lemmatization, named entity recognition, parsing, and coreference resolution. '
+  version_requirements: *70258761325680
+description: ! " High-level Ruby bindings to the Stanford CoreNLP package, a set natural
+  language processing \ntools for English, including tokenization, part-of-speech
+  tagging, lemmatization, named entity recognition, à\nparsing, and coreference resolution. "
 email:
 - louis.mullie@gmail.com
 executables: []
@@ -32,6 +32,7 @@ extensions: []
 extra_rdoc_files: []
 files:
 - lib/stanford-core-nlp/jar-loader.rb
+- lib/stanford-core-nlp/java-wrapper.rb
 - lib/stanford-core-nlp.rb
 - bin/bridge.jar
 - bin/classifiers/all.3class.distsim.crf.ser.gz
@@ -86,5 +87,5 @@ rubyforge_project:
 rubygems_version: 1.8.15
 signing_key:
 specification_version: 3
-summary: Ruby bindings to the Stanford CoreNLP tools.
+summary: Ruby bindings to the Stanford Core NLP tools.
 test_files: []