RubyGems - corenlp - Versions diffs - 0.0.3 → 0.0.4 - Mend

corenlp 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 072b1b153bb4591c16e6242713e9b4431ba003da
-  data.tar.gz: 0e71dd5289c128e0245f082ace874d29b51cd92f
+  metadata.gz: a88d19d7dc8eae9e7df59d4fe9b1e0c492aa194f
+  data.tar.gz: 5c6a32994f6720210b7a909c5b839503530793b3
 SHA512:
-  metadata.gz: 7969ddc18c42ca6c832c06bf677df56212f4ec54bc35bebbd4d9e4925425804015ef2e333cbc0463a88af76204ed46f814c355c10c703586761c9a7db501442d
-  data.tar.gz: 646eb3e03f42182e5a957fe6d52db3a7767cd52c9cf48bb4bd4bc36d0e07c03f1f4890c652addad58ccaf845f9a430c5d877143902b3824efc60c42e44757f31
+  metadata.gz: b5f185cda3feb604e97e5682440a01f763773e80631bf5a68aa19a1a35e2874999770dd8707af2d74cb9baf630491219fdba498284f37a784813510d48c6549f
+  data.tar.gz: 99cfc0054a47e92c517b6ba9025e063d6fb316a2788acf9255079ab828a7f046ff82057fee9b2b1a154127f7825ce6abcef0eae52a08e59175c0d94bcb58233b

data/lib/corenlp.rb ADDED

@@ -0,0 +1,111 @@
+require "nokogiri"
+require "bundler"
+Bundler.require
+module Corenlp
+  class Treebank
+    attr_accessor :raw_text, :filenames, :output_directory, :summary_file, :threads_to_use, :java_max_memory, :sentences
+    def initialize(attrs = {})
+      self.raw_text = attrs[:raw_text] || ""
+      self.filenames = []
+      self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
+      self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
+      self.filenames = []
+      self.threads_to_use = attrs[:threads_to_use] || 4
+      self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
+      self.sentences = []
+    end
+    def write_output_file_and_summary_file
+      input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
+      filenames << input_file
+      File.open(input_file, "w"){|f| f.write(raw_text)}
+      File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
+    end
+    def process_files_with_stanford_corenlp
+      deps = "./lib/ext" # dependencies directory: JARs, model files, taggers, etc.
+      classpath = "#{deps}/stanford-corenlp-3.4.jar:#{deps}/stanford-corenlp-3.4-models.jar:#{deps}/xom.jar:#{deps}/joda-time.jar:#{deps}/jollyday.jar:#{deps}/ejml-0.23.jar"
+      stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
+      annotators = "tokenize,ssplit,pos,lemma,parse,ner"
+      options = []
+      options << ["-cp", classpath]
+      options << [java_max_memory, stanford_bin]
+      options << ["-annotators", annotators]
+      options << ["-ner.useSUTime", 0] # turn this off
+      #options << ["-sutime.binders", 0]
+      options << ["-outputDirectory", output_directory]
+      options << ["-nthreads", threads_to_use]
+      options << ["-filelist", summary_file] # a file with one zone file per line
+      command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
+      puts "Running command: \n\n#{command}\n\n"
+      `#{command}`
+    end
+    def build_treebank
+      filenames.each do |filename|
+        xml_file = "#{filename}.xml"
+        doc = Nokogiri.XML(File.open(xml_file))
+        doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
+          sentence = Sentence.new(index: idx)
+          self.sentences << sentence
+          sentence_node.xpath(".//token").each_with_index do |token_node, index|
+            text = token_node.children.at('word').text
+            text = Token.clean_stanford_text(text)
+            cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
+            token_attrs = {
+              index: index,
+              text: text,
+              penn_treebank_tag: token_node.children.at('POS').text,
+              stanford_lemma: cleaned_stanford_lemma,
+              type: Token.token_subclass_from_text(text),
+              ner: token_node.children.at('NER').text
+            }
+            token = Token.token_subclass_from_text(text).new(token_attrs)
+            sentence.tokens << token
+          end
+          sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
+            dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
+            governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
+            if dependent_index >= 0 && governor_index >= 0
+              dependent = sentence.get_dependency_token_by_index(dependent_index),
+              governor  = sentence.get_dependency_token_by_index(governor_index),
+              relation = dep_node.attr('type')
+              if dependent && governor && relation
+                token_dep = TokenDependency.new({
+                  dependent: sentence.get_dependency_token_by_index(dependent_index),
+                  governor: sentence.get_dependency_token_by_index(governor_index),
+                  relation: dep_node.attr('type')
+                })
+                sentence.token_dependencies << token_dep
+              end
+            end
+          end
+          sentence_node.xpath(".//parse").each do |parse_node|
+            sentence.parse_tree_raw = parse_node.text
+          end
+        end
+      end
+    end
+    def parse
+      write_output_file_and_summary_file
+      process_files_with_stanford_corenlp
+      build_treebank
+      self
+    end
+  end
+end
+require "corenlp/version"
+require "corenlp/sentence"
+require "corenlp/token"
+require "corenlp/token_dependency"
+require "corenlp/enclitic"
+require "corenlp/word"
+require "corenlp/punctuation"
+require "corenlp/number"
+require "corenlp/downloader"

data/lib/corenlp/downloader.rb CHANGED

@@ -1,7 +1,9 @@
+require 'bundler'
+Bundler.require
 require 'net/http'
-require 'zip/zip'
 require 'fileutils'
 require 'uri'
+require 'zip'
 module Corenlp
   class Downloader
@@ -14,7 +16,7 @@ module Corenlp
     def extract
       puts "extracting file..."
-      Zip::ZipFile.open(local_file) do |zip_file|
+      Zip::File.open(local_file) do |zip_file|
         zip_file.each do |file|
           file_path = File.join(destination, file.name)
           zip_file.extract(file, file_path) unless File.exist?(file_path)
@@ -38,7 +40,7 @@ module Corenlp
     def download
       return unless url
-      puts "downloading zip file from url #{url} to #{destination}..."
+      puts "downloading zip file from url #{url}. Extracting files to #{destination}..."
       self.local_file = File.basename(url)
       uri = URI.parse(url)
       if local_file && uri

data/lib/corenlp/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Corenlp
-  VERSION = "0.0.3"
+  VERSION = "0.0.4"
 end

data/lib/tasks/downloader.rake ADDED

@@ -0,0 +1,10 @@
+require 'corenlp'
+desc "download Stanford CoreNLP dependencies files"
+namespace :corenlp do
+  task :download_deps do
+    zip_file_url = ENV['CORENLP_DOWNLOAD_URL'] || "http://nlp.stanford.edu/software/stanford-corenlp-full-2014-06-16.zip"
+    destination = File.join(ENV['CORENLP_DEPS_DIR'] || './lib/ext/')
+    Corenlp::Downloader.new(zip_file_url, destination).download
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: corenlp
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Lengio Corporation
@@ -102,6 +102,7 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/corenlp.rb
 - lib/corenlp/downloader.rb
 - lib/corenlp/enclitic.rb
 - lib/corenlp/number.rb
@@ -111,6 +112,7 @@ files:
 - lib/corenlp/token_dependency.rb
 - lib/corenlp/version.rb
 - lib/corenlp/word.rb
+- lib/tasks/downloader.rake
 - test/downloader_test.rb
 - test/enclitic_test.rb
 - test/number_test.rb