corenlp 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 072b1b153bb4591c16e6242713e9b4431ba003da
4
- data.tar.gz: 0e71dd5289c128e0245f082ace874d29b51cd92f
3
+ metadata.gz: a88d19d7dc8eae9e7df59d4fe9b1e0c492aa194f
4
+ data.tar.gz: 5c6a32994f6720210b7a909c5b839503530793b3
5
5
  SHA512:
6
- metadata.gz: 7969ddc18c42ca6c832c06bf677df56212f4ec54bc35bebbd4d9e4925425804015ef2e333cbc0463a88af76204ed46f814c355c10c703586761c9a7db501442d
7
- data.tar.gz: 646eb3e03f42182e5a957fe6d52db3a7767cd52c9cf48bb4bd4bc36d0e07c03f1f4890c652addad58ccaf845f9a430c5d877143902b3824efc60c42e44757f31
6
+ metadata.gz: b5f185cda3feb604e97e5682440a01f763773e80631bf5a68aa19a1a35e2874999770dd8707af2d74cb9baf630491219fdba498284f37a784813510d48c6549f
7
+ data.tar.gz: 99cfc0054a47e92c517b6ba9025e063d6fb316a2788acf9255079ab828a7f046ff82057fee9b2b1a154127f7825ce6abcef0eae52a08e59175c0d94bcb58233b
@@ -0,0 +1,111 @@
1
+ require "nokogiri"
2
+ require "bundler"
3
+ Bundler.require
4
+
5
+ module Corenlp
6
+ class Treebank
7
+ attr_accessor :raw_text, :filenames, :output_directory, :summary_file, :threads_to_use, :java_max_memory, :sentences
8
+
9
+ def initialize(attrs = {})
10
+ self.raw_text = attrs[:raw_text] || ""
11
+ self.filenames = []
12
+ self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
13
+ self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
14
+ self.filenames = []
15
+ self.threads_to_use = attrs[:threads_to_use] || 4
16
+ self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
17
+ self.sentences = []
18
+ end
19
+
20
+ def write_output_file_and_summary_file
21
+ input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
22
+ filenames << input_file
23
+ File.open(input_file, "w"){|f| f.write(raw_text)}
24
+ File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
25
+ end
26
+
27
+ def process_files_with_stanford_corenlp
28
+ deps = "./lib/ext" # dependencies directory: JARs, model files, taggers, etc.
29
+ classpath = "#{deps}/stanford-corenlp-3.4.jar:#{deps}/stanford-corenlp-3.4-models.jar:#{deps}/xom.jar:#{deps}/joda-time.jar:#{deps}/jollyday.jar:#{deps}/ejml-0.23.jar"
30
+ stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
31
+ annotators = "tokenize,ssplit,pos,lemma,parse,ner"
32
+
33
+ options = []
34
+ options << ["-cp", classpath]
35
+ options << [java_max_memory, stanford_bin]
36
+ options << ["-annotators", annotators]
37
+ options << ["-ner.useSUTime", 0] # turn this off
38
+ #options << ["-sutime.binders", 0]
39
+ options << ["-outputDirectory", output_directory]
40
+ options << ["-nthreads", threads_to_use]
41
+ options << ["-filelist", summary_file] # a file with one zone file per line
42
+
43
+ command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
44
+ puts "Running command: \n\n#{command}\n\n"
45
+ `#{command}`
46
+ end
47
+
48
+ def build_treebank
49
+ filenames.each do |filename|
50
+ xml_file = "#{filename}.xml"
51
+ doc = Nokogiri.XML(File.open(xml_file))
52
+ doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
53
+ sentence = Sentence.new(index: idx)
54
+ self.sentences << sentence
55
+ sentence_node.xpath(".//token").each_with_index do |token_node, index|
56
+ text = token_node.children.at('word').text
57
+ text = Token.clean_stanford_text(text)
58
+ cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
59
+ token_attrs = {
60
+ index: index,
61
+ text: text,
62
+ penn_treebank_tag: token_node.children.at('POS').text,
63
+ stanford_lemma: cleaned_stanford_lemma,
64
+ type: Token.token_subclass_from_text(text),
65
+ ner: token_node.children.at('NER').text
66
+ }
67
+ token = Token.token_subclass_from_text(text).new(token_attrs)
68
+ sentence.tokens << token
69
+ end
70
+ sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
71
+ dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
72
+ governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
73
+ if dependent_index >= 0 && governor_index >= 0
74
+ dependent = sentence.get_dependency_token_by_index(dependent_index),
75
+ governor = sentence.get_dependency_token_by_index(governor_index),
76
+ relation = dep_node.attr('type')
77
+ if dependent && governor && relation
78
+ token_dep = TokenDependency.new({
79
+ dependent: sentence.get_dependency_token_by_index(dependent_index),
80
+ governor: sentence.get_dependency_token_by_index(governor_index),
81
+ relation: dep_node.attr('type')
82
+ })
83
+ sentence.token_dependencies << token_dep
84
+ end
85
+ end
86
+ end
87
+ sentence_node.xpath(".//parse").each do |parse_node|
88
+ sentence.parse_tree_raw = parse_node.text
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ def parse
95
+ write_output_file_and_summary_file
96
+ process_files_with_stanford_corenlp
97
+ build_treebank
98
+ self
99
+ end
100
+ end
101
+ end
102
+
103
+ require "corenlp/version"
104
+ require "corenlp/sentence"
105
+ require "corenlp/token"
106
+ require "corenlp/token_dependency"
107
+ require "corenlp/enclitic"
108
+ require "corenlp/word"
109
+ require "corenlp/punctuation"
110
+ require "corenlp/number"
111
+ require "corenlp/downloader"
@@ -1,7 +1,9 @@
1
+ require 'bundler'
2
+ Bundler.require
1
3
  require 'net/http'
2
- require 'zip/zip'
3
4
  require 'fileutils'
4
5
  require 'uri'
6
+ require 'zip'
5
7
 
6
8
  module Corenlp
7
9
  class Downloader
@@ -14,7 +16,7 @@ module Corenlp
14
16
 
15
17
  def extract
16
18
  puts "extracting file..."
17
- Zip::ZipFile.open(local_file) do |zip_file|
19
+ Zip::File.open(local_file) do |zip_file|
18
20
  zip_file.each do |file|
19
21
  file_path = File.join(destination, file.name)
20
22
  zip_file.extract(file, file_path) unless File.exist?(file_path)
@@ -38,7 +40,7 @@ module Corenlp
38
40
 
39
41
  def download
40
42
  return unless url
41
- puts "downloading zip file from url #{url} to #{destination}..."
43
+ puts "downloading zip file from url #{url}. Extracting files to #{destination}..."
42
44
  self.local_file = File.basename(url)
43
45
  uri = URI.parse(url)
44
46
  if local_file && uri
@@ -1,3 +1,3 @@
1
1
  module Corenlp
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -0,0 +1,10 @@
1
+ require 'corenlp'
2
+
3
+ desc "download Stanford CoreNLP dependencies files"
4
+ namespace :corenlp do
5
+ task :download_deps do
6
+ zip_file_url = ENV['CORENLP_DOWNLOAD_URL'] || "http://nlp.stanford.edu/software/stanford-corenlp-full-2014-06-16.zip"
7
+ destination = File.join(ENV['CORENLP_DEPS_DIR'] || './lib/ext/')
8
+ Corenlp::Downloader.new(zip_file_url, destination).download
9
+ end
10
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: corenlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lengio Corporation
@@ -102,6 +102,7 @@ executables: []
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
+ - lib/corenlp.rb
105
106
  - lib/corenlp/downloader.rb
106
107
  - lib/corenlp/enclitic.rb
107
108
  - lib/corenlp/number.rb
@@ -111,6 +112,7 @@ files:
111
112
  - lib/corenlp/token_dependency.rb
112
113
  - lib/corenlp/version.rb
113
114
  - lib/corenlp/word.rb
115
+ - lib/tasks/downloader.rake
114
116
  - test/downloader_test.rb
115
117
  - test/enclitic_test.rb
116
118
  - test/number_test.rb