corenlp 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 072b1b153bb4591c16e6242713e9b4431ba003da
4
- data.tar.gz: 0e71dd5289c128e0245f082ace874d29b51cd92f
3
+ metadata.gz: a88d19d7dc8eae9e7df59d4fe9b1e0c492aa194f
4
+ data.tar.gz: 5c6a32994f6720210b7a909c5b839503530793b3
5
5
  SHA512:
6
- metadata.gz: 7969ddc18c42ca6c832c06bf677df56212f4ec54bc35bebbd4d9e4925425804015ef2e333cbc0463a88af76204ed46f814c355c10c703586761c9a7db501442d
7
- data.tar.gz: 646eb3e03f42182e5a957fe6d52db3a7767cd52c9cf48bb4bd4bc36d0e07c03f1f4890c652addad58ccaf845f9a430c5d877143902b3824efc60c42e44757f31
6
+ metadata.gz: b5f185cda3feb604e97e5682440a01f763773e80631bf5a68aa19a1a35e2874999770dd8707af2d74cb9baf630491219fdba498284f37a784813510d48c6549f
7
+ data.tar.gz: 99cfc0054a47e92c517b6ba9025e063d6fb316a2788acf9255079ab828a7f046ff82057fee9b2b1a154127f7825ce6abcef0eae52a08e59175c0d94bcb58233b
@@ -0,0 +1,111 @@
1
+ require "nokogiri"
2
+ require "bundler"
3
+ Bundler.require
4
+
5
+ module Corenlp
6
+ class Treebank
7
+ attr_accessor :raw_text, :filenames, :output_directory, :summary_file, :threads_to_use, :java_max_memory, :sentences
8
+
9
+ def initialize(attrs = {})
10
+ self.raw_text = attrs[:raw_text] || ""
11
+ self.filenames = []
12
+ self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
13
+ self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
14
+ self.filenames = []
15
+ self.threads_to_use = attrs[:threads_to_use] || 4
16
+ self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
17
+ self.sentences = []
18
+ end
19
+
20
+ def write_output_file_and_summary_file
21
+ input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
22
+ filenames << input_file
23
+ File.open(input_file, "w"){|f| f.write(raw_text)}
24
+ File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
25
+ end
26
+
27
+ def process_files_with_stanford_corenlp
28
+ deps = "./lib/ext" # dependencies directory: JARs, model files, taggers, etc.
29
+ classpath = "#{deps}/stanford-corenlp-3.4.jar:#{deps}/stanford-corenlp-3.4-models.jar:#{deps}/xom.jar:#{deps}/joda-time.jar:#{deps}/jollyday.jar:#{deps}/ejml-0.23.jar"
30
+ stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
31
+ annotators = "tokenize,ssplit,pos,lemma,parse,ner"
32
+
33
+ options = []
34
+ options << ["-cp", classpath]
35
+ options << [java_max_memory, stanford_bin]
36
+ options << ["-annotators", annotators]
37
+ options << ["-ner.useSUTime", 0] # turn this off
38
+ #options << ["-sutime.binders", 0]
39
+ options << ["-outputDirectory", output_directory]
40
+ options << ["-nthreads", threads_to_use]
41
+ options << ["-filelist", summary_file] # a file with one zone file per line
42
+
43
+ command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
44
+ puts "Running command: \n\n#{command}\n\n"
45
+ `#{command}`
46
+ end
47
+
48
+ def build_treebank
49
+ filenames.each do |filename|
50
+ xml_file = "#{filename}.xml"
51
+ doc = Nokogiri.XML(File.open(xml_file))
52
+ doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
53
+ sentence = Sentence.new(index: idx)
54
+ self.sentences << sentence
55
+ sentence_node.xpath(".//token").each_with_index do |token_node, index|
56
+ text = token_node.children.at('word').text
57
+ text = Token.clean_stanford_text(text)
58
+ cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
59
+ token_attrs = {
60
+ index: index,
61
+ text: text,
62
+ penn_treebank_tag: token_node.children.at('POS').text,
63
+ stanford_lemma: cleaned_stanford_lemma,
64
+ type: Token.token_subclass_from_text(text),
65
+ ner: token_node.children.at('NER').text
66
+ }
67
+ token = Token.token_subclass_from_text(text).new(token_attrs)
68
+ sentence.tokens << token
69
+ end
70
+ sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
71
+ dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
72
+ governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
73
+ if dependent_index >= 0 && governor_index >= 0
74
+ dependent = sentence.get_dependency_token_by_index(dependent_index),
75
+ governor = sentence.get_dependency_token_by_index(governor_index),
76
+ relation = dep_node.attr('type')
77
+ if dependent && governor && relation
78
+ token_dep = TokenDependency.new({
79
+ dependent: sentence.get_dependency_token_by_index(dependent_index),
80
+ governor: sentence.get_dependency_token_by_index(governor_index),
81
+ relation: dep_node.attr('type')
82
+ })
83
+ sentence.token_dependencies << token_dep
84
+ end
85
+ end
86
+ end
87
+ sentence_node.xpath(".//parse").each do |parse_node|
88
+ sentence.parse_tree_raw = parse_node.text
89
+ end
90
+ end
91
+ end
92
+ end
93
+
94
+ def parse
95
+ write_output_file_and_summary_file
96
+ process_files_with_stanford_corenlp
97
+ build_treebank
98
+ self
99
+ end
100
+ end
101
+ end
102
+
103
+ require "corenlp/version"
104
+ require "corenlp/sentence"
105
+ require "corenlp/token"
106
+ require "corenlp/token_dependency"
107
+ require "corenlp/enclitic"
108
+ require "corenlp/word"
109
+ require "corenlp/punctuation"
110
+ require "corenlp/number"
111
+ require "corenlp/downloader"
@@ -1,7 +1,9 @@
1
+ require 'bundler'
2
+ Bundler.require
1
3
  require 'net/http'
2
- require 'zip/zip'
3
4
  require 'fileutils'
4
5
  require 'uri'
6
+ require 'zip'
5
7
 
6
8
  module Corenlp
7
9
  class Downloader
@@ -14,7 +16,7 @@ module Corenlp
14
16
 
15
17
  def extract
16
18
  puts "extracting file..."
17
- Zip::ZipFile.open(local_file) do |zip_file|
19
+ Zip::File.open(local_file) do |zip_file|
18
20
  zip_file.each do |file|
19
21
  file_path = File.join(destination, file.name)
20
22
  zip_file.extract(file, file_path) unless File.exist?(file_path)
@@ -38,7 +40,7 @@ module Corenlp
38
40
 
39
41
  def download
40
42
  return unless url
41
- puts "downloading zip file from url #{url} to #{destination}..."
43
+ puts "downloading zip file from url #{url}. Extracting files to #{destination}..."
42
44
  self.local_file = File.basename(url)
43
45
  uri = URI.parse(url)
44
46
  if local_file && uri
@@ -1,3 +1,3 @@
1
1
  module Corenlp
2
- VERSION = "0.0.3"
2
+ VERSION = "0.0.4"
3
3
  end
@@ -0,0 +1,10 @@
1
+ require 'corenlp'
2
+
3
+ desc "download Stanford CoreNLP dependencies files"
4
+ namespace :corenlp do
5
+ task :download_deps do
6
+ zip_file_url = ENV['CORENLP_DOWNLOAD_URL'] || "http://nlp.stanford.edu/software/stanford-corenlp-full-2014-06-16.zip"
7
+ destination = File.join(ENV['CORENLP_DEPS_DIR'] || './lib/ext/')
8
+ Corenlp::Downloader.new(zip_file_url, destination).download
9
+ end
10
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: corenlp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Lengio Corporation
@@ -102,6 +102,7 @@ executables: []
102
102
  extensions: []
103
103
  extra_rdoc_files: []
104
104
  files:
105
+ - lib/corenlp.rb
105
106
  - lib/corenlp/downloader.rb
106
107
  - lib/corenlp/enclitic.rb
107
108
  - lib/corenlp/number.rb
@@ -111,6 +112,7 @@ files:
111
112
  - lib/corenlp/token_dependency.rb
112
113
  - lib/corenlp/version.rb
113
114
  - lib/corenlp/word.rb
115
+ - lib/tasks/downloader.rake
114
116
  - test/downloader_test.rb
115
117
  - test/enclitic_test.rb
116
118
  - test/number_test.rb