corenlp 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/corenlp.rb +111 -0
- data/lib/corenlp/downloader.rb +5 -3
- data/lib/corenlp/version.rb +1 -1
- data/lib/tasks/downloader.rake +10 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a88d19d7dc8eae9e7df59d4fe9b1e0c492aa194f
|
4
|
+
data.tar.gz: 5c6a32994f6720210b7a909c5b839503530793b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5f185cda3feb604e97e5682440a01f763773e80631bf5a68aa19a1a35e2874999770dd8707af2d74cb9baf630491219fdba498284f37a784813510d48c6549f
|
7
|
+
data.tar.gz: 99cfc0054a47e92c517b6ba9025e063d6fb316a2788acf9255079ab828a7f046ff82057fee9b2b1a154127f7825ce6abcef0eae52a08e59175c0d94bcb58233b
|
data/lib/corenlp.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "bundler"
|
3
|
+
Bundler.require
|
4
|
+
|
5
|
+
module Corenlp
|
6
|
+
class Treebank
|
7
|
+
attr_accessor :raw_text, :filenames, :output_directory, :summary_file, :threads_to_use, :java_max_memory, :sentences
|
8
|
+
|
9
|
+
def initialize(attrs = {})
|
10
|
+
self.raw_text = attrs[:raw_text] || ""
|
11
|
+
self.filenames = []
|
12
|
+
self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
|
13
|
+
self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
|
14
|
+
self.filenames = []
|
15
|
+
self.threads_to_use = attrs[:threads_to_use] || 4
|
16
|
+
self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
|
17
|
+
self.sentences = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_output_file_and_summary_file
|
21
|
+
input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
|
22
|
+
filenames << input_file
|
23
|
+
File.open(input_file, "w"){|f| f.write(raw_text)}
|
24
|
+
File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
|
25
|
+
end
|
26
|
+
|
27
|
+
def process_files_with_stanford_corenlp
|
28
|
+
deps = "./lib/ext" # dependencies directory: JARs, model files, taggers, etc.
|
29
|
+
classpath = "#{deps}/stanford-corenlp-3.4.jar:#{deps}/stanford-corenlp-3.4-models.jar:#{deps}/xom.jar:#{deps}/joda-time.jar:#{deps}/jollyday.jar:#{deps}/ejml-0.23.jar"
|
30
|
+
stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
|
31
|
+
annotators = "tokenize,ssplit,pos,lemma,parse,ner"
|
32
|
+
|
33
|
+
options = []
|
34
|
+
options << ["-cp", classpath]
|
35
|
+
options << [java_max_memory, stanford_bin]
|
36
|
+
options << ["-annotators", annotators]
|
37
|
+
options << ["-ner.useSUTime", 0] # turn this off
|
38
|
+
#options << ["-sutime.binders", 0]
|
39
|
+
options << ["-outputDirectory", output_directory]
|
40
|
+
options << ["-nthreads", threads_to_use]
|
41
|
+
options << ["-filelist", summary_file] # a file with one zone file per line
|
42
|
+
|
43
|
+
command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
|
44
|
+
puts "Running command: \n\n#{command}\n\n"
|
45
|
+
`#{command}`
|
46
|
+
end
|
47
|
+
|
48
|
+
def build_treebank
|
49
|
+
filenames.each do |filename|
|
50
|
+
xml_file = "#{filename}.xml"
|
51
|
+
doc = Nokogiri.XML(File.open(xml_file))
|
52
|
+
doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
|
53
|
+
sentence = Sentence.new(index: idx)
|
54
|
+
self.sentences << sentence
|
55
|
+
sentence_node.xpath(".//token").each_with_index do |token_node, index|
|
56
|
+
text = token_node.children.at('word').text
|
57
|
+
text = Token.clean_stanford_text(text)
|
58
|
+
cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
|
59
|
+
token_attrs = {
|
60
|
+
index: index,
|
61
|
+
text: text,
|
62
|
+
penn_treebank_tag: token_node.children.at('POS').text,
|
63
|
+
stanford_lemma: cleaned_stanford_lemma,
|
64
|
+
type: Token.token_subclass_from_text(text),
|
65
|
+
ner: token_node.children.at('NER').text
|
66
|
+
}
|
67
|
+
token = Token.token_subclass_from_text(text).new(token_attrs)
|
68
|
+
sentence.tokens << token
|
69
|
+
end
|
70
|
+
sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
|
71
|
+
dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
|
72
|
+
governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
|
73
|
+
if dependent_index >= 0 && governor_index >= 0
|
74
|
+
dependent = sentence.get_dependency_token_by_index(dependent_index),
|
75
|
+
governor = sentence.get_dependency_token_by_index(governor_index),
|
76
|
+
relation = dep_node.attr('type')
|
77
|
+
if dependent && governor && relation
|
78
|
+
token_dep = TokenDependency.new({
|
79
|
+
dependent: sentence.get_dependency_token_by_index(dependent_index),
|
80
|
+
governor: sentence.get_dependency_token_by_index(governor_index),
|
81
|
+
relation: dep_node.attr('type')
|
82
|
+
})
|
83
|
+
sentence.token_dependencies << token_dep
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
sentence_node.xpath(".//parse").each do |parse_node|
|
88
|
+
sentence.parse_tree_raw = parse_node.text
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse
|
95
|
+
write_output_file_and_summary_file
|
96
|
+
process_files_with_stanford_corenlp
|
97
|
+
build_treebank
|
98
|
+
self
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
require "corenlp/version"
|
104
|
+
require "corenlp/sentence"
|
105
|
+
require "corenlp/token"
|
106
|
+
require "corenlp/token_dependency"
|
107
|
+
require "corenlp/enclitic"
|
108
|
+
require "corenlp/word"
|
109
|
+
require "corenlp/punctuation"
|
110
|
+
require "corenlp/number"
|
111
|
+
require "corenlp/downloader"
|
data/lib/corenlp/downloader.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler.require
|
1
3
|
require 'net/http'
|
2
|
-
require 'zip/zip'
|
3
4
|
require 'fileutils'
|
4
5
|
require 'uri'
|
6
|
+
require 'zip'
|
5
7
|
|
6
8
|
module Corenlp
|
7
9
|
class Downloader
|
@@ -14,7 +16,7 @@ module Corenlp
|
|
14
16
|
|
15
17
|
def extract
|
16
18
|
puts "extracting file..."
|
17
|
-
Zip::
|
19
|
+
Zip::File.open(local_file) do |zip_file|
|
18
20
|
zip_file.each do |file|
|
19
21
|
file_path = File.join(destination, file.name)
|
20
22
|
zip_file.extract(file, file_path) unless File.exist?(file_path)
|
@@ -38,7 +40,7 @@ module Corenlp
|
|
38
40
|
|
39
41
|
def download
|
40
42
|
return unless url
|
41
|
-
puts "downloading zip file from url #{url} to #{destination}..."
|
43
|
+
puts "downloading zip file from url #{url}. Extracting files to #{destination}..."
|
42
44
|
self.local_file = File.basename(url)
|
43
45
|
uri = URI.parse(url)
|
44
46
|
if local_file && uri
|
data/lib/corenlp/version.rb
CHANGED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'corenlp'
|
2
|
+
|
3
|
+
desc "download Stanford CoreNLP dependencies files"
|
4
|
+
namespace :corenlp do
|
5
|
+
task :download_deps do
|
6
|
+
zip_file_url = ENV['CORENLP_DOWNLOAD_URL'] || "http://nlp.stanford.edu/software/stanford-corenlp-full-2014-06-16.zip"
|
7
|
+
destination = File.join(ENV['CORENLP_DEPS_DIR'] || './lib/ext/')
|
8
|
+
Corenlp::Downloader.new(zip_file_url, destination).download
|
9
|
+
end
|
10
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: corenlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lengio Corporation
|
@@ -102,6 +102,7 @@ executables: []
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
+
- lib/corenlp.rb
|
105
106
|
- lib/corenlp/downloader.rb
|
106
107
|
- lib/corenlp/enclitic.rb
|
107
108
|
- lib/corenlp/number.rb
|
@@ -111,6 +112,7 @@ files:
|
|
111
112
|
- lib/corenlp/token_dependency.rb
|
112
113
|
- lib/corenlp/version.rb
|
113
114
|
- lib/corenlp/word.rb
|
115
|
+
- lib/tasks/downloader.rake
|
114
116
|
- test/downloader_test.rb
|
115
117
|
- test/enclitic_test.rb
|
116
118
|
- test/number_test.rb
|