corenlp 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/corenlp.rb +111 -0
- data/lib/corenlp/downloader.rb +5 -3
- data/lib/corenlp/version.rb +1 -1
- data/lib/tasks/downloader.rake +10 -0
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a88d19d7dc8eae9e7df59d4fe9b1e0c492aa194f
|
4
|
+
data.tar.gz: 5c6a32994f6720210b7a909c5b839503530793b3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b5f185cda3feb604e97e5682440a01f763773e80631bf5a68aa19a1a35e2874999770dd8707af2d74cb9baf630491219fdba498284f37a784813510d48c6549f
|
7
|
+
data.tar.gz: 99cfc0054a47e92c517b6ba9025e063d6fb316a2788acf9255079ab828a7f046ff82057fee9b2b1a154127f7825ce6abcef0eae52a08e59175c0d94bcb58233b
|
data/lib/corenlp.rb
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "bundler"
|
3
|
+
Bundler.require
|
4
|
+
|
5
|
+
module Corenlp
|
6
|
+
class Treebank
|
7
|
+
attr_accessor :raw_text, :filenames, :output_directory, :summary_file, :threads_to_use, :java_max_memory, :sentences
|
8
|
+
|
9
|
+
def initialize(attrs = {})
|
10
|
+
self.raw_text = attrs[:raw_text] || ""
|
11
|
+
self.filenames = []
|
12
|
+
self.output_directory = attrs[:output_directory] || "./tmp/language_processing"
|
13
|
+
self.summary_file = "#{output_directory}/summary_file_#{object_id}_#{Time.now.to_i}.txt"
|
14
|
+
self.filenames = []
|
15
|
+
self.threads_to_use = attrs[:threads_to_use] || 4
|
16
|
+
self.java_max_memory = attrs[:java_max_memory] || "-Xmx3g"
|
17
|
+
self.sentences = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def write_output_file_and_summary_file
|
21
|
+
input_file = File.join(output_directory, "text_#{object_id}_#{Time.now.to_i}.txt")
|
22
|
+
filenames << input_file
|
23
|
+
File.open(input_file, "w"){|f| f.write(raw_text)}
|
24
|
+
File.open(summary_file, "w"){|f| f.write(filenames.join("\n"))}
|
25
|
+
end
|
26
|
+
|
27
|
+
def process_files_with_stanford_corenlp
|
28
|
+
deps = "./lib/ext" # dependencies directory: JARs, model files, taggers, etc.
|
29
|
+
classpath = "#{deps}/stanford-corenlp-3.4.jar:#{deps}/stanford-corenlp-3.4-models.jar:#{deps}/xom.jar:#{deps}/joda-time.jar:#{deps}/jollyday.jar:#{deps}/ejml-0.23.jar"
|
30
|
+
stanford_bin = "edu.stanford.nlp.pipeline.StanfordCoreNLP"
|
31
|
+
annotators = "tokenize,ssplit,pos,lemma,parse,ner"
|
32
|
+
|
33
|
+
options = []
|
34
|
+
options << ["-cp", classpath]
|
35
|
+
options << [java_max_memory, stanford_bin]
|
36
|
+
options << ["-annotators", annotators]
|
37
|
+
options << ["-ner.useSUTime", 0] # turn this off
|
38
|
+
#options << ["-sutime.binders", 0]
|
39
|
+
options << ["-outputDirectory", output_directory]
|
40
|
+
options << ["-nthreads", threads_to_use]
|
41
|
+
options << ["-filelist", summary_file] # a file with one zone file per line
|
42
|
+
|
43
|
+
command = "java #{options.map{|x| x.join(" ")}.join(" ")}"
|
44
|
+
puts "Running command: \n\n#{command}\n\n"
|
45
|
+
`#{command}`
|
46
|
+
end
|
47
|
+
|
48
|
+
def build_treebank
|
49
|
+
filenames.each do |filename|
|
50
|
+
xml_file = "#{filename}.xml"
|
51
|
+
doc = Nokogiri.XML(File.open(xml_file))
|
52
|
+
doc.xpath("//sentences/sentence").each_with_index do |sentence_node, idx|
|
53
|
+
sentence = Sentence.new(index: idx)
|
54
|
+
self.sentences << sentence
|
55
|
+
sentence_node.xpath(".//token").each_with_index do |token_node, index|
|
56
|
+
text = token_node.children.at('word').text
|
57
|
+
text = Token.clean_stanford_text(text)
|
58
|
+
cleaned_stanford_lemma = Token.clean_stanford_text(token_node.children.at('lemma').text)
|
59
|
+
token_attrs = {
|
60
|
+
index: index,
|
61
|
+
text: text,
|
62
|
+
penn_treebank_tag: token_node.children.at('POS').text,
|
63
|
+
stanford_lemma: cleaned_stanford_lemma,
|
64
|
+
type: Token.token_subclass_from_text(text),
|
65
|
+
ner: token_node.children.at('NER').text
|
66
|
+
}
|
67
|
+
token = Token.token_subclass_from_text(text).new(token_attrs)
|
68
|
+
sentence.tokens << token
|
69
|
+
end
|
70
|
+
sentence_node.xpath(".//dependencies[@type='collapsed-dependencies']/dep").each do |dep_node|
|
71
|
+
dependent_index = dep_node.children.at('dependent').attr('idx').to_i - 1
|
72
|
+
governor_index = dep_node.children.at('governor').attr('idx').to_i - 1
|
73
|
+
if dependent_index >= 0 && governor_index >= 0
|
74
|
+
dependent = sentence.get_dependency_token_by_index(dependent_index),
|
75
|
+
governor = sentence.get_dependency_token_by_index(governor_index),
|
76
|
+
relation = dep_node.attr('type')
|
77
|
+
if dependent && governor && relation
|
78
|
+
token_dep = TokenDependency.new({
|
79
|
+
dependent: sentence.get_dependency_token_by_index(dependent_index),
|
80
|
+
governor: sentence.get_dependency_token_by_index(governor_index),
|
81
|
+
relation: dep_node.attr('type')
|
82
|
+
})
|
83
|
+
sentence.token_dependencies << token_dep
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
sentence_node.xpath(".//parse").each do |parse_node|
|
88
|
+
sentence.parse_tree_raw = parse_node.text
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse
|
95
|
+
write_output_file_and_summary_file
|
96
|
+
process_files_with_stanford_corenlp
|
97
|
+
build_treebank
|
98
|
+
self
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
require "corenlp/version"
|
104
|
+
require "corenlp/sentence"
|
105
|
+
require "corenlp/token"
|
106
|
+
require "corenlp/token_dependency"
|
107
|
+
require "corenlp/enclitic"
|
108
|
+
require "corenlp/word"
|
109
|
+
require "corenlp/punctuation"
|
110
|
+
require "corenlp/number"
|
111
|
+
require "corenlp/downloader"
|
data/lib/corenlp/downloader.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler.require
|
1
3
|
require 'net/http'
|
2
|
-
require 'zip/zip'
|
3
4
|
require 'fileutils'
|
4
5
|
require 'uri'
|
6
|
+
require 'zip'
|
5
7
|
|
6
8
|
module Corenlp
|
7
9
|
class Downloader
|
@@ -14,7 +16,7 @@ module Corenlp
|
|
14
16
|
|
15
17
|
def extract
|
16
18
|
puts "extracting file..."
|
17
|
-
Zip::
|
19
|
+
Zip::File.open(local_file) do |zip_file|
|
18
20
|
zip_file.each do |file|
|
19
21
|
file_path = File.join(destination, file.name)
|
20
22
|
zip_file.extract(file, file_path) unless File.exist?(file_path)
|
@@ -38,7 +40,7 @@ module Corenlp
|
|
38
40
|
|
39
41
|
def download
|
40
42
|
return unless url
|
41
|
-
puts "downloading zip file from url #{url} to #{destination}..."
|
43
|
+
puts "downloading zip file from url #{url}. Extracting files to #{destination}..."
|
42
44
|
self.local_file = File.basename(url)
|
43
45
|
uri = URI.parse(url)
|
44
46
|
if local_file && uri
|
data/lib/corenlp/version.rb
CHANGED
@@ -0,0 +1,10 @@
|
|
1
|
+
require 'corenlp'
|
2
|
+
|
3
|
+
desc "download Stanford CoreNLP dependencies files"
|
4
|
+
namespace :corenlp do
|
5
|
+
task :download_deps do
|
6
|
+
zip_file_url = ENV['CORENLP_DOWNLOAD_URL'] || "http://nlp.stanford.edu/software/stanford-corenlp-full-2014-06-16.zip"
|
7
|
+
destination = File.join(ENV['CORENLP_DEPS_DIR'] || './lib/ext/')
|
8
|
+
Corenlp::Downloader.new(zip_file_url, destination).download
|
9
|
+
end
|
10
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: corenlp
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Lengio Corporation
|
@@ -102,6 +102,7 @@ executables: []
|
|
102
102
|
extensions: []
|
103
103
|
extra_rdoc_files: []
|
104
104
|
files:
|
105
|
+
- lib/corenlp.rb
|
105
106
|
- lib/corenlp/downloader.rb
|
106
107
|
- lib/corenlp/enclitic.rb
|
107
108
|
- lib/corenlp/number.rb
|
@@ -111,6 +112,7 @@ files:
|
|
111
112
|
- lib/corenlp/token_dependency.rb
|
112
113
|
- lib/corenlp/version.rb
|
113
114
|
- lib/corenlp/word.rb
|
115
|
+
- lib/tasks/downloader.rake
|
114
116
|
- test/downloader_test.rb
|
115
117
|
- test/enclitic_test.rb
|
116
118
|
- test/number_test.rb
|