nlp_toolz 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +3 -1
- data/Rakefile +3 -1
- data/bin/nlp_toolz +50 -3
- data/lib/nlp_toolz.rb +27 -10
- data/lib/nlp_toolz/home.rb +3 -0
- data/lib/nlp_toolz/language.rb +10 -11
- data/lib/nlp_toolz/load_jars.rb +3 -8
- data/lib/nlp_toolz/pos_tags.rb +6 -4
- data/lib/nlp_toolz/sentences.rb +5 -3
- data/lib/nlp_toolz/tokens.rb +6 -4
- data/lib/nlp_toolz/version.rb +1 -1
- data/nlp_toolz.gemspec +4 -0
- metadata +31 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ce637d8ddb8a8ad0b62c3998d016d3552b0cf58
|
4
|
+
data.tar.gz: be30bf6df57f309a050ebd0dffa6b6e5bcf85d48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 453750f9759a44a70939feca45f9bf85d1ec50cf44c9794da66614dcbf2e10f6f97f7e0b3d2709489547c409a92f67dc2bbaf88b4f494b0f459d1d69269531bb
|
7
|
+
data.tar.gz: ad7179da66ff954010aca8245180cedf688d49531d6963cfaae92bfa944bc4e240fef3cb41cee471757ab893d2dd1847d2ca6fa0a66c33e76541e10513c2845a
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,9 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
$ gem install nlp_toolz
|
20
20
|
|
21
|
-
Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder
|
21
|
+
~~Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.~~
|
22
|
+
|
23
|
+
Simplyfied installing `models` and `jars`, now run `nlp_toolz init` from command line.
|
22
24
|
|
23
25
|
## Usage
|
24
26
|
|
data/Rakefile
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
#!/usr/bin/env rake
|
2
2
|
require "bundler/gem_tasks"
|
3
|
-
require "awesome_print"
|
4
3
|
require 'rspec/core'
|
5
4
|
require 'rspec/core/rake_task'
|
5
|
+
require "nlp_toolz"
|
6
|
+
|
6
7
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
8
|
+
NlpToolz.check_dependencies
|
7
9
|
spec.pattern = FileList['spec/**/*_spec.rb']
|
8
10
|
end
|
9
11
|
|
data/bin/nlp_toolz
CHANGED
@@ -14,8 +14,7 @@ include GLI::App
|
|
14
14
|
|
15
15
|
# helper methods
|
16
16
|
def get_out(this)
|
17
|
-
|
18
|
-
$stdout.puts this unless $stdout.tty?
|
17
|
+
$stdout.puts this
|
19
18
|
end
|
20
19
|
|
21
20
|
def get_in(input_arg)
|
@@ -32,6 +31,23 @@ def get_file(name)
|
|
32
31
|
file.force_encoding("utf-8") unless file.nil?
|
33
32
|
end
|
34
33
|
|
34
|
+
def unzip_file (file, destination)
|
35
|
+
Zip::ZipFile.open(file) { |zip_file|
|
36
|
+
zip_file.each { |f|
|
37
|
+
f_path=File.join(destination, f.name)
|
38
|
+
FileUtils.mkdir_p(File.dirname(f_path))
|
39
|
+
zip_file.extract(f, f_path) unless File.exist?(f_path)
|
40
|
+
}
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_path_in_lanikernel
|
45
|
+
file = IO.read(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'))
|
46
|
+
file.sub!('WordlistDir=models',"WordlistDir=#{NlpToolz::HOME}/models")
|
47
|
+
file.sub!('BlacklistFile=models',"BlacklistFile=#{NlpToolz::HOME}/models")
|
48
|
+
file.sub!('MappingFile=models',"MappingFile=#{NlpToolz::HOME}/models")
|
49
|
+
File.open(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'), 'w') {|f| f.write(file) }
|
50
|
+
end
|
35
51
|
|
36
52
|
program_desc 'running basic NLP tasks'
|
37
53
|
|
@@ -40,7 +56,34 @@ version NlpToolz::VERSION
|
|
40
56
|
desc 'initial setup'
|
41
57
|
command :init do |c|
|
42
58
|
c.action do |global_options,options,args|
|
43
|
-
|
59
|
+
unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
60
|
+
gem_home = NlpToolz::HOME
|
61
|
+
$stdout.print "download and unzip to: ".green
|
62
|
+
$stdout.puts "#{gem_home}".blue
|
63
|
+
[
|
64
|
+
'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AACw8Y04KnFotOpBkzcfLxmwa/jars.zip',
|
65
|
+
'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AADUSMRMVg3n54Djdy9BWYVEa/models.zip',
|
66
|
+
].each do |link|
|
67
|
+
loaded_file = link.split('/').last.sub('?dl=0','')
|
68
|
+
$stdout.print "download: ".green
|
69
|
+
$stdout.puts "#{loaded_file}".blue
|
70
|
+
|
71
|
+
Schiphol.download(
|
72
|
+
link,
|
73
|
+
# Default values
|
74
|
+
:download_folder => "#{gem_home}",
|
75
|
+
:show_progress => true,
|
76
|
+
:max_tries => 3
|
77
|
+
)
|
78
|
+
$stdout.puts "extracting …".green
|
79
|
+
unzip_file(File.join(gem_home,loaded_file),gem_home)
|
80
|
+
FileUtils.rm(File.join(gem_home,loaded_file))
|
81
|
+
end
|
82
|
+
$stdout.puts "add abbsolute path to language config".green
|
83
|
+
add_path_in_lanikernel
|
84
|
+
else
|
85
|
+
$stdout.puts "files exist".green
|
86
|
+
end
|
44
87
|
end
|
45
88
|
end
|
46
89
|
|
@@ -51,6 +94,7 @@ command :sent do |c|
|
|
51
94
|
c.arg_name '<path/to/file>'
|
52
95
|
c.flag [:f,:file]
|
53
96
|
c.action do |global_options,options,args|
|
97
|
+
NlpToolz.check_dependencies
|
54
98
|
input = get_in(options[:f] || args.first)
|
55
99
|
get_out NlpToolz.get_sentences(input)
|
56
100
|
end
|
@@ -63,6 +107,7 @@ command :parse do |c|
|
|
63
107
|
c.arg_name '<path/to/file>'
|
64
108
|
c.flag [:f,:file]
|
65
109
|
c.action do |global_options,options,args|
|
110
|
+
NlpToolz.check_dependencies
|
66
111
|
input = get_in(options[:f] || args.first)
|
67
112
|
get_out NlpToolz.parse_text(input)
|
68
113
|
end
|
@@ -75,6 +120,7 @@ command :tag do |c|
|
|
75
120
|
c.arg_name '<path/to/file>'
|
76
121
|
c.flag [:f,:file]
|
77
122
|
c.action do |global_options,options,args|
|
123
|
+
NlpToolz.check_dependencies
|
78
124
|
input = get_in(options[:f] || args.first)
|
79
125
|
get_out NlpToolz.tag_text(input)
|
80
126
|
end
|
@@ -87,6 +133,7 @@ command :token do |c|
|
|
87
133
|
c.arg_name '<path/to/file>'
|
88
134
|
c.flag [:f,:file]
|
89
135
|
c.action do |global_options,options,args|
|
136
|
+
NlpToolz.check_dependencies
|
90
137
|
input = get_in(options[:f] || args.first)
|
91
138
|
get_out NlpToolz.tokenize_text(input)
|
92
139
|
end
|
data/lib/nlp_toolz.rb
CHANGED
@@ -9,6 +9,9 @@ require "rjb"
|
|
9
9
|
# external requirements
|
10
10
|
require "awesome_print"
|
11
11
|
require "multi_json"
|
12
|
+
# for downloading models and jars
|
13
|
+
require "schiphol"
|
14
|
+
require "zip/zip"
|
12
15
|
|
13
16
|
# internal requirements
|
14
17
|
require "nlp_toolz/version"
|
@@ -17,6 +20,7 @@ require "nlp_toolz/helpers/string_extended"
|
|
17
20
|
require "nlp_toolz/helpers/tmp_file"
|
18
21
|
|
19
22
|
# NLP Tools
|
23
|
+
require "nlp_toolz/home"
|
20
24
|
require "nlp_toolz/load_jars"
|
21
25
|
require "nlp_toolz/language"
|
22
26
|
require "nlp_toolz/sentences"
|
@@ -24,29 +28,41 @@ require "nlp_toolz/pos_tags"
|
|
24
28
|
require "nlp_toolz/tokens"
|
25
29
|
require "nlp_toolz/parser"
|
26
30
|
|
31
|
+
|
27
32
|
module NlpToolz
|
33
|
+
|
34
|
+
|
28
35
|
module_function
|
29
36
|
|
37
|
+
def check_dependencies
|
38
|
+
unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
39
|
+
$stdout.puts "\n--> models and jars not installed,"
|
40
|
+
$stdout.puts " install it by running:"
|
41
|
+
$stdout.puts "--> $ nlp_toolz init\n".green
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
30
46
|
def get_lang(input)
|
31
47
|
NlpToolz::Language.get_language(input)
|
32
48
|
end
|
33
|
-
|
49
|
+
|
34
50
|
def get_sentences(input,lang = nil)
|
35
51
|
text = NlpToolz::Sentences.new(input,lang)
|
36
52
|
text.split_into_sentences if text.has_model?
|
37
53
|
end
|
38
|
-
|
54
|
+
|
39
55
|
def tokenize_sentence(input,lang = nil)
|
40
56
|
sentence = NlpToolz::Tokens.new(input,lang)
|
41
57
|
sentence.tokenize
|
42
58
|
end
|
43
|
-
|
59
|
+
|
44
60
|
def tokenize_text(input,lang = nil)
|
45
61
|
tokenized_text = []
|
46
62
|
get_sentences(input,lang).each do |sentence|
|
47
63
|
tokenized_text << tokenize_sentence(sentence,lang)
|
48
64
|
end
|
49
|
-
|
65
|
+
|
50
66
|
tokenized_text
|
51
67
|
end
|
52
68
|
|
@@ -54,29 +70,30 @@ module NlpToolz
|
|
54
70
|
sentence = NlpToolz::PosTags.new(input,lang)
|
55
71
|
sentence.get_pos_tags if sentence.has_model?
|
56
72
|
end
|
57
|
-
|
73
|
+
|
58
74
|
def tag_text(input,lang = nil)
|
59
75
|
tagged_text = []
|
60
76
|
get_sentences(input,lang).each do |sentence|
|
61
77
|
tagged_text << tag_sentence(sentence,lang)
|
62
78
|
end
|
63
|
-
|
79
|
+
|
64
80
|
tagged_text
|
65
81
|
end
|
66
|
-
|
82
|
+
|
67
83
|
def parse_sentence(input,lang = nil)
|
68
84
|
text = NlpToolz::Parser.new(input,lang)
|
69
85
|
text.parse_text
|
70
|
-
|
86
|
+
|
71
87
|
text.parse_hash
|
72
88
|
end
|
73
|
-
|
89
|
+
|
74
90
|
def parse_text(input,lang = nil)
|
75
91
|
parsed_text = []
|
76
92
|
get_sentences(input,lang).each do |sentence|
|
77
93
|
parsed_text << parse_sentence(sentence,lang)
|
78
94
|
end
|
79
|
-
|
95
|
+
|
80
96
|
parsed_text
|
81
97
|
end
|
98
|
+
|
82
99
|
end
|
data/lib/nlp_toolz/language.rb
CHANGED
@@ -7,17 +7,16 @@ module NlpToolz
|
|
7
7
|
|
8
8
|
class Language
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
# Hashtable = Rjb::import("java.util.Hashtable")
|
14
|
-
# Set = Rjb::import("java.util.Set")
|
10
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
|
+
# load java classes
|
12
|
+
HashSet = Rjb::import("java.util.HashSet")
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
DataSourceException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.DataSourceException")
|
15
|
+
LanIKernel = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.LanIKernel")
|
16
|
+
Request = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Request")
|
17
|
+
RequestException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.RequestException")
|
18
|
+
Response = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Response")
|
19
|
+
end
|
21
20
|
|
22
21
|
def self.get_language(text = nil)
|
23
22
|
return -1 if text.nil? || text.empty?
|
@@ -33,7 +32,7 @@ module NlpToolz
|
|
33
32
|
|
34
33
|
req = Request.new(text, languages, modus, reduce)
|
35
34
|
|
36
|
-
LanIKernel.propertyFile = File.join(MODELS,
|
35
|
+
LanIKernel.propertyFile = File.join(MODELS,'language','lanikernel')
|
37
36
|
kernel = LanIKernel.getInstance()
|
38
37
|
res = kernel.evaluate(req)
|
39
38
|
|
data/lib/nlp_toolz/load_jars.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
module NlpToolz
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
# CLASS_PATH = [
|
6
|
-
# File.join(JARS, "jwnl-1.3.3.jar"),
|
7
|
-
# File.join(JARS, "opennlp-tools-1.5.3.jar"),
|
8
|
-
# File.join(JARS, "opennlp-maxent-3.0.3.jar")
|
9
|
-
# ].join(":")
|
2
|
+
CONFIG = File.join(File.dirname(__FILE__), '..', '..', 'config')
|
3
|
+
MODELS = File.join(File.dirname(__FILE__), '..', '..', 'models')
|
4
|
+
JARS = File.join(File.dirname(__FILE__), '..', '..', 'jars')
|
10
5
|
|
11
6
|
CLASS_PATH = Dir.glob(File.join(JARS,'*.jar')).join(':')
|
12
7
|
|
data/lib/nlp_toolz/pos_tags.rb
CHANGED
@@ -8,10 +8,12 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class PosTags
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
12
|
+
# load java classes
|
13
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
|
+
POSModel = Rjb::import('opennlp.tools.postag.POSModel')
|
15
|
+
POSTaggerME = Rjb::import('opennlp.tools.postag.POSTaggerME')
|
16
|
+
end
|
15
17
|
|
16
18
|
attr_accessor :input, :lang, :model, :model_name, :tokenized
|
17
19
|
|
data/lib/nlp_toolz/sentences.rb
CHANGED
@@ -8,10 +8,12 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class Sentences
|
10
10
|
|
11
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
12
|
# load java classes
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
|
+
SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
|
15
|
+
SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
|
16
|
+
end
|
15
17
|
|
16
18
|
attr_accessor :input, :lang, :model, :model_name, :sentences
|
17
19
|
|
data/lib/nlp_toolz/tokens.rb
CHANGED
@@ -7,10 +7,12 @@ module NlpToolz
|
|
7
7
|
|
8
8
|
class Tokens
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
|
+
# load java classes
|
12
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
13
|
+
TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
|
14
|
+
TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
|
15
|
+
end
|
14
16
|
|
15
17
|
attr_accessor :input, :lang, :model, :model_name, :tokens
|
16
18
|
|
data/lib/nlp_toolz/version.rb
CHANGED
data/nlp_toolz.gemspec
CHANGED
@@ -39,4 +39,8 @@ Gem::Specification.new do |gem|
|
|
39
39
|
gem.add_runtime_dependency "multi_json"
|
40
40
|
gem.add_runtime_dependency "gli"
|
41
41
|
gem.add_runtime_dependency "rake"
|
42
|
+
|
43
|
+
#for downloading models and jars
|
44
|
+
gem.add_runtime_dependency "schiphol"
|
45
|
+
gem.add_runtime_dependency "rubyzip"
|
42
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp_toolz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LeFnord
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -192,6 +192,34 @@ dependencies:
|
|
192
192
|
- - ">="
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: schiphol
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: rubyzip
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
description: make NLP tools available, from OpenNLP and BerkeleyParser
|
196
224
|
email:
|
197
225
|
- pscholz.le@gmail.com
|
@@ -212,6 +240,7 @@ files:
|
|
212
240
|
- lib/nlp_toolz/helpers/string_extended.rb
|
213
241
|
- lib/nlp_toolz/helpers/tmp_file.rb
|
214
242
|
- lib/nlp_toolz/helpers/url_handler.rb
|
243
|
+
- lib/nlp_toolz/home.rb
|
215
244
|
- lib/nlp_toolz/language.rb
|
216
245
|
- lib/nlp_toolz/load_jars.rb
|
217
246
|
- lib/nlp_toolz/parser.rb
|