nlp_toolz 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/README.md +3 -1
- data/Rakefile +3 -1
- data/bin/nlp_toolz +50 -3
- data/lib/nlp_toolz.rb +27 -10
- data/lib/nlp_toolz/home.rb +3 -0
- data/lib/nlp_toolz/language.rb +10 -11
- data/lib/nlp_toolz/load_jars.rb +3 -8
- data/lib/nlp_toolz/pos_tags.rb +6 -4
- data/lib/nlp_toolz/sentences.rb +5 -3
- data/lib/nlp_toolz/tokens.rb +6 -4
- data/lib/nlp_toolz/version.rb +1 -1
- data/nlp_toolz.gemspec +4 -0
- metadata +31 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2ce637d8ddb8a8ad0b62c3998d016d3552b0cf58
|
4
|
+
data.tar.gz: be30bf6df57f309a050ebd0dffa6b6e5bcf85d48
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 453750f9759a44a70939feca45f9bf85d1ec50cf44c9794da66614dcbf2e10f6f97f7e0b3d2709489547c409a92f67dc2bbaf88b4f494b0f459d1d69269531bb
|
7
|
+
data.tar.gz: ad7179da66ff954010aca8245180cedf688d49531d6963cfaae92bfa944bc4e240fef3cb41cee471757ab893d2dd1847d2ca6fa0a66c33e76541e10513c2845a
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,9 @@ Or install it yourself as:
|
|
18
18
|
|
19
19
|
$ gem install nlp_toolz
|
20
20
|
|
21
|
-
Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder
|
21
|
+
~~Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.~~
|
22
|
+
|
23
|
+
Simplyfied installing `models` and `jars`, now run `nlp_toolz init` from command line.
|
22
24
|
|
23
25
|
## Usage
|
24
26
|
|
data/Rakefile
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
#!/usr/bin/env rake
|
2
2
|
require "bundler/gem_tasks"
|
3
|
-
require "awesome_print"
|
4
3
|
require 'rspec/core'
|
5
4
|
require 'rspec/core/rake_task'
|
5
|
+
require "nlp_toolz"
|
6
|
+
|
6
7
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
8
|
+
NlpToolz.check_dependencies
|
7
9
|
spec.pattern = FileList['spec/**/*_spec.rb']
|
8
10
|
end
|
9
11
|
|
data/bin/nlp_toolz
CHANGED
@@ -14,8 +14,7 @@ include GLI::App
|
|
14
14
|
|
15
15
|
# helper methods
|
16
16
|
def get_out(this)
|
17
|
-
|
18
|
-
$stdout.puts this unless $stdout.tty?
|
17
|
+
$stdout.puts this
|
19
18
|
end
|
20
19
|
|
21
20
|
def get_in(input_arg)
|
@@ -32,6 +31,23 @@ def get_file(name)
|
|
32
31
|
file.force_encoding("utf-8") unless file.nil?
|
33
32
|
end
|
34
33
|
|
34
|
+
def unzip_file (file, destination)
|
35
|
+
Zip::ZipFile.open(file) { |zip_file|
|
36
|
+
zip_file.each { |f|
|
37
|
+
f_path=File.join(destination, f.name)
|
38
|
+
FileUtils.mkdir_p(File.dirname(f_path))
|
39
|
+
zip_file.extract(f, f_path) unless File.exist?(f_path)
|
40
|
+
}
|
41
|
+
}
|
42
|
+
end
|
43
|
+
|
44
|
+
def add_path_in_lanikernel
|
45
|
+
file = IO.read(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'))
|
46
|
+
file.sub!('WordlistDir=models',"WordlistDir=#{NlpToolz::HOME}/models")
|
47
|
+
file.sub!('BlacklistFile=models',"BlacklistFile=#{NlpToolz::HOME}/models")
|
48
|
+
file.sub!('MappingFile=models',"MappingFile=#{NlpToolz::HOME}/models")
|
49
|
+
File.open(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'), 'w') {|f| f.write(file) }
|
50
|
+
end
|
35
51
|
|
36
52
|
program_desc 'running basic NLP tasks'
|
37
53
|
|
@@ -40,7 +56,34 @@ version NlpToolz::VERSION
|
|
40
56
|
desc 'initial setup'
|
41
57
|
command :init do |c|
|
42
58
|
c.action do |global_options,options,args|
|
43
|
-
|
59
|
+
unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
60
|
+
gem_home = NlpToolz::HOME
|
61
|
+
$stdout.print "download and unzip to: ".green
|
62
|
+
$stdout.puts "#{gem_home}".blue
|
63
|
+
[
|
64
|
+
'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AACw8Y04KnFotOpBkzcfLxmwa/jars.zip',
|
65
|
+
'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AADUSMRMVg3n54Djdy9BWYVEa/models.zip',
|
66
|
+
].each do |link|
|
67
|
+
loaded_file = link.split('/').last.sub('?dl=0','')
|
68
|
+
$stdout.print "download: ".green
|
69
|
+
$stdout.puts "#{loaded_file}".blue
|
70
|
+
|
71
|
+
Schiphol.download(
|
72
|
+
link,
|
73
|
+
# Default values
|
74
|
+
:download_folder => "#{gem_home}",
|
75
|
+
:show_progress => true,
|
76
|
+
:max_tries => 3
|
77
|
+
)
|
78
|
+
$stdout.puts "extracting …".green
|
79
|
+
unzip_file(File.join(gem_home,loaded_file),gem_home)
|
80
|
+
FileUtils.rm(File.join(gem_home,loaded_file))
|
81
|
+
end
|
82
|
+
$stdout.puts "add abbsolute path to language config".green
|
83
|
+
add_path_in_lanikernel
|
84
|
+
else
|
85
|
+
$stdout.puts "files exist".green
|
86
|
+
end
|
44
87
|
end
|
45
88
|
end
|
46
89
|
|
@@ -51,6 +94,7 @@ command :sent do |c|
|
|
51
94
|
c.arg_name '<path/to/file>'
|
52
95
|
c.flag [:f,:file]
|
53
96
|
c.action do |global_options,options,args|
|
97
|
+
NlpToolz.check_dependencies
|
54
98
|
input = get_in(options[:f] || args.first)
|
55
99
|
get_out NlpToolz.get_sentences(input)
|
56
100
|
end
|
@@ -63,6 +107,7 @@ command :parse do |c|
|
|
63
107
|
c.arg_name '<path/to/file>'
|
64
108
|
c.flag [:f,:file]
|
65
109
|
c.action do |global_options,options,args|
|
110
|
+
NlpToolz.check_dependencies
|
66
111
|
input = get_in(options[:f] || args.first)
|
67
112
|
get_out NlpToolz.parse_text(input)
|
68
113
|
end
|
@@ -75,6 +120,7 @@ command :tag do |c|
|
|
75
120
|
c.arg_name '<path/to/file>'
|
76
121
|
c.flag [:f,:file]
|
77
122
|
c.action do |global_options,options,args|
|
123
|
+
NlpToolz.check_dependencies
|
78
124
|
input = get_in(options[:f] || args.first)
|
79
125
|
get_out NlpToolz.tag_text(input)
|
80
126
|
end
|
@@ -87,6 +133,7 @@ command :token do |c|
|
|
87
133
|
c.arg_name '<path/to/file>'
|
88
134
|
c.flag [:f,:file]
|
89
135
|
c.action do |global_options,options,args|
|
136
|
+
NlpToolz.check_dependencies
|
90
137
|
input = get_in(options[:f] || args.first)
|
91
138
|
get_out NlpToolz.tokenize_text(input)
|
92
139
|
end
|
data/lib/nlp_toolz.rb
CHANGED
@@ -9,6 +9,9 @@ require "rjb"
|
|
9
9
|
# external requirements
|
10
10
|
require "awesome_print"
|
11
11
|
require "multi_json"
|
12
|
+
# for downloading models and jars
|
13
|
+
require "schiphol"
|
14
|
+
require "zip/zip"
|
12
15
|
|
13
16
|
# internal requirements
|
14
17
|
require "nlp_toolz/version"
|
@@ -17,6 +20,7 @@ require "nlp_toolz/helpers/string_extended"
|
|
17
20
|
require "nlp_toolz/helpers/tmp_file"
|
18
21
|
|
19
22
|
# NLP Tools
|
23
|
+
require "nlp_toolz/home"
|
20
24
|
require "nlp_toolz/load_jars"
|
21
25
|
require "nlp_toolz/language"
|
22
26
|
require "nlp_toolz/sentences"
|
@@ -24,29 +28,41 @@ require "nlp_toolz/pos_tags"
|
|
24
28
|
require "nlp_toolz/tokens"
|
25
29
|
require "nlp_toolz/parser"
|
26
30
|
|
31
|
+
|
27
32
|
module NlpToolz
|
33
|
+
|
34
|
+
|
28
35
|
module_function
|
29
36
|
|
37
|
+
def check_dependencies
|
38
|
+
unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
39
|
+
$stdout.puts "\n--> models and jars not installed,"
|
40
|
+
$stdout.puts " install it by running:"
|
41
|
+
$stdout.puts "--> $ nlp_toolz init\n".green
|
42
|
+
exit
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
30
46
|
def get_lang(input)
|
31
47
|
NlpToolz::Language.get_language(input)
|
32
48
|
end
|
33
|
-
|
49
|
+
|
34
50
|
def get_sentences(input,lang = nil)
|
35
51
|
text = NlpToolz::Sentences.new(input,lang)
|
36
52
|
text.split_into_sentences if text.has_model?
|
37
53
|
end
|
38
|
-
|
54
|
+
|
39
55
|
def tokenize_sentence(input,lang = nil)
|
40
56
|
sentence = NlpToolz::Tokens.new(input,lang)
|
41
57
|
sentence.tokenize
|
42
58
|
end
|
43
|
-
|
59
|
+
|
44
60
|
def tokenize_text(input,lang = nil)
|
45
61
|
tokenized_text = []
|
46
62
|
get_sentences(input,lang).each do |sentence|
|
47
63
|
tokenized_text << tokenize_sentence(sentence,lang)
|
48
64
|
end
|
49
|
-
|
65
|
+
|
50
66
|
tokenized_text
|
51
67
|
end
|
52
68
|
|
@@ -54,29 +70,30 @@ module NlpToolz
|
|
54
70
|
sentence = NlpToolz::PosTags.new(input,lang)
|
55
71
|
sentence.get_pos_tags if sentence.has_model?
|
56
72
|
end
|
57
|
-
|
73
|
+
|
58
74
|
def tag_text(input,lang = nil)
|
59
75
|
tagged_text = []
|
60
76
|
get_sentences(input,lang).each do |sentence|
|
61
77
|
tagged_text << tag_sentence(sentence,lang)
|
62
78
|
end
|
63
|
-
|
79
|
+
|
64
80
|
tagged_text
|
65
81
|
end
|
66
|
-
|
82
|
+
|
67
83
|
def parse_sentence(input,lang = nil)
|
68
84
|
text = NlpToolz::Parser.new(input,lang)
|
69
85
|
text.parse_text
|
70
|
-
|
86
|
+
|
71
87
|
text.parse_hash
|
72
88
|
end
|
73
|
-
|
89
|
+
|
74
90
|
def parse_text(input,lang = nil)
|
75
91
|
parsed_text = []
|
76
92
|
get_sentences(input,lang).each do |sentence|
|
77
93
|
parsed_text << parse_sentence(sentence,lang)
|
78
94
|
end
|
79
|
-
|
95
|
+
|
80
96
|
parsed_text
|
81
97
|
end
|
98
|
+
|
82
99
|
end
|
data/lib/nlp_toolz/language.rb
CHANGED
@@ -7,17 +7,16 @@ module NlpToolz
|
|
7
7
|
|
8
8
|
class Language
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
# Hashtable = Rjb::import("java.util.Hashtable")
|
14
|
-
# Set = Rjb::import("java.util.Set")
|
10
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
|
+
# load java classes
|
12
|
+
HashSet = Rjb::import("java.util.HashSet")
|
15
13
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
14
|
+
DataSourceException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.DataSourceException")
|
15
|
+
LanIKernel = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.LanIKernel")
|
16
|
+
Request = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Request")
|
17
|
+
RequestException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.RequestException")
|
18
|
+
Response = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Response")
|
19
|
+
end
|
21
20
|
|
22
21
|
def self.get_language(text = nil)
|
23
22
|
return -1 if text.nil? || text.empty?
|
@@ -33,7 +32,7 @@ module NlpToolz
|
|
33
32
|
|
34
33
|
req = Request.new(text, languages, modus, reduce)
|
35
34
|
|
36
|
-
LanIKernel.propertyFile = File.join(MODELS,
|
35
|
+
LanIKernel.propertyFile = File.join(MODELS,'language','lanikernel')
|
37
36
|
kernel = LanIKernel.getInstance()
|
38
37
|
res = kernel.evaluate(req)
|
39
38
|
|
data/lib/nlp_toolz/load_jars.rb
CHANGED
@@ -1,12 +1,7 @@
|
|
1
1
|
module NlpToolz
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
# CLASS_PATH = [
|
6
|
-
# File.join(JARS, "jwnl-1.3.3.jar"),
|
7
|
-
# File.join(JARS, "opennlp-tools-1.5.3.jar"),
|
8
|
-
# File.join(JARS, "opennlp-maxent-3.0.3.jar")
|
9
|
-
# ].join(":")
|
2
|
+
CONFIG = File.join(File.dirname(__FILE__), '..', '..', 'config')
|
3
|
+
MODELS = File.join(File.dirname(__FILE__), '..', '..', 'models')
|
4
|
+
JARS = File.join(File.dirname(__FILE__), '..', '..', 'jars')
|
10
5
|
|
11
6
|
CLASS_PATH = Dir.glob(File.join(JARS,'*.jar')).join(':')
|
12
7
|
|
data/lib/nlp_toolz/pos_tags.rb
CHANGED
@@ -8,10 +8,12 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class PosTags
|
10
10
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
11
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
12
|
+
# load java classes
|
13
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
|
+
POSModel = Rjb::import('opennlp.tools.postag.POSModel')
|
15
|
+
POSTaggerME = Rjb::import('opennlp.tools.postag.POSTaggerME')
|
16
|
+
end
|
15
17
|
|
16
18
|
attr_accessor :input, :lang, :model, :model_name, :tokenized
|
17
19
|
|
data/lib/nlp_toolz/sentences.rb
CHANGED
@@ -8,10 +8,12 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class Sentences
|
10
10
|
|
11
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
12
|
# load java classes
|
12
|
-
|
13
|
-
|
14
|
-
|
13
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
|
+
SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
|
15
|
+
SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
|
16
|
+
end
|
15
17
|
|
16
18
|
attr_accessor :input, :lang, :model, :model_name, :sentences
|
17
19
|
|
data/lib/nlp_toolz/tokens.rb
CHANGED
@@ -7,10 +7,12 @@ module NlpToolz
|
|
7
7
|
|
8
8
|
class Tokens
|
9
9
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
|
11
|
+
# load java classes
|
12
|
+
FileInputStream = Rjb::import('java.io.FileInputStream')
|
13
|
+
TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
|
14
|
+
TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
|
15
|
+
end
|
14
16
|
|
15
17
|
attr_accessor :input, :lang, :model, :model_name, :tokens
|
16
18
|
|
data/lib/nlp_toolz/version.rb
CHANGED
data/nlp_toolz.gemspec
CHANGED
@@ -39,4 +39,8 @@ Gem::Specification.new do |gem|
|
|
39
39
|
gem.add_runtime_dependency "multi_json"
|
40
40
|
gem.add_runtime_dependency "gli"
|
41
41
|
gem.add_runtime_dependency "rake"
|
42
|
+
|
43
|
+
#for downloading models and jars
|
44
|
+
gem.add_runtime_dependency "schiphol"
|
45
|
+
gem.add_runtime_dependency "rubyzip"
|
42
46
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp_toolz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LeFnord
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-10-
|
11
|
+
date: 2014-10-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -192,6 +192,34 @@ dependencies:
|
|
192
192
|
- - ">="
|
193
193
|
- !ruby/object:Gem::Version
|
194
194
|
version: '0'
|
195
|
+
- !ruby/object:Gem::Dependency
|
196
|
+
name: schiphol
|
197
|
+
requirement: !ruby/object:Gem::Requirement
|
198
|
+
requirements:
|
199
|
+
- - ">="
|
200
|
+
- !ruby/object:Gem::Version
|
201
|
+
version: '0'
|
202
|
+
type: :runtime
|
203
|
+
prerelease: false
|
204
|
+
version_requirements: !ruby/object:Gem::Requirement
|
205
|
+
requirements:
|
206
|
+
- - ">="
|
207
|
+
- !ruby/object:Gem::Version
|
208
|
+
version: '0'
|
209
|
+
- !ruby/object:Gem::Dependency
|
210
|
+
name: rubyzip
|
211
|
+
requirement: !ruby/object:Gem::Requirement
|
212
|
+
requirements:
|
213
|
+
- - ">="
|
214
|
+
- !ruby/object:Gem::Version
|
215
|
+
version: '0'
|
216
|
+
type: :runtime
|
217
|
+
prerelease: false
|
218
|
+
version_requirements: !ruby/object:Gem::Requirement
|
219
|
+
requirements:
|
220
|
+
- - ">="
|
221
|
+
- !ruby/object:Gem::Version
|
222
|
+
version: '0'
|
195
223
|
description: make NLP tools available, from OpenNLP and BerkeleyParser
|
196
224
|
email:
|
197
225
|
- pscholz.le@gmail.com
|
@@ -212,6 +240,7 @@ files:
|
|
212
240
|
- lib/nlp_toolz/helpers/string_extended.rb
|
213
241
|
- lib/nlp_toolz/helpers/tmp_file.rb
|
214
242
|
- lib/nlp_toolz/helpers/url_handler.rb
|
243
|
+
- lib/nlp_toolz/home.rb
|
215
244
|
- lib/nlp_toolz/language.rb
|
216
245
|
- lib/nlp_toolz/load_jars.rb
|
217
246
|
- lib/nlp_toolz/parser.rb
|