nlp_toolz 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ce5f4cad49039b0d8cb6d626facc67a4efa32ae4
4
- data.tar.gz: 0565742385f0a34aabe4e456cde014ba2673a589
3
+ metadata.gz: 2ce637d8ddb8a8ad0b62c3998d016d3552b0cf58
4
+ data.tar.gz: be30bf6df57f309a050ebd0dffa6b6e5bcf85d48
5
5
  SHA512:
6
- metadata.gz: 1ec11ec4b9b07437fb16f9ab0c181c9cee40a0cc900f90d02d2a6e4fc3bac7efaae890e8eda16bf7dcf8e3595bcb4010cf9d3893bee2a7a937b0fd527c40356f
7
- data.tar.gz: 06d53b1bfe11004d0abeba1db130a13f664a054e8ed56f5edb260ec3f8bf189b0f9cb64687a471d2241ffd0d612ae632b50853737282a8d1901ea0645be4426a
6
+ metadata.gz: 453750f9759a44a70939feca45f9bf85d1ec50cf44c9794da66614dcbf2e10f6f97f7e0b3d2709489547c409a92f67dc2bbaf88b4f494b0f459d1d69269531bb
7
+ data.tar.gz: ad7179da66ff954010aca8245180cedf688d49531d6963cfaae92bfa944bc4e240fef3cb41cee471757ab893d2dd1847d2ca6fa0a66c33e76541e10513c2845a
data/.gitignore CHANGED
@@ -26,3 +26,5 @@ teste.rb
26
26
  test-data/
27
27
  jars/*
28
28
  models/*
29
+ jars.zip
30
+ models.zip
data/README.md CHANGED
@@ -18,7 +18,9 @@ Or install it yourself as:
18
18
 
19
19
  $ gem install nlp_toolz
20
20
 
21
- Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.
21
+ ~~Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.~~
22
+
23
+ Simplyfied installing `models` and `jars`, now run `nlp_toolz init` from command line.
22
24
 
23
25
  ## Usage
24
26
 
data/Rakefile CHANGED
@@ -1,9 +1,11 @@
1
1
  #!/usr/bin/env rake
2
2
  require "bundler/gem_tasks"
3
- require "awesome_print"
4
3
  require 'rspec/core'
5
4
  require 'rspec/core/rake_task'
5
+ require "nlp_toolz"
6
+
6
7
  RSpec::Core::RakeTask.new(:spec) do |spec|
8
+ NlpToolz.check_dependencies
7
9
  spec.pattern = FileList['spec/**/*_spec.rb']
8
10
  end
9
11
 
@@ -14,8 +14,7 @@ include GLI::App
14
14
 
15
15
  # helper methods
16
16
  def get_out(this)
17
- ap this if $stdout.tty?
18
- $stdout.puts this unless $stdout.tty?
17
+ $stdout.puts this
19
18
  end
20
19
 
21
20
  def get_in(input_arg)
@@ -32,6 +31,23 @@ def get_file(name)
32
31
  file.force_encoding("utf-8") unless file.nil?
33
32
  end
34
33
 
34
+ def unzip_file (file, destination)
35
+ Zip::ZipFile.open(file) { |zip_file|
36
+ zip_file.each { |f|
37
+ f_path=File.join(destination, f.name)
38
+ FileUtils.mkdir_p(File.dirname(f_path))
39
+ zip_file.extract(f, f_path) unless File.exist?(f_path)
40
+ }
41
+ }
42
+ end
43
+
44
+ def add_path_in_lanikernel
45
+ file = IO.read(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'))
46
+ file.sub!('WordlistDir=models',"WordlistDir=#{NlpToolz::HOME}/models")
47
+ file.sub!('BlacklistFile=models',"BlacklistFile=#{NlpToolz::HOME}/models")
48
+ file.sub!('MappingFile=models',"MappingFile=#{NlpToolz::HOME}/models")
49
+ File.open(File.join(NlpToolz::HOME,'models','language','lanikernel.ini'), 'w') {|f| f.write(file) }
50
+ end
35
51
 
36
52
  program_desc 'running basic NLP tasks'
37
53
 
@@ -40,7 +56,34 @@ version NlpToolz::VERSION
40
56
  desc 'initial setup'
41
57
  command :init do |c|
42
58
  c.action do |global_options,options,args|
43
- puts 'setting up app'
59
+ unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
60
+ gem_home = NlpToolz::HOME
61
+ $stdout.print "download and unzip to: ".green
62
+ $stdout.puts "#{gem_home}".blue
63
+ [
64
+ 'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AACw8Y04KnFotOpBkzcfLxmwa/jars.zip',
65
+ 'https://dl.dropboxusercontent.com/sh/1layyjgf5h0wwi3/AADUSMRMVg3n54Djdy9BWYVEa/models.zip',
66
+ ].each do |link|
67
+ loaded_file = link.split('/').last.sub('?dl=0','')
68
+ $stdout.print "download: ".green
69
+ $stdout.puts "#{loaded_file}".blue
70
+
71
+ Schiphol.download(
72
+ link,
73
+ # Default values
74
+ :download_folder => "#{gem_home}",
75
+ :show_progress => true,
76
+ :max_tries => 3
77
+ )
78
+ $stdout.puts "extracting …".green
79
+ unzip_file(File.join(gem_home,loaded_file),gem_home)
80
+ FileUtils.rm(File.join(gem_home,loaded_file))
81
+ end
82
+ $stdout.puts "add abbsolute path to language config".green
83
+ add_path_in_lanikernel
84
+ else
85
+ $stdout.puts "files exist".green
86
+ end
44
87
  end
45
88
  end
46
89
 
@@ -51,6 +94,7 @@ command :sent do |c|
51
94
  c.arg_name '<path/to/file>'
52
95
  c.flag [:f,:file]
53
96
  c.action do |global_options,options,args|
97
+ NlpToolz.check_dependencies
54
98
  input = get_in(options[:f] || args.first)
55
99
  get_out NlpToolz.get_sentences(input)
56
100
  end
@@ -63,6 +107,7 @@ command :parse do |c|
63
107
  c.arg_name '<path/to/file>'
64
108
  c.flag [:f,:file]
65
109
  c.action do |global_options,options,args|
110
+ NlpToolz.check_dependencies
66
111
  input = get_in(options[:f] || args.first)
67
112
  get_out NlpToolz.parse_text(input)
68
113
  end
@@ -75,6 +120,7 @@ command :tag do |c|
75
120
  c.arg_name '<path/to/file>'
76
121
  c.flag [:f,:file]
77
122
  c.action do |global_options,options,args|
123
+ NlpToolz.check_dependencies
78
124
  input = get_in(options[:f] || args.first)
79
125
  get_out NlpToolz.tag_text(input)
80
126
  end
@@ -87,6 +133,7 @@ command :token do |c|
87
133
  c.arg_name '<path/to/file>'
88
134
  c.flag [:f,:file]
89
135
  c.action do |global_options,options,args|
136
+ NlpToolz.check_dependencies
90
137
  input = get_in(options[:f] || args.first)
91
138
  get_out NlpToolz.tokenize_text(input)
92
139
  end
@@ -9,6 +9,9 @@ require "rjb"
9
9
  # external requirements
10
10
  require "awesome_print"
11
11
  require "multi_json"
12
+ # for downloading models and jars
13
+ require "schiphol"
14
+ require "zip/zip"
12
15
 
13
16
  # internal requirements
14
17
  require "nlp_toolz/version"
@@ -17,6 +20,7 @@ require "nlp_toolz/helpers/string_extended"
17
20
  require "nlp_toolz/helpers/tmp_file"
18
21
 
19
22
  # NLP Tools
23
+ require "nlp_toolz/home"
20
24
  require "nlp_toolz/load_jars"
21
25
  require "nlp_toolz/language"
22
26
  require "nlp_toolz/sentences"
@@ -24,29 +28,41 @@ require "nlp_toolz/pos_tags"
24
28
  require "nlp_toolz/tokens"
25
29
  require "nlp_toolz/parser"
26
30
 
31
+
27
32
  module NlpToolz
33
+
34
+
28
35
  module_function
29
36
 
37
+ def check_dependencies
38
+ unless Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
39
+ $stdout.puts "\n--> models and jars not installed,"
40
+ $stdout.puts " install it by running:"
41
+ $stdout.puts "--> $ nlp_toolz init\n".green
42
+ exit
43
+ end
44
+ end
45
+
30
46
  def get_lang(input)
31
47
  NlpToolz::Language.get_language(input)
32
48
  end
33
-
49
+
34
50
  def get_sentences(input,lang = nil)
35
51
  text = NlpToolz::Sentences.new(input,lang)
36
52
  text.split_into_sentences if text.has_model?
37
53
  end
38
-
54
+
39
55
  def tokenize_sentence(input,lang = nil)
40
56
  sentence = NlpToolz::Tokens.new(input,lang)
41
57
  sentence.tokenize
42
58
  end
43
-
59
+
44
60
  def tokenize_text(input,lang = nil)
45
61
  tokenized_text = []
46
62
  get_sentences(input,lang).each do |sentence|
47
63
  tokenized_text << tokenize_sentence(sentence,lang)
48
64
  end
49
-
65
+
50
66
  tokenized_text
51
67
  end
52
68
 
@@ -54,29 +70,30 @@ module NlpToolz
54
70
  sentence = NlpToolz::PosTags.new(input,lang)
55
71
  sentence.get_pos_tags if sentence.has_model?
56
72
  end
57
-
73
+
58
74
  def tag_text(input,lang = nil)
59
75
  tagged_text = []
60
76
  get_sentences(input,lang).each do |sentence|
61
77
  tagged_text << tag_sentence(sentence,lang)
62
78
  end
63
-
79
+
64
80
  tagged_text
65
81
  end
66
-
82
+
67
83
  def parse_sentence(input,lang = nil)
68
84
  text = NlpToolz::Parser.new(input,lang)
69
85
  text.parse_text
70
-
86
+
71
87
  text.parse_hash
72
88
  end
73
-
89
+
74
90
  def parse_text(input,lang = nil)
75
91
  parsed_text = []
76
92
  get_sentences(input,lang).each do |sentence|
77
93
  parsed_text << parse_sentence(sentence,lang)
78
94
  end
79
-
95
+
80
96
  parsed_text
81
97
  end
98
+
82
99
  end
@@ -0,0 +1,3 @@
1
+ module NlpToolz
2
+ HOME = Gem::Specification.find_by_name("nlp_toolz").gem_dir
3
+ end
@@ -7,17 +7,16 @@ module NlpToolz
7
7
 
8
8
  class Language
9
9
 
10
- # load java classes
11
- # Enumeration = Rjb::import("java.util.Enumeration")
12
- HashSet = Rjb::import("java.util.HashSet")
13
- # Hashtable = Rjb::import("java.util.Hashtable")
14
- # Set = Rjb::import("java.util.Set")
10
+ if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
11
+ # load java classes
12
+ HashSet = Rjb::import("java.util.HashSet")
15
13
 
16
- DataSourceException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.DataSourceException")
17
- LanIKernel = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.LanIKernel")
18
- Request = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Request")
19
- RequestException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.RequestException")
20
- Response = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Response")
14
+ DataSourceException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.DataSourceException")
15
+ LanIKernel = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.LanIKernel")
16
+ Request = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Request")
17
+ RequestException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.RequestException")
18
+ Response = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Response")
19
+ end
21
20
 
22
21
  def self.get_language(text = nil)
23
22
  return -1 if text.nil? || text.empty?
@@ -33,7 +32,7 @@ module NlpToolz
33
32
 
34
33
  req = Request.new(text, languages, modus, reduce)
35
34
 
36
- LanIKernel.propertyFile = File.join(MODELS, 'language', 'lanikernel')
35
+ LanIKernel.propertyFile = File.join(MODELS,'language','lanikernel')
37
36
  kernel = LanIKernel.getInstance()
38
37
  res = kernel.evaluate(req)
39
38
 
@@ -1,12 +1,7 @@
1
1
  module NlpToolz
2
- MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
3
- JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
4
-
5
- # CLASS_PATH = [
6
- # File.join(JARS, "jwnl-1.3.3.jar"),
7
- # File.join(JARS, "opennlp-tools-1.5.3.jar"),
8
- # File.join(JARS, "opennlp-maxent-3.0.3.jar")
9
- # ].join(":")
2
+ CONFIG = File.join(File.dirname(__FILE__), '..', '..', 'config')
3
+ MODELS = File.join(File.dirname(__FILE__), '..', '..', 'models')
4
+ JARS = File.join(File.dirname(__FILE__), '..', '..', 'jars')
10
5
 
11
6
  CLASS_PATH = Dir.glob(File.join(JARS,'*.jar')).join(':')
12
7
 
@@ -8,10 +8,12 @@ module NlpToolz
8
8
 
9
9
  class PosTags
10
10
 
11
- # load java classes
12
- FileInputStream = Rjb::import('java.io.FileInputStream')
13
- POSModel = Rjb::import('opennlp.tools.postag.POSModel')
14
- POSTaggerME = Rjb::import('opennlp.tools.postag.POSTaggerME')
11
+ if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
12
+ # load java classes
13
+ FileInputStream = Rjb::import('java.io.FileInputStream')
14
+ POSModel = Rjb::import('opennlp.tools.postag.POSModel')
15
+ POSTaggerME = Rjb::import('opennlp.tools.postag.POSTaggerME')
16
+ end
15
17
 
16
18
  attr_accessor :input, :lang, :model, :model_name, :tokenized
17
19
 
@@ -8,10 +8,12 @@ module NlpToolz
8
8
 
9
9
  class Sentences
10
10
 
11
+ if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
11
12
  # load java classes
12
- FileInputStream = Rjb::import('java.io.FileInputStream')
13
- SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
14
- SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
13
+ FileInputStream = Rjb::import('java.io.FileInputStream')
14
+ SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
15
+ SentenceModel = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
16
+ end
15
17
 
16
18
  attr_accessor :input, :lang, :model, :model_name, :sentences
17
19
 
@@ -7,10 +7,12 @@ module NlpToolz
7
7
 
8
8
  class Tokens
9
9
 
10
- # load java classes
11
- FileInputStream = Rjb::import('java.io.FileInputStream')
12
- TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
13
- TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
10
+ if Dir.exist?(File.join(NlpToolz::HOME,'models')) && Dir.exist?(File.join(NlpToolz::HOME,'jars'))
11
+ # load java classes
12
+ FileInputStream = Rjb::import('java.io.FileInputStream')
13
+ TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
14
+ TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
15
+ end
14
16
 
15
17
  attr_accessor :input, :lang, :model, :model_name, :tokens
16
18
 
@@ -4,5 +4,5 @@
4
4
  # date: 2012-10-23
5
5
 
6
6
  module NlpToolz
7
- VERSION = "1.1.0"
7
+ VERSION = "1.1.1"
8
8
  end
@@ -39,4 +39,8 @@ Gem::Specification.new do |gem|
39
39
  gem.add_runtime_dependency "multi_json"
40
40
  gem.add_runtime_dependency "gli"
41
41
  gem.add_runtime_dependency "rake"
42
+
43
+ #for downloading models and jars
44
+ gem.add_runtime_dependency "schiphol"
45
+ gem.add_runtime_dependency "rubyzip"
42
46
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nlp_toolz
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
4
+ version: 1.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - LeFnord
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-12 00:00:00.000000000 Z
11
+ date: 2014-10-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rspec
@@ -192,6 +192,34 @@ dependencies:
192
192
  - - ">="
193
193
  - !ruby/object:Gem::Version
194
194
  version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: schiphol
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ type: :runtime
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: rubyzip
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
195
223
  description: make NLP tools available, from OpenNLP and BerkeleyParser
196
224
  email:
197
225
  - pscholz.le@gmail.com
@@ -212,6 +240,7 @@ files:
212
240
  - lib/nlp_toolz/helpers/string_extended.rb
213
241
  - lib/nlp_toolz/helpers/tmp_file.rb
214
242
  - lib/nlp_toolz/helpers/url_handler.rb
243
+ - lib/nlp_toolz/home.rb
215
244
  - lib/nlp_toolz/language.rb
216
245
  - lib/nlp_toolz/load_jars.rb
217
246
  - lib/nlp_toolz/parser.rb