nlp_toolz 1.0.5 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Guardfile +1 -1
- data/lib/nlp_toolz/language.rb +52 -0
- data/lib/nlp_toolz/load_jars.rb +7 -5
- data/lib/nlp_toolz/parser.rb +4 -5
- data/lib/nlp_toolz/pos_tags.rb +1 -3
- data/lib/nlp_toolz/sentences.rb +1 -3
- data/lib/nlp_toolz/tokens.rb +1 -3
- data/lib/nlp_toolz/version.rb +1 -1
- data/lib/nlp_toolz.rb +2 -4
- data/spec/helpers/string_extended_spec.rb +3 -3
- data/spec/lib/nlp_toolz/language_spec.rb +23 -0
- data/spec/lib/nlp_toolz/parser_spec.rb +20 -20
- data/spec/lib/nlp_toolz/pos_tags_spec.rb +22 -24
- data/spec/lib/nlp_toolz/sentences_spec.rb +19 -19
- data/spec/lib/nlp_toolz/tokens_spec.rb +19 -19
- data/spec/lib/nlp_toolz_spec.rb +12 -12
- metadata +6 -4
- data/lib/nlp_toolz/helpers/lang.rb +0 -26
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ce5f4cad49039b0d8cb6d626facc67a4efa32ae4
|
4
|
+
data.tar.gz: 0565742385f0a34aabe4e456cde014ba2673a589
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1ec11ec4b9b07437fb16f9ab0c181c9cee40a0cc900f90d02d2a6e4fc3bac7efaae890e8eda16bf7dcf8e3595bcb4010cf9d3893bee2a7a937b0fd527c40356f
|
7
|
+
data.tar.gz: 06d53b1bfe11004d0abeba1db130a13f664a054e8ed56f5edb260ec3f8bf189b0f9cb64687a471d2241ffd0d612ae632b50853737282a8d1901ea0645be4426a
|
data/Guardfile
CHANGED
@@ -0,0 +1,52 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# author LeFnord
|
3
|
+
# email pscholz.le@gmail.com
|
4
|
+
# date 2014-10-12
|
5
|
+
|
6
|
+
module NlpToolz
|
7
|
+
|
8
|
+
class Language
|
9
|
+
|
10
|
+
# load java classes
|
11
|
+
# Enumeration = Rjb::import("java.util.Enumeration")
|
12
|
+
HashSet = Rjb::import("java.util.HashSet")
|
13
|
+
# Hashtable = Rjb::import("java.util.Hashtable")
|
14
|
+
# Set = Rjb::import("java.util.Set")
|
15
|
+
|
16
|
+
DataSourceException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.DataSourceException")
|
17
|
+
LanIKernel = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.LanIKernel")
|
18
|
+
Request = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Request")
|
19
|
+
RequestException = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.RequestException")
|
20
|
+
Response = Rjb::import("de.uni_leipzig.asv.toolbox.jLanI.kernel.Response")
|
21
|
+
|
22
|
+
def self.get_language(text = nil)
|
23
|
+
return -1 if text.nil? || text.empty?
|
24
|
+
lang_probability = identify text
|
25
|
+
lang_probability.first
|
26
|
+
end
|
27
|
+
|
28
|
+
# set language and probability of sentence
|
29
|
+
def self.identify(text)
|
30
|
+
languages = HashSet.new
|
31
|
+
modus = 0
|
32
|
+
reduce = true
|
33
|
+
|
34
|
+
req = Request.new(text, languages, modus, reduce)
|
35
|
+
|
36
|
+
LanIKernel.propertyFile = File.join(MODELS, 'language', 'lanikernel')
|
37
|
+
kernel = LanIKernel.getInstance()
|
38
|
+
res = kernel.evaluate(req)
|
39
|
+
|
40
|
+
@lang,@probability = get_most_probability_lang(res.getResult.toString)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
def self.get_most_probability_lang(result)
|
45
|
+
res = []
|
46
|
+
foo = result.sub!("{","").sub!("}","").split(', ').collect{ |x| x.split('=') }
|
47
|
+
foo.each{ |x| res << [x.first,x.last.to_f] }
|
48
|
+
res.max{|a,b| a.last <=> b.last}
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
data/lib/nlp_toolz/load_jars.rb
CHANGED
@@ -2,11 +2,13 @@ module NlpToolz
|
|
2
2
|
MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
|
3
3
|
JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
|
4
4
|
|
5
|
-
CLASS_PATH = [
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
].join(":")
|
5
|
+
# CLASS_PATH = [
|
6
|
+
# File.join(JARS, "jwnl-1.3.3.jar"),
|
7
|
+
# File.join(JARS, "opennlp-tools-1.5.3.jar"),
|
8
|
+
# File.join(JARS, "opennlp-maxent-3.0.3.jar")
|
9
|
+
# ].join(":")
|
10
|
+
|
11
|
+
CLASS_PATH = Dir.glob(File.join(JARS,'*.jar')).join(':')
|
10
12
|
|
11
13
|
Rjb::load(CLASS_PATH,['-X+C','-Xmx4096m','-Djava.awt.headless=true'])
|
12
14
|
# Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseParallelGC','-XX:+UseParallelOldGC','-Djava.awt.headless=true'])
|
data/lib/nlp_toolz/parser.rb
CHANGED
@@ -4,12 +4,11 @@
|
|
4
4
|
# date: 2012-12-10
|
5
5
|
|
6
6
|
module NlpToolz
|
7
|
-
|
7
|
+
|
8
8
|
class Parser
|
9
|
-
|
10
|
-
include Lang
|
9
|
+
|
11
10
|
include TmpFile
|
12
|
-
|
11
|
+
|
13
12
|
# load java classes
|
14
13
|
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
14
|
|
@@ -18,7 +17,7 @@ module NlpToolz
|
|
18
17
|
|
19
18
|
def initialize(input, lang = nil)
|
20
19
|
@input = input
|
21
|
-
@lang = lang || get_language
|
20
|
+
@lang = lang || NlpToolz::Language.get_language(input)
|
22
21
|
@model_name = "#{@lang}-sm5.gr"
|
23
22
|
get_model
|
24
23
|
end
|
data/lib/nlp_toolz/pos_tags.rb
CHANGED
@@ -8,8 +8,6 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class PosTags
|
10
10
|
|
11
|
-
include Lang
|
12
|
-
|
13
11
|
# load java classes
|
14
12
|
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
13
|
POSModel = Rjb::import('opennlp.tools.postag.POSModel')
|
@@ -19,7 +17,7 @@ module NlpToolz
|
|
19
17
|
|
20
18
|
def initialize(input, lang = nil)
|
21
19
|
@input = input
|
22
|
-
@lang = lang || get_language
|
20
|
+
@lang = lang || NlpToolz::Language.get_language(input)
|
23
21
|
@model_name = "#{@lang}-pos-maxent.bin"
|
24
22
|
get_model
|
25
23
|
end
|
data/lib/nlp_toolz/sentences.rb
CHANGED
@@ -8,8 +8,6 @@ module NlpToolz
|
|
8
8
|
|
9
9
|
class Sentences
|
10
10
|
|
11
|
-
include Lang
|
12
|
-
|
13
11
|
# load java classes
|
14
12
|
FileInputStream = Rjb::import('java.io.FileInputStream')
|
15
13
|
SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
|
@@ -19,7 +17,7 @@ module NlpToolz
|
|
19
17
|
|
20
18
|
def initialize(input,lang = nil)
|
21
19
|
@input = input
|
22
|
-
@lang = lang || get_language
|
20
|
+
@lang = lang || NlpToolz::Language.get_language(input)
|
23
21
|
@model_name = "#{@lang}-sent.bin"
|
24
22
|
get_model
|
25
23
|
end
|
data/lib/nlp_toolz/tokens.rb
CHANGED
@@ -7,8 +7,6 @@ module NlpToolz
|
|
7
7
|
|
8
8
|
class Tokens
|
9
9
|
|
10
|
-
include Lang
|
11
|
-
|
12
10
|
# load java classes
|
13
11
|
FileInputStream = Rjb::import('java.io.FileInputStream')
|
14
12
|
TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
|
@@ -18,7 +16,7 @@ module NlpToolz
|
|
18
16
|
|
19
17
|
def initialize(input, lang = nil)
|
20
18
|
@input = input
|
21
|
-
@lang = lang || get_language
|
19
|
+
@lang = lang || NlpToolz::Language.get_language(input)
|
22
20
|
@model_name = "#{@lang}-token.bin"
|
23
21
|
get_model
|
24
22
|
end
|
data/lib/nlp_toolz/version.rb
CHANGED
data/lib/nlp_toolz.rb
CHANGED
@@ -13,24 +13,22 @@ require "multi_json"
|
|
13
13
|
# internal requirements
|
14
14
|
require "nlp_toolz/version"
|
15
15
|
require "nlp_toolz/helpers/url_handler"
|
16
|
-
require "nlp_toolz/helpers/lang"
|
17
16
|
require "nlp_toolz/helpers/string_extended"
|
18
17
|
require "nlp_toolz/helpers/tmp_file"
|
19
18
|
|
20
19
|
# NLP Tools
|
21
20
|
require "nlp_toolz/load_jars"
|
21
|
+
require "nlp_toolz/language"
|
22
22
|
require "nlp_toolz/sentences"
|
23
23
|
require "nlp_toolz/pos_tags"
|
24
24
|
require "nlp_toolz/tokens"
|
25
25
|
require "nlp_toolz/parser"
|
26
26
|
|
27
27
|
module NlpToolz
|
28
|
-
extend Lang
|
29
|
-
|
30
28
|
module_function
|
31
29
|
|
32
30
|
def get_lang(input)
|
33
|
-
NlpToolz.get_language(input)
|
31
|
+
NlpToolz::Language.get_language(input)
|
34
32
|
end
|
35
33
|
|
36
34
|
def get_sentences(input,lang = nil)
|
@@ -7,10 +7,10 @@ describe String do
|
|
7
7
|
end
|
8
8
|
|
9
9
|
it "should delete quotations marks" do
|
10
|
-
@a.join("").clean_up.
|
10
|
+
expect(@a.join("").clean_up).to be_empty
|
11
11
|
chars = (@a.length - 1) * 3
|
12
|
-
@a.join(" ap").clean_up.length.
|
13
|
-
@a.join("ap ").clean_up.length.
|
12
|
+
expect(@a.join(" ap").clean_up.length).to be == chars
|
13
|
+
expect(@a.join("ap ").clean_up.length).to be == chars
|
14
14
|
end
|
15
15
|
|
16
16
|
|
@@ -0,0 +1,23 @@
|
|
1
|
+
require "spec_helper"
|
2
|
+
|
3
|
+
describe 'Language' do
|
4
|
+
|
5
|
+
before(:all) do
|
6
|
+
@en_text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
|
7
|
+
@de_text = "Die erste Ausgabe der von Arwidsson herausgegebenen, kurzlebigen Zeitschrift Abo Morgonblad vom 5. Januar 1821."
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'do nothings if text empty or nil' do
|
11
|
+
res = NlpToolz::Language.get_language
|
12
|
+
expect(res).to be == -1
|
13
|
+
res = NlpToolz::Language.get_language('')
|
14
|
+
expect(res).to be == -1
|
15
|
+
end
|
16
|
+
|
17
|
+
it 'gets language' do
|
18
|
+
res = NlpToolz::Language.get_language @en_text
|
19
|
+
expect(res).to be == 'en'
|
20
|
+
res = NlpToolz::Language.get_language @de_text
|
21
|
+
expect(res).to be == 'de'
|
22
|
+
end
|
23
|
+
end
|
@@ -7,44 +7,44 @@ describe NlpToolz do
|
|
7
7
|
@text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
|
8
8
|
@g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
describe "attributes" do
|
12
12
|
it "should respond to #attribute" do
|
13
13
|
text = NlpToolz::Parser.new(@text)
|
14
|
-
text.
|
15
|
-
text.
|
16
|
-
text.
|
17
|
-
text.
|
18
|
-
text.
|
14
|
+
expect(text).to respond_to(:input)
|
15
|
+
expect(text).to respond_to(:lang)
|
16
|
+
expect(text).to respond_to(:model_name)
|
17
|
+
expect(text).to respond_to(:model)
|
18
|
+
expect(text).to respond_to(:parse_hash)
|
19
19
|
end
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
describe "model" do
|
23
23
|
it "should have a model, if lang 'en'" do
|
24
24
|
sent = NlpToolz::Parser.new(@text,'en')
|
25
|
-
sent.model_name.
|
26
|
-
sent.has_model
|
25
|
+
expect(sent.model_name).to be == 'en-sm5.gr'
|
26
|
+
expect(sent.has_model?).to be_truthy
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
it "should not have a model, if lang not known" do
|
30
30
|
sent = NlpToolz::Parser.new(@g_text)
|
31
|
-
sent.has_model
|
31
|
+
expect(sent.has_model?).to be_falsey
|
32
32
|
end
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
describe "object" do
|
36
36
|
it "should create a valid object" do
|
37
37
|
expect{ text = NlpToolz::Parser.new(@text,"en") }.to_not raise_error
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
it "should set the language of input" do
|
41
41
|
text = NlpToolz::Parser.new(@text)
|
42
|
-
text.lang.
|
42
|
+
expect(text.lang).to be == "en"
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
it "should build the right model name" do
|
46
46
|
text = NlpToolz::Parser.new(@text)
|
47
|
-
text.model_name.
|
47
|
+
expect(text.model_name).to be == "en-sm5.gr"
|
48
48
|
end
|
49
49
|
end
|
50
50
|
|
@@ -52,15 +52,15 @@ describe NlpToolz do
|
|
52
52
|
it "should store tree in a hash" do
|
53
53
|
text = NlpToolz::Parser.new(@text)
|
54
54
|
text.parse_text
|
55
|
-
text.parse_hash.
|
55
|
+
expect(text.parse_hash).to be_a(Hash)
|
56
56
|
end
|
57
57
|
|
58
58
|
it "should have a token hash after parsing" do
|
59
59
|
text = NlpToolz::Parser.new(@text)
|
60
60
|
text.parse_text
|
61
|
-
text.layer.
|
62
|
-
text.layer.
|
63
|
-
text.layer.
|
61
|
+
expect(text.layer).to be_a Hash
|
62
|
+
expect(text.layer).to include(:tags)
|
63
|
+
expect(text.layer).to include(:tokens)
|
64
64
|
end
|
65
65
|
end
|
66
66
|
end # Parser
|
@@ -7,60 +7,58 @@ describe NlpToolz do
|
|
7
7
|
@text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
|
8
8
|
@g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
describe "attributes" do
|
12
12
|
it "should respond to #attribute" do
|
13
13
|
text = NlpToolz::PosTags.new(@text)
|
14
|
-
text.
|
15
|
-
text.
|
16
|
-
text.
|
17
|
-
text.
|
18
|
-
text.
|
14
|
+
expect(text).to respond_to(:input)
|
15
|
+
expect(text).to respond_to(:lang)
|
16
|
+
expect(text).to respond_to(:model_name)
|
17
|
+
expect(text).to respond_to(:model)
|
18
|
+
expect(text).to respond_to(:tokenized)
|
19
19
|
end
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
describe "model" do
|
23
23
|
it "should have a model, if lang 'en'" do
|
24
24
|
sent = NlpToolz::PosTags.new(@text,'en')
|
25
|
-
sent.model_name.
|
26
|
-
sent.has_model
|
25
|
+
expect(sent.model_name).to be == 'en-pos-maxent.bin'
|
26
|
+
expect(sent.has_model?).to be_truthy
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
it "should not have a model, if lang not known" do
|
30
30
|
sent = NlpToolz::PosTags.new(@g_text)
|
31
|
-
sent.has_model
|
31
|
+
expect(sent.has_model?).to be_falsey
|
32
32
|
end
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
describe "object" do
|
36
36
|
it "should create a valid object" do
|
37
37
|
expect{ text = NlpToolz::PosTags.new(@text,"en") }.to_not raise_error
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
it "should set the language of input" do
|
41
41
|
text = NlpToolz::PosTags.new(@text)
|
42
|
-
text.lang.
|
42
|
+
expect(text.lang).to be == "en"
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
it "should build the right model name" do
|
46
46
|
text = NlpToolz::PosTags.new(@text)
|
47
|
-
text.model_name.
|
47
|
+
expect(text.model_name).to be == "en-pos-maxent.bin"
|
48
48
|
end
|
49
|
-
|
49
|
+
|
50
50
|
it "should be a hash after pos tagging" do
|
51
51
|
text = NlpToolz::PosTags.new(@text,"en")
|
52
52
|
text.get_pos_tags
|
53
|
-
text.tokenized.
|
54
|
-
text.tokenized.
|
55
|
-
text.tokenized.
|
53
|
+
expect(text.tokenized).to include(:tokens)
|
54
|
+
expect(text.tokenized).to include(:tags)
|
55
|
+
expect(text.tokenized).to be_a Hash
|
56
56
|
end
|
57
|
-
|
57
|
+
|
58
58
|
it "should get pos text of given text" do
|
59
59
|
text = NlpToolz::PosTags.new(@text,"en")
|
60
60
|
text.get_pos_tags
|
61
|
-
text.tokenized[:tokens].
|
62
|
-
text.tokenized[:tags].should have(15).items
|
63
|
-
text.tokenized[:tokens].length.should == text.tokenized[:tags].length
|
61
|
+
expect(text.tokenized[:tokens].length).to be == text.tokenized[:tags].length
|
64
62
|
end
|
65
63
|
end
|
66
64
|
end # POS Tags
|
@@ -7,53 +7,53 @@ describe NlpToolz do
|
|
7
7
|
@text = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author.
|
8
8
|
He served as a general in the United States Army during the American Civil War (1861–65), receiving both recognition for his outstanding command of military strategy, and criticism for the harshness of the scorched earth policies he implemented in conducting total war against the Confederate States of America.
|
9
9
|
Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
|
10
|
-
|
10
|
+
|
11
11
|
@g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
|
12
12
|
end
|
13
|
-
|
13
|
+
|
14
14
|
describe "attributes" do
|
15
15
|
it "should respond to #attribute" do
|
16
16
|
sent = NlpToolz::Sentences.new(@text)
|
17
|
-
sent.
|
18
|
-
sent.
|
19
|
-
sent.
|
20
|
-
sent.
|
21
|
-
sent.
|
17
|
+
expect(sent).to respond_to(:input)
|
18
|
+
expect(sent).to respond_to(:lang)
|
19
|
+
expect(sent).to respond_to(:model_name)
|
20
|
+
expect(sent).to respond_to(:model)
|
21
|
+
expect(sent).to respond_to(:sentences)
|
22
22
|
end
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
describe "model" do
|
26
26
|
it "should have a model, if lang 'en'" do
|
27
27
|
sent = NlpToolz::Sentences.new(@text,'en')
|
28
|
-
sent.has_model
|
29
|
-
sent.model_name.
|
28
|
+
expect(sent.has_model?).to be_truthy
|
29
|
+
expect(sent.model_name).to be == 'en-sent.bin'
|
30
30
|
end
|
31
|
-
|
31
|
+
|
32
32
|
it "should not have a model, if lang not known" do
|
33
33
|
sent = NlpToolz::Sentences.new(@g_text)
|
34
|
-
sent.has_model
|
34
|
+
expect(sent.has_model?).to be_falsey
|
35
35
|
end
|
36
36
|
end
|
37
|
-
|
37
|
+
|
38
38
|
describe "object" do
|
39
39
|
it "should create a valid object" do
|
40
40
|
expect{ sent = NlpToolz::Sentences.new(@text) }.to_not raise_error
|
41
41
|
end
|
42
|
-
|
42
|
+
|
43
43
|
it "should set the language of input" do
|
44
44
|
sent = NlpToolz::Sentences.new(@text)
|
45
|
-
sent.lang.
|
45
|
+
expect(sent.lang).to be == "en"
|
46
46
|
end
|
47
|
-
|
47
|
+
|
48
48
|
it "should build the right model name" do
|
49
49
|
sent = NlpToolz::Sentences.new(@text)
|
50
|
-
sent.model_name.
|
50
|
+
expect(sent.model_name).to be == "en-sent.bin"
|
51
51
|
end
|
52
|
-
|
52
|
+
|
53
53
|
it "should split incoming text into sentences" do
|
54
54
|
text = NlpToolz::Sentences.new(@text,"en")
|
55
55
|
text.split_into_sentences
|
56
|
-
text.sentences.
|
56
|
+
expect(text.sentences.length).to be == 3
|
57
57
|
end
|
58
58
|
end
|
59
59
|
end # Sentences
|
@@ -7,55 +7,55 @@ describe NlpToolz do
|
|
7
7
|
@text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
|
8
8
|
@g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
|
9
9
|
end
|
10
|
-
|
10
|
+
|
11
11
|
describe "attributes" do
|
12
12
|
it "should respond to #attribute" do
|
13
13
|
text = NlpToolz::Tokens.new(@text)
|
14
|
-
text.
|
15
|
-
text.
|
16
|
-
text.
|
17
|
-
text.
|
18
|
-
text.
|
14
|
+
expect(text).to respond_to(:input)
|
15
|
+
expect(text).to respond_to(:lang)
|
16
|
+
expect(text).to respond_to(:model_name)
|
17
|
+
expect(text).to respond_to(:model)
|
18
|
+
expect(text).to respond_to(:tokens)
|
19
19
|
end
|
20
20
|
end
|
21
|
-
|
21
|
+
|
22
22
|
describe "model" do
|
23
23
|
it "should have a model, if lang 'en'" do
|
24
24
|
sent = NlpToolz::Tokens.new(@text,'en')
|
25
|
-
sent.has_model
|
25
|
+
expect(sent.has_model?).to be_truthy
|
26
26
|
end
|
27
|
-
|
27
|
+
|
28
28
|
it "should not have a model, if lang not known" do
|
29
29
|
sent = NlpToolz::Tokens.new(@g_text)
|
30
|
-
sent.has_model
|
30
|
+
expect(sent.has_model?).to be_falsey
|
31
31
|
end
|
32
32
|
end
|
33
|
-
|
33
|
+
|
34
34
|
describe "object" do
|
35
35
|
it "should create a valid object" do
|
36
36
|
expect{ text = NlpToolz::Tokens.new(@text,"en") }.to_not raise_error
|
37
37
|
end
|
38
|
-
|
38
|
+
|
39
39
|
it "should set the language of input" do
|
40
40
|
text = NlpToolz::Tokens.new(@text)
|
41
|
-
text.lang.
|
41
|
+
expect(text.lang).to be == "en"
|
42
42
|
end
|
43
|
-
|
43
|
+
|
44
44
|
it "should build the right model name" do
|
45
45
|
text = NlpToolz::Tokens.new(@text)
|
46
|
-
text.model_name.
|
46
|
+
expect(text.model_name).to be == "en-token.bin"
|
47
47
|
end
|
48
|
-
|
48
|
+
|
49
49
|
it "should be a arrar after tokenizing" do
|
50
50
|
text = NlpToolz::Tokens.new(@text,"en")
|
51
51
|
text.tokenize
|
52
|
-
text.tokens.
|
52
|
+
expect(text.tokens).to be_a Array
|
53
53
|
end
|
54
|
-
|
54
|
+
|
55
55
|
it "should tokenize given text" do
|
56
56
|
text = NlpToolz::Tokens.new(@text,"en")
|
57
57
|
text.tokenize
|
58
|
-
text.tokens.
|
58
|
+
expect(text.tokens.length).to be == 15
|
59
59
|
end
|
60
60
|
end
|
61
61
|
end # Tokens
|
data/spec/lib/nlp_toolz_spec.rb
CHANGED
@@ -11,33 +11,33 @@ describe NlpToolz do
|
|
11
11
|
describe "detect language" do
|
12
12
|
it "should description" do
|
13
13
|
lang = NlpToolz.get_lang(@text)
|
14
|
-
lang.
|
14
|
+
expect(lang).to be == 'en'
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
18
|
describe "sentence detection" do
|
19
19
|
it "should input text split into its sentences" do
|
20
20
|
sentences = NlpToolz.get_sentences(@text)
|
21
|
-
sentences.
|
21
|
+
expect(sentences.length).to be == 3
|
22
22
|
end
|
23
23
|
|
24
24
|
it "should be 'nil', if text lang is unsupported" do
|
25
25
|
sentences = NlpToolz.get_sentences(@g_text)
|
26
|
-
sentences.
|
26
|
+
expect(sentences).to be_nil
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
30
|
describe "tokenizing" do
|
31
31
|
it "should tag a sentence" do
|
32
32
|
tokens = NlpToolz.tokenize_sentence(@sentence)
|
33
|
-
tokens.
|
34
|
-
tokens.
|
33
|
+
expect(tokens.length).to be == 26
|
34
|
+
expect(tokens).to be_a Array
|
35
35
|
end
|
36
36
|
|
37
37
|
it "should tokenize a whole text" do
|
38
38
|
token_arr = NlpToolz.tokenize_text(@text)
|
39
|
-
token_arr.
|
40
|
-
token_arr.first.
|
39
|
+
expect(token_arr.length).to be == 3
|
40
|
+
expect(token_arr.first.length).to be == 26
|
41
41
|
end
|
42
42
|
end
|
43
43
|
|
@@ -45,12 +45,12 @@ describe NlpToolz do
|
|
45
45
|
it "should tag a sentence" do
|
46
46
|
sentence = NlpToolz.get_sentences(@sentence).last
|
47
47
|
tags = NlpToolz.tag_sentence(sentence)
|
48
|
-
tags[:tokens].length.
|
48
|
+
expect(tags[:tokens].length).to be == tags[:tags].length
|
49
49
|
end
|
50
50
|
|
51
51
|
it "should be 'nil', if sentence language not supported " do
|
52
52
|
tags = NlpToolz.tag_sentence(@g_text)
|
53
|
-
tags.
|
53
|
+
expect(tags).to be_nil
|
54
54
|
end
|
55
55
|
end
|
56
56
|
|
@@ -58,12 +58,12 @@ describe NlpToolz do
|
|
58
58
|
it "should parse a sentence" do
|
59
59
|
sentence = NlpToolz.get_sentences(@sentence).last
|
60
60
|
parsed = NlpToolz.parse_sentence(sentence)
|
61
|
-
parsed.
|
61
|
+
expect(parsed).to be_a Hash
|
62
62
|
end
|
63
|
-
|
63
|
+
|
64
64
|
it "should should be 'nil', if sentence language is not supported" do
|
65
65
|
parsed = NlpToolz.parse_sentence(@g_text)
|
66
|
-
parsed.
|
66
|
+
expect(parsed).to be_nil
|
67
67
|
end
|
68
68
|
end
|
69
69
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nlp_toolz
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- LeFnord
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -209,10 +209,10 @@ files:
|
|
209
209
|
- Rakefile
|
210
210
|
- bin/nlp_toolz
|
211
211
|
- lib/nlp_toolz.rb
|
212
|
-
- lib/nlp_toolz/helpers/lang.rb
|
213
212
|
- lib/nlp_toolz/helpers/string_extended.rb
|
214
213
|
- lib/nlp_toolz/helpers/tmp_file.rb
|
215
214
|
- lib/nlp_toolz/helpers/url_handler.rb
|
215
|
+
- lib/nlp_toolz/language.rb
|
216
216
|
- lib/nlp_toolz/load_jars.rb
|
217
217
|
- lib/nlp_toolz/parser.rb
|
218
218
|
- lib/nlp_toolz/pos_tags.rb
|
@@ -221,6 +221,7 @@ files:
|
|
221
221
|
- lib/nlp_toolz/version.rb
|
222
222
|
- nlp_toolz.gemspec
|
223
223
|
- spec/helpers/string_extended_spec.rb
|
224
|
+
- spec/lib/nlp_toolz/language_spec.rb
|
224
225
|
- spec/lib/nlp_toolz/parser_spec.rb
|
225
226
|
- spec/lib/nlp_toolz/pos_tags_spec.rb
|
226
227
|
- spec/lib/nlp_toolz/sentences_spec.rb
|
@@ -247,12 +248,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
247
248
|
version: '0'
|
248
249
|
requirements: []
|
249
250
|
rubyforge_project:
|
250
|
-
rubygems_version: 2.2.
|
251
|
+
rubygems_version: 2.2.2
|
251
252
|
signing_key:
|
252
253
|
specification_version: 4
|
253
254
|
summary: wrapper around the openNLP toolset
|
254
255
|
test_files:
|
255
256
|
- spec/helpers/string_extended_spec.rb
|
257
|
+
- spec/lib/nlp_toolz/language_spec.rb
|
256
258
|
- spec/lib/nlp_toolz/parser_spec.rb
|
257
259
|
- spec/lib/nlp_toolz/pos_tags_spec.rb
|
258
260
|
- spec/lib/nlp_toolz/sentences_spec.rb
|
@@ -1,26 +0,0 @@
|
|
1
|
-
module Lang
|
2
|
-
|
3
|
-
include UrlHandler
|
4
|
-
|
5
|
-
def get_language(text = nil)
|
6
|
-
uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
|
7
|
-
|
8
|
-
if @input
|
9
|
-
asv_response = post_data(URI.escape(@input),uri,{'Content-type'=>'text/plain;charset=utf-8'})
|
10
|
-
elsif text
|
11
|
-
asv_response = post_data(URI.escape(text),uri,{'Content-type'=>'text/plain;charset=utf-8'})
|
12
|
-
end
|
13
|
-
response = MultiJson.load(asv_response.body)
|
14
|
-
|
15
|
-
response["lang"]
|
16
|
-
end
|
17
|
-
|
18
|
-
# ToDo 2013-02-26: make different lang identifier available
|
19
|
-
def alternative_langs lang
|
20
|
-
langs = {
|
21
|
-
en: [:eng, :english],
|
22
|
-
de: [:ger, :german]
|
23
|
-
}.each.collect{|x| x.flatten}
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|