nlp_toolz 1.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/lib/nlp_toolz.rb ADDED
@@ -0,0 +1,84 @@
1
+ # coding: utf-8
2
+ # @author: LeFnord
3
+ # @email: pscholz.le@gmail.com
4
+ # @date: 2012-10-23
5
+
6
+ # for java usage
7
+ require "rjb"
8
+
9
+ # external requirements
10
+ require "awesome_print"
11
+ require "multi_json"
12
+
13
+ # internal requirements
14
+ require "nlp_toolz/version"
15
+ require "nlp_toolz/helpers/url_handler"
16
+ require "nlp_toolz/helpers/lang"
17
+ require "nlp_toolz/helpers/string_extended"
18
+ require "nlp_toolz/helpers/tmp_file"
19
+
20
+ # NLP Tools
21
+ require "nlp_toolz/load_jars"
22
+ require "nlp_toolz/sentences"
23
+ require "nlp_toolz/pos_tags"
24
+ require "nlp_toolz/tokens"
25
+ require "nlp_toolz/parser"
26
+
27
+ module NlpToolz
28
+ extend Lang
29
+
30
+ module_function
31
+
32
+ def get_lang(input)
33
+ NlpToolz.get_language(input)
34
+ end
35
+
36
+ def get_sentences(input,lang = nil)
37
+ text = NlpToolz::Sentences.new(input,lang)
38
+ text.split_into_sentences if text.has_model?
39
+ end
40
+
41
+ def tokenize_sentence(input,lang = nil)
42
+ sentence = NlpToolz::Tokens.new(input,lang)
43
+ sentence.tokenize
44
+ end
45
+
46
+ def tokenize_text(input,lang = nil)
47
+ tokenized_text = []
48
+ get_sentences(input,lang).each do |sentence|
49
+ tokenized_text << tokenize_sentence(sentence,lang)
50
+ end
51
+
52
+ tokenized_text
53
+ end
54
+
55
+ def tag_sentence(input,lang = nil)
56
+ sentence = NlpToolz::PosTags.new(input,lang)
57
+ sentence.get_pos_tags if sentence.has_model?
58
+ end
59
+
60
+ def tag_text(input,lang = nil)
61
+ tagged_text = []
62
+ get_sentences(input,lang).each do |sentence|
63
+ tagged_text << tag_sentence(sentence,lang)
64
+ end
65
+
66
+ tagged_text
67
+ end
68
+
69
+ def parse_sentence(input,lang = nil)
70
+ text = NlpToolz::Parser.new(input,lang)
71
+ text.parse_text
72
+
73
+ text.parse_hash
74
+ end
75
+
76
+ def parse_text(input,lang = nil)
77
+ parsed_text = []
78
+ get_sentences(input,lang).each do |sentence|
79
+ parsed_text << parse_sentence(sentence,lang)
80
+ end
81
+
82
+ parsed_text
83
+ end
84
+ end
data/nlp_toolz.gemspec ADDED
@@ -0,0 +1,42 @@
1
+ # coding: utf-8
2
+ # @author: LeFnord
3
+ # @email: pscholz.le@gmail.com
4
+ # @date: 2012-10-23
5
+
6
+ lib = File.expand_path('../lib', __FILE__)
7
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
8
+ require 'nlp_toolz/version'
9
+
10
+ Gem::Specification.new do |gem|
11
+ gem.name = "nlp_toolz"
12
+ gem.version = NlpToolz::VERSION
13
+ gem.authors = ["LeFnord"]
14
+ gem.email = ["pscholz.le@gmail.com"]
15
+ gem.description = %q{make NLP tools available, from OpenNLP and BerkeleyParser}
16
+ gem.summary = %q{wrapper around the openNLP toolset}
17
+ gem.homepage = "https://github.com/LeFnord/nlp_toolz"
18
+ gem.license = "MIT"
19
+
20
+ gem.required_ruby_version = '>= 2.0.0'
21
+ gem.files = `git ls-files`.split($/)
22
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
23
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
24
+ gem.require_paths = ["lib"]
25
+
26
+ gem.add_development_dependency "rspec"
27
+
28
+ gem.add_development_dependency "guard"
29
+ gem.add_development_dependency "rb-fsevent"
30
+ gem.add_development_dependency "guard-rspec"
31
+ gem.add_development_dependency "guard-bundler"
32
+ gem.add_development_dependency "terminal-notifier-guard"
33
+
34
+ gem.add_development_dependency "yard"
35
+ gem.add_development_dependency "syntax"
36
+ gem.add_development_dependency "awesome_print"
37
+
38
+ gem.add_runtime_dependency "rjb"
39
+ gem.add_runtime_dependency "multi_json"
40
+ gem.add_runtime_dependency "gli"
41
+ gem.add_runtime_dependency "rake"
42
+ end
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe String do
5
+ before(:each) do
6
+ @a = %w[ " ' „ “ ‘ ‘ ’ “ ” « » ‹ › ]
7
+ end
8
+
9
+ it "should delete quotations marks" do
10
+ @a.join("").clean_up.should be_empty
11
+ chars = (@a.length - 1) * 3
12
+ @a.join(" ap").clean_up.length.should == chars
13
+ @a.join("ap ").clean_up.length.should == chars
14
+ end
15
+
16
+
17
+ end
@@ -0,0 +1,67 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe NlpToolz do
5
+ describe "Parser" do
6
+ before(:each) do
7
+ @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
8
+ @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
9
+ end
10
+
11
+ describe "attributes" do
12
+ it "should respond to #attribute" do
13
+ text = NlpToolz::Parser.new(@text)
14
+ text.should respond_to(:input)
15
+ text.should respond_to(:lang)
16
+ text.should respond_to(:model_name)
17
+ text.should respond_to(:model)
18
+ text.should respond_to(:parse_hash)
19
+ end
20
+ end
21
+
22
+ describe "model" do
23
+ it "should have a model, if lang 'en'" do
24
+ sent = NlpToolz::Parser.new(@text,'en')
25
+ sent.model_name.should == 'en-sm5.gr'
26
+ sent.has_model?.should be_true
27
+ end
28
+
29
+ it "should not have a model, if lang not known" do
30
+ sent = NlpToolz::Parser.new(@g_text)
31
+ sent.has_model?.should be_false
32
+ end
33
+ end
34
+
35
+ describe "object" do
36
+ it "should create a valid object" do
37
+ expect{ text = NlpToolz::Parser.new(@text,"en") }.to_not raise_error
38
+ end
39
+
40
+ it "should set the language of input" do
41
+ text = NlpToolz::Parser.new(@text)
42
+ text.lang.should == "en"
43
+ end
44
+
45
+ it "should build the right model name" do
46
+ text = NlpToolz::Parser.new(@text)
47
+ text.model_name.should == "en-sm5.gr"
48
+ end
49
+ end
50
+
51
+ describe "parsing" do
52
+ it "should store tree in a hash" do
53
+ text = NlpToolz::Parser.new(@text)
54
+ text.parse_text
55
+ text.parse_hash.should be_a(Hash)
56
+ end
57
+
58
+ it "should have a token hash after parsing" do
59
+ text = NlpToolz::Parser.new(@text)
60
+ text.parse_text
61
+ text.layer.should be_a Hash
62
+ text.layer.should include(:tags)
63
+ text.layer.should include(:tokens)
64
+ end
65
+ end
66
+ end # Parser
67
+ end # NlpToolz
@@ -0,0 +1,67 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe NlpToolz do
5
+ describe "POS Tags" do
6
+ before(:each) do
7
+ @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
8
+ @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
9
+ end
10
+
11
+ describe "attributes" do
12
+ it "should respond to #attribute" do
13
+ text = NlpToolz::PosTags.new(@text)
14
+ text.should respond_to(:input)
15
+ text.should respond_to(:lang)
16
+ text.should respond_to(:model_name)
17
+ text.should respond_to(:model)
18
+ text.should respond_to(:tokenized)
19
+ end
20
+ end
21
+
22
+ describe "model" do
23
+ it "should have a model, if lang 'en'" do
24
+ sent = NlpToolz::PosTags.new(@text,'en')
25
+ sent.model_name.should == 'en-pos-maxent.bin'
26
+ sent.has_model?.should be_true
27
+ end
28
+
29
+ it "should not have a model, if lang not known" do
30
+ sent = NlpToolz::PosTags.new(@g_text)
31
+ sent.has_model?.should be_false
32
+ end
33
+ end
34
+
35
+ describe "object" do
36
+ it "should create a valid object" do
37
+ expect{ text = NlpToolz::PosTags.new(@text,"en") }.to_not raise_error
38
+ end
39
+
40
+ it "should set the language of input" do
41
+ text = NlpToolz::PosTags.new(@text)
42
+ text.lang.should == "en"
43
+ end
44
+
45
+ it "should build the right model name" do
46
+ text = NlpToolz::PosTags.new(@text)
47
+ text.model_name.should == "en-pos-maxent.bin"
48
+ end
49
+
50
+ it "should be a hash after pos tagging" do
51
+ text = NlpToolz::PosTags.new(@text,"en")
52
+ text.get_pos_tags
53
+ text.tokenized.should include(:tokens)
54
+ text.tokenized.should include(:tags)
55
+ text.tokenized.should be_a Hash
56
+ end
57
+
58
+ it "should get pos text of given text" do
59
+ text = NlpToolz::PosTags.new(@text,"en")
60
+ text.get_pos_tags
61
+ text.tokenized[:tokens].should have(15).items
62
+ text.tokenized[:tags].should have(15).items
63
+ text.tokenized[:tokens].length.should == text.tokenized[:tags].length
64
+ end
65
+ end
66
+ end # POS Tags
67
+ end # NlpToolz
@@ -0,0 +1,60 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe NlpToolz do
5
+ describe "Sentences" do
6
+ before(:each) do
7
+ @text = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author.
8
+ He served as a general in the United States Army during the American Civil War (1861–65), receiving both recognition for his outstanding command of military strategy, and criticism for the harshness of the scorched earth policies he implemented in conducting total war against the Confederate States of America.
9
+ Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
10
+
11
+ @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
12
+ end
13
+
14
+ describe "attributes" do
15
+ it "should respond to #attribute" do
16
+ sent = NlpToolz::Sentences.new(@text)
17
+ sent.should respond_to(:input)
18
+ sent.should respond_to(:lang)
19
+ sent.should respond_to(:model_name)
20
+ sent.should respond_to(:model)
21
+ sent.should respond_to(:sentences)
22
+ end
23
+ end
24
+
25
+ describe "model" do
26
+ it "should have a model, if lang 'en'" do
27
+ sent = NlpToolz::Sentences.new(@text,'en')
28
+ sent.has_model?.should be_true
29
+ sent.model_name.should == 'en-sent.bin'
30
+ end
31
+
32
+ it "should not have a model, if lang not known" do
33
+ sent = NlpToolz::Sentences.new(@g_text)
34
+ sent.has_model?.should be_false
35
+ end
36
+ end
37
+
38
+ describe "object" do
39
+ it "should create a valid object" do
40
+ expect{ sent = NlpToolz::Sentences.new(@text) }.to_not raise_error
41
+ end
42
+
43
+ it "should set the language of input" do
44
+ sent = NlpToolz::Sentences.new(@text)
45
+ sent.lang.should == "en"
46
+ end
47
+
48
+ it "should build the right model name" do
49
+ sent = NlpToolz::Sentences.new(@text)
50
+ sent.model_name.should == "en-sent.bin"
51
+ end
52
+
53
+ it "should split incoming text into sentences" do
54
+ text = NlpToolz::Sentences.new(@text,"en")
55
+ text.split_into_sentences
56
+ text.sentences.should have(3).items
57
+ end
58
+ end
59
+ end # Sentences
60
+ end # NlpToolz
@@ -0,0 +1,62 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe NlpToolz do
5
+ describe "Tokens" do
6
+ before(:each) do
7
+ @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
8
+ @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
9
+ end
10
+
11
+ describe "attributes" do
12
+ it "should respond to #attribute" do
13
+ text = NlpToolz::Tokens.new(@text)
14
+ text.should respond_to(:input)
15
+ text.should respond_to(:lang)
16
+ text.should respond_to(:model_name)
17
+ text.should respond_to(:model)
18
+ text.should respond_to(:tokens)
19
+ end
20
+ end
21
+
22
+ describe "model" do
23
+ it "should have a model, if lang 'en'" do
24
+ sent = NlpToolz::Tokens.new(@text,'en')
25
+ sent.has_model?.should be_true
26
+ end
27
+
28
+ it "should not have a model, if lang not known" do
29
+ sent = NlpToolz::Tokens.new(@g_text)
30
+ sent.has_model?.should be_false
31
+ end
32
+ end
33
+
34
+ describe "object" do
35
+ it "should create a valid object" do
36
+ expect{ text = NlpToolz::Tokens.new(@text,"en") }.to_not raise_error
37
+ end
38
+
39
+ it "should set the language of input" do
40
+ text = NlpToolz::Tokens.new(@text)
41
+ text.lang.should == "en"
42
+ end
43
+
44
+ it "should build the right model name" do
45
+ text = NlpToolz::Tokens.new(@text)
46
+ text.model_name.should == "en-token.bin"
47
+ end
48
+
49
+ it "should be a arrar after tokenizing" do
50
+ text = NlpToolz::Tokens.new(@text,"en")
51
+ text.tokenize
52
+ text.tokens.should be_a Array
53
+ end
54
+
55
+ it "should tokenize given text" do
56
+ text = NlpToolz::Tokens.new(@text,"en")
57
+ text.tokenize
58
+ text.tokens.should have(15).items
59
+ end
60
+ end
61
+ end # Tokens
62
+ end # NlpToolz
@@ -0,0 +1,69 @@
1
+ # coding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe NlpToolz do
5
+ before(:each) do
6
+ @sentence = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author."
7
+ @text = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author. He served as a general in the United States Army during the American Civil War (1861–65), receiving both recognition for his outstanding command of military strategy, and criticism for the harshness of the scorched earth policies he implemented in conducting total war against the Confederate States of America. Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
8
+ @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
9
+ end
10
+
11
+ describe "detect language" do
12
+ it "should description" do
13
+ lang = NlpToolz.get_lang(@text)
14
+ lang.should == 'en'
15
+ end
16
+ end
17
+
18
+ describe "sentence detection" do
19
+ it "should input text split into its sentences" do
20
+ sentences = NlpToolz.get_sentences(@text)
21
+ sentences.should have(3).items
22
+ end
23
+
24
+ it "should be 'nil', if text lang is unsupported" do
25
+ sentences = NlpToolz.get_sentences(@g_text)
26
+ sentences.should be_nil
27
+ end
28
+ end
29
+
30
+ describe "tokenizing" do
31
+ it "should tag a sentence" do
32
+ tokens = NlpToolz.tokenize_sentence(@sentence)
33
+ tokens.should have(26).items
34
+ tokens.should be_a Array
35
+ end
36
+
37
+ it "should tokenize a whole text" do
38
+ token_arr = NlpToolz.tokenize_text(@text)
39
+ token_arr.should have(3).items
40
+ token_arr.first.should have(26).items
41
+ end
42
+ end
43
+
44
+ describe "tagging" do
45
+ it "should tag a sentence" do
46
+ sentence = NlpToolz.get_sentences(@sentence).last
47
+ tags = NlpToolz.tag_sentence(sentence)
48
+ tags[:tokens].length.should == tags[:tags].length
49
+ end
50
+
51
+ it "should be 'nil', if sentence language not supported " do
52
+ tags = NlpToolz.tag_sentence(@g_text)
53
+ tags.should be_nil
54
+ end
55
+ end
56
+
57
+ describe "parsing" do
58
+ it "should parse a sentence" do
59
+ sentence = NlpToolz.get_sentences(@sentence).last
60
+ parsed = NlpToolz.parse_sentence(sentence)
61
+ parsed.should be_a Hash
62
+ end
63
+
64
+ it "should should be 'nil', if sentence language is not supported" do
65
+ parsed = NlpToolz.parse_sentence(@g_text)
66
+ parsed.should be_nil
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,16 @@
1
+ require "nlp_toolz"
2
+
3
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
4
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
5
+
6
+ RSpec.configure do |config|
7
+ # config.treat_symbols_as_metadata_keys_with_true_values = true
8
+ # config.run_all_when_everything_filtered = true
9
+ # config.filter_run :focus
10
+
11
+ # Run specs in random order to surface order dependencies. If you find an
12
+ # order dependency and want to debug it, you can fix the order by providing
13
+ # the seed, which is printed after each run.
14
+ # --seed 1234
15
+ config.order = 'random'
16
+ end