RubyGems - nlp_toolz - Versions diffs - 1.0.3 - Mend

nlp_toolz 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/.gitignore +28 -0
data/.rspec +2 -0
data/Gemfile +6 -0
data/Guardfile +13 -0
data/LICENSE.txt +22 -0
data/README.md +37 -0
data/Rakefile +15 -0
data/bin/nlp_toolz +92 -0
data/lib/nlp_toolz/helpers/lang.rb +36 -0
data/lib/nlp_toolz/helpers/string_extended.rb +20 -0
data/lib/nlp_toolz/helpers/tmp_file.rb +18 -0
data/lib/nlp_toolz/helpers/url_handler.rb +26 -0
data/lib/nlp_toolz/load_jars.rb +22 -0
data/lib/nlp_toolz/parser.rb +146 -0
data/lib/nlp_toolz/pos_tags.rb +77 -0
data/lib/nlp_toolz/sentences.rb +50 -0
data/lib/nlp_toolz/tokens.rb +48 -0
data/lib/nlp_toolz/version.rb +8 -0
data/lib/nlp_toolz.rb +84 -0
data/nlp_toolz.gemspec +42 -0
data/spec/helpers/string_extended_spec.rb +17 -0
data/spec/lib/nlp_toolz/parser_spec.rb +67 -0
data/spec/lib/nlp_toolz/pos_tags_spec.rb +67 -0
data/spec/lib/nlp_toolz/sentences_spec.rb +60 -0
data/spec/lib/nlp_toolz/tokens_spec.rb +62 -0
data/spec/lib/nlp_toolz_spec.rb +69 -0
data/spec/spec_helper.rb +16 -0
metadata +262 -0

data/lib/nlp_toolz.rb ADDED Viewed

@@ -0,0 +1,84 @@
+# coding:  utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+# for java usage
+require "rjb"
+# external requirements
+require "awesome_print"
+require "multi_json"
+# internal requirements
+require "nlp_toolz/version"
+require "nlp_toolz/helpers/url_handler"
+require "nlp_toolz/helpers/lang"
+require "nlp_toolz/helpers/string_extended"
+require "nlp_toolz/helpers/tmp_file"
+# NLP Tools
+require "nlp_toolz/load_jars"
+require "nlp_toolz/sentences"
+require "nlp_toolz/pos_tags"
+require "nlp_toolz/tokens"
+require "nlp_toolz/parser"
+module NlpToolz
+  extend Lang
+  module_function
+  def get_lang(input)
+    NlpToolz.get_language(input)
+  end
+  def get_sentences(input,lang = nil)
+    text = NlpToolz::Sentences.new(input,lang)
+    text.split_into_sentences if text.has_model?
+  end
+  def tokenize_sentence(input,lang = nil)
+    sentence = NlpToolz::Tokens.new(input,lang)
+    sentence.tokenize
+  end
+  def tokenize_text(input,lang = nil)
+    tokenized_text = []
+    get_sentences(input,lang).each do |sentence|
+      tokenized_text << tokenize_sentence(sentence,lang)
+    end
+    tokenized_text
+  end
+  def tag_sentence(input,lang = nil)
+    sentence = NlpToolz::PosTags.new(input,lang)
+    sentence.get_pos_tags if sentence.has_model?
+  end
+  def tag_text(input,lang = nil)
+    tagged_text = []
+    get_sentences(input,lang).each do |sentence|
+      tagged_text << tag_sentence(sentence,lang)
+    end
+    tagged_text
+  end
+  def parse_sentence(input,lang = nil)
+    text = NlpToolz::Parser.new(input,lang)
+    text.parse_text
+    text.parse_hash
+  end
+  def parse_text(input,lang = nil)
+    parsed_text = []
+    get_sentences(input,lang).each do |sentence|
+      parsed_text << parse_sentence(sentence,lang)
+    end
+    parsed_text
+  end
+end

data/nlp_toolz.gemspec ADDED Viewed

@@ -0,0 +1,42 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'nlp_toolz/version'
+Gem::Specification.new do |gem|
+  gem.name          = "nlp_toolz"
+  gem.version       = NlpToolz::VERSION
+  gem.authors       = ["LeFnord"]
+  gem.email         = ["pscholz.le@gmail.com"]
+  gem.description   = %q{make NLP tools available, from OpenNLP and BerkeleyParser}
+  gem.summary       = %q{wrapper around the openNLP toolset}
+  gem.homepage      = "https://github.com/LeFnord/nlp_toolz"
+  gem.license       = "MIT"
+  gem.required_ruby_version = '>= 2.0.0'
+  gem.files         = `git ls-files`.split($/)
+  gem.executables   = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
+  gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
+  gem.require_paths = ["lib"]
+  gem.add_development_dependency "rspec"
+  gem.add_development_dependency "guard"
+  gem.add_development_dependency "rb-fsevent"
+  gem.add_development_dependency "guard-rspec"
+  gem.add_development_dependency "guard-bundler"
+  gem.add_development_dependency "terminal-notifier-guard"
+  gem.add_development_dependency "yard"
+  gem.add_development_dependency "syntax"
+  gem.add_development_dependency "awesome_print"
+  gem.add_runtime_dependency "rjb"
+  gem.add_runtime_dependency "multi_json"
+  gem.add_runtime_dependency "gli"
+  gem.add_runtime_dependency "rake"
+end

data/spec/helpers/string_extended_spec.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# coding: utf-8
+require "spec_helper"
+describe String do
+  before(:each) do
+    @a = %w[ " ' „ “ ‘ ‘ ’ “ ” « » ‹ › ]
+  end
+  it "should delete quotations marks" do
+    @a.join("").clean_up.should be_empty
+    chars =  (@a.length - 1) * 3
+    @a.join(" ap").clean_up.length.should == chars
+    @a.join("ap ").clean_up.length.should == chars
+  end
+end

data/spec/lib/nlp_toolz/parser_spec.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# coding: utf-8
+require "spec_helper"
+describe NlpToolz do
+  describe "Parser" do
+    before(:each) do
+      @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
+      @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
+    end
+    describe "attributes" do
+      it "should respond to #attribute" do
+        text = NlpToolz::Parser.new(@text)
+        text.should respond_to(:input)
+        text.should respond_to(:lang)
+        text.should respond_to(:model_name)
+        text.should respond_to(:model)
+        text.should respond_to(:parse_hash)
+      end
+    end
+    describe "model" do
+      it "should have a model, if lang 'en'" do
+        sent = NlpToolz::Parser.new(@text,'en')
+        sent.model_name.should == 'en-sm5.gr'
+        sent.has_model?.should be_true
+      end
+      it "should not have a model, if lang not known" do
+        sent = NlpToolz::Parser.new(@g_text)
+        sent.has_model?.should be_false
+      end
+    end
+    describe "object" do
+      it "should create a valid object" do
+        expect{ text = NlpToolz::Parser.new(@text,"en") }.to_not raise_error
+      end
+      it "should set the language of input" do
+        text = NlpToolz::Parser.new(@text)
+        text.lang.should == "en"
+      end
+      it "should build the right model name" do
+        text = NlpToolz::Parser.new(@text)
+        text.model_name.should == "en-sm5.gr"
+      end
+    end
+    describe "parsing" do
+      it "should store tree in a hash" do
+        text = NlpToolz::Parser.new(@text)
+        text.parse_text
+        text.parse_hash.should be_a(Hash)
+      end
+      it "should have a token hash after parsing" do
+        text = NlpToolz::Parser.new(@text)
+        text.parse_text
+        text.layer.should be_a Hash
+        text.layer.should include(:tags)
+        text.layer.should include(:tokens)
+      end
+    end
+  end # Parser
+end # NlpToolz

data/spec/lib/nlp_toolz/pos_tags_spec.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# coding: utf-8
+require "spec_helper"
+describe NlpToolz do
+  describe "POS Tags" do
+    before(:each) do
+      @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
+      @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
+    end
+    describe "attributes" do
+      it "should respond to #attribute" do
+        text = NlpToolz::PosTags.new(@text)
+        text.should respond_to(:input)
+        text.should respond_to(:lang)
+        text.should respond_to(:model_name)
+        text.should respond_to(:model)
+        text.should respond_to(:tokenized)
+      end
+    end
+    describe "model" do
+      it "should have a model, if lang 'en'" do
+        sent = NlpToolz::PosTags.new(@text,'en')
+        sent.model_name.should == 'en-pos-maxent.bin'
+        sent.has_model?.should be_true
+      end
+      it "should not have a model, if lang not known" do
+        sent = NlpToolz::PosTags.new(@g_text)
+        sent.has_model?.should be_false
+      end
+    end
+    describe "object" do
+      it "should create a valid object" do
+        expect{ text = NlpToolz::PosTags.new(@text,"en") }.to_not raise_error
+      end
+      it "should set the language of input" do
+        text = NlpToolz::PosTags.new(@text)
+        text.lang.should == "en"
+      end
+      it "should build the right model name" do
+        text = NlpToolz::PosTags.new(@text)
+        text.model_name.should == "en-pos-maxent.bin"
+      end
+      it "should be a hash after pos tagging" do
+        text = NlpToolz::PosTags.new(@text,"en")
+        text.get_pos_tags
+        text.tokenized.should include(:tokens)
+        text.tokenized.should include(:tags)
+        text.tokenized.should be_a Hash
+      end
+      it "should get pos text of given text" do
+        text = NlpToolz::PosTags.new(@text,"en")
+        text.get_pos_tags
+        text.tokenized[:tokens].should have(15).items
+        text.tokenized[:tags].should have(15).items
+        text.tokenized[:tokens].length.should == text.tokenized[:tags].length
+      end
+    end
+  end # POS Tags
+end # NlpToolz

data/spec/lib/nlp_toolz/sentences_spec.rb ADDED Viewed

@@ -0,0 +1,60 @@
+# coding: utf-8
+require "spec_helper"
+describe NlpToolz do
+  describe "Sentences" do
+    before(:each) do
+      @text = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author.
+               He served as a general in the United States Army during the American Civil War (1861–65), receiving both recognition for his outstanding command of military strategy, and criticism for the harshness of the scorched earth policies he implemented in conducting total war against the Confederate States of America.
+               Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
+      @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
+    end
+    describe "attributes" do
+      it "should respond to #attribute" do
+        sent = NlpToolz::Sentences.new(@text)
+        sent.should respond_to(:input)
+        sent.should respond_to(:lang)
+        sent.should respond_to(:model_name)
+        sent.should respond_to(:model)
+        sent.should respond_to(:sentences)
+      end
+    end
+    describe "model" do
+      it "should have a model, if lang 'en'" do
+        sent = NlpToolz::Sentences.new(@text,'en')
+        sent.has_model?.should be_true
+        sent.model_name.should == 'en-sent.bin'
+      end
+      it "should not have a model, if lang not known" do
+        sent = NlpToolz::Sentences.new(@g_text)
+        sent.has_model?.should be_false
+      end
+    end
+    describe "object" do
+      it "should create a valid object" do
+        expect{ sent = NlpToolz::Sentences.new(@text) }.to_not raise_error
+      end
+      it "should set the language of input" do
+        sent = NlpToolz::Sentences.new(@text)
+        sent.lang.should == "en"
+      end
+      it "should build the right model name" do
+        sent = NlpToolz::Sentences.new(@text)
+        sent.model_name.should == "en-sent.bin"
+      end
+      it "should split incoming text into sentences" do
+        text = NlpToolz::Sentences.new(@text,"en")
+        text.split_into_sentences
+        text.sentences.should have(3).items
+      end
+    end
+  end # Sentences
+end # NlpToolz

data/spec/lib/nlp_toolz/tokens_spec.rb ADDED Viewed

@@ -0,0 +1,62 @@
+# coding: utf-8
+require "spec_helper"
+describe NlpToolz do
+  describe "Tokens" do
+    before(:each) do
+      @text = "Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
+      @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
+    end
+    describe "attributes" do
+      it "should respond to #attribute" do
+        text = NlpToolz::Tokens.new(@text)
+        text.should respond_to(:input)
+        text.should respond_to(:lang)
+        text.should respond_to(:model_name)
+        text.should respond_to(:model)
+        text.should respond_to(:tokens)
+      end
+    end
+    describe "model" do
+      it "should have a model, if lang 'en'" do
+        sent = NlpToolz::Tokens.new(@text,'en')
+        sent.has_model?.should be_true
+      end
+      it "should not have a model, if lang not known" do
+        sent = NlpToolz::Tokens.new(@g_text)
+        sent.has_model?.should be_false
+      end
+    end
+    describe "object" do
+      it "should create a valid object" do
+        expect{ text = NlpToolz::Tokens.new(@text,"en") }.to_not raise_error
+      end
+      it "should set the language of input" do
+        text = NlpToolz::Tokens.new(@text)
+        text.lang.should == "en"
+      end
+      it "should build the right model name" do
+        text = NlpToolz::Tokens.new(@text)
+        text.model_name.should == "en-token.bin"
+      end
+      it "should be a arrar after tokenizing" do
+        text = NlpToolz::Tokens.new(@text,"en")
+        text.tokenize
+        text.tokens.should be_a Array
+      end
+      it "should tokenize given text" do
+        text = NlpToolz::Tokens.new(@text,"en")
+        text.tokenize
+        text.tokens.should have(15).items
+      end
+    end
+  end # Tokens
+end # NlpToolz

data/spec/lib/nlp_toolz_spec.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# coding: utf-8
+require "spec_helper"
+describe NlpToolz do
+  before(:each) do
+    @sentence = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author."
+    @text = "William Tecumseh Sherman (February 8, 1820 – February 14, 1891) was an American soldier, businessman, educator, and author. He served as a general in the United States Army during the American Civil War (1861–65), receiving both recognition for his outstanding command of military strategy, and criticism for the harshness of the scorched earth policies he implemented in conducting total war against the Confederate States of America. Military historian Basil Liddell Hart famously declared that Sherman was the first modern general."
+    @g_text = "μακεδονικού εκκεντροφόρου πολιτισμός του. την ανάφλεξης πολιτισμική. πολιτισμού του να. τόπος επειδή σε. καθορίσουν χρόνια Στα από."
+  end
+  describe "detect language" do
+    it "should description" do
+      lang = NlpToolz.get_lang(@text)
+      lang.should == 'en'
+    end
+  end
+  describe "sentence detection" do
+    it "should input text split into its sentences" do
+      sentences = NlpToolz.get_sentences(@text)
+      sentences.should have(3).items
+    end
+    it "should be 'nil', if text lang is unsupported" do
+      sentences = NlpToolz.get_sentences(@g_text)
+      sentences.should be_nil
+    end
+  end
+  describe "tokenizing" do
+    it "should tag a sentence" do
+      tokens = NlpToolz.tokenize_sentence(@sentence)
+      tokens.should have(26).items
+      tokens.should be_a Array
+    end
+    it "should tokenize a whole text" do
+      token_arr = NlpToolz.tokenize_text(@text)
+      token_arr.should have(3).items
+      token_arr.first.should have(26).items
+    end
+  end
+  describe "tagging" do
+    it "should tag a sentence" do
+      sentence = NlpToolz.get_sentences(@sentence).last
+      tags = NlpToolz.tag_sentence(sentence)
+      tags[:tokens].length.should == tags[:tags].length
+    end
+    it "should be 'nil', if sentence language not supported " do
+      tags = NlpToolz.tag_sentence(@g_text)
+      tags.should be_nil
+    end
+  end
+  describe "parsing" do
+    it "should parse a sentence" do
+      sentence = NlpToolz.get_sentences(@sentence).last
+      parsed = NlpToolz.parse_sentence(sentence)
+      parsed.should be_a Hash
+    end
+    it "should should be 'nil', if sentence language is not supported" do
+      parsed = NlpToolz.parse_sentence(@g_text)
+      parsed.should be_nil
+    end
+  end
+end

data/spec/spec_helper.rb ADDED Viewed

@@ -0,0 +1,16 @@
+require "nlp_toolz"
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+RSpec.configure do |config|
+  # config.treat_symbols_as_metadata_keys_with_true_values = true
+  # config.run_all_when_everything_filtered = true
+  # config.filter_run :focus
+  # Run specs in random order to surface order dependencies. If you find an
+  # order dependency and want to debug it, you can fix the order by providing
+  # the seed, which is printed after each run.
+  #     --seed 1234
+  config.order = 'random'
+end