RubyGems - nlp_toolz - Versions diffs - 1.0.3 - Mend

nlp_toolz 1.0.3

Files changed (29) hide show

checksums.yaml +7 -0
data/.gitignore +28 -0
data/.rspec +2 -0
data/Gemfile +6 -0
data/Guardfile +13 -0
data/LICENSE.txt +22 -0
data/README.md +37 -0
data/Rakefile +15 -0
data/bin/nlp_toolz +92 -0
data/lib/nlp_toolz/helpers/lang.rb +36 -0
data/lib/nlp_toolz/helpers/string_extended.rb +20 -0
data/lib/nlp_toolz/helpers/tmp_file.rb +18 -0
data/lib/nlp_toolz/helpers/url_handler.rb +26 -0
data/lib/nlp_toolz/load_jars.rb +22 -0
data/lib/nlp_toolz/parser.rb +146 -0
data/lib/nlp_toolz/pos_tags.rb +77 -0
data/lib/nlp_toolz/sentences.rb +50 -0
data/lib/nlp_toolz/tokens.rb +48 -0
data/lib/nlp_toolz/version.rb +8 -0
data/lib/nlp_toolz.rb +84 -0
data/nlp_toolz.gemspec +42 -0
data/spec/helpers/string_extended_spec.rb +17 -0
data/spec/lib/nlp_toolz/parser_spec.rb +67 -0
data/spec/lib/nlp_toolz/pos_tags_spec.rb +67 -0
data/spec/lib/nlp_toolz/sentences_spec.rb +60 -0
data/spec/lib/nlp_toolz/tokens_spec.rb +62 -0
data/spec/lib/nlp_toolz_spec.rb +69 -0
data/spec/spec_helper.rb +16 -0
metadata +262 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 71916455cffe07c8464fb8cc1543d7b8a2ea7205
+  data.tar.gz: bc30072b7d62770c3e202e0545137056fe5a6164
+SHA512:
+  metadata.gz: 997d3fc4fb5d9c18546e1ea4c5c8acd19e61ef6979ece0d27cff540cea99c2ecae094fba16a4c3aa25dc05f1fe9282498c228a898b68b4271e493027663e0ba3
+  data.tar.gz: 42d5ea917f3febe6484a80ab085f0b41515540f841edc2de4b219d06456d7d331a750fb306095336918b4c82f4cd184d1dc6099cd4ff0fd51e2cb487adab9944

data/.gitignore ADDED Viewed

@@ -0,0 +1,28 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.rvmrc
+ToDo.task
+teste.rb
+.DS_Store
+test-data/
+jars/*
+models/*

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in nlp_toolz.gemspec
+gemspec
+# gem 'birch', git: 'git://github.com/louismullie/birch.git'

data/Guardfile ADDED Viewed

@@ -0,0 +1,13 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard :bundler do
+  watch('Gemfile')
+  watch(/^.+\.gemspec/)
+end
+guard :rspec do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/lib/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 LeFnord
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,37 @@
+# NlpToolz
+Basic NLP tools, mostly based on [OpenNLP](http://opennlp.apache.org), at this time `sentence finder`, `tokenizer` and `POS tagger` implemented, plus [Berkeley Parser](http://code.google.com/p/berkeleyparser/).
+## Installation
+Add this line to your application's Gemfile:
+    gem 'nlp_toolz'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install nlp_toolz
+Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.
+## Usage
+see: [nlp_toolz.rb](https://github.com/LeFnord/nlp_toolz/blob/master/lib/nlp_toolz.rb) and specs for usage
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## Comments
+- removed Celluloid, do concurrency in your app, where it be used
+- check `load_jars` for JVM parameters

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require "awesome_print"
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task :default => :spec
+require 'yard'
+YARD::Rake::YardocTask.new
+Dir["lib/tasks/**/*.rake"].sort.each { |ext| load ext }

data/bin/nlp_toolz ADDED Viewed

@@ -0,0 +1,92 @@
+#!/usr/bin/env ruby
+require 'gli'
+begin # XXX: Remove this begin/rescue before distributing your app
+require 'nlp_toolz'
+rescue LoadError
+  STDERR.puts "In development, you need to use `bundle exec bin/nlp_toolz` to run your app"
+  STDERR.puts "At install-time, RubyGems will make sure lib, etc. are in the load path"
+  STDERR.puts "Feel free to remove this message from bin/NlpToolz now"
+  exit 64
+end
+include GLI::App
+# helper methods
+def get_out(this)
+  ap this if $stdout.tty?
+  $stdout.puts this unless $stdout.tty?
+end
+def get_in(input_arg)
+  if File.exists?(input_arg) && !File.directory?(input_arg)
+    return get_file(input_arg)
+  else
+    return input_arg
+  end
+end
+def get_file(name)
+  file = File.open(name).gets(nil)
+  "" if file.nil?
+  file.force_encoding("utf-8") unless file.nil?
+end
+program_desc 'running basic NLP tasks'
+version NlpToolz::VERSION
+desc 'sentence detection'
+arg_name 'Describe arguments to sent here'
+command :sent do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.get_sentences(input)
+  end
+end
+desc 'parsing text'
+arg_name 'Describe arguments to parse here'
+command :parse do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.parse_text(input)
+  end
+end
+desc 'pos tagging of text'
+arg_name 'Describe arguments to tag here'
+command :tag do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.tag_text(input)
+  end
+end
+desc 'tokenizing text'
+arg_name 'Describe arguments to token here'
+command :token do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.tokenize_text(input)
+  end
+end
+on_error do |exception|
+  true
+end
+exit run(ARGV)

data/lib/nlp_toolz/helpers/lang.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Lang
+  include UrlHandler
+  # get language of input
+  def get_language(text = nil)
+    environment = ENV['ENV_NAME'] || 'development'
+    # ToDo 2013-03-14: respect environment
+    case environment
+    when 'development'
+      # development -> local
+      # uri = build_url("localhost", 9292, "/langid", nil)
+      uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
+    when 'production'
+      # production
+      uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
+    end
+    if @input
+      asv_response = post_data(URI.escape(@input),uri,{'Content-type'=>'text/plain;charset=utf-8'})
+    elsif text
+      asv_response = post_data(URI.escape(text),uri,{'Content-type'=>'text/plain;charset=utf-8'})
+    end
+    response = MultiJson.load(asv_response.body)
+    response["lang"]
+  end
+  # ToDo 2013-02-26: make different lang identifier available
+  def alternative_langs lang
+    langs = {
+      en: [:eng, :english],
+      de: [:ger, :german]
+    }.each.collect{|x| x.flatten}
+  end
+end

data/lib/nlp_toolz/helpers/string_extended.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# coding: utf-8
+class String
+  # ToDo: check abbr against list of ..
+  def clean_up
+    foo = self.encode('UTF-8', :invalid => :replace, :undef => :replace)
+    bar = foo.gsub(/[\p{Pi}\p{Pf}"'„“‘’“”«»‹›]/,'')       # quotation marks
+             .gsub(/\b\/\b/,' ')
+             .gsub(/(\p{Ps})(.)/,'\1 \2')   # left braces
+             .gsub(/(.)(\p{Pe})/,'\1 \2')   # right braces
+             .gsub(/([\w]{3,})([\.])/,'\1 \2')  # abbrevation?
+             .gsub(/(.)([,;:!?]+)/,'\1 \2')     # punctation
+    bar
+  end
+  def basename
+    self.split("/").last
+  end
+end

data/lib/nlp_toolz/helpers/tmp_file.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'tempfile'
+module TmpFile
+  module_function
+  def make_tmp_file_from text = nil
+    tmp_file = ::Tempfile.new('tmp.txt')
+    tmp_file.write text unless text.nil?
+    tmp_file.rewind
+    tmp_file
+  end
+  def delete_and_unlink_tmp_file tmp_file
+    tmp_file.close
+    tmp_file.unlink
+  end
+end

data/lib/nlp_toolz/helpers/url_handler.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'uri'
+require 'net/http'
+module UrlHandler
+  module ClassMethods
+  end
+  # instance methods
+  def build_url(host, port, path, query)
+    return URI::HTTP.build({:host => host, :path => path, :query => query}) if port.nil?
+    return URI::HTTP.build({:host => host, :port => port, :path => path, :query => query}) unless port.nil?
+  end
+  def post_data(content,uri,content_type)
+    post = Net::HTTP::Post.new(uri.request_uri,content_type)
+    post.body = content.force_encoding("utf-8")
+    uri_response = Net::HTTP.start(uri.host,uri.port) {|http| http.request(post)}
+    uri_response
+  end
+  def self.included(receiver)
+    receiver.extend ClassMethods
+  end
+end

data/lib/nlp_toolz/load_jars.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module NlpToolz
+  MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
+  JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
+  CLASS_PATH = [
+    File.join(JARS, "jwnl-1.3.3.jar"),
+    File.join(JARS, "opennlp-tools-1.5.3.jar"),
+    File.join(JARS, "opennlp-maxent-3.0.3.jar")
+  ].join(":")
+  Rjb::load(CLASS_PATH,['-Xmx4096m','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseParallelGC','-XX:+UseParallelOldGC','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseConcMarkSweepGC','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseSerialGC','-Djava.awt.headless=true'])
+end
+# simple example benchmarks, pos tagging 862 pharses:
+# /wo extra options  -> 656s
+# /w ParallelGC      -> 657s
+# /w ConcMarkSweepGC -> 659s
+# /w SerialGC        -> 668s
+# see: [Java GC tuning](http://www.oracle.com/technetwork/java/javase/gc-tuning-6-140523.html)

data/lib/nlp_toolz/parser.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# coding:  utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-12-10
+module NlpToolz
+  class Parser
+    include Lang
+    include TmpFile
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    attr_reader :parsed
+    attr_accessor :input, :lang, :model, :model_name, :parse_hash
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-sm5.gr"
+      get_model
+    end
+    def parse_text
+      parsed = nil
+      if self.has_model?
+        jar = "#{JARS}/BerkeleyParser-1.7.jar"
+        in_file = make_tmp_file_from @input.clean_up
+        out_file = make_tmp_file_from
+        `java -Xmx4g -jar #{jar} -gr #{@model} -inputFile #{in_file.path} -outputFile #{out_file.path} -tokenize -maxLength 500`.chomp
+        @parsed = File.open(out_file).gets(nil).chomp
+        parse_output_to_hash
+        delete_and_unlink_tmp_file in_file
+        delete_and_unlink_tmp_file out_file
+      end
+    end
+    def has_model?
+      @model
+    end
+    def layer(level = nil)
+      @first_layer
+    end
+    def hash
+      @parse_hash
+    end
+    private
+    # helper for ...
+    # initialize
+    def get_model
+      model_file = "#{MODELS}/parser/#{@model_name}"
+      if File.exists?(model_file)
+        @model = model_file
+      else
+        @model = false
+      end
+    end
+    # convert: #tree -> #hash
+    def parse_output_to_hash
+      parsed = split_parse_tree(self.parsed)
+      nodes = create_leafs(parsed)
+      @parse_hash = make_hash_hash(nodes)
+      @parse_hash
+    end
+    # helper for parsing to hash
+    ::Leaf = Struct.new(:tag, :token)
+    ::Node = Struct.new(:tag, :parent, :childs)
+    # 1. split
+    def split_parse_tree(parsed)
+      bar = parsed.gsub("))", ") )").gsub("))", ") )")
+                  .gsub("(", "{")
+                  .gsub(")", "}")
+      bar.split
+    end
+    # 2. merge tags and tokens, create leafs
+    def create_leafs(parsed)
+      @first_layer = {tags: [],tokens: []}
+      leafs = {}
+      foo = []
+      parsed.each_with_index do |part,i|
+        if part =~ /\{([\w\-]+|\$\p{P}|\p{P})/ && parsed[i+1] =~ /([\p{L}\p{N}\-\.]+|\p{P})\}/
+          tag = part.gsub("{","")
+          token = parsed[i+1].gsub("}","")
+          @first_layer[:tags] << tag
+          @first_layer[:tokens] << token
+          leaf = Leaf.new(tag.to_sym,token)
+          if foo[foo.length-1].is_a?(Hash)
+            foo[foo.length-1] = [foo[foo.length-1], leaf]
+          elsif foo[foo.length-1].is_a?(Array)
+            foo[foo.length-1] << leaf
+          else
+            foo << leaf
+          end
+        elsif part !~ /([\p{L}\p{N}\-]+|\p{P})\}/
+          if part =~ /(\{)(.+)/
+            foo << "{#{part.gsub("{","")}"
+          else
+            foo << "#{part}"
+          end
+        end
+      end
+      foo
+    end
+    def make_hash_hash(nodes)
+      tmp = catch(:done) {
+        nodes.reverse.each_with_index do |node,i|
+          if node =~ /\{(\w+)/
+            key = node.match(/\{(\w+)/)[1].to_sym
+            part = []
+            nodes[-i-1..-1].each_with_index do |x,ii|
+              if x == "}"
+                part = {key => nodes[-i..-i+ii-2]}
+                throw :done, [nodes[0..-i-2],part,nodes[-i+ii..-1]].flatten
+              end
+            end
+          end
+        end
+      }
+      if tmp.length > 3
+        make_hash_hash(tmp)
+      else
+        tmp[1]
+      end
+    end
+  end # class Parser
+end # module NlpToolz

data/lib/nlp_toolz/pos_tags.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-24
+# ToDo 2012-10-24: add train capabilities
+module NlpToolz
+  class PosTags
+    include Lang
+    # load java classes
+    FileInputStream = Rjb::import('java.io.FileInputStream')
+    POSModel        = Rjb::import('opennlp.tools.postag.POSModel')
+    POSTaggerME     = Rjb::import('opennlp.tools.postag.POSTaggerME')
+    attr_accessor :input, :lang, :model, :model_name, :tokenized
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-pos-maxent.bin"
+      get_model
+    end
+    def get_pos_tags
+      if self.has_model?
+        @tokenized = tokenize_it @tagger.tag(@input.clean_up)
+      end
+    end
+    def tokens
+      @tokenized[:tokens]
+    end
+    def tags
+      @tokenized[:tags]
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/pos/#{@model_name}"
+      if File.exists?(model_file)
+        @model = POSModel.new(FileInputStream.new(model_file))
+        @tagger = POSTaggerME.new(@model)
+      else
+        @model = false
+      end
+    end
+    # ToDo 2012-11-28: only a workaround upto the opennlp tokenizer is implemented
+    def tokenize_it stream
+      foo = {tokens: [], tags: []}
+      stream.split.each do |token|
+        splitter = token.split("/")
+        if splitter.length == 2
+          foo[:tokens] << splitter.first
+          foo[:tags] << splitter.last
+        else
+          splitter[0..-2].each do |splits|
+            foo[:tokens] << splits
+            foo[:tags] << splitter.last
+          end
+        end
+      end
+      foo
+    end
+  end # class PosTags
+end # module NlpToolz

data/lib/nlp_toolz/sentences.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+# ToDo 2012-10-24: add train capabilities
+module NlpToolz
+  class Sentences
+    include Lang
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
+    SentenceModel      = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
+    attr_accessor :input, :lang, :model, :model_name, :sentences
+    def initialize(input,lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-sent.bin"
+      get_model
+    end
+    def split_into_sentences
+      @sentences = @sentence_detector.sentDetect(@input).to_a
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/sent/#{@model_name}"
+      if File.exists?(model_file)
+        @model = SentenceModel.new(FileInputStream.new(model_file))
+        @sentence_detector = SentenceDetectorME.new(@model)
+      else
+        @model = false
+        # raise 'file not found'
+      end
+    end
+  end # class Sentences
+end # module NlpToolz

data/lib/nlp_toolz/tokens.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# coding:  utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-11-30
+module NlpToolz
+  class Tokens
+    include Lang
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
+    TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
+    attr_accessor :input, :lang, :model, :model_name, :tokens
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-token.bin"
+      get_model
+    end
+    def tokenize
+      @tokens = @tokenizer.tokenize(@input)
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/token/#{@model_name}"
+      if File.exists?(model_file)
+        @model = TokenizerModel.new(FileInputStream.new(model_file))
+        @tokenizer = TokenizerME.new(@model)
+      else
+        @model = false
+      end
+    end
+  end # Class Tokens
+end # module NlpToolz

data/lib/nlp_toolz/version.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+module NlpToolz
+  VERSION = "1.0.3"
+end