RubyGems - nlp_toolz - Versions diffs - 1.0.3 - Mend

nlp_toolz 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

checksums.yaml +7 -0
data/.gitignore +28 -0
data/.rspec +2 -0
data/Gemfile +6 -0
data/Guardfile +13 -0
data/LICENSE.txt +22 -0
data/README.md +37 -0
data/Rakefile +15 -0
data/bin/nlp_toolz +92 -0
data/lib/nlp_toolz/helpers/lang.rb +36 -0
data/lib/nlp_toolz/helpers/string_extended.rb +20 -0
data/lib/nlp_toolz/helpers/tmp_file.rb +18 -0
data/lib/nlp_toolz/helpers/url_handler.rb +26 -0
data/lib/nlp_toolz/load_jars.rb +22 -0
data/lib/nlp_toolz/parser.rb +146 -0
data/lib/nlp_toolz/pos_tags.rb +77 -0
data/lib/nlp_toolz/sentences.rb +50 -0
data/lib/nlp_toolz/tokens.rb +48 -0
data/lib/nlp_toolz/version.rb +8 -0
data/lib/nlp_toolz.rb +84 -0
data/nlp_toolz.gemspec +42 -0
data/spec/helpers/string_extended_spec.rb +17 -0
data/spec/lib/nlp_toolz/parser_spec.rb +67 -0
data/spec/lib/nlp_toolz/pos_tags_spec.rb +67 -0
data/spec/lib/nlp_toolz/sentences_spec.rb +60 -0
data/spec/lib/nlp_toolz/tokens_spec.rb +62 -0
data/spec/lib/nlp_toolz_spec.rb +69 -0
data/spec/spec_helper.rb +16 -0
metadata +262 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 71916455cffe07c8464fb8cc1543d7b8a2ea7205
+  data.tar.gz: bc30072b7d62770c3e202e0545137056fe5a6164
+SHA512:
+  metadata.gz: 997d3fc4fb5d9c18546e1ea4c5c8acd19e61ef6979ece0d27cff540cea99c2ecae094fba16a4c3aa25dc05f1fe9282498c228a898b68b4271e493027663e0ba3
+  data.tar.gz: 42d5ea917f3febe6484a80ab085f0b41515540f841edc2de4b219d06456d7d331a750fb306095336918b4c82f4cd184d1dc6099cd4ff0fd51e2cb487adab9944

data/.gitignore ADDED Viewed

@@ -0,0 +1,28 @@
+*.gem
+*.rbc
+.bundle
+.config
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.rvmrc
+ToDo.task
+teste.rb
+.DS_Store
+test-data/
+jars/*
+models/*

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --color
2	+ --format progress

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in nlp_toolz.gemspec
+gemspec
+# gem 'birch', git: 'git://github.com/louismullie/birch.git'

data/Guardfile ADDED Viewed

@@ -0,0 +1,13 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard :bundler do
+  watch('Gemfile')
+  watch(/^.+\.gemspec/)
+end
+guard :rspec do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/(.+)\.rb$})     { |m| "spec/lib/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb')  { "spec" }
+end

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 LeFnord
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,37 @@
+# NlpToolz
+Basic NLP tools, mostly based on [OpenNLP](http://opennlp.apache.org), at this time `sentence finder`, `tokenizer` and `POS tagger` implemented, plus [Berkeley Parser](http://code.google.com/p/berkeleyparser/).
+## Installation
+Add this line to your application's Gemfile:
+    gem 'nlp_toolz'
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install nlp_toolz
+Download jars and model files from [Dropbox](https://www.dropbox.com/sh/1layyjgf5h0wwi3/s2SHAnfVhs) and unzip it in gem folder.
+## Usage
+see: [nlp_toolz.rb](https://github.com/LeFnord/nlp_toolz/blob/master/lib/nlp_toolz.rb) and specs for usage
+## Contributing
+1. Fork it
+2. Create your feature branch (`git checkout -b my-new-feature`)
+3. Commit your changes (`git commit -am 'Add some feature'`)
+4. Push to the branch (`git push origin my-new-feature`)
+5. Create new Pull Request
+## Comments
+- removed Celluloid, do concurrency in your app, where it be used
+- check `load_jars` for JVM parameters

data/Rakefile ADDED Viewed

@@ -0,0 +1,15 @@
+#!/usr/bin/env rake
+require "bundler/gem_tasks"
+require "awesome_print"
+require 'rspec/core'
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec) do |spec|
+  spec.pattern = FileList['spec/**/*_spec.rb']
+end
+task :default => :spec
+require 'yard'
+YARD::Rake::YardocTask.new
+Dir["lib/tasks/**/*.rake"].sort.each { |ext| load ext }

data/bin/nlp_toolz ADDED Viewed

@@ -0,0 +1,92 @@
+#!/usr/bin/env ruby
+require 'gli'
+begin # XXX: Remove this begin/rescue before distributing your app
+require 'nlp_toolz'
+rescue LoadError
+  STDERR.puts "In development, you need to use `bundle exec bin/nlp_toolz` to run your app"
+  STDERR.puts "At install-time, RubyGems will make sure lib, etc. are in the load path"
+  STDERR.puts "Feel free to remove this message from bin/NlpToolz now"
+  exit 64
+end
+include GLI::App
+# helper methods
+def get_out(this)
+  ap this if $stdout.tty?
+  $stdout.puts this unless $stdout.tty?
+end
+def get_in(input_arg)
+  if File.exists?(input_arg) && !File.directory?(input_arg)
+    return get_file(input_arg)
+  else
+    return input_arg
+  end
+end
+def get_file(name)
+  file = File.open(name).gets(nil)
+  "" if file.nil?
+  file.force_encoding("utf-8") unless file.nil?
+end
+program_desc 'running basic NLP tasks'
+version NlpToolz::VERSION
+desc 'sentence detection'
+arg_name 'Describe arguments to sent here'
+command :sent do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.get_sentences(input)
+  end
+end
+desc 'parsing text'
+arg_name 'Describe arguments to parse here'
+command :parse do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.parse_text(input)
+  end
+end
+desc 'pos tagging of text'
+arg_name 'Describe arguments to tag here'
+command :tag do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.tag_text(input)
+  end
+end
+desc 'tokenizing text'
+arg_name 'Describe arguments to token here'
+command :token do |c|
+  c.desc 'file input'
+  c.arg_name '<path/to/file>'
+  c.flag [:f,:file]
+  c.action do |global_options,options,args|
+    input = get_in(options[:f] || args.first)
+    get_out NlpToolz.tokenize_text(input)
+  end
+end
+on_error do |exception|
+  true
+end
+exit run(ARGV)

data/lib/nlp_toolz/helpers/lang.rb ADDED Viewed

@@ -0,0 +1,36 @@
+module Lang
+  include UrlHandler
+  # get language of input
+  def get_language(text = nil)
+    environment = ENV['ENV_NAME'] || 'development'
+    # ToDo 2013-03-14: respect environment
+    case environment
+    when 'development'
+      # development -> local
+      # uri = build_url("localhost", 9292, "/langid", nil)
+      uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
+    when 'production'
+      # production
+      uri = build_url("arielle.tm.informatik.uni-leipzig.de", 55700, "/langid", nil)
+    end
+    if @input
+      asv_response = post_data(URI.escape(@input),uri,{'Content-type'=>'text/plain;charset=utf-8'})
+    elsif text
+      asv_response = post_data(URI.escape(text),uri,{'Content-type'=>'text/plain;charset=utf-8'})
+    end
+    response = MultiJson.load(asv_response.body)
+    response["lang"]
+  end
+  # ToDo 2013-02-26: make different lang identifier available
+  def alternative_langs lang
+    langs = {
+      en: [:eng, :english],
+      de: [:ger, :german]
+    }.each.collect{|x| x.flatten}
+  end
+end

data/lib/nlp_toolz/helpers/string_extended.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# coding: utf-8
+class String
+  # ToDo: check abbr against list of ..
+  def clean_up
+    foo = self.encode('UTF-8', :invalid => :replace, :undef => :replace)
+    bar = foo.gsub(/[\p{Pi}\p{Pf}"'„“‘’“”«»‹›]/,'')       # quotation marks
+             .gsub(/\b\/\b/,' ')
+             .gsub(/(\p{Ps})(.)/,'\1 \2')   # left braces
+             .gsub(/(.)(\p{Pe})/,'\1 \2')   # right braces
+             .gsub(/([\w]{3,})([\.])/,'\1 \2')  # abbrevation?
+             .gsub(/(.)([,;:!?]+)/,'\1 \2')     # punctation
+    bar
+  end
+  def basename
+    self.split("/").last
+  end
+end

data/lib/nlp_toolz/helpers/tmp_file.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require 'tempfile'
+module TmpFile
+  module_function
+  def make_tmp_file_from text = nil
+    tmp_file = ::Tempfile.new('tmp.txt')
+    tmp_file.write text unless text.nil?
+    tmp_file.rewind
+    tmp_file
+  end
+  def delete_and_unlink_tmp_file tmp_file
+    tmp_file.close
+    tmp_file.unlink
+  end
+end

data/lib/nlp_toolz/helpers/url_handler.rb ADDED Viewed

@@ -0,0 +1,26 @@
+require 'uri'
+require 'net/http'
+module UrlHandler
+  module ClassMethods
+  end
+  # instance methods
+  def build_url(host, port, path, query)
+    return URI::HTTP.build({:host => host, :path => path, :query => query}) if port.nil?
+    return URI::HTTP.build({:host => host, :port => port, :path => path, :query => query}) unless port.nil?
+  end
+  def post_data(content,uri,content_type)
+    post = Net::HTTP::Post.new(uri.request_uri,content_type)
+    post.body = content.force_encoding("utf-8")
+    uri_response = Net::HTTP.start(uri.host,uri.port) {|http| http.request(post)}
+    uri_response
+  end
+  def self.included(receiver)
+    receiver.extend ClassMethods
+  end
+end

data/lib/nlp_toolz/load_jars.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module NlpToolz
+  MODELS = File.join(File.dirname(__FILE__), '..', '..', "models")
+  JARS = File.join(File.dirname(__FILE__), '..', '..', "jars")
+  CLASS_PATH = [
+    File.join(JARS, "jwnl-1.3.3.jar"),
+    File.join(JARS, "opennlp-tools-1.5.3.jar"),
+    File.join(JARS, "opennlp-maxent-3.0.3.jar")
+  ].join(":")
+  Rjb::load(CLASS_PATH,['-Xmx4096m','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseParallelGC','-XX:+UseParallelOldGC','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseConcMarkSweepGC','-Djava.awt.headless=true'])
+  # Rjb::load(CLASS_PATH,['-Xmx4096m','-XX:+UseSerialGC','-Djava.awt.headless=true'])
+end
+# simple example benchmarks, pos tagging 862 pharses:
+# /wo extra options  -> 656s
+# /w ParallelGC      -> 657s
+# /w ConcMarkSweepGC -> 659s
+# /w SerialGC        -> 668s
+# see: [Java GC tuning](http://www.oracle.com/technetwork/java/javase/gc-tuning-6-140523.html)

data/lib/nlp_toolz/parser.rb ADDED Viewed

@@ -0,0 +1,146 @@
+# coding:  utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-12-10
+module NlpToolz
+  class Parser
+    include Lang
+    include TmpFile
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    attr_reader :parsed
+    attr_accessor :input, :lang, :model, :model_name, :parse_hash
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-sm5.gr"
+      get_model
+    end
+    def parse_text
+      parsed = nil
+      if self.has_model?
+        jar = "#{JARS}/BerkeleyParser-1.7.jar"
+        in_file = make_tmp_file_from @input.clean_up
+        out_file = make_tmp_file_from
+        `java -Xmx4g -jar #{jar} -gr #{@model} -inputFile #{in_file.path} -outputFile #{out_file.path} -tokenize -maxLength 500`.chomp
+        @parsed = File.open(out_file).gets(nil).chomp
+        parse_output_to_hash
+        delete_and_unlink_tmp_file in_file
+        delete_and_unlink_tmp_file out_file
+      end
+    end
+    def has_model?
+      @model
+    end
+    def layer(level = nil)
+      @first_layer
+    end
+    def hash
+      @parse_hash
+    end
+    private
+    # helper for ...
+    # initialize
+    def get_model
+      model_file = "#{MODELS}/parser/#{@model_name}"
+      if File.exists?(model_file)
+        @model = model_file
+      else
+        @model = false
+      end
+    end
+    # convert: #tree -> #hash
+    def parse_output_to_hash
+      parsed = split_parse_tree(self.parsed)
+      nodes = create_leafs(parsed)
+      @parse_hash = make_hash_hash(nodes)
+      @parse_hash
+    end
+    # helper for parsing to hash
+    ::Leaf = Struct.new(:tag, :token)
+    ::Node = Struct.new(:tag, :parent, :childs)
+    # 1. split
+    def split_parse_tree(parsed)
+      bar = parsed.gsub("))", ") )").gsub("))", ") )")
+                  .gsub("(", "{")
+                  .gsub(")", "}")
+      bar.split
+    end
+    # 2. merge tags and tokens, create leafs
+    def create_leafs(parsed)
+      @first_layer = {tags: [],tokens: []}
+      leafs = {}
+      foo = []
+      parsed.each_with_index do |part,i|
+        if part =~ /\{([\w\-]+|\$\p{P}|\p{P})/ && parsed[i+1] =~ /([\p{L}\p{N}\-\.]+|\p{P})\}/
+          tag = part.gsub("{","")
+          token = parsed[i+1].gsub("}","")
+          @first_layer[:tags] << tag
+          @first_layer[:tokens] << token
+          leaf = Leaf.new(tag.to_sym,token)
+          if foo[foo.length-1].is_a?(Hash)
+            foo[foo.length-1] = [foo[foo.length-1], leaf]
+          elsif foo[foo.length-1].is_a?(Array)
+            foo[foo.length-1] << leaf
+          else
+            foo << leaf
+          end
+        elsif part !~ /([\p{L}\p{N}\-]+|\p{P})\}/
+          if part =~ /(\{)(.+)/
+            foo << "{#{part.gsub("{","")}"
+          else
+            foo << "#{part}"
+          end
+        end
+      end
+      foo
+    end
+    def make_hash_hash(nodes)
+      tmp = catch(:done) {
+        nodes.reverse.each_with_index do |node,i|
+          if node =~ /\{(\w+)/
+            key = node.match(/\{(\w+)/)[1].to_sym
+            part = []
+            nodes[-i-1..-1].each_with_index do |x,ii|
+              if x == "}"
+                part = {key => nodes[-i..-i+ii-2]}
+                throw :done, [nodes[0..-i-2],part,nodes[-i+ii..-1]].flatten
+              end
+            end
+          end
+        end
+      }
+      if tmp.length > 3
+        make_hash_hash(tmp)
+      else
+        tmp[1]
+      end
+    end
+  end # class Parser
+end # module NlpToolz

data/lib/nlp_toolz/pos_tags.rb ADDED Viewed

@@ -0,0 +1,77 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-24
+# ToDo 2012-10-24: add train capabilities
+module NlpToolz
+  class PosTags
+    include Lang
+    # load java classes
+    FileInputStream = Rjb::import('java.io.FileInputStream')
+    POSModel        = Rjb::import('opennlp.tools.postag.POSModel')
+    POSTaggerME     = Rjb::import('opennlp.tools.postag.POSTaggerME')
+    attr_accessor :input, :lang, :model, :model_name, :tokenized
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-pos-maxent.bin"
+      get_model
+    end
+    def get_pos_tags
+      if self.has_model?
+        @tokenized = tokenize_it @tagger.tag(@input.clean_up)
+      end
+    end
+    def tokens
+      @tokenized[:tokens]
+    end
+    def tags
+      @tokenized[:tags]
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/pos/#{@model_name}"
+      if File.exists?(model_file)
+        @model = POSModel.new(FileInputStream.new(model_file))
+        @tagger = POSTaggerME.new(@model)
+      else
+        @model = false
+      end
+    end
+    # ToDo 2012-11-28: only a workaround upto the opennlp tokenizer is implemented
+    def tokenize_it stream
+      foo = {tokens: [], tags: []}
+      stream.split.each do |token|
+        splitter = token.split("/")
+        if splitter.length == 2
+          foo[:tokens] << splitter.first
+          foo[:tags] << splitter.last
+        else
+          splitter[0..-2].each do |splits|
+            foo[:tokens] << splits
+            foo[:tags] << splitter.last
+          end
+        end
+      end
+      foo
+    end
+  end # class PosTags
+end # module NlpToolz

data/lib/nlp_toolz/sentences.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+# ToDo 2012-10-24: add train capabilities
+module NlpToolz
+  class Sentences
+    include Lang
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    SentenceDetectorME = Rjb::import('opennlp.tools.sentdetect.SentenceDetectorME')
+    SentenceModel      = Rjb::import('opennlp.tools.sentdetect.SentenceModel')
+    attr_accessor :input, :lang, :model, :model_name, :sentences
+    def initialize(input,lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-sent.bin"
+      get_model
+    end
+    def split_into_sentences
+      @sentences = @sentence_detector.sentDetect(@input).to_a
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/sent/#{@model_name}"
+      if File.exists?(model_file)
+        @model = SentenceModel.new(FileInputStream.new(model_file))
+        @sentence_detector = SentenceDetectorME.new(@model)
+      else
+        @model = false
+        # raise 'file not found'
+      end
+    end
+  end # class Sentences
+end # module NlpToolz

data/lib/nlp_toolz/tokens.rb ADDED Viewed

@@ -0,0 +1,48 @@
+# coding:  utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-11-30
+module NlpToolz
+  class Tokens
+    include Lang
+    # load java classes
+    FileInputStream    = Rjb::import('java.io.FileInputStream')
+    TokenizerModel = Rjb::import('opennlp.tools.tokenize.TokenizerModel')
+    TokenizerME = Rjb::import('opennlp.tools.tokenize.TokenizerME')
+    attr_accessor :input, :lang, :model, :model_name, :tokens
+    def initialize(input, lang = nil)
+      @input = input
+      @lang = lang || get_language
+      @model_name = "#{@lang}-token.bin"
+      get_model
+    end
+    def tokenize
+      @tokens = @tokenizer.tokenize(@input)
+    end
+    def has_model?
+      @model
+    end
+    private
+    def get_model
+      model_file = "#{MODELS}/token/#{@model_name}"
+      if File.exists?(model_file)
+        @model = TokenizerModel.new(FileInputStream.new(model_file))
+        @tokenizer = TokenizerME.new(@model)
+      else
+        @model = false
+      end
+    end
+  end # Class Tokens
+end # module NlpToolz

data/lib/nlp_toolz/version.rb ADDED Viewed

@@ -0,0 +1,8 @@
+# coding: utf-8
+# @author: LeFnord
+# @email:  pscholz.le@gmail.com
+# @date:   2012-10-23
+module NlpToolz
+  VERSION = "1.0.3"
+end