RubyGems - tokenizer - Versions diffs - 0.1.2 → 0.2.0 - Mend

tokenizer 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/.yardopts +3 -1
data/README.rdoc +8 -8
data/bin/tokenize +3 -4
data/lib/tokenizer/tokenizer.rb +42 -33
data/lib/tokenizer/version.rb +1 -1
data/test/regression_tests/test_de_tokenizer.rb +21 -3
metadata +4 -3

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: abb8db238956f4cbc491e75ec47f7994e572df1d
-  data.tar.gz: 4d69f5ae6fe9c6411b45946098700034c01152fe
+  metadata.gz: f536183270ebc769890d1210adfd255a4df593a1
+  data.tar.gz: 4396fc6566fe0703e326a6c135d644c071fd4f28
 SHA512:
-  metadata.gz: fcf399eb94f200fa1a682dc64193fcb3bef391e2db7eece38f1603181d139368a2036e14058b59c26b75f4643517f10c5f4508a27830561ed197bfa941fa4ad3
-  data.tar.gz: 12f59222b26ec7987f971679b8d7be32fc86468b0741afed1b8607eb3049902082d93c614ce83358e6777a61fa4326c4bd11d5ebc9f12d751eb4fb11d678a8b7
+  metadata.gz: e14294be0b8ac2a341bab0dea72196e76e82a8192432694dcca0a1456d86290f48ba332b56a19a0401b38360643734c99333ef2e70c7a2a9d1801b1c7618a9bf
+  data.tar.gz: d603d33571f3ae9b1a1721784ad5da0e531acd763811989ef34d91b453713b3558bea370a0ca0c13a5c8fef5a5d0ac0a9ab71a2e3be7ac6b47a328624df67d82

data/.yardopts CHANGED

@@ -1,8 +1,10 @@
 --private
 --protected
 --title 'A simple tokenizer for NLP tasks.'
+--main README.rdoc
 -
 CHANGELOG.rdoc
+README.rdoc
 LICENSE.rdoc
 bin/*
+lib/**/*

data/README.rdoc CHANGED

@@ -44,16 +44,16 @@ You can use +Tokenizer+ in two ways.
     $ echo 'Hi, ich gehe in die Schule!. | tokenize
 * As a library for embedded tokenization:
-    $ require 'tokenizer'
-    $ de_tokenizer = Tokenizer::Tokenizer.new
-    $ de_tokenizer.tokenize('Ich gehe in die Schule!')
-    $ => ["Ich", "gehe", "in", "die", "Schule", "!"]
+    > require 'tokenizer'
+    > de_tokenizer = Tokenizer::Tokenizer.new
+    > de_tokenizer.tokenize('Ich gehe in die Schule!')
+    > => ["Ich", "gehe", "in", "die", "Schule", "!"]
 * Customizable PRE and POST list
-    $ require 'tokenizer'
-    $ de_tokenizer = Tokenizer::Tokenizer.new(:de, { POST: Tokenizer::Tokenizer::POST + ['|'] })
-    $ de_tokenizer.tokenize('Ich gehe|in die Schule!')
-    $ => ["Ich", "gehe", "|in", "die", "Schule", "!"]
+    > require 'tokenizer'
+    > de_tokenizer = Tokenizer::Tokenizer.new(:de, { post: Tokenizer::Tokenizer::POST + ['|'] })
+    > de_tokenizer.tokenize('Ich gehe|in die Schule!')
+    > => ["Ich", "gehe", "|in", "die", "Schule", "!"]
 See documentation in the Tokenizer::Tokenizer class for details
 on particular methods.

data/bin/tokenize CHANGED

@@ -1,10 +1,9 @@
 #!/usr/bin/env ruby
-# -*- coding: utf-8 -*-
 require 'tokenizer'
-de_tokenizer = Tokenizer::Tokenizer.new
+tokenizer = Tokenizer::Tokenizer.new
-while record = gets
-  print de_tokenizer.tokenize(record).join("\n")
+while (line = gets)
+  puts tokenizer.tokenize(line).join("\n")
 end

data/lib/tokenizer/tokenizer.rb CHANGED

@@ -4,19 +4,34 @@
 # A namespace for all project related stuff.
 module Tokenizer
+  # Simple whitespace based tokenizer with configurable punctuation detection.
   class Tokenizer
+    # Default whitespace separator.
     FS = Regexp.new('[[:blank:]]+')
-    # spanish marks
-    SIMPLE_PRE = []
-    PAIR_PRE = ['(', '{', '[', '<']
+    # Characters only in the role of splittable prefixes.
+    SIMPLE_PRE = ['¿', '¡']
+    # Characters only in the role of splittable suffixes.
     SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
-    PAIR_POST = [')', '}', ']', '>']
+    # Characters as splittable prefixes with an optional matching suffix.
+    PAIR_PRE = ['(', '{', '[', '<', '«', '„']
+    # Characters as splittable suffixes with an optional matching prefix.
+    PAIR_POST = [')', '}', ']', '>', '»', '“']
+    # Characters which can be both prefixes AND suffixes.
     PRE_N_POST = ['"', "'"]
-    PRE = SIMPLE_PRE + PAIR_PRE
-    POST = SIMPLE_POST + PAIR_POST
+    private_constant :FS
+    # @param [Symbol] lang Language identifier.
+    # @param [Hash] options Additional options.
+    # @option options [Array] :pre Array of splittable prefix characters.
+    # @option options [Array] :post Array of splittable suffix characters.
+    # @option options [Array] :pre_n_post Array of characters with
+    #   suffix AND prefix functions.
     def initialize(lang = :de, options = {})
       @lang = lang
       @options = {
@@ -26,39 +41,33 @@ module Tokenizer
       }.merge(options)
     end
+    # @param [String] str String to be tokenized.
+    # @return [Array<String>] Array of tokens.
     def tokenize(str)
-      output = ''
+      tokens = sanitize_input(str).split(FS)
+      return [''] if tokens.empty?
-      fields = str.chomp.split(FS)
+      splittables = SIMPLE_PRE + SIMPLE_POST + PAIR_PRE + PAIR_POST + PRE_N_POST
+      pattern = Regexp.new("[^#{Regexp.escape(splittables.join)}]+")
+      output = []
+      tokens.each do |token|
+        prefix, stem, suffix = token.partition(pattern)
+        output << prefix.split('') unless prefix.empty?
+        output << stem unless stem.empty?
+        output << suffix.split('') unless suffix.empty?
+      end
-      return [''] if fields.empty?
+      output.flatten
+    end
-      fields.each do |field|
-        field.each_char.with_index do |ch, idx|
-          case
-          when @options[:pre].include?(ch)
-            output << "#{ch}\n"
-          when @options[:post].include?(ch)
-            output << "\n#{ch}"
-            if ['?', '!', '.'].include?(ch)
-              output << "\n"
-            end
-          when @options[:pre_n_post].include?(ch)
-            if idx == 0
-              output << "#{ch}\n"
-            elsif idx != 0
-              output << "\n#{ch}"
-            end
-          else
-            output << ch
-          end
-        end
+    alias process tokenize
-        output << "\n"
-      end
+    private
-      # @TODO: Rework the format of the string!
-      output.chomp('').split("\n", -1)
+    # @param [String] User defined string to be tokenized.
+    # @return [String] A new modified string.
+    def sanitize_input(str)
+      str.chomp.strip
     end
   end # class
 end # module

data/lib/tokenizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tokenizer
-  VERSION = '0.1.2'
+  VERSION = '0.2.0'
 end

data/test/regression_tests/test_de_tokenizer.rb CHANGED

@@ -1,10 +1,12 @@
+# coding: utf-8
 require 'minitest/autorun'
+require 'minitest/spec'
 require 'tokenizer'
 class TestTokenizer < Minitest::Test
   def setup
-    @de_tokenizer = Tokenizer::Tokenizer.new(:de)
+    @t = Tokenizer::Tokenizer.new(:de)
   end
   def test_constants
@@ -12,14 +14,30 @@ class TestTokenizer < Minitest::Test
   end
   def test_output_type
-    output = @de_tokenizer.tokenize('ich gehe in die Schule')
+    output = @t.tokenize('ich gehe in die Schule')
     assert(output.is_a?(Array))
   end
   def test_tokenization_001
     input = 'Ich ging in die Schule!'
     etalon = %w(Ich ging in die Schule !)
-    output = @de_tokenizer.tokenize(input)
+    output = @t.tokenize(input)
     assert_equal(etalon, output)
   end
+  def test_tokenization_002
+    input = '" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .'
+    etalon = %w(" Es ist wirklich schwer zu sagen , welche Positionen er einnimmt , da er sich noch nicht konkret geäußert hat " , beklagen Volkswirte .)
+    output = @t.tokenize(input)
+    assert_equal(etalon, output)
+  end
+end
+describe Tokenizer do
+  describe 'empty input' do
+    it 'should return an Array with an empty string' do
+      tokens = Tokenizer::Tokenizer.new.tokenize('')
+      tokens.must_equal([''])
+    end
+  end
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.2.0
 platform: ruby
 authors:
 - Andrei Beliankou
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-03 00:00:00.000000000 Z
+date: 2016-01-11 00:00:00.000000000 Z
 dependencies: []
 description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
   and a library for linguistic tokenization which is an anavoidable step for many
@@ -41,7 +41,8 @@ files:
 - test/development_tests/test_ru_tokenizer_dev.rb
 - test/regression_tests/test_de_tokenizer.rb
 homepage: https://github.com/arbox/tokenizer
-licenses: []
+licenses:
+- MIT
 metadata: {}
 post_install_message:
 rdoc_options: []