RubyGems - TokenizerProjectUT - Versions diffs - 0.0.1 - Mend

TokenizerProjectUT 0.0.1

Files changed (11) hide show

data/CHANGELOG.rdoc +6 -0
data/LICENSE.rdoc +4 -0
data/README.rdoc +20 -0
data/bin/tokenize +7 -0
data/lib/tokenizer.rb +3 -0
data/lib/tokenizer/tokenizer.rb +23 -0
data/lib/tokenizer/version.rb +4 -0
data/test/test_de_tokenizer_dev.rb +283 -0
data/test/test_tokenizer.rb +31 -0
data/test/test_version.rb +21 -0
metadata +67 -0

data/CHANGELOG.rdoc ADDED Viewed

@@ -0,0 +1,6 @@
+==COMPLETED
+===0.0.1
+Program structure and very simple tokenization.
+==PLANNED
+===0.0.2
+Correct tokenization of punctuation.

data/LICENSE.rdoc ADDED Viewed

@@ -0,0 +1,4 @@
+= LICENSE file
+== Description
+This file is licensed to David Alfter and may not be used without consent by aforementionned person.
+By reading this, you agree to these terms.

data/README.rdoc ADDED Viewed

@@ -0,0 +1,20 @@
+= Tokenizer Project
+== DESCRIPTION
+This is a tokenizer intended for simple tasks.
+=== Implemented Features
+* simple string tokenization
+* test structure
+== Installation
+If you don't know how to install this,... you have a problem.
+== Synopsis
+The synopsis is not implemented.
+== Support
+No support has been implemented.
+== Changelog
+For further information on the evolution of this tokenizer, see the CHANGELOG.rdoc file.
+== Caution
+Keep quiet while using this tokenizer. Loud noises may or may not interfere with function.
+== License
+This file is licensed under the conditions listed in the LICENSE.rdoc file.
+== Version
+The actual version of this tokenizer can be found in the lib/tokenizer/version.rb file.

data/bin/tokenize ADDED Viewed

@@ -0,0 +1,7 @@
+require 'tokenizer'
+options = :de
+t = Tokenizer::Tokenizer.new(options)
+while str = gets
+	puts t.tokenize(str)
+end

data/lib/tokenizer.rb ADDED Viewed

@@ -0,0 +1,3 @@
+require 'tokenizer/version'
+require 'tokenizer/tokenizer'

data/lib/tokenizer/tokenizer.rb ADDED Viewed

@@ -0,0 +1,23 @@
+# :title: My cool Tokenizer!!!
+# :main: README.rdoc
+#The module Tokenizer is the namespace for this project.
+module Tokenizer
+	#The class Tokenizer defines the tokenizer itself.
+	class Tokenizer
+	@lang
+	#WL is the word limit used by the tokenizer.
+	WL = /\s+/
+		#Constructs a Tokenizer with specified language. Standard = :de
+		def initialize(lang = :de)
+			@lang = lang
+		end
+		#Returns the tokens contained in the given string.
+		def tokenize(str)
+			tokens = str.split(WL)
+			tokens
+		end
+	end
+end

data/lib/tokenizer/version.rb ADDED Viewed

@@ -0,0 +1,4 @@
+module Tokenizer
+#This is the current version.
+VERSION = '0.0.1'
+end

data/test/test_de_tokenizer_dev.rb ADDED Viewed

@@ -0,0 +1,283 @@
+# -*- coding: utf-8 -*-
+require 'test/unit'
+require 'tokenizer'
+class TestTokenizerDev < Test::Unit::TestCase
+  def setup
+    @de_tokenizer = Tokenizer::Tokenizer.new(:de)
+  end
+  def test_tokenization_001
+    input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
+    etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
+    compare(etalon, input)
+  end
+  def test_tokenization_002
+    input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
+    etalon = %w{Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd !}
+    compare(etalon, input)
+  end
+  def test_tokenization_003
+    input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
+    etalon = %w{Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen .}
+    compare(etalon, input)
+  end
+  def test_tokenization_004
+    input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
+    etalon = %w{Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen .}
+    compare(etalon, input)
+  end
+  def test_tokenization_005
+    input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
+    etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
+    compare(etalon, input)
+  end
+  def test_tokenization_006
+    input = 'Es gibt viele verschiedene Zeichen, die noch in Texten vorkommen können wie - zum Beispiel - diese hier "text" oder (text).'
+    etalon = %w{Es gibt viele verschiedene Zeichen , die noch in Texten vorkommen können wie - zum Beispiel - diese hier " text " oder ( text ) .}
+    compare(etalon, input)
+  end
+  def test_tokenization_007
+    input = "Abkürzungen sind immer ein Problem, da auch Leerzeichen dazwischen stehen können, wie z. B. hier."
+    etalon = ["Abkürzungen", "sind", "immer", "ein", "Problem", ",", "da", "auch", "Leerzeichen", "dazwischen", "stehen", "können", ",", "wie", "z. B.", "hier", "."]
+    compare(etalon, input)
+  end
+  def test_tokenization_008
+    input = "Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen, bei z.B. Aufzählungen."
+    etalon = %w{Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen , bei z.B. Aufzählungen .}
+    compare(etalon, input)
+  end
+  def test_tokenization_009
+    input = "Ein weiteres Problem sind solche Getrennt- und Zusammenschreibungen."
+    etalon = %w{Ein weiteres Problem sind solche Getrenntschreibungen und Zusammenschreibungen .}
+    compare(etalon, input)
+  end
+  def test_tokenization_010
+    input = "In manchen Texten gibt es auch Worttrennung am Zeilen- ende."
+    etalon = %w{In manchen Texten gibt es auch Worttrennung am Zeilenende .}
+    compare(etalon, input)
+  end
+  def test_tokenization_011 #Ellipsis
+    input = "Der Satz endet in einer Ellips..."
+    etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
+    compare(etalon, input)
+  end
+  def test_tokenization_012 #Fehlende Leerzeichen
+    input = "Der Satz endet.Das Leerzeichen fehlt."
+    etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
+    compare(etalon, input)
+  end
+  def test_tokenization_013 #Bindestriche
+    input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
+    etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
+    compare(etalon, input)
+  end
+  def test_tokenization_014 #Abkuerzungen
+    input = "Der Satz enthielt z.B. Fehler"
+    etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
+    compare(etalon, input)
+  end
+  def test_tokenization_015 #Fehlende Grossbuchstaben
+    input = "Der Satz endet. der Satz beginnt"
+    etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
+    compare(etalon, input)
+  end
+  def test_tokenization_016 #Franzoesisch
+    input = "L'art de l'univers, c'est un art"
+    etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
+    compare(etalon, input)
+  end
+  def test_tokenization_017 #James Bond
+    input = "Bond,... James Bond."
+    etalon = %w{ Bond , ... James Bond . } #Kontrovers!
+    compare(etalon, input)
+  end
+  def test_tokenization_018 #Inches
+    input = "The square had four 9\" sides"
+    etalon = %w{ The square had four 9" sides }
+    compare(etalon, input)
+  end
+  def test_tokenization_019 #Abkuerzung zugleich Lexikon-Eintrag
+    input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
+    etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
+    compare(etalon, input)
+  end
+  def test_tokenization_020 #Leerzeichen-getrennte Zusammengehörigkeiten
+    input = "They booked the flight New York-Los Angeles"
+    etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
+    compare(etalon, input)
+  end
+  def test_tokenization_021 #Ordinale
+    input = "Der 1. Platz ging an den Sieger"
+    etalon = %w{ Der 1. Platz ging an den Sieger }
+    compare(etalon, input)
+  end
+  def test_tokenization_022 #Klitika
+    input = "Er war's, stimmt's?"
+    etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
+    compare(etalon, input)
+  end
+  def test_tokenization_023 #Datums- und Zeitangaben
+    input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
+    etalon = [ "Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
+    compare(etalon, input)
+  end
+  def test_tokenization_024 #Eingebettete Saetze
+    input = "\"This is all?\" George asked."
+    etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
+    compare(etalon, input)
+  end
+  def test_tokenization_025 #Eingebettete Saetze 2
+    input = "\"Das ist alles?\" fragte sie."
+    etalon = %w{ Das ist alles ? fragte sie . } #ungrammatischer Satz "fragte sie."
+    compare(etalon, input)
+  end
+  def test_tokenization_026
+    input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
+    etalon = %w{ Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd ! }
+    compare(etalon, input)
+  end
+  def test_tokenization_027
+    input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
+    etalon = %w{ Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen . }
+    compare(etalon, input)
+  end
+  def test_tokenization_028
+    input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
+    etalon = %w{ Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen . }
+    compare(etalon, input)
+  end
+  def test_tokenization_029
+    input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
+    etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
+    compare(etalon, input)
+  end
+  def test_tokenization_030 #Ellipsis
+    input = "Der Satz endet in einer Ellips..."
+    etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
+    compare(etalon, input)
+  end
+  def test_tokenization_031 #Fehlende Leerzeichen
+    input = "Der Satz endet.Das Leerzeichen fehlt."
+    etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
+    compare(etalon, input)
+  end
+  def test_tokenization_032 #Bindestriche
+    input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
+    etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
+    compare(etalon, input)
+  end
+  def test_tokenization_033 #Abkuerzungen
+    input = "Der Satz enthielt z.B. Fehler"
+    etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
+    compare(etalon, input)
+  end
+  def test_tokenization_034 #Fehlende Grossbuchstaben
+    input = "Der Satz endet. der Satz beginnt"
+    etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
+    compare(etalon, input)
+  end
+  def test_tokenization_035 #Franzoesisch
+    input = "L'art de l'univers, c'est un art"
+    etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
+    compare(etalon, input)
+  end
+  def test_tokenization_036 #James Bond
+    input = "Bond,... James Bond."
+    etalon = %w{ Bond , ... James Bond . } #Kontrovers!
+    compare(etalon, input)
+  end
+  def test_tokenization_037 #Inches
+    input = "The square had four 9\" sides"
+    etalon = %w{ The square had four 9" sides }
+    compare(etalon, input)
+  end
+  def test_tokenization_039 #Abkuerzung zugleich Lexikon-Eintrag
+    input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
+    etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
+    compare(etalon, input)
+  end
+  def test_tokenization_040 #Leerzeichen-getrennte Zusammengehörigkeiten
+    input = "They booked the flight New York-Los Angeles"
+    etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
+    compare(etalon, input)
+  end
+  def test_tokenization_041 #Ordinale
+    input = "Der 1. Platz ging an den Sieger"
+    etalon = %w{ Der 1. Platz ging an den Sieger }
+    compare(etalon, input)
+  end
+  def test_tokenization_042 #Klitika
+    input = "Er war's, stimmt's?"
+    etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
+    compare(etalon, input)
+  end
+  #Datums- und Zeitangaben
+  def test_tokenization_043
+    input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
+    etalon = ["Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
+    compare(etalon, input)
+  end
+  #Eingebettete Sätze
+  def test_tokenization_044
+    input = '"This is all?" George asked.'
+    etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
+    compare(etalon, input)
+  end
+  def test_tokenization_046 #Eingebettete Saetze 2
+    input = '"Das ist alles?" fragte sie.'
+    etalon = %w{Das ist alles ? fragte sie .} #ungrammatischer Satz "fragte sie."
+    compare(etalon, input)
+  end
+  private
+  def compare(exp_result, input)
+    act_result = @de_tokenizer.tokenize(input)
+    assert_equal(exp_result, act_result)
+  end
+end

data/test/test_tokenizer.rb ADDED Viewed

@@ -0,0 +1,31 @@
+require 'tokenizer/tokenizer'
+require 'test/unit'
+#This is the test suite for lib/tokenizer/tokenizer.rb
+class TestTokenizer < Test::Unit::TestCase
+	def setup
+		@t = Tokenizer::Tokenizer.new
+		@result = @t.tokenize("test\t\n string")
+	end
+	def test_has_method
+		assert(@t.respond_to?(:tokenize))
+	end
+	def test_returns_array
+		assert_instance_of(Array, @result)
+	end
+	def test_array_not_empty
+		assert_equal(false, @result.empty?)
+	end
+	def test_array_contains_strings
+		assert_instance_of(String, @result.first)
+	end
+	def test_splits_more_whitespace
+		assert_equal(["test", "string"], @result)
+	end
+end

data/test/test_version.rb ADDED Viewed

@@ -0,0 +1,21 @@
+require 'tokenizer'
+require 'test/unit'
+#This is the test suite for lib/tokenizer/version.rb
+class TestVersion < Test::Unit::TestCase
+	#Setup a tokenizer
+	def setup
+		@t = Tokenizer::Tokenizer.new(:de)
+	end
+	#Test whether version is a string.
+	def test_version_is_string
+		assert(Tokenizer::VERSION.is_a?(String), "Falsche Klasse fuer Version!")
+	end
+	#Test whether version is empty.
+	def test_version_not_empty
+		assert_equal(false, Tokenizer::VERSION.empty?)
+	end
+end #Test Version

metadata ADDED Viewed

@@ -0,0 +1,67 @@
+--- !ruby/object:Gem::Specification
+name: TokenizerProjectUT
+version: !ruby/object:Gem::Version
+  version: 0.0.1
+  prerelease:
+platform: ruby
+authors:
+- David Alfter
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2011-11-24 00:00:00.000000000 +01:00
+default_executable:
+dependencies: []
+description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
+  and a library for linguistic tokenization which is an anavoidable step for many
+  HLT (human language technology) tasks in the preprocessing phase for further syntactic,
+  semantic and other higher level processing goals. Use it for tokenization of German,
+  English and French texts.
+email: s2daalft@uni-trier.de
+executables:
+- tokenize
+extensions: []
+extra_rdoc_files:
+- README.rdoc
+- LICENSE.rdoc
+- CHANGELOG.rdoc
+files:
+- lib/tokenizer.rb
+- lib/tokenizer/tokenizer.rb
+- lib/tokenizer/version.rb
+- README.rdoc
+- LICENSE.rdoc
+- CHANGELOG.rdoc
+- test/test_version.rb
+- test/test_de_tokenizer_dev.rb
+- test/test_tokenizer.rb
+- bin/tokenize
+has_rdoc: true
+homepage:
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project: tokenizer
+rubygems_version: 1.5.2
+signing_key:
+specification_version: 3
+summary: Tokenizer is a linguistic tool intended to split a text into tokens.
+test_files:
+- test/test_version.rb
+- test/test_de_tokenizer_dev.rb
+- test/test_tokenizer.rb