RubyGems - tokenizer - Versions diffs - 0.0.1.prealpha → 0.1.0 - Mend

tokenizer 0.0.1.prealpha → 0.1.0

Files changed (11) hide show

File without changes

data/LICENSE.rdoc ADDED

@@ -0,0 +1,19 @@
+Copyright (c) 2011- Andrei Beliankou, University of Trier, Germany
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE

data/README.rdoc CHANGED

@@ -0,0 +1,36 @@
+= Tokenizer
+* {RubyGems}[http://rubygems.org/gems/tokenizer]
+* {Developers Homepage}[http://www.uni-trier.de/index.php?id=24140]
+== DESCRIPTION
+This _Tokenizer_ is a linguistic tool intended to split a text into tokens.
+== INSTALLATION
+_Tokenizer_ is provided as a .gem package. Simply install it via RubyGems.
+== SYNOPSIS
+You can use _Tokenizer_ in two ways.
+  - As a command line tool:
+      $ echo 'Hi, ich gehe in die Schule!. | tokenize
+  - As a library for embedded tokenization:
+      $ require 'tokenizer'
+      $ de_tokenizer = Tokenizer::Tokenizer.new
+      $ de_tokenizer.tokenize('Ich gehe in die Schule!")
+      $ => ["Ich", "gehe", "in", "die", "Schule", "!"]
+See documentation in the Tokenizer::Tokenizer class for details on particular methods.
+== SUPPORT
+If you have question, bug reports or any suggestions, please drop me an email :) Any help is deeply appreciated!
+== LICENSE
+_Tokenizer_ is a copyrighted software by Andrei Beliankou, 2011-
+You may use, redistribute and change it under the terms
+provided in the LICENSE.rdoc file.

data/TODO.rdoc ADDED

@@ -0,0 +1,16 @@
+= Milestones for the project _Tokenizer_
+== 0.0.1
+  - simple tokenization is desired
+== 0.1.0
+  - notion of binary tokenizer and a library for embedded tokenization;
+  - separation of punctuaction marks.
+== 0.2.0
+== 0.3.0
+== 0.4.0
+== 0.5.0
+== 0.6.0
+== 0.7.0
+== 0.8.0
+== 0.9.0
+== 1.0.0

data/bin/tokenize CHANGED

@@ -1,8 +1,10 @@
-#!/usr/bin/ruby -nw
+#!/usr/bin/env ruby
+# -*- coding: utf-8 -*-
-BEGIN {
-  $/ = " "
-}
+require 'tokenizer'
-puts $_
+de_tokenizer = Tokenizer::Tokenizer.new
+while record = gets
+  puts de_tokenizer.tokenize(record)
+end

data/lib/tokenizer.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require_relative 'tokenizer/tokenizer'
2	+ require_relative 'tokenizer/version'

data/lib/tokenizer/tokenizer.rb ADDED

@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# :title: A simple Tokenizer for NLP Tasks.
+# :main: README.rdoc
+# A namespace for all project related stuff.
+module Tokenizer
+  class Tokenizer
+    FS = Regexp.new('[[:blank:]]+')
+#    PRE = '[{(\\`"‚„†‡‹‘’“”•–—›'
+#    POST = %w| ] } ' ` " ) , ; : \ ! \ ? \ % ‚ „ … † ‡ ‰ ‹ ‘ ’ “ ” • – — › |
+    POST = %w{! ? , : ; . )}
+    PRE = %w{(}
+    def initialize(lang=:de)
+      @lang = lang
+    end
+    def tokenize(str)
+      tokens = []
+      token = ''
+      output = ''
+      fields = str.split(FS)
+      fields.each do |field|
+        field.each_char do |ch|
+         if POST.include?(ch)
+            output << "\n#{ch}"
+          elsif PRE.include?(ch)
+            output << "#{ch}\n"
+          else
+            output << ch
+          end
+        end
+        output << "\n"
+      end
+      output.split("\n")
+    end
+  end # class
+end # module

data/lib/tokenizer/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Tokenizer
-  VERSION = '0.0.1.prealpha'
+  VERSION = '0.1.0'
 end

data/test/test_tokenizer.rb CHANGED

@@ -0,0 +1,27 @@
+require 'test/unit'
+require 'tokenizer'
+class TestTokenizer < Test::Unit::TestCase
+  def setup
+    @de_tokenizer = Tokenizer::Tokenizer.new(:de)
+    @en_tokenizer = Tokenizer::Tokenizer.new(:en)
+    @fr_tokenizer = Tokenizer::Tokenizer.new(:fr)
+  end
+  def test_constants
+    assert(Tokenizer::VERSION.is_a?(String) && ! Tokenizer::VERSION.empty?)
+  end
+  def test_output_type
+    output = @de_tokenizer.tokenize('ich gehe in die Schule')
+    assert(output.is_a?(Array))
+  end
+  def test_tokenization_001
+    input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
+    etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
+    output = @de_tokenizer.tokenize(input)
+    assert_equal(etalon, output)
+  end
+end

metadata CHANGED

@@ -1,14 +1,8 @@
 --- !ruby/object:Gem::Specification
 name: tokenizer
 version: !ruby/object:Gem::Version
-  hash: 196621262
-  prerelease: 6
-  segments:
-  - 0
-  - 0
-  - 1
-  - prealpha
-  version: 0.0.1.prealpha
+  prerelease:
+  version: 0.1.0
 platform: ruby
 authors:
 - Andrei Beliankou
@@ -16,9 +10,31 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-05-05 00:00:00 Z
-dependencies: []
+date: 2011-05-19 00:00:00 +02:00
+default_executable:
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: rdoc
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  version_requirements: *id001
+- !ruby/object:Gem::Dependency
+  name: rake
+  prerelease: false
+  requirement: &id002 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: "0"
+  type: :development
+  version_requirements: *id002
 description: A simple tokenizer for NLP tasks.
 email: a.belenkow@uni-trier.de
 executables:
@@ -27,21 +43,26 @@ extensions: []
 extra_rdoc_files:
 - README.rdoc
-- LICENSE
+- TODO.rdoc
+- LICENSE.rdoc
+- HISTORY.rdoc
 files:
+- lib/tokenizer.rb
+- lib/tokenizer/tokenizer.rb
 - lib/tokenizer/version.rb
-- bin/tokenize
 - README.rdoc
-- LICENSE
-- Rakefile
+- TODO.rdoc
+- LICENSE.rdoc
+- HISTORY.rdoc
 - test/test_tokenizer.rb
+- bin/tokenize
+has_rdoc: true
 homepage: http://www.uni-trier.de/index.php?id=34451
 licenses: []
 post_install_message:
-rdoc_options:
-- -m
-- README.rdoc
+rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
@@ -49,26 +70,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      hash: 31
-      segments:
-      - 1
-      - 8
-      version: "1.8"
+      version: "1.9"
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
-  - - ">"
+  - - ">="
     - !ruby/object:Gem::Version
-      hash: 25
-      segments:
-      - 1
-      - 3
-      - 1
-      version: 1.3.1
+      version: "0"
 requirements: []
 rubyforge_project: tokenizer
-rubygems_version: 1.7.2
+rubygems_version: 1.5.2
 signing_key:
 specification_version: 3
 summary: Tokenizer is a linguistic tool intended to split a text into tokens.

data/Rakefile DELETED

@@ -1,37 +0,0 @@
-begin
-  require 'rake'
-rescue LoadError
-  require 'rubygems'
-  require 'rake'
-end
-# we can require 'rake/clean' to add 'clobber' and 'clean' tasks
-require 'rake/clean'
-SRC = FileList['**/*.rb']
-CLOBBER.include('doc', '**/*.html', '**/*.gem')
-# testing
-require 'rake/testtask'
-Rake::TestTask.new do |t|
-  t.test_files = FileList.new('test/**/*.rb').to_a
-end
-# Build the gem package
-load 'tokenizer.gemspec'
-require 'rubygems/package_task'
-Gem::PackageTask.new(GEMSPEC).define
-# Generate documentation
-require 'rdoc/task'
-RDoc::Task.new do |rdoc|
-  rdoc.rdoc_files.include('README', 'LICENSE', 'lib/**/*')
-end
-desc "Open an irb session preloaded with this library."
-task :console do
-  sh "irb -rubygems -I lib -r tokenizer.rb"
-end