tokenizer 0.0.1.prealpha → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
File without changes
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2011- Andrei Beliankou, University of Trier, Germany
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE
@@ -0,0 +1,36 @@
1
+ = Tokenizer
2
+
3
+ * {RubyGems}[http://rubygems.org/gems/tokenizer]
4
+ * {Developers Homepage}[http://www.uni-trier.de/index.php?id=24140]
5
+
6
+ == DESCRIPTION
7
+ This _Tokenizer_ is a linguistic tool intended to split a text into tokens.
8
+
9
+ == INSTALLATION
10
+
11
+ _Tokenizer_ is provided as a .gem package. Simply install it via RubyGems.
12
+
13
+ == SYNOPSIS
14
+
15
+ You can use _Tokenizer_ in two ways.
16
+ - As a command line tool:
17
+ $ echo 'Hi, ich gehe in die Schule!. | tokenize
18
+
19
+ - As a library for embedded tokenization:
20
+ $ require 'tokenizer'
21
+ $ de_tokenizer = Tokenizer::Tokenizer.new
22
+ $ de_tokenizer.tokenize('Ich gehe in die Schule!")
23
+ $ => ["Ich", "gehe", "in", "die", "Schule", "!"]
24
+
25
+ See documentation in the Tokenizer::Tokenizer class for details on particular methods.
26
+
27
+ == SUPPORT
28
+
29
+ If you have question, bug reports or any suggestions, please drop me an email :) Any help is deeply appreciated!
30
+
31
+ == LICENSE
32
+
33
+ _Tokenizer_ is a copyrighted software by Andrei Beliankou, 2011-
34
+
35
+ You may use, redistribute and change it under the terms
36
+ provided in the LICENSE.rdoc file.
@@ -0,0 +1,16 @@
1
+ = Milestones for the project _Tokenizer_
2
+
3
+ == 0.0.1
4
+ - simple tokenization is desired
5
+ == 0.1.0
6
+ - notion of binary tokenizer and a library for embedded tokenization;
7
+ - separation of punctuaction marks.
8
+ == 0.2.0
9
+ == 0.3.0
10
+ == 0.4.0
11
+ == 0.5.0
12
+ == 0.6.0
13
+ == 0.7.0
14
+ == 0.8.0
15
+ == 0.9.0
16
+ == 1.0.0
@@ -1,8 +1,10 @@
1
- #!/usr/bin/ruby -nw
1
+ #!/usr/bin/env ruby
2
+ # -*- coding: utf-8 -*-
2
3
 
3
- BEGIN {
4
- $/ = " "
5
- }
4
+ require 'tokenizer'
6
5
 
7
- puts $_
6
+ de_tokenizer = Tokenizer::Tokenizer.new
8
7
 
8
+ while record = gets
9
+ puts de_tokenizer.tokenize(record)
10
+ end
@@ -0,0 +1,2 @@
1
+ require_relative 'tokenizer/tokenizer'
2
+ require_relative 'tokenizer/version'
@@ -0,0 +1,42 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # :title: A simple Tokenizer for NLP Tasks.
4
+ # :main: README.rdoc
5
+
6
+ # A namespace for all project related stuff.
7
+ module Tokenizer
8
+
9
+ class Tokenizer
10
+ FS = Regexp.new('[[:blank:]]+')
11
+ # PRE = '[{(\\`"‚„†‡‹‘’“”•–—›'
12
+ # POST = %w| ] } ' ` " ) , ; : \ ! \ ? \ % ‚ „ … † ‡ ‰ ‹ ‘ ’ “ ” • – — › |
13
+ POST = %w{! ? , : ; . )}
14
+ PRE = %w{(}
15
+
16
+ def initialize(lang=:de)
17
+ @lang = lang
18
+ end
19
+
20
+ def tokenize(str)
21
+ tokens = []
22
+ token = ''
23
+ output = ''
24
+ fields = str.split(FS)
25
+ fields.each do |field|
26
+ field.each_char do |ch|
27
+ if POST.include?(ch)
28
+ output << "\n#{ch}"
29
+ elsif PRE.include?(ch)
30
+ output << "#{ch}\n"
31
+ else
32
+ output << ch
33
+ end
34
+ end
35
+ output << "\n"
36
+ end
37
+ output.split("\n")
38
+ end
39
+
40
+ end # class
41
+
42
+ end # module
@@ -1,3 +1,3 @@
1
1
  module Tokenizer
2
- VERSION = '0.0.1.prealpha'
2
+ VERSION = '0.1.0'
3
3
  end
@@ -0,0 +1,27 @@
1
+ require 'test/unit'
2
+ require 'tokenizer'
3
+
4
+ class TestTokenizer < Test::Unit::TestCase
5
+
6
+ def setup
7
+ @de_tokenizer = Tokenizer::Tokenizer.new(:de)
8
+ @en_tokenizer = Tokenizer::Tokenizer.new(:en)
9
+ @fr_tokenizer = Tokenizer::Tokenizer.new(:fr)
10
+ end
11
+
12
+ def test_constants
13
+ assert(Tokenizer::VERSION.is_a?(String) && ! Tokenizer::VERSION.empty?)
14
+ end
15
+
16
+ def test_output_type
17
+ output = @de_tokenizer.tokenize('ich gehe in die Schule')
18
+ assert(output.is_a?(Array))
19
+ end
20
+
21
+ def test_tokenization_001
22
+ input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
23
+ etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
24
+ output = @de_tokenizer.tokenize(input)
25
+ assert_equal(etalon, output)
26
+ end
27
+ end
metadata CHANGED
@@ -1,14 +1,8 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- hash: 196621262
5
- prerelease: 6
6
- segments:
7
- - 0
8
- - 0
9
- - 1
10
- - prealpha
11
- version: 0.0.1.prealpha
4
+ prerelease:
5
+ version: 0.1.0
12
6
  platform: ruby
13
7
  authors:
14
8
  - Andrei Beliankou
@@ -16,9 +10,31 @@ autorequire:
16
10
  bindir: bin
17
11
  cert_chain: []
18
12
 
19
- date: 2011-05-05 00:00:00 Z
20
- dependencies: []
21
-
13
+ date: 2011-05-19 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rdoc
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :development
26
+ version_requirements: *id001
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ prerelease: false
30
+ requirement: &id002 !ruby/object:Gem::Requirement
31
+ none: false
32
+ requirements:
33
+ - - ">="
34
+ - !ruby/object:Gem::Version
35
+ version: "0"
36
+ type: :development
37
+ version_requirements: *id002
22
38
  description: A simple tokenizer for NLP tasks.
23
39
  email: a.belenkow@uni-trier.de
24
40
  executables:
@@ -27,21 +43,26 @@ extensions: []
27
43
 
28
44
  extra_rdoc_files:
29
45
  - README.rdoc
30
- - LICENSE
46
+ - TODO.rdoc
47
+ - LICENSE.rdoc
48
+ - HISTORY.rdoc
31
49
  files:
50
+ - lib/tokenizer.rb
51
+ - lib/tokenizer/tokenizer.rb
32
52
  - lib/tokenizer/version.rb
33
- - bin/tokenize
34
53
  - README.rdoc
35
- - LICENSE
36
- - Rakefile
54
+ - TODO.rdoc
55
+ - LICENSE.rdoc
56
+ - HISTORY.rdoc
37
57
  - test/test_tokenizer.rb
58
+ - bin/tokenize
59
+ has_rdoc: true
38
60
  homepage: http://www.uni-trier.de/index.php?id=34451
39
61
  licenses: []
40
62
 
41
63
  post_install_message:
42
- rdoc_options:
43
- - -m
44
- - README.rdoc
64
+ rdoc_options: []
65
+
45
66
  require_paths:
46
67
  - lib
47
68
  required_ruby_version: !ruby/object:Gem::Requirement
@@ -49,26 +70,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
49
70
  requirements:
50
71
  - - ">="
51
72
  - !ruby/object:Gem::Version
52
- hash: 31
53
- segments:
54
- - 1
55
- - 8
56
- version: "1.8"
73
+ version: "1.9"
57
74
  required_rubygems_version: !ruby/object:Gem::Requirement
58
75
  none: false
59
76
  requirements:
60
- - - ">"
77
+ - - ">="
61
78
  - !ruby/object:Gem::Version
62
- hash: 25
63
- segments:
64
- - 1
65
- - 3
66
- - 1
67
- version: 1.3.1
79
+ version: "0"
68
80
  requirements: []
69
81
 
70
82
  rubyforge_project: tokenizer
71
- rubygems_version: 1.7.2
83
+ rubygems_version: 1.5.2
72
84
  signing_key:
73
85
  specification_version: 3
74
86
  summary: Tokenizer is a linguistic tool intended to split a text into tokens.
data/Rakefile DELETED
@@ -1,37 +0,0 @@
1
- begin
2
- require 'rake'
3
- rescue LoadError
4
- require 'rubygems'
5
- require 'rake'
6
- end
7
-
8
- # we can require 'rake/clean' to add 'clobber' and 'clean' tasks
9
- require 'rake/clean'
10
-
11
-
12
-
13
- SRC = FileList['**/*.rb']
14
-
15
- CLOBBER.include('doc', '**/*.html', '**/*.gem')
16
-
17
- # testing
18
- require 'rake/testtask'
19
- Rake::TestTask.new do |t|
20
- t.test_files = FileList.new('test/**/*.rb').to_a
21
- end
22
-
23
- # Build the gem package
24
- load 'tokenizer.gemspec'
25
- require 'rubygems/package_task'
26
- Gem::PackageTask.new(GEMSPEC).define
27
-
28
- # Generate documentation
29
- require 'rdoc/task'
30
- RDoc::Task.new do |rdoc|
31
- rdoc.rdoc_files.include('README', 'LICENSE', 'lib/**/*')
32
- end
33
-
34
- desc "Open an irb session preloaded with this library."
35
- task :console do
36
- sh "irb -rubygems -I lib -r tokenizer.rb"
37
- end