tokenizer 0.0.1.prealpha → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/{LICENSE → HISTORY.rdoc} +0 -0
- data/LICENSE.rdoc +19 -0
- data/README.rdoc +36 -0
- data/TODO.rdoc +16 -0
- data/bin/tokenize +7 -5
- data/lib/tokenizer.rb +2 -0
- data/lib/tokenizer/tokenizer.rb +42 -0
- data/lib/tokenizer/version.rb +1 -1
- data/test/test_tokenizer.rb +27 -0
- metadata +43 -31
- data/Rakefile +0 -37
data/{LICENSE → HISTORY.rdoc}
RENAMED
File without changes
|
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (c) 2011- Andrei Beliankou, University of Trier, Germany
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE
|
data/README.rdoc
CHANGED
@@ -0,0 +1,36 @@
|
|
1
|
+
= Tokenizer
|
2
|
+
|
3
|
+
* {RubyGems}[http://rubygems.org/gems/tokenizer]
|
4
|
+
* {Developers Homepage}[http://www.uni-trier.de/index.php?id=24140]
|
5
|
+
|
6
|
+
== DESCRIPTION
|
7
|
+
This _Tokenizer_ is a linguistic tool intended to split a text into tokens.
|
8
|
+
|
9
|
+
== INSTALLATION
|
10
|
+
|
11
|
+
_Tokenizer_ is provided as a .gem package. Simply install it via RubyGems.
|
12
|
+
|
13
|
+
== SYNOPSIS
|
14
|
+
|
15
|
+
You can use _Tokenizer_ in two ways.
|
16
|
+
- As a command line tool:
|
17
|
+
$ echo 'Hi, ich gehe in die Schule!. | tokenize
|
18
|
+
|
19
|
+
- As a library for embedded tokenization:
|
20
|
+
$ require 'tokenizer'
|
21
|
+
$ de_tokenizer = Tokenizer::Tokenizer.new
|
22
|
+
$ de_tokenizer.tokenize('Ich gehe in die Schule!")
|
23
|
+
$ => ["Ich", "gehe", "in", "die", "Schule", "!"]
|
24
|
+
|
25
|
+
See documentation in the Tokenizer::Tokenizer class for details on particular methods.
|
26
|
+
|
27
|
+
== SUPPORT
|
28
|
+
|
29
|
+
If you have question, bug reports or any suggestions, please drop me an email :) Any help is deeply appreciated!
|
30
|
+
|
31
|
+
== LICENSE
|
32
|
+
|
33
|
+
_Tokenizer_ is a copyrighted software by Andrei Beliankou, 2011-
|
34
|
+
|
35
|
+
You may use, redistribute and change it under the terms
|
36
|
+
provided in the LICENSE.rdoc file.
|
data/TODO.rdoc
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
= Milestones for the project _Tokenizer_
|
2
|
+
|
3
|
+
== 0.0.1
|
4
|
+
- simple tokenization is desired
|
5
|
+
== 0.1.0
|
6
|
+
- notion of binary tokenizer and a library for embedded tokenization;
|
7
|
+
- separation of punctuaction marks.
|
8
|
+
== 0.2.0
|
9
|
+
== 0.3.0
|
10
|
+
== 0.4.0
|
11
|
+
== 0.5.0
|
12
|
+
== 0.6.0
|
13
|
+
== 0.7.0
|
14
|
+
== 0.8.0
|
15
|
+
== 0.9.0
|
16
|
+
== 1.0.0
|
data/bin/tokenize
CHANGED
data/lib/tokenizer.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
# :title: A simple Tokenizer for NLP Tasks.
|
4
|
+
# :main: README.rdoc
|
5
|
+
|
6
|
+
# A namespace for all project related stuff.
|
7
|
+
module Tokenizer
|
8
|
+
|
9
|
+
class Tokenizer
|
10
|
+
FS = Regexp.new('[[:blank:]]+')
|
11
|
+
# PRE = '[{(\\`"‚„†‡‹‘’“”•–—›'
|
12
|
+
# POST = %w| ] } ' ` " ) , ; : \ ! \ ? \ % ‚ „ … † ‡ ‰ ‹ ‘ ’ “ ” • – — › |
|
13
|
+
POST = %w{! ? , : ; . )}
|
14
|
+
PRE = %w{(}
|
15
|
+
|
16
|
+
def initialize(lang=:de)
|
17
|
+
@lang = lang
|
18
|
+
end
|
19
|
+
|
20
|
+
def tokenize(str)
|
21
|
+
tokens = []
|
22
|
+
token = ''
|
23
|
+
output = ''
|
24
|
+
fields = str.split(FS)
|
25
|
+
fields.each do |field|
|
26
|
+
field.each_char do |ch|
|
27
|
+
if POST.include?(ch)
|
28
|
+
output << "\n#{ch}"
|
29
|
+
elsif PRE.include?(ch)
|
30
|
+
output << "#{ch}\n"
|
31
|
+
else
|
32
|
+
output << ch
|
33
|
+
end
|
34
|
+
end
|
35
|
+
output << "\n"
|
36
|
+
end
|
37
|
+
output.split("\n")
|
38
|
+
end
|
39
|
+
|
40
|
+
end # class
|
41
|
+
|
42
|
+
end # module
|
data/lib/tokenizer/version.rb
CHANGED
data/test/test_tokenizer.rb
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'tokenizer'
|
3
|
+
|
4
|
+
class TestTokenizer < Test::Unit::TestCase
|
5
|
+
|
6
|
+
def setup
|
7
|
+
@de_tokenizer = Tokenizer::Tokenizer.new(:de)
|
8
|
+
@en_tokenizer = Tokenizer::Tokenizer.new(:en)
|
9
|
+
@fr_tokenizer = Tokenizer::Tokenizer.new(:fr)
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_constants
|
13
|
+
assert(Tokenizer::VERSION.is_a?(String) && ! Tokenizer::VERSION.empty?)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_output_type
|
17
|
+
output = @de_tokenizer.tokenize('ich gehe in die Schule')
|
18
|
+
assert(output.is_a?(Array))
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_tokenization_001
|
22
|
+
input = 'ich ging? du, und ich nicht (konnte nicht)? Warum?!!'
|
23
|
+
etalon = %w{ ich ging ? du , und ich nicht ( konnte nicht ) ? Warum ? ! !}
|
24
|
+
output = @de_tokenizer.tokenize(input)
|
25
|
+
assert_equal(etalon, output)
|
26
|
+
end
|
27
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,8 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 1
|
10
|
-
- prealpha
|
11
|
-
version: 0.0.1.prealpha
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.0
|
12
6
|
platform: ruby
|
13
7
|
authors:
|
14
8
|
- Andrei Beliankou
|
@@ -16,9 +10,31 @@ autorequire:
|
|
16
10
|
bindir: bin
|
17
11
|
cert_chain: []
|
18
12
|
|
19
|
-
date: 2011-05-
|
20
|
-
|
21
|
-
|
13
|
+
date: 2011-05-19 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: rdoc
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :development
|
26
|
+
version_requirements: *id001
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
prerelease: false
|
30
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
+
none: false
|
32
|
+
requirements:
|
33
|
+
- - ">="
|
34
|
+
- !ruby/object:Gem::Version
|
35
|
+
version: "0"
|
36
|
+
type: :development
|
37
|
+
version_requirements: *id002
|
22
38
|
description: A simple tokenizer for NLP tasks.
|
23
39
|
email: a.belenkow@uni-trier.de
|
24
40
|
executables:
|
@@ -27,21 +43,26 @@ extensions: []
|
|
27
43
|
|
28
44
|
extra_rdoc_files:
|
29
45
|
- README.rdoc
|
30
|
-
-
|
46
|
+
- TODO.rdoc
|
47
|
+
- LICENSE.rdoc
|
48
|
+
- HISTORY.rdoc
|
31
49
|
files:
|
50
|
+
- lib/tokenizer.rb
|
51
|
+
- lib/tokenizer/tokenizer.rb
|
32
52
|
- lib/tokenizer/version.rb
|
33
|
-
- bin/tokenize
|
34
53
|
- README.rdoc
|
35
|
-
-
|
36
|
-
-
|
54
|
+
- TODO.rdoc
|
55
|
+
- LICENSE.rdoc
|
56
|
+
- HISTORY.rdoc
|
37
57
|
- test/test_tokenizer.rb
|
58
|
+
- bin/tokenize
|
59
|
+
has_rdoc: true
|
38
60
|
homepage: http://www.uni-trier.de/index.php?id=34451
|
39
61
|
licenses: []
|
40
62
|
|
41
63
|
post_install_message:
|
42
|
-
rdoc_options:
|
43
|
-
|
44
|
-
- README.rdoc
|
64
|
+
rdoc_options: []
|
65
|
+
|
45
66
|
require_paths:
|
46
67
|
- lib
|
47
68
|
required_ruby_version: !ruby/object:Gem::Requirement
|
@@ -49,26 +70,17 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
49
70
|
requirements:
|
50
71
|
- - ">="
|
51
72
|
- !ruby/object:Gem::Version
|
52
|
-
|
53
|
-
segments:
|
54
|
-
- 1
|
55
|
-
- 8
|
56
|
-
version: "1.8"
|
73
|
+
version: "1.9"
|
57
74
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
75
|
none: false
|
59
76
|
requirements:
|
60
|
-
- - "
|
77
|
+
- - ">="
|
61
78
|
- !ruby/object:Gem::Version
|
62
|
-
|
63
|
-
segments:
|
64
|
-
- 1
|
65
|
-
- 3
|
66
|
-
- 1
|
67
|
-
version: 1.3.1
|
79
|
+
version: "0"
|
68
80
|
requirements: []
|
69
81
|
|
70
82
|
rubyforge_project: tokenizer
|
71
|
-
rubygems_version: 1.
|
83
|
+
rubygems_version: 1.5.2
|
72
84
|
signing_key:
|
73
85
|
specification_version: 3
|
74
86
|
summary: Tokenizer is a linguistic tool intended to split a text into tokens.
|
data/Rakefile
DELETED
@@ -1,37 +0,0 @@
|
|
1
|
-
begin
|
2
|
-
require 'rake'
|
3
|
-
rescue LoadError
|
4
|
-
require 'rubygems'
|
5
|
-
require 'rake'
|
6
|
-
end
|
7
|
-
|
8
|
-
# we can require 'rake/clean' to add 'clobber' and 'clean' tasks
|
9
|
-
require 'rake/clean'
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
SRC = FileList['**/*.rb']
|
14
|
-
|
15
|
-
CLOBBER.include('doc', '**/*.html', '**/*.gem')
|
16
|
-
|
17
|
-
# testing
|
18
|
-
require 'rake/testtask'
|
19
|
-
Rake::TestTask.new do |t|
|
20
|
-
t.test_files = FileList.new('test/**/*.rb').to_a
|
21
|
-
end
|
22
|
-
|
23
|
-
# Build the gem package
|
24
|
-
load 'tokenizer.gemspec'
|
25
|
-
require 'rubygems/package_task'
|
26
|
-
Gem::PackageTask.new(GEMSPEC).define
|
27
|
-
|
28
|
-
# Generate documentation
|
29
|
-
require 'rdoc/task'
|
30
|
-
RDoc::Task.new do |rdoc|
|
31
|
-
rdoc.rdoc_files.include('README', 'LICENSE', 'lib/**/*')
|
32
|
-
end
|
33
|
-
|
34
|
-
desc "Open an irb session preloaded with this library."
|
35
|
-
task :console do
|
36
|
-
sh "irb -rubygems -I lib -r tokenizer.rb"
|
37
|
-
end
|