tokenizer 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +31 -14
- data/bin/tokenize +1 -1
- data/lib/tokenizer/tokenizer.rb +39 -17
- data/lib/tokenizer/version.rb +1 -1
- data/test/{test_by_tokenizer_dev.rb → development_tests/test_by_tokenizer_dev.rb} +0 -0
- data/test/{test_de_tokenizer_dev.rb → development_tests/test_de_tokenizer_dev.rb} +45 -45
- data/test/{test_en_tokenizer_dev.rb → development_tests/test_en_tokenizer_dev.rb} +2 -0
- data/test/{test_fr_tokenizer_dev.rb → development_tests/test_fr_tokenizer_dev.rb} +0 -0
- data/test/{test_it_tokenizer_dev.rb → development_tests/test_it_tokenizer_dev.rb} +0 -0
- data/test/development_tests/test_parameters.rb +26 -0
- data/test/{test_ru_tokenizer_dev.rb → development_tests/test_ru_tokenizer_dev.rb} +0 -0
- data/test/{test_de_tokenizer.rb → regression_tests/test_de_tokenizer.rb} +5 -5
- metadata +52 -98
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: abb8db238956f4cbc491e75ec47f7994e572df1d
|
4
|
+
data.tar.gz: 4d69f5ae6fe9c6411b45946098700034c01152fe
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fcf399eb94f200fa1a682dc64193fcb3bef391e2db7eece38f1603181d139368a2036e14058b59c26b75f4643517f10c5f4508a27830561ed197bfa941fa4ad3
|
7
|
+
data.tar.gz: 12f59222b26ec7987f971679b8d7be32fc86468b0741afed1b8607eb3049902082d93c614ce83358e6777a61fa4326c4bd11d5ebc9f12d751eb4fb11d678a8b7
|
data/README.rdoc
CHANGED
@@ -1,18 +1,29 @@
|
|
1
1
|
= Tokenizer
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
{RubyGems}[http://rubygems.org/gems/tokenizer] |
|
4
|
+
{Homepage}[http://bu.chsta.be/projects/tokenizer] |
|
5
|
+
{Source Code}[https://github.com/arbox/tokenizer] |
|
6
|
+
{Bug Tracker}[https://github.com/arbox/tokenizer/issues]
|
7
|
+
|
8
|
+
{<img src="https://img.shields.io/gem/v/tokenizer.svg" alt="Gem Version" />}[https://rubygems.org/gems/tokenizer]
|
9
|
+
{<img src="https://img.shields.io/travis/arbox/tokenizer.svg" alt="Build Status" />}[https://travis-ci.org/arbox/tokenizer]
|
10
|
+
{<img src="https://img.shields.io/codeclimate/github/arbox/tokenizer.svg" alt="Code Climate" />}[https://codeclimate.com/github/arbox/tokenizer]
|
11
|
+
{<img src="https://img.shields.io/gemnasium/arbox/tokenizer.svg" alt="Dependency Status" />}[https://gemnasium.com/arbox/tokenizer]
|
7
12
|
|
8
13
|
== DESCRIPTION
|
9
|
-
A simple multilingual tokenizer -- a linguistic tool intended
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
+
A simple multilingual tokenizer -- a linguistic tool intended to split a text
|
15
|
+
into tokens for NLP tasks. This tool provides a CLI and a library for
|
16
|
+
linguistic tokenization which is an anavoidable step for many HLT (Human
|
17
|
+
Language Technology) tasks in the preprocessing phase for further syntactic,
|
18
|
+
semantic and other higher level processing goals.
|
19
|
+
|
20
|
+
Tokenization task involves Sentence Segmentation, Word Segmentation and Boundary
|
21
|
+
Disambiguation for the both tasks.
|
22
|
+
|
23
|
+
Use it for tokenization of German, English and Dutch texts.
|
14
24
|
|
15
|
-
|
25
|
+
=== Implemented Algorithms
|
26
|
+
to be ...
|
16
27
|
|
17
28
|
== INSTALLATION
|
18
29
|
+Tokenizer+ is provided as a .gem package. Simply install it via
|
@@ -26,7 +37,6 @@ If you want to do a system wide installation, do this as root
|
|
26
37
|
|
27
38
|
Alternatively use your Gemfile for dependency management.
|
28
39
|
|
29
|
-
|
30
40
|
== SYNOPSIS
|
31
41
|
|
32
42
|
You can use +Tokenizer+ in two ways.
|
@@ -39,12 +49,19 @@ You can use +Tokenizer+ in two ways.
|
|
39
49
|
$ de_tokenizer.tokenize('Ich gehe in die Schule!')
|
40
50
|
$ => ["Ich", "gehe", "in", "die", "Schule", "!"]
|
41
51
|
|
52
|
+
* Customizable PRE and POST list
|
53
|
+
$ require 'tokenizer'
|
54
|
+
$ de_tokenizer = Tokenizer::Tokenizer.new(:de, { POST: Tokenizer::Tokenizer::POST + ['|'] })
|
55
|
+
$ de_tokenizer.tokenize('Ich gehe|in die Schule!')
|
56
|
+
$ => ["Ich", "gehe", "|in", "die", "Schule", "!"]
|
57
|
+
|
42
58
|
See documentation in the Tokenizer::Tokenizer class for details
|
43
59
|
on particular methods.
|
44
60
|
|
45
61
|
== SUPPORT
|
46
62
|
|
47
|
-
If you have question, bug reports or any suggestions, please drop me an email :)
|
63
|
+
If you have question, bug reports or any suggestions, please drop me an email :)
|
64
|
+
Any help is deeply appreciated!
|
48
65
|
|
49
66
|
== CHANGELOG
|
50
67
|
For details on future plan and working progress see CHANGELOG.rdoc.
|
@@ -59,5 +76,5 @@ Please contact me with your suggestions, bug reports and feature requests.
|
|
59
76
|
|
60
77
|
+Tokenizer+ is a copyrighted software by Andrei Beliankou, 2011-
|
61
78
|
|
62
|
-
You may use, redistribute and change it under the terms
|
63
|
-
|
79
|
+
You may use, redistribute and change it under the terms provided
|
80
|
+
in the LICENSE.rdoc file.
|
data/bin/tokenize
CHANGED
data/lib/tokenizer/tokenizer.rb
CHANGED
@@ -1,42 +1,64 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
|
-
|
3
2
|
# :title: A simple Tokenizer for NLP Tasks.
|
4
3
|
# :main: README.rdoc
|
5
4
|
|
6
5
|
# A namespace for all project related stuff.
|
7
6
|
module Tokenizer
|
8
|
-
|
9
7
|
class Tokenizer
|
10
8
|
FS = Regexp.new('[[:blank:]]+')
|
11
|
-
# PRE = '[{(\\`"‚„†‡‹‘’“”•–—›'
|
12
|
-
# POST = %w| ] } ' ` " ) , ; : \ ! \ ? \ % ‚ „ … † ‡ ‰ ‹ ‘ ’ “ ” • – — › |
|
13
|
-
POST = %w{! ? , : ; . )}
|
14
|
-
PRE = %w{(}
|
15
9
|
|
16
|
-
|
10
|
+
# spanish marks
|
11
|
+
SIMPLE_PRE = []
|
12
|
+
PAIR_PRE = ['(', '{', '[', '<']
|
13
|
+
SIMPLE_POST = ['!', '?', ',', ':', ';', '.']
|
14
|
+
PAIR_POST = [')', '}', ']', '>']
|
15
|
+
PRE_N_POST = ['"', "'"]
|
16
|
+
|
17
|
+
PRE = SIMPLE_PRE + PAIR_PRE
|
18
|
+
POST = SIMPLE_POST + PAIR_POST
|
19
|
+
|
20
|
+
def initialize(lang = :de, options = {})
|
17
21
|
@lang = lang
|
22
|
+
@options = {
|
23
|
+
pre: SIMPLE_PRE + PAIR_PRE,
|
24
|
+
post: SIMPLE_POST + PAIR_POST,
|
25
|
+
pre_n_post: PRE_N_POST
|
26
|
+
}.merge(options)
|
18
27
|
end
|
19
28
|
|
20
29
|
def tokenize(str)
|
21
|
-
tokens = []
|
22
|
-
token = ''
|
23
30
|
output = ''
|
24
|
-
|
31
|
+
|
32
|
+
fields = str.chomp.split(FS)
|
33
|
+
|
34
|
+
return [''] if fields.empty?
|
35
|
+
|
25
36
|
fields.each do |field|
|
26
|
-
field.each_char do |ch|
|
27
|
-
|
28
|
-
|
29
|
-
elsif PRE.include?(ch)
|
37
|
+
field.each_char.with_index do |ch, idx|
|
38
|
+
case
|
39
|
+
when @options[:pre].include?(ch)
|
30
40
|
output << "#{ch}\n"
|
41
|
+
when @options[:post].include?(ch)
|
42
|
+
output << "\n#{ch}"
|
43
|
+
if ['?', '!', '.'].include?(ch)
|
44
|
+
output << "\n"
|
45
|
+
end
|
46
|
+
when @options[:pre_n_post].include?(ch)
|
47
|
+
if idx == 0
|
48
|
+
output << "#{ch}\n"
|
49
|
+
elsif idx != 0
|
50
|
+
output << "\n#{ch}"
|
51
|
+
end
|
31
52
|
else
|
32
53
|
output << ch
|
33
54
|
end
|
34
55
|
end
|
56
|
+
|
35
57
|
output << "\n"
|
36
58
|
end
|
37
|
-
output.split("\n")
|
38
|
-
end
|
39
59
|
|
60
|
+
# @TODO: Rework the format of the string!
|
61
|
+
output.chomp('').split("\n", -1)
|
62
|
+
end
|
40
63
|
end # class
|
41
|
-
|
42
64
|
end # module
|
data/lib/tokenizer/version.rb
CHANGED
File without changes
|
@@ -19,262 +19,262 @@ class TestTokenizerDev < Test::Unit::TestCase
|
|
19
19
|
etalon = %w{Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd !}
|
20
20
|
compare(etalon, input)
|
21
21
|
end
|
22
|
-
|
22
|
+
|
23
23
|
def test_tokenization_003
|
24
24
|
input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
|
25
25
|
etalon = %w{Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen .}
|
26
26
|
compare(etalon, input)
|
27
27
|
end
|
28
|
-
|
28
|
+
|
29
29
|
def test_tokenization_004
|
30
30
|
input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
|
31
31
|
etalon = %w{Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen .}
|
32
32
|
compare(etalon, input)
|
33
33
|
end
|
34
|
-
|
34
|
+
|
35
35
|
def test_tokenization_005
|
36
36
|
input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
|
37
37
|
etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
|
38
38
|
compare(etalon, input)
|
39
39
|
end
|
40
|
-
|
40
|
+
|
41
41
|
def test_tokenization_006
|
42
42
|
input = 'Es gibt viele verschiedene Zeichen, die noch in Texten vorkommen können wie - zum Beispiel - diese hier "text" oder (text).'
|
43
43
|
etalon = %w{Es gibt viele verschiedene Zeichen , die noch in Texten vorkommen können wie - zum Beispiel - diese hier " text " oder ( text ) .}
|
44
44
|
compare(etalon, input)
|
45
45
|
end
|
46
|
-
|
46
|
+
|
47
47
|
def test_tokenization_007
|
48
48
|
input = "Abkürzungen sind immer ein Problem, da auch Leerzeichen dazwischen stehen können, wie z. B. hier."
|
49
49
|
etalon = ["Abkürzungen", "sind", "immer", "ein", "Problem", ",", "da", "auch", "Leerzeichen", "dazwischen", "stehen", "können", ",", "wie", "z. B.", "hier", "."]
|
50
50
|
compare(etalon, input)
|
51
51
|
end
|
52
|
-
|
52
|
+
|
53
53
|
def test_tokenization_008
|
54
54
|
input = "Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen, bei z.B. Aufzählungen."
|
55
55
|
etalon = %w{Außerdem kann es nach Abkürzungen und Satzenden auch mit Großschreibung weiter gehen , bei z.B. Aufzählungen .}
|
56
56
|
compare(etalon, input)
|
57
57
|
end
|
58
|
-
|
58
|
+
|
59
59
|
def test_tokenization_009
|
60
60
|
input = "Ein weiteres Problem sind solche Getrennt- und Zusammenschreibungen."
|
61
61
|
etalon = %w{Ein weiteres Problem sind solche Getrenntschreibungen und Zusammenschreibungen .}
|
62
62
|
compare(etalon, input)
|
63
63
|
end
|
64
|
-
|
64
|
+
|
65
65
|
def test_tokenization_010
|
66
66
|
input = "In manchen Texten gibt es auch Worttrennung am Zeilen- ende."
|
67
67
|
etalon = %w{In manchen Texten gibt es auch Worttrennung am Zeilenende .}
|
68
68
|
compare(etalon, input)
|
69
69
|
end
|
70
|
-
|
70
|
+
|
71
71
|
def test_tokenization_011 #Ellipsis
|
72
72
|
input = "Der Satz endet in einer Ellips..."
|
73
73
|
etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
|
74
74
|
compare(etalon, input)
|
75
75
|
end
|
76
|
-
|
76
|
+
|
77
77
|
def test_tokenization_012 #Fehlende Leerzeichen
|
78
78
|
input = "Der Satz endet.Das Leerzeichen fehlt."
|
79
79
|
etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
|
80
80
|
compare(etalon, input)
|
81
81
|
end
|
82
|
-
|
82
|
+
|
83
83
|
def test_tokenization_013 #Bindestriche
|
84
84
|
input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
|
85
85
|
etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
|
86
86
|
compare(etalon, input)
|
87
87
|
end
|
88
|
-
|
88
|
+
|
89
89
|
def test_tokenization_014 #Abkuerzungen
|
90
90
|
input = "Der Satz enthielt z.B. Fehler"
|
91
91
|
etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
|
92
92
|
compare(etalon, input)
|
93
93
|
end
|
94
|
-
|
94
|
+
|
95
95
|
def test_tokenization_015 #Fehlende Grossbuchstaben
|
96
96
|
input = "Der Satz endet. der Satz beginnt"
|
97
97
|
etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
|
98
98
|
compare(etalon, input)
|
99
99
|
end
|
100
|
-
|
100
|
+
|
101
101
|
def test_tokenization_016 #Franzoesisch
|
102
102
|
input = "L'art de l'univers, c'est un art"
|
103
103
|
etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
|
104
104
|
compare(etalon, input)
|
105
105
|
end
|
106
|
-
|
106
|
+
|
107
107
|
def test_tokenization_017 #James Bond
|
108
108
|
input = "Bond,... James Bond."
|
109
109
|
etalon = %w{ Bond , ... James Bond . } #Kontrovers!
|
110
110
|
compare(etalon, input)
|
111
111
|
end
|
112
|
-
|
112
|
+
|
113
113
|
def test_tokenization_018 #Inches
|
114
114
|
input = "The square had four 9\" sides"
|
115
115
|
etalon = %w{ The square had four 9" sides }
|
116
116
|
compare(etalon, input)
|
117
117
|
end
|
118
|
-
|
118
|
+
|
119
119
|
def test_tokenization_019 #Abkuerzung zugleich Lexikon-Eintrag
|
120
120
|
input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
|
121
121
|
etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
|
122
122
|
compare(etalon, input)
|
123
123
|
end
|
124
|
-
|
124
|
+
|
125
125
|
def test_tokenization_020 #Leerzeichen-getrennte Zusammengehörigkeiten
|
126
126
|
input = "They booked the flight New York-Los Angeles"
|
127
127
|
etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
|
128
128
|
compare(etalon, input)
|
129
129
|
end
|
130
|
-
|
130
|
+
|
131
131
|
def test_tokenization_021 #Ordinale
|
132
132
|
input = "Der 1. Platz ging an den Sieger"
|
133
133
|
etalon = %w{ Der 1. Platz ging an den Sieger }
|
134
134
|
compare(etalon, input)
|
135
135
|
end
|
136
|
-
|
136
|
+
|
137
137
|
def test_tokenization_022 #Klitika
|
138
138
|
input = "Er war's, stimmt's?"
|
139
139
|
etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
|
140
140
|
compare(etalon, input)
|
141
141
|
end
|
142
|
-
|
142
|
+
|
143
143
|
def test_tokenization_023 #Datums- und Zeitangaben
|
144
144
|
input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
|
145
145
|
etalon = [ "Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
|
146
146
|
compare(etalon, input)
|
147
147
|
end
|
148
|
-
|
148
|
+
|
149
149
|
def test_tokenization_024 #Eingebettete Saetze
|
150
150
|
input = "\"This is all?\" George asked."
|
151
151
|
etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
|
152
152
|
compare(etalon, input)
|
153
153
|
end
|
154
|
-
|
154
|
+
|
155
155
|
def test_tokenization_025 #Eingebettete Saetze 2
|
156
156
|
input = "\"Das ist alles?\" fragte sie."
|
157
157
|
etalon = %w{ Das ist alles ? fragte sie . } #ungrammatischer Satz "fragte sie."
|
158
158
|
compare(etalon, input)
|
159
159
|
end
|
160
|
-
|
161
|
-
|
160
|
+
|
161
|
+
|
162
162
|
def test_tokenization_026
|
163
163
|
input = "Die deutschen Umlaute und Sonderzeichen, wie in Mäuse, Scheiß und Tütchen, sind blöd!"
|
164
164
|
etalon = %w{ Die deutschen Umlaute und Sonderzeichen , wie in Mäuse , Scheiß und Tütchen , sind blöd ! }
|
165
165
|
compare(etalon, input)
|
166
166
|
end
|
167
|
-
|
167
|
+
|
168
168
|
def test_tokenization_027
|
169
169
|
input = "Abkürzungen, wie z.B. usw. und d.h. können zu Problemem führen."
|
170
170
|
etalon = %w{ Abkürzungen , wie z.B. usw. und d.h. können zu Problemem führen . }
|
171
171
|
compare(etalon, input)
|
172
172
|
end
|
173
|
-
|
173
|
+
|
174
174
|
def test_tokenization_028
|
175
175
|
input = "Es gibt mehr als 1.023.345 Menschen in Deutschland, die keine Tausenderpunkte verstehen."
|
176
176
|
etalon = %w{ Es gibt mehr als 1.023.345 Menschen in Deutschland , die keine Tausenderpunkte verstehen . }
|
177
177
|
compare(etalon, input)
|
178
178
|
end
|
179
|
-
|
179
|
+
|
180
180
|
def test_tokenization_029
|
181
181
|
input = "Cocktails, wie Apfel-Martini, Rum-Kirsche-Cola und andere, bereiten nicht nur Menschen Probleme."
|
182
182
|
etalon = %w{ Cocktails , wie Apfel-Martini , Rum-Kirsche-Cola und andere , bereiten nicht nur Menschen Probleme . }
|
183
183
|
compare(etalon, input)
|
184
184
|
end
|
185
|
-
|
185
|
+
|
186
186
|
def test_tokenization_030 #Ellipsis
|
187
187
|
input = "Der Satz endet in einer Ellips..."
|
188
188
|
etalon = %w{ Der Satz endet in einer Ellips... } #die elliptischen Punkte sollten nicht vom Wort getrennt werden
|
189
189
|
compare(etalon, input)
|
190
190
|
end
|
191
|
-
|
191
|
+
|
192
192
|
def test_tokenization_031 #Fehlende Leerzeichen
|
193
193
|
input = "Der Satz endet.Das Leerzeichen fehlt."
|
194
194
|
etalon = %w{ Der Satz endet . Das Leerzeichen fehlt . } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
|
195
195
|
compare(etalon, input)
|
196
196
|
end
|
197
|
-
|
197
|
+
|
198
198
|
def test_tokenization_032 #Bindestriche
|
199
199
|
input = "Das Bindeglied - manisch-depressives Verhalten, binden-verbinden"
|
200
200
|
etalon = %w{ Das Bindeglied - manisch-depressives Verhalten , binden - verbinden}
|
201
201
|
compare(etalon, input)
|
202
202
|
end
|
203
|
-
|
203
|
+
|
204
204
|
def test_tokenization_033 #Abkuerzungen
|
205
205
|
input = "Der Satz enthielt z.B. Fehler"
|
206
206
|
etalon = %w{ Der Satz enthielt z.B. Fehler } #/\.\s(?=[A-Z])/ wuerde hinter Punkt den Satz beenden
|
207
207
|
compare(etalon, input)
|
208
208
|
end
|
209
|
-
|
209
|
+
|
210
210
|
def test_tokenization_034 #Fehlende Grossbuchstaben
|
211
211
|
input = "Der Satz endet. der Satz beginnt"
|
212
212
|
etalon = %w{ Der Satz endet . der Satz beginnt } #/\.\s(?=[A-Z])/ wuerde die Saetze nicht trennen
|
213
213
|
compare(etalon, input)
|
214
214
|
end
|
215
|
-
|
215
|
+
|
216
216
|
def test_tokenization_035 #Franzoesisch
|
217
217
|
input = "L'art de l'univers, c'est un art"
|
218
218
|
etalon = %w{ L' art de l' univers , c'est un art } #Kontrovers!
|
219
219
|
compare(etalon, input)
|
220
220
|
end
|
221
|
-
|
221
|
+
|
222
222
|
def test_tokenization_036 #James Bond
|
223
223
|
input = "Bond,... James Bond."
|
224
224
|
etalon = %w{ Bond , ... James Bond . } #Kontrovers!
|
225
225
|
compare(etalon, input)
|
226
226
|
end
|
227
|
-
|
227
|
+
|
228
228
|
def test_tokenization_037 #Inches
|
229
229
|
input = "The square had four 9\" sides"
|
230
230
|
etalon = %w{ The square had four 9" sides }
|
231
231
|
compare(etalon, input)
|
232
232
|
end
|
233
|
-
|
233
|
+
|
234
234
|
def test_tokenization_039 #Abkuerzung zugleich Lexikon-Eintrag
|
235
235
|
input = "In fig. 3, a fig can be seen. Fig. no. 4 shows no fig."
|
236
236
|
etalon = %w{ In fig. 3 , a fig can be seen . Fig. no. 4 shows no fig . } #fig sowohl als Abkuerzung als auch als Wort
|
237
237
|
compare(etalon, input)
|
238
238
|
end
|
239
|
-
|
239
|
+
|
240
240
|
def test_tokenization_040 #Leerzeichen-getrennte Zusammengehörigkeiten
|
241
241
|
input = "They booked the flight New York-Los Angeles"
|
242
242
|
etalon = ["They", "booked", "the", "flight", "New York", "-", "Los Angeles"] #oder mit Bindestrich verbunden
|
243
243
|
compare(etalon, input)
|
244
244
|
end
|
245
|
-
|
245
|
+
|
246
246
|
def test_tokenization_041 #Ordinale
|
247
247
|
input = "Der 1. Platz ging an den Sieger"
|
248
248
|
etalon = %w{ Der 1. Platz ging an den Sieger }
|
249
249
|
compare(etalon, input)
|
250
250
|
end
|
251
|
-
|
251
|
+
|
252
252
|
def test_tokenization_042 #Klitika
|
253
253
|
input = "Er war's, stimmt's?"
|
254
254
|
etalon = %w{ Er war es , stimmt es ? } #Kontrovers! Benoetigt komplexere Analyse
|
255
255
|
compare(etalon, input)
|
256
256
|
end
|
257
|
-
|
257
|
+
|
258
258
|
#Datums- und Zeitangaben
|
259
|
-
def test_tokenization_043
|
259
|
+
def test_tokenization_043
|
260
260
|
input = "Es passierte am 13. Januar 2011 um 12:13 Uhr"
|
261
261
|
etalon = ["Es", "passierte", "am", "13. Januar 2011", "um", "12:13 Uhr"]
|
262
262
|
compare(etalon, input)
|
263
263
|
end
|
264
|
-
|
264
|
+
|
265
265
|
#Eingebettete Sätze
|
266
266
|
def test_tokenization_044
|
267
267
|
input = '"This is all?" George asked.'
|
268
268
|
etalon = %w{ This is all ? George asked . } #kann zu ungrammatischen Saetzen fuehren
|
269
269
|
compare(etalon, input)
|
270
270
|
end
|
271
|
-
|
271
|
+
|
272
272
|
def test_tokenization_046 #Eingebettete Saetze 2
|
273
273
|
input = '"Das ist alles?" fragte sie.'
|
274
274
|
etalon = %w{Das ist alles ? fragte sie .} #ungrammatischer Satz "fragte sie."
|
275
275
|
compare(etalon, input)
|
276
276
|
end
|
277
|
-
|
277
|
+
|
278
278
|
private
|
279
279
|
def compare(exp_result, input)
|
280
280
|
act_result = @de_tokenizer.tokenize(input)
|
File without changes
|
File without changes
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
require 'test/unit'
|
3
|
+
require 'tokenizer'
|
4
|
+
|
5
|
+
class TestTokenizerDev < Test::Unit::TestCase
|
6
|
+
|
7
|
+
def setup
|
8
|
+
@en_tokenizer = Tokenizer::Tokenizer.new(:en, {PRE: [], POST: ['|']})
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_tokenization_001
|
12
|
+
result = @en_tokenizer.tokenize('testing| new')
|
13
|
+
assert_equal(['testing', '|', 'new', ''], result)
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_tokenization_002
|
17
|
+
result = @en_tokenizer.tokenize('testing, new')
|
18
|
+
assert_equal(['testing,', 'new', ''], result)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def compare(exp_result, input)
|
23
|
+
act_result = @de_tokenizer.tokenize(input)
|
24
|
+
assert_equal(exp_result, act_result)
|
25
|
+
end
|
26
|
+
end
|
File without changes
|
@@ -1,14 +1,14 @@
|
|
1
|
-
require '
|
1
|
+
require 'minitest/autorun'
|
2
2
|
require 'tokenizer'
|
3
3
|
|
4
|
-
class TestTokenizer < Test
|
4
|
+
class TestTokenizer < Minitest::Test
|
5
5
|
|
6
6
|
def setup
|
7
7
|
@de_tokenizer = Tokenizer::Tokenizer.new(:de)
|
8
8
|
end
|
9
9
|
|
10
10
|
def test_constants
|
11
|
-
assert(Tokenizer::VERSION.is_a?(String) && !
|
11
|
+
assert(Tokenizer::VERSION.is_a?(String) && !Tokenizer::VERSION.empty?)
|
12
12
|
end
|
13
13
|
|
14
14
|
def test_output_type
|
@@ -17,8 +17,8 @@ class TestTokenizer < Test::Unit::TestCase
|
|
17
17
|
end
|
18
18
|
|
19
19
|
def test_tokenization_001
|
20
|
-
input = '
|
21
|
-
etalon = %w
|
20
|
+
input = 'Ich ging in die Schule!'
|
21
|
+
etalon = %w(Ich ging in die Schule !)
|
22
22
|
output = @de_tokenizer.tokenize(input)
|
23
23
|
assert_equal(etalon, output)
|
24
24
|
end
|
metadata
CHANGED
@@ -1,121 +1,75 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizer
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.1.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.2
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
6
|
+
authors:
|
8
7
|
- Andrei Beliankou
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
requirements:
|
22
|
-
- - ">="
|
23
|
-
- !ruby/object:Gem::Version
|
24
|
-
version: 3.9.1
|
25
|
-
type: :development
|
26
|
-
version_requirements: *id001
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: rake
|
29
|
-
prerelease: false
|
30
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
31
|
-
none: false
|
32
|
-
requirements:
|
33
|
-
- - "="
|
34
|
-
- !ruby/object:Gem::Version
|
35
|
-
version: 0.8.7
|
36
|
-
type: :development
|
37
|
-
version_requirements: *id002
|
38
|
-
- !ruby/object:Gem::Dependency
|
39
|
-
name: yard
|
40
|
-
prerelease: false
|
41
|
-
requirement: &id003 !ruby/object:Gem::Requirement
|
42
|
-
none: false
|
43
|
-
requirements:
|
44
|
-
- - ">="
|
45
|
-
- !ruby/object:Gem::Version
|
46
|
-
version: "0"
|
47
|
-
type: :development
|
48
|
-
version_requirements: *id003
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: bundler
|
51
|
-
prerelease: false
|
52
|
-
requirement: &id004 !ruby/object:Gem::Requirement
|
53
|
-
none: false
|
54
|
-
requirements:
|
55
|
-
- - ">="
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
version: "0"
|
58
|
-
type: :development
|
59
|
-
version_requirements: *id004
|
60
|
-
description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI and a library for linguistic tokenization which is an anavoidable step for many HLT (human language technology) tasks in the preprocessing phase for further syntactic, semantic and other higher level processing goals. Use it for tokenization of German, English and French texts.
|
61
|
-
email: a.belenkow@uni-trier.de
|
62
|
-
executables:
|
11
|
+
date: 2015-09-03 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: A simple multilingual tokenizer for NLP tasks. This tool provides a CLI
|
14
|
+
and a library for linguistic tokenization which is an anavoidable step for many
|
15
|
+
HLT (human language technology) tasks in the preprocessing phase for further syntactic,
|
16
|
+
semantic and other higher level processing goals. Use it for tokenization of German,
|
17
|
+
English and French texts.
|
18
|
+
email: arbox@yandex.ru
|
19
|
+
executables:
|
63
20
|
- tokenize
|
64
21
|
extensions: []
|
65
|
-
|
66
|
-
|
22
|
+
extra_rdoc_files:
|
23
|
+
- CHANGELOG.rdoc
|
24
|
+
- LICENSE.rdoc
|
67
25
|
- README.rdoc
|
26
|
+
files:
|
27
|
+
- ".yardopts"
|
68
28
|
- CHANGELOG.rdoc
|
69
29
|
- LICENSE.rdoc
|
70
|
-
|
30
|
+
- README.rdoc
|
31
|
+
- bin/tokenize
|
71
32
|
- lib/tokenizer.rb
|
72
33
|
- lib/tokenizer/tokenizer.rb
|
73
34
|
- lib/tokenizer/version.rb
|
74
|
-
-
|
75
|
-
-
|
76
|
-
-
|
77
|
-
- .
|
78
|
-
- test/
|
79
|
-
- test/
|
80
|
-
- test/
|
81
|
-
- test/
|
82
|
-
|
83
|
-
- test/test_fr_tokenizer_dev.rb
|
84
|
-
- test/test_ru_tokenizer_dev.rb
|
85
|
-
- bin/tokenize
|
86
|
-
has_rdoc: true
|
87
|
-
homepage: http://www.uni-trier.de/index.php?id=34451
|
35
|
+
- test/development_tests/test_by_tokenizer_dev.rb
|
36
|
+
- test/development_tests/test_de_tokenizer_dev.rb
|
37
|
+
- test/development_tests/test_en_tokenizer_dev.rb
|
38
|
+
- test/development_tests/test_fr_tokenizer_dev.rb
|
39
|
+
- test/development_tests/test_it_tokenizer_dev.rb
|
40
|
+
- test/development_tests/test_parameters.rb
|
41
|
+
- test/development_tests/test_ru_tokenizer_dev.rb
|
42
|
+
- test/regression_tests/test_de_tokenizer.rb
|
43
|
+
homepage: https://github.com/arbox/tokenizer
|
88
44
|
licenses: []
|
89
|
-
|
45
|
+
metadata: {}
|
90
46
|
post_install_message:
|
91
47
|
rdoc_options: []
|
92
|
-
|
93
|
-
require_paths:
|
48
|
+
require_paths:
|
94
49
|
- lib
|
95
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
-
|
97
|
-
requirements:
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
98
52
|
- - ">="
|
99
|
-
- !ruby/object:Gem::Version
|
100
|
-
version: 1.
|
101
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
-
|
103
|
-
requirements:
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.9.3
|
55
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
104
57
|
- - ">="
|
105
|
-
- !ruby/object:Gem::Version
|
106
|
-
version:
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: '0'
|
107
60
|
requirements: []
|
108
|
-
|
109
61
|
rubyforge_project: tokenizer
|
110
|
-
rubygems_version:
|
62
|
+
rubygems_version: 2.4.5
|
111
63
|
signing_key:
|
112
|
-
specification_version:
|
113
|
-
summary: Tokenizer is a
|
114
|
-
test_files:
|
115
|
-
- test/
|
116
|
-
- test/test_de_tokenizer_dev.rb
|
117
|
-
- test/
|
118
|
-
- test/
|
119
|
-
- test/test_it_tokenizer_dev.rb
|
120
|
-
- test/
|
121
|
-
- test/test_ru_tokenizer_dev.rb
|
64
|
+
specification_version: 4
|
65
|
+
summary: Tokenizer is a tool intended to split a text into tokens.
|
66
|
+
test_files:
|
67
|
+
- test/development_tests/test_by_tokenizer_dev.rb
|
68
|
+
- test/development_tests/test_de_tokenizer_dev.rb
|
69
|
+
- test/development_tests/test_en_tokenizer_dev.rb
|
70
|
+
- test/development_tests/test_fr_tokenizer_dev.rb
|
71
|
+
- test/development_tests/test_it_tokenizer_dev.rb
|
72
|
+
- test/development_tests/test_parameters.rb
|
73
|
+
- test/development_tests/test_ru_tokenizer_dev.rb
|
74
|
+
- test/regression_tests/test_de_tokenizer.rb
|
75
|
+
has_rdoc:
|