pragmatic_tokenizer 1.4.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -9,189 +9,210 @@ describe PragmaticTokenizer do
|
|
9
9
|
|
10
10
|
it 'tokenizes a string #002' do
|
11
11
|
text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
|
12
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
13
|
+
text,
|
14
|
+
language: 'de',
|
15
|
+
downcase: false,
|
16
|
+
remove_stop_words: true,
|
17
|
+
punctuation: 'none',
|
18
|
+
numbers: :none
|
19
|
+
).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
|
19
20
|
end
|
20
21
|
|
21
22
|
it 'tokenizes a string #003' do
|
22
23
|
text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
|
23
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
24
|
-
|
25
|
-
|
24
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
25
|
+
text,
|
26
|
+
language: 'de',
|
27
|
+
downcase: false
|
26
28
|
).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
27
29
|
end
|
28
30
|
|
29
31
|
it 'tokenizes a string #004' do
|
30
32
|
text = 'der Kaffee-Ersatz'
|
31
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
32
|
-
|
33
|
-
|
33
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
34
|
+
text,
|
35
|
+
language: 'de',
|
36
|
+
downcase: false
|
34
37
|
).tokenize).to eq(['der', 'Kaffee-Ersatz'])
|
35
38
|
end
|
36
39
|
|
37
40
|
it 'tokenizes a string #005' do
|
38
41
|
text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
|
39
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
40
|
-
|
42
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
43
|
+
text,
|
44
|
+
language: 'de'
|
41
45
|
).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
42
46
|
end
|
43
47
|
|
44
48
|
it 'handles words with a slash 1' do
|
45
49
|
text = "We pay 3000 €/month"
|
46
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
47
|
-
|
48
|
-
|
50
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
51
|
+
text,
|
52
|
+
punctuation: 'none',
|
53
|
+
language: 'de'
|
49
54
|
)
|
50
55
|
expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
|
51
56
|
end
|
52
57
|
|
53
58
|
it 'handles words with a slash 2' do
|
54
59
|
text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
|
55
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
56
|
-
|
57
|
-
|
60
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
61
|
+
text,
|
62
|
+
punctuation: 'none',
|
63
|
+
language: 'de'
|
58
64
|
)
|
59
|
-
expect(pt.tokenize).to eq(
|
65
|
+
expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
|
60
66
|
end
|
61
67
|
|
62
68
|
it 'handles words with a slash 3' do
|
63
69
|
text = "Poison gas attack in Ghuta/Syria."
|
64
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
65
|
-
|
66
|
-
|
70
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
71
|
+
text,
|
72
|
+
punctuation: 'none',
|
73
|
+
language: 'de'
|
67
74
|
)
|
68
|
-
expect(pt.tokenize).to eq(
|
75
|
+
expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
|
69
76
|
end
|
70
77
|
|
71
78
|
it 'handles words with a question mark' do
|
72
79
|
text = "Essen á la carte?Man ist versucht…"
|
73
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
74
|
-
|
75
|
-
|
80
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
81
|
+
text,
|
82
|
+
punctuation: 'none',
|
83
|
+
language: 'de'
|
76
84
|
)
|
77
|
-
expect(pt.tokenize).to eq(
|
85
|
+
expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
|
78
86
|
end
|
79
87
|
|
80
88
|
it 'handles apostrophes and quotes 3' do
|
81
89
|
text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
|
82
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
83
|
-
|
84
|
-
|
90
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
91
|
+
text,
|
92
|
+
punctuation: 'none',
|
93
|
+
language: 'de'
|
85
94
|
)
|
86
|
-
expect(pt.tokenize).to eq(
|
95
|
+
expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
|
87
96
|
end
|
88
97
|
|
89
98
|
it 'handles mentions 1' do
|
90
99
|
text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
|
91
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
92
|
-
|
93
|
-
|
100
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
101
|
+
text,
|
102
|
+
punctuation: 'none',
|
103
|
+
language: 'de'
|
94
104
|
)
|
95
105
|
expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
96
106
|
end
|
97
107
|
|
98
108
|
it 'handles mentions 2' do
|
99
109
|
text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
|
100
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
101
|
-
|
102
|
-
|
110
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
111
|
+
text,
|
112
|
+
punctuation: 'none',
|
113
|
+
language: 'de'
|
103
114
|
)
|
104
115
|
expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
105
116
|
end
|
106
117
|
|
107
118
|
it 'handles old school emoticons 1' do
|
108
119
|
text = "du übertreibst maßlos :D"
|
109
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
110
|
-
|
111
|
-
|
112
|
-
|
120
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
121
|
+
text,
|
122
|
+
punctuation: 'none',
|
123
|
+
downcase: false,
|
124
|
+
language: 'de'
|
113
125
|
)
|
114
126
|
expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
|
115
127
|
end
|
116
128
|
|
117
129
|
it 'handles words with a symbol suffix' do
|
118
130
|
text = "hier ist ein Whirlpool versteckt^^"
|
119
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
120
|
-
|
121
|
-
|
131
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
132
|
+
text,
|
133
|
+
punctuation: 'none',
|
134
|
+
language: 'de'
|
122
135
|
)
|
123
|
-
expect(pt.tokenize).to eq(
|
136
|
+
expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
|
124
137
|
end
|
125
138
|
|
126
139
|
it 'handles hashtags 1' do
|
127
140
|
text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
|
128
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
129
|
-
|
130
|
-
|
141
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
142
|
+
text,
|
143
|
+
punctuation: 'none',
|
144
|
+
language: 'de'
|
131
145
|
)
|
132
146
|
expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
133
147
|
end
|
134
148
|
|
135
149
|
it 'handles numbers and words' do
|
136
150
|
text = "Air Force Once ist 18.270-mal abgehoben."
|
137
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
138
|
-
|
139
|
-
|
151
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
152
|
+
text,
|
153
|
+
punctuation: 'none',
|
154
|
+
language: 'de'
|
140
155
|
)
|
141
156
|
expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
142
157
|
end
|
143
158
|
|
144
159
|
it 'maintains the german gender-neutrality form 2' do
|
145
160
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen"
|
146
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
147
|
-
|
148
|
-
|
161
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
162
|
+
text,
|
163
|
+
punctuation: 'none',
|
164
|
+
language: 'de'
|
149
165
|
)
|
150
|
-
expect(pt.tokenize).to eq(
|
166
|
+
expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
|
151
167
|
end
|
152
168
|
|
153
169
|
it 'handles contractions 1' do
|
154
170
|
text = "gibt's"
|
155
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
156
|
-
|
157
|
-
|
171
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
172
|
+
text,
|
173
|
+
expand_contractions: true,
|
174
|
+
language: 'de'
|
158
175
|
)
|
159
|
-
expect(pt.tokenize).to eq(
|
176
|
+
expect(pt.tokenize).to eq(%w(gibt es))
|
160
177
|
end
|
161
178
|
|
162
179
|
it 'handles contractions 2' do
|
163
180
|
text = "gibt‘s schaut’s wenn's g›spür find´s"
|
164
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
165
|
-
|
166
|
-
|
181
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
182
|
+
text,
|
183
|
+
expand_contractions: true,
|
184
|
+
language: 'de'
|
167
185
|
)
|
168
|
-
expect(pt.tokenize).to eq(
|
186
|
+
expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
|
169
187
|
end
|
170
188
|
|
171
189
|
it 'removes English stopwords' do
|
172
190
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
173
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
174
|
-
|
175
|
-
|
176
|
-
|
191
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
192
|
+
text,
|
193
|
+
filter_languages: [:en],
|
194
|
+
remove_stop_words: true,
|
195
|
+
language: 'de'
|
177
196
|
)
|
178
197
|
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
179
198
|
end
|
180
199
|
|
181
|
-
|
200
|
+
it 'removes English and German stopwords' do
|
182
201
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
183
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
184
|
-
|
185
|
-
|
186
|
-
|
202
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
203
|
+
text,
|
204
|
+
filter_languages: [:en, :de],
|
205
|
+
remove_stop_words: true,
|
206
|
+
language: 'de'
|
187
207
|
)
|
188
208
|
expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
189
209
|
end
|
190
210
|
|
191
211
|
it 'does not remove English stopwords' do
|
192
212
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
193
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
194
|
-
|
213
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
214
|
+
text,
|
215
|
+
language: 'de'
|
195
216
|
)
|
196
217
|
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
197
218
|
end
|
@@ -201,9 +222,10 @@ describe PragmaticTokenizer do
|
|
201
222
|
it 'maintains the german gender-neutrality form 1' do
|
202
223
|
skip "NOT IMPLEMENTED"
|
203
224
|
text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
|
204
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
205
|
-
|
206
|
-
|
225
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
226
|
+
text,
|
227
|
+
punctuation: 'none',
|
228
|
+
language: 'de'
|
207
229
|
)
|
208
230
|
expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
209
231
|
end
|
@@ -211,17 +233,19 @@ describe PragmaticTokenizer do
|
|
211
233
|
it 'handles apostrophes and quotes 4' do
|
212
234
|
skip "NOT IMPLEMENTED"
|
213
235
|
text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
|
214
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
215
|
-
|
216
|
-
|
236
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
237
|
+
text,
|
238
|
+
punctuation: 'none',
|
239
|
+
language: 'de'
|
217
240
|
)
|
218
|
-
expect(pt.tokenize).to eq(
|
241
|
+
expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
|
219
242
|
end
|
220
243
|
|
221
244
|
it 'handles abrreviations for languages other than English' do
|
222
245
|
text = "Adj. Smith how are ü. today."
|
223
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
224
|
-
|
246
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
247
|
+
text,
|
248
|
+
language: :de
|
225
249
|
)
|
226
250
|
expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
227
251
|
end
|