pragmatic_tokenizer 1.4.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +184 -0
- data/.rubocop_todo.yml +66 -0
- data/README.md +0 -7
- data/Rakefile +1 -1
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
- data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +28 -28
- data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
- data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
- data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -0
- data/spec/languages/bulgarian_spec.rb +17 -13
- data/spec/languages/deutsch_spec.rb +110 -86
- data/spec/languages/english_spec.rb +465 -342
- data/spec/languages/french_spec.rb +3 -2
- data/spec/performance_spec.rb +7 -7
- data/spec/pragmatic_tokenizer_spec.rb +8 -8
- metadata +18 -2
@@ -9,189 +9,210 @@ describe PragmaticTokenizer do
|
|
9
9
|
|
10
10
|
it 'tokenizes a string #002' do
|
11
11
|
text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
|
12
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
12
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
13
|
+
text,
|
14
|
+
language: 'de',
|
15
|
+
downcase: false,
|
16
|
+
remove_stop_words: true,
|
17
|
+
punctuation: 'none',
|
18
|
+
numbers: :none
|
19
|
+
).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
|
19
20
|
end
|
20
21
|
|
21
22
|
it 'tokenizes a string #003' do
|
22
23
|
text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
|
23
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
24
|
-
|
25
|
-
|
24
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
25
|
+
text,
|
26
|
+
language: 'de',
|
27
|
+
downcase: false
|
26
28
|
).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
27
29
|
end
|
28
30
|
|
29
31
|
it 'tokenizes a string #004' do
|
30
32
|
text = 'der Kaffee-Ersatz'
|
31
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
32
|
-
|
33
|
-
|
33
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
34
|
+
text,
|
35
|
+
language: 'de',
|
36
|
+
downcase: false
|
34
37
|
).tokenize).to eq(['der', 'Kaffee-Ersatz'])
|
35
38
|
end
|
36
39
|
|
37
40
|
it 'tokenizes a string #005' do
|
38
41
|
text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
|
39
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
40
|
-
|
42
|
+
expect(PragmaticTokenizer::Tokenizer.new(
|
43
|
+
text,
|
44
|
+
language: 'de'
|
41
45
|
).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
42
46
|
end
|
43
47
|
|
44
48
|
it 'handles words with a slash 1' do
|
45
49
|
text = "We pay 3000 €/month"
|
46
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
47
|
-
|
48
|
-
|
50
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
51
|
+
text,
|
52
|
+
punctuation: 'none',
|
53
|
+
language: 'de'
|
49
54
|
)
|
50
55
|
expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
|
51
56
|
end
|
52
57
|
|
53
58
|
it 'handles words with a slash 2' do
|
54
59
|
text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
|
55
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
56
|
-
|
57
|
-
|
60
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
61
|
+
text,
|
62
|
+
punctuation: 'none',
|
63
|
+
language: 'de'
|
58
64
|
)
|
59
|
-
expect(pt.tokenize).to eq(
|
65
|
+
expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
|
60
66
|
end
|
61
67
|
|
62
68
|
it 'handles words with a slash 3' do
|
63
69
|
text = "Poison gas attack in Ghuta/Syria."
|
64
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
65
|
-
|
66
|
-
|
70
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
71
|
+
text,
|
72
|
+
punctuation: 'none',
|
73
|
+
language: 'de'
|
67
74
|
)
|
68
|
-
expect(pt.tokenize).to eq(
|
75
|
+
expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
|
69
76
|
end
|
70
77
|
|
71
78
|
it 'handles words with a question mark' do
|
72
79
|
text = "Essen á la carte?Man ist versucht…"
|
73
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
74
|
-
|
75
|
-
|
80
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
81
|
+
text,
|
82
|
+
punctuation: 'none',
|
83
|
+
language: 'de'
|
76
84
|
)
|
77
|
-
expect(pt.tokenize).to eq(
|
85
|
+
expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
|
78
86
|
end
|
79
87
|
|
80
88
|
it 'handles apostrophes and quotes 3' do
|
81
89
|
text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
|
82
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
83
|
-
|
84
|
-
|
90
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
91
|
+
text,
|
92
|
+
punctuation: 'none',
|
93
|
+
language: 'de'
|
85
94
|
)
|
86
|
-
expect(pt.tokenize).to eq(
|
95
|
+
expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
|
87
96
|
end
|
88
97
|
|
89
98
|
it 'handles mentions 1' do
|
90
99
|
text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
|
91
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
92
|
-
|
93
|
-
|
100
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
101
|
+
text,
|
102
|
+
punctuation: 'none',
|
103
|
+
language: 'de'
|
94
104
|
)
|
95
105
|
expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
96
106
|
end
|
97
107
|
|
98
108
|
it 'handles mentions 2' do
|
99
109
|
text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
|
100
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
101
|
-
|
102
|
-
|
110
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
111
|
+
text,
|
112
|
+
punctuation: 'none',
|
113
|
+
language: 'de'
|
103
114
|
)
|
104
115
|
expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
105
116
|
end
|
106
117
|
|
107
118
|
it 'handles old school emoticons 1' do
|
108
119
|
text = "du übertreibst maßlos :D"
|
109
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
110
|
-
|
111
|
-
|
112
|
-
|
120
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
121
|
+
text,
|
122
|
+
punctuation: 'none',
|
123
|
+
downcase: false,
|
124
|
+
language: 'de'
|
113
125
|
)
|
114
126
|
expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
|
115
127
|
end
|
116
128
|
|
117
129
|
it 'handles words with a symbol suffix' do
|
118
130
|
text = "hier ist ein Whirlpool versteckt^^"
|
119
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
120
|
-
|
121
|
-
|
131
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
132
|
+
text,
|
133
|
+
punctuation: 'none',
|
134
|
+
language: 'de'
|
122
135
|
)
|
123
|
-
expect(pt.tokenize).to eq(
|
136
|
+
expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
|
124
137
|
end
|
125
138
|
|
126
139
|
it 'handles hashtags 1' do
|
127
140
|
text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
|
128
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
129
|
-
|
130
|
-
|
141
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
142
|
+
text,
|
143
|
+
punctuation: 'none',
|
144
|
+
language: 'de'
|
131
145
|
)
|
132
146
|
expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
133
147
|
end
|
134
148
|
|
135
149
|
it 'handles numbers and words' do
|
136
150
|
text = "Air Force Once ist 18.270-mal abgehoben."
|
137
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
138
|
-
|
139
|
-
|
151
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
152
|
+
text,
|
153
|
+
punctuation: 'none',
|
154
|
+
language: 'de'
|
140
155
|
)
|
141
156
|
expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
142
157
|
end
|
143
158
|
|
144
159
|
it 'maintains the german gender-neutrality form 2' do
|
145
160
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen"
|
146
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
147
|
-
|
148
|
-
|
161
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
162
|
+
text,
|
163
|
+
punctuation: 'none',
|
164
|
+
language: 'de'
|
149
165
|
)
|
150
|
-
expect(pt.tokenize).to eq(
|
166
|
+
expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
|
151
167
|
end
|
152
168
|
|
153
169
|
it 'handles contractions 1' do
|
154
170
|
text = "gibt's"
|
155
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
156
|
-
|
157
|
-
|
171
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
172
|
+
text,
|
173
|
+
expand_contractions: true,
|
174
|
+
language: 'de'
|
158
175
|
)
|
159
|
-
expect(pt.tokenize).to eq(
|
176
|
+
expect(pt.tokenize).to eq(%w(gibt es))
|
160
177
|
end
|
161
178
|
|
162
179
|
it 'handles contractions 2' do
|
163
180
|
text = "gibt‘s schaut’s wenn's g›spür find´s"
|
164
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
165
|
-
|
166
|
-
|
181
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
182
|
+
text,
|
183
|
+
expand_contractions: true,
|
184
|
+
language: 'de'
|
167
185
|
)
|
168
|
-
expect(pt.tokenize).to eq(
|
186
|
+
expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
|
169
187
|
end
|
170
188
|
|
171
189
|
it 'removes English stopwords' do
|
172
190
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
173
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
174
|
-
|
175
|
-
|
176
|
-
|
191
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
192
|
+
text,
|
193
|
+
filter_languages: [:en],
|
194
|
+
remove_stop_words: true,
|
195
|
+
language: 'de'
|
177
196
|
)
|
178
197
|
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
179
198
|
end
|
180
199
|
|
181
|
-
|
200
|
+
it 'removes English and German stopwords' do
|
182
201
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
183
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
184
|
-
|
185
|
-
|
186
|
-
|
202
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
203
|
+
text,
|
204
|
+
filter_languages: [:en, :de],
|
205
|
+
remove_stop_words: true,
|
206
|
+
language: 'de'
|
187
207
|
)
|
188
208
|
expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
189
209
|
end
|
190
210
|
|
191
211
|
it 'does not remove English stopwords' do
|
192
212
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
193
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
194
|
-
|
213
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
214
|
+
text,
|
215
|
+
language: 'de'
|
195
216
|
)
|
196
217
|
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
197
218
|
end
|
@@ -201,9 +222,10 @@ describe PragmaticTokenizer do
|
|
201
222
|
it 'maintains the german gender-neutrality form 1' do
|
202
223
|
skip "NOT IMPLEMENTED"
|
203
224
|
text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
|
204
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
205
|
-
|
206
|
-
|
225
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
226
|
+
text,
|
227
|
+
punctuation: 'none',
|
228
|
+
language: 'de'
|
207
229
|
)
|
208
230
|
expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
209
231
|
end
|
@@ -211,17 +233,19 @@ describe PragmaticTokenizer do
|
|
211
233
|
it 'handles apostrophes and quotes 4' do
|
212
234
|
skip "NOT IMPLEMENTED"
|
213
235
|
text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
|
214
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
215
|
-
|
216
|
-
|
236
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
237
|
+
text,
|
238
|
+
punctuation: 'none',
|
239
|
+
language: 'de'
|
217
240
|
)
|
218
|
-
expect(pt.tokenize).to eq(
|
241
|
+
expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
|
219
242
|
end
|
220
243
|
|
221
244
|
it 'handles abrreviations for languages other than English' do
|
222
245
|
text = "Adj. Smith how are ü. today."
|
223
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
224
|
-
|
246
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
247
|
+
text,
|
248
|
+
language: :de
|
225
249
|
)
|
226
250
|
expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
227
251
|
end
|