pragmatic_tokenizer 1.4.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -9,189 +9,210 @@ describe PragmaticTokenizer do
9
9
 
10
10
  it 'tokenizes a string #002' do
11
11
  text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
12
- expect(PragmaticTokenizer::Tokenizer.new(text,
13
- language: 'de',
14
- downcase: false,
15
- remove_stop_words: true,
16
- punctuation: 'none',
17
- numbers: :none
18
- ).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
12
+ expect(PragmaticTokenizer::Tokenizer.new(
13
+ text,
14
+ language: 'de',
15
+ downcase: false,
16
+ remove_stop_words: true,
17
+ punctuation: 'none',
18
+ numbers: :none
19
+ ).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
19
20
  end
20
21
 
21
22
  it 'tokenizes a string #003' do
22
23
  text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
23
- expect(PragmaticTokenizer::Tokenizer.new(text,
24
- language: 'de',
25
- downcase: false
24
+ expect(PragmaticTokenizer::Tokenizer.new(
25
+ text,
26
+ language: 'de',
27
+ downcase: false
26
28
  ).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
27
29
  end
28
30
 
29
31
  it 'tokenizes a string #004' do
30
32
  text = 'der Kaffee-Ersatz'
31
- expect(PragmaticTokenizer::Tokenizer.new(text,
32
- language: 'de',
33
- downcase: false
33
+ expect(PragmaticTokenizer::Tokenizer.new(
34
+ text,
35
+ language: 'de',
36
+ downcase: false
34
37
  ).tokenize).to eq(['der', 'Kaffee-Ersatz'])
35
38
  end
36
39
 
37
40
  it 'tokenizes a string #005' do
38
41
  text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
39
- expect(PragmaticTokenizer::Tokenizer.new(text,
40
- language: 'de',
42
+ expect(PragmaticTokenizer::Tokenizer.new(
43
+ text,
44
+ language: 'de'
41
45
  ).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
42
46
  end
43
47
 
44
48
  it 'handles words with a slash 1' do
45
49
  text = "We pay 3000 €/month"
46
- pt = PragmaticTokenizer::Tokenizer.new(text,
47
- punctuation: 'none',
48
- language: 'de'
50
+ pt = PragmaticTokenizer::Tokenizer.new(
51
+ text,
52
+ punctuation: 'none',
53
+ language: 'de'
49
54
  )
50
55
  expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
51
56
  end
52
57
 
53
58
  it 'handles words with a slash 2' do
54
59
  text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
55
- pt = PragmaticTokenizer::Tokenizer.new(text,
56
- punctuation: 'none',
57
- language: 'de'
60
+ pt = PragmaticTokenizer::Tokenizer.new(
61
+ text,
62
+ punctuation: 'none',
63
+ language: 'de'
58
64
  )
59
- expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
65
+ expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
60
66
  end
61
67
 
62
68
  it 'handles words with a slash 3' do
63
69
  text = "Poison gas attack in Ghuta/Syria."
64
- pt = PragmaticTokenizer::Tokenizer.new(text,
65
- punctuation: 'none',
66
- language: 'de'
70
+ pt = PragmaticTokenizer::Tokenizer.new(
71
+ text,
72
+ punctuation: 'none',
73
+ language: 'de'
67
74
  )
68
- expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
75
+ expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
69
76
  end
70
77
 
71
78
  it 'handles words with a question mark' do
72
79
  text = "Essen á la carte?Man ist versucht…"
73
- pt = PragmaticTokenizer::Tokenizer.new(text,
74
- punctuation: 'none',
75
- language: 'de'
80
+ pt = PragmaticTokenizer::Tokenizer.new(
81
+ text,
82
+ punctuation: 'none',
83
+ language: 'de'
76
84
  )
77
- expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
85
+ expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
78
86
  end
79
87
 
80
88
  it 'handles apostrophes and quotes 3' do
81
89
  text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
82
- pt = PragmaticTokenizer::Tokenizer.new(text,
83
- punctuation: 'none',
84
- language: 'de'
90
+ pt = PragmaticTokenizer::Tokenizer.new(
91
+ text,
92
+ punctuation: 'none',
93
+ language: 'de'
85
94
  )
86
- expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
95
+ expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
87
96
  end
88
97
 
89
98
  it 'handles mentions 1' do
90
99
  text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
91
- pt = PragmaticTokenizer::Tokenizer.new(text,
92
- punctuation: 'none',
93
- language: 'de'
100
+ pt = PragmaticTokenizer::Tokenizer.new(
101
+ text,
102
+ punctuation: 'none',
103
+ language: 'de'
94
104
  )
95
105
  expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
96
106
  end
97
107
 
98
108
  it 'handles mentions 2' do
99
109
  text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
100
- pt = PragmaticTokenizer::Tokenizer.new(text,
101
- punctuation: 'none',
102
- language: 'de'
110
+ pt = PragmaticTokenizer::Tokenizer.new(
111
+ text,
112
+ punctuation: 'none',
113
+ language: 'de'
103
114
  )
104
115
  expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
105
116
  end
106
117
 
107
118
  it 'handles old school emoticons 1' do
108
119
  text = "du übertreibst maßlos :D"
109
- pt = PragmaticTokenizer::Tokenizer.new(text,
110
- punctuation: 'none',
111
- downcase: false,
112
- language: 'de'
120
+ pt = PragmaticTokenizer::Tokenizer.new(
121
+ text,
122
+ punctuation: 'none',
123
+ downcase: false,
124
+ language: 'de'
113
125
  )
114
126
  expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
115
127
  end
116
128
 
117
129
  it 'handles words with a symbol suffix' do
118
130
  text = "hier ist ein Whirlpool versteckt^^"
119
- pt = PragmaticTokenizer::Tokenizer.new(text,
120
- punctuation: 'none',
121
- language: 'de'
131
+ pt = PragmaticTokenizer::Tokenizer.new(
132
+ text,
133
+ punctuation: 'none',
134
+ language: 'de'
122
135
  )
123
- expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
136
+ expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
124
137
  end
125
138
 
126
139
  it 'handles hashtags 1' do
127
140
  text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
128
- pt = PragmaticTokenizer::Tokenizer.new(text,
129
- punctuation: 'none',
130
- language: 'de'
141
+ pt = PragmaticTokenizer::Tokenizer.new(
142
+ text,
143
+ punctuation: 'none',
144
+ language: 'de'
131
145
  )
132
146
  expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
133
147
  end
134
148
 
135
149
  it 'handles numbers and words' do
136
150
  text = "Air Force Once ist 18.270-mal abgehoben."
137
- pt = PragmaticTokenizer::Tokenizer.new(text,
138
- punctuation: 'none',
139
- language: 'de'
151
+ pt = PragmaticTokenizer::Tokenizer.new(
152
+ text,
153
+ punctuation: 'none',
154
+ language: 'de'
140
155
  )
141
156
  expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
142
157
  end
143
158
 
144
159
  it 'maintains the german gender-neutrality form 2' do
145
160
  text = "der/die Lehrer_in und seine/ihre Schüler_innen"
146
- pt = PragmaticTokenizer::Tokenizer.new(text,
147
- punctuation: 'none',
148
- language: 'de'
161
+ pt = PragmaticTokenizer::Tokenizer.new(
162
+ text,
163
+ punctuation: 'none',
164
+ language: 'de'
149
165
  )
150
- expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
166
+ expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
151
167
  end
152
168
 
153
169
  it 'handles contractions 1' do
154
170
  text = "gibt's"
155
- pt = PragmaticTokenizer::Tokenizer.new(text,
156
- expand_contractions: true,
157
- language: 'de'
171
+ pt = PragmaticTokenizer::Tokenizer.new(
172
+ text,
173
+ expand_contractions: true,
174
+ language: 'de'
158
175
  )
159
- expect(pt.tokenize).to eq(["gibt", "es"])
176
+ expect(pt.tokenize).to eq(%w(gibt es))
160
177
  end
161
178
 
162
179
  it 'handles contractions 2' do
163
180
  text = "gibt‘s schaut’s wenn's g›spür find´s"
164
- pt = PragmaticTokenizer::Tokenizer.new(text,
165
- expand_contractions: true,
166
- language: 'de'
181
+ pt = PragmaticTokenizer::Tokenizer.new(
182
+ text,
183
+ expand_contractions: true,
184
+ language: 'de'
167
185
  )
168
- expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
186
+ expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
169
187
  end
170
188
 
171
189
  it 'removes English stopwords' do
172
190
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
173
- pt = PragmaticTokenizer::Tokenizer.new(text,
174
- filter_languages: [:en],
175
- remove_stop_words: true,
176
- language: 'de'
191
+ pt = PragmaticTokenizer::Tokenizer.new(
192
+ text,
193
+ filter_languages: [:en],
194
+ remove_stop_words: true,
195
+ language: 'de'
177
196
  )
178
197
  expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
179
198
  end
180
199
 
181
- it 'removes English and German stopwords' do
200
+ it 'removes English and German stopwords' do
182
201
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
183
- pt = PragmaticTokenizer::Tokenizer.new(text,
184
- filter_languages: [:en, :de],
185
- remove_stop_words: true,
186
- language: 'de'
202
+ pt = PragmaticTokenizer::Tokenizer.new(
203
+ text,
204
+ filter_languages: [:en, :de],
205
+ remove_stop_words: true,
206
+ language: 'de'
187
207
  )
188
208
  expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
189
209
  end
190
210
 
191
211
  it 'does not remove English stopwords' do
192
212
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
193
- pt = PragmaticTokenizer::Tokenizer.new(text,
194
- language: 'de'
213
+ pt = PragmaticTokenizer::Tokenizer.new(
214
+ text,
215
+ language: 'de'
195
216
  )
196
217
  expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
197
218
  end
@@ -201,9 +222,10 @@ describe PragmaticTokenizer do
201
222
  it 'maintains the german gender-neutrality form 1' do
202
223
  skip "NOT IMPLEMENTED"
203
224
  text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
204
- pt = PragmaticTokenizer::Tokenizer.new(text,
205
- punctuation: 'none',
206
- language: 'de'
225
+ pt = PragmaticTokenizer::Tokenizer.new(
226
+ text,
227
+ punctuation: 'none',
228
+ language: 'de'
207
229
  )
208
230
  expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
209
231
  end
@@ -211,17 +233,19 @@ describe PragmaticTokenizer do
211
233
  it 'handles apostrophes and quotes 4' do
212
234
  skip "NOT IMPLEMENTED"
213
235
  text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
214
- pt = PragmaticTokenizer::Tokenizer.new(text,
215
- punctuation: 'none',
216
- language: 'de'
236
+ pt = PragmaticTokenizer::Tokenizer.new(
237
+ text,
238
+ punctuation: 'none',
239
+ language: 'de'
217
240
  )
218
- expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
241
+ expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
219
242
  end
220
243
 
221
244
  it 'handles abrreviations for languages other than English' do
222
245
  text = "Adj. Smith how are ü. today."
223
- pt = PragmaticTokenizer::Tokenizer.new(text,
224
- language: :de
246
+ pt = PragmaticTokenizer::Tokenizer.new(
247
+ text,
248
+ language: :de
225
249
  )
226
250
  expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
227
251
  end