pragmatic_tokenizer 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +184 -0
  3. data/.rubocop_todo.yml +66 -0
  4. data/README.md +0 -7
  5. data/Rakefile +1 -1
  6. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +2 -2
  7. data/lib/pragmatic_tokenizer/full_stop_separator.rb +6 -6
  8. data/lib/pragmatic_tokenizer/languages/arabic.rb +1 -1
  9. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +1 -1
  10. data/lib/pragmatic_tokenizer/languages/catalan.rb +1 -1
  11. data/lib/pragmatic_tokenizer/languages/common.rb +4 -4
  12. data/lib/pragmatic_tokenizer/languages/czech.rb +1 -1
  13. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/deutsch.rb +94 -23
  15. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/english.rb +91 -91
  17. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/french.rb +1 -1
  19. data/lib/pragmatic_tokenizer/languages/greek.rb +1 -1
  20. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
  22. data/lib/pragmatic_tokenizer/languages/latvian.rb +1 -1
  23. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/romanian.rb +1 -1
  28. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  31. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  32. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  33. data/lib/pragmatic_tokenizer/languages.rb +28 -28
  34. data/lib/pragmatic_tokenizer/post_processor.rb +38 -24
  35. data/lib/pragmatic_tokenizer/pre_processor.rb +148 -118
  36. data/lib/pragmatic_tokenizer/tokenizer.rb +160 -135
  37. data/lib/pragmatic_tokenizer/version.rb +1 -1
  38. data/pragmatic_tokenizer.gemspec +1 -0
  39. data/spec/languages/bulgarian_spec.rb +17 -13
  40. data/spec/languages/deutsch_spec.rb +110 -86
  41. data/spec/languages/english_spec.rb +465 -342
  42. data/spec/languages/french_spec.rb +3 -2
  43. data/spec/performance_spec.rb +7 -7
  44. data/spec/pragmatic_tokenizer_spec.rb +8 -8
  45. metadata +18 -2
@@ -9,189 +9,210 @@ describe PragmaticTokenizer do
9
9
 
10
10
  it 'tokenizes a string #002' do
11
11
  text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
12
- expect(PragmaticTokenizer::Tokenizer.new(text,
13
- language: 'de',
14
- downcase: false,
15
- remove_stop_words: true,
16
- punctuation: 'none',
17
- numbers: :none
18
- ).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
12
+ expect(PragmaticTokenizer::Tokenizer.new(
13
+ text,
14
+ language: 'de',
15
+ downcase: false,
16
+ remove_stop_words: true,
17
+ punctuation: 'none',
18
+ numbers: :none
19
+ ).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
19
20
  end
20
21
 
21
22
  it 'tokenizes a string #003' do
22
23
  text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
23
- expect(PragmaticTokenizer::Tokenizer.new(text,
24
- language: 'de',
25
- downcase: false
24
+ expect(PragmaticTokenizer::Tokenizer.new(
25
+ text,
26
+ language: 'de',
27
+ downcase: false
26
28
  ).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
27
29
  end
28
30
 
29
31
  it 'tokenizes a string #004' do
30
32
  text = 'der Kaffee-Ersatz'
31
- expect(PragmaticTokenizer::Tokenizer.new(text,
32
- language: 'de',
33
- downcase: false
33
+ expect(PragmaticTokenizer::Tokenizer.new(
34
+ text,
35
+ language: 'de',
36
+ downcase: false
34
37
  ).tokenize).to eq(['der', 'Kaffee-Ersatz'])
35
38
  end
36
39
 
37
40
  it 'tokenizes a string #005' do
38
41
  text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
39
- expect(PragmaticTokenizer::Tokenizer.new(text,
40
- language: 'de',
42
+ expect(PragmaticTokenizer::Tokenizer.new(
43
+ text,
44
+ language: 'de'
41
45
  ).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
42
46
  end
43
47
 
44
48
  it 'handles words with a slash 1' do
45
49
  text = "We pay 3000 €/month"
46
- pt = PragmaticTokenizer::Tokenizer.new(text,
47
- punctuation: 'none',
48
- language: 'de'
50
+ pt = PragmaticTokenizer::Tokenizer.new(
51
+ text,
52
+ punctuation: 'none',
53
+ language: 'de'
49
54
  )
50
55
  expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
51
56
  end
52
57
 
53
58
  it 'handles words with a slash 2' do
54
59
  text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
55
- pt = PragmaticTokenizer::Tokenizer.new(text,
56
- punctuation: 'none',
57
- language: 'de'
60
+ pt = PragmaticTokenizer::Tokenizer.new(
61
+ text,
62
+ punctuation: 'none',
63
+ language: 'de'
58
64
  )
59
- expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
65
+ expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
60
66
  end
61
67
 
62
68
  it 'handles words with a slash 3' do
63
69
  text = "Poison gas attack in Ghuta/Syria."
64
- pt = PragmaticTokenizer::Tokenizer.new(text,
65
- punctuation: 'none',
66
- language: 'de'
70
+ pt = PragmaticTokenizer::Tokenizer.new(
71
+ text,
72
+ punctuation: 'none',
73
+ language: 'de'
67
74
  )
68
- expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
75
+ expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
69
76
  end
70
77
 
71
78
  it 'handles words with a question mark' do
72
79
  text = "Essen á la carte?Man ist versucht…"
73
- pt = PragmaticTokenizer::Tokenizer.new(text,
74
- punctuation: 'none',
75
- language: 'de'
80
+ pt = PragmaticTokenizer::Tokenizer.new(
81
+ text,
82
+ punctuation: 'none',
83
+ language: 'de'
76
84
  )
77
- expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
85
+ expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
78
86
  end
79
87
 
80
88
  it 'handles apostrophes and quotes 3' do
81
89
  text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
82
- pt = PragmaticTokenizer::Tokenizer.new(text,
83
- punctuation: 'none',
84
- language: 'de'
90
+ pt = PragmaticTokenizer::Tokenizer.new(
91
+ text,
92
+ punctuation: 'none',
93
+ language: 'de'
85
94
  )
86
- expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
95
+ expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
87
96
  end
88
97
 
89
98
  it 'handles mentions 1' do
90
99
  text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
91
- pt = PragmaticTokenizer::Tokenizer.new(text,
92
- punctuation: 'none',
93
- language: 'de'
100
+ pt = PragmaticTokenizer::Tokenizer.new(
101
+ text,
102
+ punctuation: 'none',
103
+ language: 'de'
94
104
  )
95
105
  expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
96
106
  end
97
107
 
98
108
  it 'handles mentions 2' do
99
109
  text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
100
- pt = PragmaticTokenizer::Tokenizer.new(text,
101
- punctuation: 'none',
102
- language: 'de'
110
+ pt = PragmaticTokenizer::Tokenizer.new(
111
+ text,
112
+ punctuation: 'none',
113
+ language: 'de'
103
114
  )
104
115
  expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
105
116
  end
106
117
 
107
118
  it 'handles old school emoticons 1' do
108
119
  text = "du übertreibst maßlos :D"
109
- pt = PragmaticTokenizer::Tokenizer.new(text,
110
- punctuation: 'none',
111
- downcase: false,
112
- language: 'de'
120
+ pt = PragmaticTokenizer::Tokenizer.new(
121
+ text,
122
+ punctuation: 'none',
123
+ downcase: false,
124
+ language: 'de'
113
125
  )
114
126
  expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
115
127
  end
116
128
 
117
129
  it 'handles words with a symbol suffix' do
118
130
  text = "hier ist ein Whirlpool versteckt^^"
119
- pt = PragmaticTokenizer::Tokenizer.new(text,
120
- punctuation: 'none',
121
- language: 'de'
131
+ pt = PragmaticTokenizer::Tokenizer.new(
132
+ text,
133
+ punctuation: 'none',
134
+ language: 'de'
122
135
  )
123
- expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
136
+ expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
124
137
  end
125
138
 
126
139
  it 'handles hashtags 1' do
127
140
  text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
128
- pt = PragmaticTokenizer::Tokenizer.new(text,
129
- punctuation: 'none',
130
- language: 'de'
141
+ pt = PragmaticTokenizer::Tokenizer.new(
142
+ text,
143
+ punctuation: 'none',
144
+ language: 'de'
131
145
  )
132
146
  expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
133
147
  end
134
148
 
135
149
  it 'handles numbers and words' do
136
150
  text = "Air Force Once ist 18.270-mal abgehoben."
137
- pt = PragmaticTokenizer::Tokenizer.new(text,
138
- punctuation: 'none',
139
- language: 'de'
151
+ pt = PragmaticTokenizer::Tokenizer.new(
152
+ text,
153
+ punctuation: 'none',
154
+ language: 'de'
140
155
  )
141
156
  expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
142
157
  end
143
158
 
144
159
  it 'maintains the german gender-neutrality form 2' do
145
160
  text = "der/die Lehrer_in und seine/ihre Schüler_innen"
146
- pt = PragmaticTokenizer::Tokenizer.new(text,
147
- punctuation: 'none',
148
- language: 'de'
161
+ pt = PragmaticTokenizer::Tokenizer.new(
162
+ text,
163
+ punctuation: 'none',
164
+ language: 'de'
149
165
  )
150
- expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
166
+ expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
151
167
  end
152
168
 
153
169
  it 'handles contractions 1' do
154
170
  text = "gibt's"
155
- pt = PragmaticTokenizer::Tokenizer.new(text,
156
- expand_contractions: true,
157
- language: 'de'
171
+ pt = PragmaticTokenizer::Tokenizer.new(
172
+ text,
173
+ expand_contractions: true,
174
+ language: 'de'
158
175
  )
159
- expect(pt.tokenize).to eq(["gibt", "es"])
176
+ expect(pt.tokenize).to eq(%w(gibt es))
160
177
  end
161
178
 
162
179
  it 'handles contractions 2' do
163
180
  text = "gibt‘s schaut’s wenn's g›spür find´s"
164
- pt = PragmaticTokenizer::Tokenizer.new(text,
165
- expand_contractions: true,
166
- language: 'de'
181
+ pt = PragmaticTokenizer::Tokenizer.new(
182
+ text,
183
+ expand_contractions: true,
184
+ language: 'de'
167
185
  )
168
- expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
186
+ expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
169
187
  end
170
188
 
171
189
  it 'removes English stopwords' do
172
190
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
173
- pt = PragmaticTokenizer::Tokenizer.new(text,
174
- filter_languages: [:en],
175
- remove_stop_words: true,
176
- language: 'de'
191
+ pt = PragmaticTokenizer::Tokenizer.new(
192
+ text,
193
+ filter_languages: [:en],
194
+ remove_stop_words: true,
195
+ language: 'de'
177
196
  )
178
197
  expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
179
198
  end
180
199
 
181
- it 'removes English and German stopwords' do
200
+ it 'removes English and German stopwords' do
182
201
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
183
- pt = PragmaticTokenizer::Tokenizer.new(text,
184
- filter_languages: [:en, :de],
185
- remove_stop_words: true,
186
- language: 'de'
202
+ pt = PragmaticTokenizer::Tokenizer.new(
203
+ text,
204
+ filter_languages: [:en, :de],
205
+ remove_stop_words: true,
206
+ language: 'de'
187
207
  )
188
208
  expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
189
209
  end
190
210
 
191
211
  it 'does not remove English stopwords' do
192
212
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
193
- pt = PragmaticTokenizer::Tokenizer.new(text,
194
- language: 'de'
213
+ pt = PragmaticTokenizer::Tokenizer.new(
214
+ text,
215
+ language: 'de'
195
216
  )
196
217
  expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
197
218
  end
@@ -201,9 +222,10 @@ describe PragmaticTokenizer do
201
222
  it 'maintains the german gender-neutrality form 1' do
202
223
  skip "NOT IMPLEMENTED"
203
224
  text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
204
- pt = PragmaticTokenizer::Tokenizer.new(text,
205
- punctuation: 'none',
206
- language: 'de'
225
+ pt = PragmaticTokenizer::Tokenizer.new(
226
+ text,
227
+ punctuation: 'none',
228
+ language: 'de'
207
229
  )
208
230
  expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
209
231
  end
@@ -211,17 +233,19 @@ describe PragmaticTokenizer do
211
233
  it 'handles apostrophes and quotes 4' do
212
234
  skip "NOT IMPLEMENTED"
213
235
  text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
214
- pt = PragmaticTokenizer::Tokenizer.new(text,
215
- punctuation: 'none',
216
- language: 'de'
236
+ pt = PragmaticTokenizer::Tokenizer.new(
237
+ text,
238
+ punctuation: 'none',
239
+ language: 'de'
217
240
  )
218
- expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
241
+ expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
219
242
  end
220
243
 
221
244
  it 'handles abrreviations for languages other than English' do
222
245
  text = "Adj. Smith how are ü. today."
223
- pt = PragmaticTokenizer::Tokenizer.new(text,
224
- language: :de
246
+ pt = PragmaticTokenizer::Tokenizer.new(
247
+ text,
248
+ language: :de
225
249
  )
226
250
  expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
227
251
  end