pragmatic_tokenizer 2.1.0 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +77 -13
  3. data/README.md +3 -3
  4. data/lib/pragmatic_tokenizer/full_stop_separator.rb +2 -2
  5. data/lib/pragmatic_tokenizer/languages.rb +27 -26
  6. data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
  7. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +2 -2
  8. data/lib/pragmatic_tokenizer/languages/catalan.rb +2 -2
  9. data/lib/pragmatic_tokenizer/languages/common.rb +11 -11
  10. data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
  11. data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/deutsch.rb +4 -4
  13. data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
  15. data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
  16. data/lib/pragmatic_tokenizer/languages/french.rb +2 -2
  17. data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
  18. data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
  19. data/lib/pragmatic_tokenizer/languages/italian.rb +2 -2
  20. data/lib/pragmatic_tokenizer/languages/latvian.rb +2 -2
  21. data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
  22. data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
  23. data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
  24. data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
  25. data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
  26. data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
  27. data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
  28. data/lib/pragmatic_tokenizer/languages/spanish.rb +2 -2
  29. data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
  30. data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
  31. data/lib/pragmatic_tokenizer/post_processor.rb +11 -13
  32. data/lib/pragmatic_tokenizer/tokenizer.rb +195 -187
  33. data/lib/pragmatic_tokenizer/version.rb +1 -1
  34. data/pragmatic_tokenizer.gemspec +1 -1
  35. data/spec/languages/bulgarian_spec.rb +4 -8
  36. data/spec/languages/deutsch_spec.rb +25 -49
  37. data/spec/languages/english_spec.rb +238 -364
  38. data/spec/languages/french_spec.rb +1 -2
  39. data/spec/performance_spec.rb +15 -16
  40. metadata +4 -4
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "2.1.0".freeze
2
+ VERSION = "2.2.0".freeze
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "unicode_case_converter", "~> 0.4"
21
+ spec.add_runtime_dependency "unicode_case_converter", "~> 1.0"
22
22
  spec.add_development_dependency "bundler", "~> 1.9"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
@@ -5,41 +5,37 @@ describe PragmaticTokenizer do
5
5
  it 'tokenizes a string #001' do
6
6
  text = 'Стойностни, вкл. български и руски'
7
7
  pt = PragmaticTokenizer::Tokenizer.new(
8
- text,
9
8
  language: 'bg'
10
9
  )
11
- expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
10
+ expect(pt.tokenize(text)).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
12
11
  end
13
12
 
14
13
  it 'tokenizes a string #002' do
15
14
  text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
16
15
  pt = PragmaticTokenizer::Tokenizer.new(
17
- text,
18
16
  language: 'bg',
19
17
  remove_stop_words: true
20
18
  )
21
- expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
19
+ expect(pt.tokenize(text)).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
22
20
  end
23
21
 
24
22
  it 'tokenizes a string #003' do
25
23
  text = 'Без български жертви в Париж.'
26
24
  pt = PragmaticTokenizer::Tokenizer.new(
27
- text,
28
25
  language: 'bg',
29
26
  remove_stop_words: true
30
27
  )
31
- expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
28
+ expect(pt.tokenize(text)).to eq(["български", "жертви", "париж", "."])
32
29
  end
33
30
 
34
31
  it 'tokenizes a string #004' do
35
32
  text = 'Без български жертви в Париж.'
36
33
  pt = PragmaticTokenizer::Tokenizer.new(
37
- text,
38
34
  language: 'bg',
39
35
  remove_stop_words: true,
40
36
  downcase: false
41
37
  )
42
- expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
38
+ expect(pt.tokenize(text)).to eq(["български", "жертви", "Париж", "."])
43
39
  end
44
40
  end
45
41
  end
@@ -4,217 +4,196 @@ describe PragmaticTokenizer do
4
4
  context 'Language: German (de)' do
5
5
  it 'tokenizes a string #001' do
6
6
  text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
7
- expect(PragmaticTokenizer::Tokenizer.new(text, language: 'de').tokenize).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
7
+ expect(PragmaticTokenizer::Tokenizer.new(language: 'de').tokenize(text)).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
8
8
  end
9
9
 
10
10
  it 'tokenizes a string #002' do
11
11
  text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
12
12
  expect(PragmaticTokenizer::Tokenizer.new(
13
- text,
14
13
  language: 'de',
15
14
  downcase: false,
16
15
  remove_stop_words: true,
17
16
  punctuation: 'none',
18
17
  numbers: :none
19
- ).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
18
+ ).tokenize(text)).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
20
19
  end
21
20
 
22
21
  it 'tokenizes a string #003' do
23
22
  text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
24
23
  expect(PragmaticTokenizer::Tokenizer.new(
25
- text,
26
24
  language: 'de',
27
25
  downcase: false
28
- ).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
26
+ ).tokenize(text)).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
29
27
  end
30
28
 
31
29
  it 'tokenizes a string #004' do
32
30
  text = 'der Kaffee-Ersatz'
33
31
  expect(PragmaticTokenizer::Tokenizer.new(
34
- text,
35
32
  language: 'de',
36
33
  downcase: false
37
- ).tokenize).to eq(['der', 'Kaffee-Ersatz'])
34
+ ).tokenize(text)).to eq(['der', 'Kaffee-Ersatz'])
38
35
  end
39
36
 
40
37
  it 'tokenizes a string #005' do
41
38
  text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
42
39
  expect(PragmaticTokenizer::Tokenizer.new(
43
- text,
44
40
  language: 'de'
45
- ).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
41
+ ).tokenize(text)).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
46
42
  end
47
43
 
48
44
  it 'handles words with a slash 1' do
49
45
  text = "We pay 3000 €/month"
50
46
  pt = PragmaticTokenizer::Tokenizer.new(
51
- text,
52
47
  punctuation: 'none',
53
48
  language: 'de'
54
49
  )
55
- expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
50
+ expect(pt.tokenize(text)).to eq(["we", "pay", "3000", "€", "month"])
56
51
  end
57
52
 
58
53
  it 'handles words with a slash 2' do
59
54
  text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
60
55
  pt = PragmaticTokenizer::Tokenizer.new(
61
- text,
62
56
  punctuation: 'none',
63
57
  language: 'de'
64
58
  )
65
- expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
59
+ expect(pt.tokenize(text)).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
66
60
  end
67
61
 
68
62
  it 'handles words with a slash 3' do
69
63
  text = "Poison gas attack in Ghuta/Syria."
70
64
  pt = PragmaticTokenizer::Tokenizer.new(
71
- text,
72
65
  punctuation: 'none',
73
66
  language: 'de'
74
67
  )
75
- expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
68
+ expect(pt.tokenize(text)).to eq(%w(poison gas attack in ghuta syria))
76
69
  end
77
70
 
78
71
  it 'handles words with a question mark' do
79
72
  text = "Essen á la carte?Man ist versucht…"
80
73
  pt = PragmaticTokenizer::Tokenizer.new(
81
- text,
82
74
  punctuation: 'none',
83
75
  language: 'de'
84
76
  )
85
- expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
77
+ expect(pt.tokenize(text)).to eq(%w(essen á la carte man ist versucht))
86
78
  end
87
79
 
88
80
  it 'handles apostrophes and quotes 3' do
89
81
  text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
90
82
  pt = PragmaticTokenizer::Tokenizer.new(
91
- text,
92
83
  punctuation: 'none',
93
84
  language: 'de'
94
85
  )
95
- expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
86
+ expect(pt.tokenize(text)).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
96
87
  end
97
88
 
98
89
  it 'handles mentions 1' do
99
90
  text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
100
91
  pt = PragmaticTokenizer::Tokenizer.new(
101
- text,
102
92
  punctuation: 'none',
103
93
  language: 'de'
104
94
  )
105
- expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
95
+ expect(pt.tokenize(text)).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
106
96
  end
107
97
 
108
98
  it 'handles mentions 2' do
109
99
  text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
110
100
  pt = PragmaticTokenizer::Tokenizer.new(
111
- text,
112
101
  punctuation: 'none',
113
102
  language: 'de'
114
103
  )
115
- expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
104
+ expect(pt.tokenize(text)).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
116
105
  end
117
106
 
118
107
  it 'handles old school emoticons 1' do
119
108
  text = "du übertreibst maßlos :D"
120
109
  pt = PragmaticTokenizer::Tokenizer.new(
121
- text,
122
110
  punctuation: 'none',
123
111
  downcase: false,
124
112
  language: 'de'
125
113
  )
126
- expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
114
+ expect(pt.tokenize(text)).to eq(["du", "übertreibst", "maßlos", ":D"])
127
115
  end
128
116
 
129
117
  it 'handles words with a symbol suffix' do
130
118
  text = "hier ist ein Whirlpool versteckt^^"
131
119
  pt = PragmaticTokenizer::Tokenizer.new(
132
- text,
133
120
  punctuation: 'none',
134
121
  language: 'de'
135
122
  )
136
- expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
123
+ expect(pt.tokenize(text)).to eq(%w(hier ist ein whirlpool versteckt))
137
124
  end
138
125
 
139
126
  it 'handles hashtags 1' do
140
127
  text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
141
128
  pt = PragmaticTokenizer::Tokenizer.new(
142
- text,
143
129
  punctuation: 'none',
144
130
  language: 'de'
145
131
  )
146
- expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
132
+ expect(pt.tokenize(text)).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
147
133
  end
148
134
 
149
135
  it 'handles numbers and words' do
150
136
  text = "Air Force Once ist 18.270-mal abgehoben."
151
137
  pt = PragmaticTokenizer::Tokenizer.new(
152
- text,
153
138
  punctuation: 'none',
154
139
  language: 'de'
155
140
  )
156
- expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
141
+ expect(pt.tokenize(text)).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
157
142
  end
158
143
 
159
144
  it 'maintains the german gender-neutrality form 2' do
160
145
  text = "der/die Lehrer_in und seine/ihre Schüler_innen"
161
146
  pt = PragmaticTokenizer::Tokenizer.new(
162
- text,
163
147
  punctuation: 'none',
164
148
  language: 'de'
165
149
  )
166
- expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
150
+ expect(pt.tokenize(text)).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
167
151
  end
168
152
 
169
153
  it 'handles contractions 1' do
170
154
  text = "gibt's"
171
155
  pt = PragmaticTokenizer::Tokenizer.new(
172
- text,
173
156
  expand_contractions: true,
174
157
  language: 'de'
175
158
  )
176
- expect(pt.tokenize).to eq(%w(gibt es))
159
+ expect(pt.tokenize(text)).to eq(%w(gibt es))
177
160
  end
178
161
 
179
162
  it 'handles contractions 2' do
180
163
  text = "gibt‘s schaut’s wenn's g›spür find´s"
181
164
  pt = PragmaticTokenizer::Tokenizer.new(
182
- text,
183
165
  expand_contractions: true,
184
166
  language: 'de'
185
167
  )
186
- expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
168
+ expect(pt.tokenize(text)).to eq(%w(gibt es schaut es wenn es gespür finde es))
187
169
  end
188
170
 
189
171
  it 'removes English stopwords' do
190
172
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
191
173
  pt = PragmaticTokenizer::Tokenizer.new(
192
- text,
193
174
  filter_languages: [:en],
194
175
  remove_stop_words: true,
195
176
  language: 'de'
196
177
  )
197
- expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
178
+ expect(pt.tokenize(text)).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
198
179
  end
199
180
 
200
181
  it 'removes English and German stopwords' do
201
182
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
202
183
  pt = PragmaticTokenizer::Tokenizer.new(
203
- text,
204
184
  filter_languages: [:en, :de],
205
185
  remove_stop_words: true,
206
186
  language: 'de'
207
187
  )
208
- expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
188
+ expect(pt.tokenize(text)).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
209
189
  end
210
190
 
211
191
  it 'does not remove English stopwords' do
212
192
  text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
213
193
  pt = PragmaticTokenizer::Tokenizer.new(
214
- text,
215
194
  language: 'de'
216
195
  )
217
- expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
196
+ expect(pt.tokenize(text)).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
218
197
  end
219
198
 
220
199
  # I don't know how to easily treat these forms, especially the most frequent form
@@ -223,31 +202,28 @@ describe PragmaticTokenizer do
223
202
  skip "NOT IMPLEMENTED"
224
203
  text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
225
204
  pt = PragmaticTokenizer::Tokenizer.new(
226
- text,
227
205
  punctuation: 'none',
228
206
  language: 'de'
229
207
  )
230
- expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
208
+ expect(pt.tokenize(text)).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
231
209
  end
232
210
 
233
211
  it 'handles apostrophes and quotes 4' do
234
212
  skip "NOT IMPLEMENTED"
235
213
  text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
236
214
  pt = PragmaticTokenizer::Tokenizer.new(
237
- text,
238
215
  punctuation: 'none',
239
216
  language: 'de'
240
217
  )
241
- expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
218
+ expect(pt.tokenize(text)).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
242
219
  end
243
220
 
244
221
  it 'handles abrreviations for languages other than English' do
245
222
  text = "Adj. Smith how are ü. today."
246
223
  pt = PragmaticTokenizer::Tokenizer.new(
247
- text,
248
224
  language: :de
249
225
  )
250
- expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
226
+ expect(pt.tokenize(text)).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
251
227
  end
252
228
  end
253
229
  end
@@ -6,170 +6,170 @@ describe PragmaticTokenizer do
6
6
  context 'no options selected' do
7
7
  it 'tokenizes a string #001' do
8
8
  text = "Hello world."
9
- pt = PragmaticTokenizer::Tokenizer.new(text)
10
- expect(pt.tokenize).to eq(["hello", "world", "."])
9
+ pt = PragmaticTokenizer::Tokenizer.new
10
+ expect(pt.tokenize(text)).to eq(["hello", "world", "."])
11
11
  end
12
12
 
13
13
  it 'tokenizes a string #002' do
14
14
  text = "Hello Dr. Death."
15
- pt = PragmaticTokenizer::Tokenizer.new(text)
16
- expect(pt.tokenize).to eq(["hello", "dr.", "death", "."])
15
+ pt = PragmaticTokenizer::Tokenizer.new
16
+ expect(pt.tokenize(text)).to eq(["hello", "dr.", "death", "."])
17
17
  end
18
18
 
19
19
  it 'tokenizes a string #003' do
20
20
  text = "Hello ____________________ ."
21
- pt = PragmaticTokenizer::Tokenizer.new(text)
22
- expect(pt.tokenize).to eq(["hello", "____________________", "."])
21
+ pt = PragmaticTokenizer::Tokenizer.new
22
+ expect(pt.tokenize(text)).to eq(["hello", "____________________", "."])
23
23
  end
24
24
 
25
25
  it 'tokenizes a string #004' do
26
26
  text = "It has a state-of-the-art design."
27
- pt = PragmaticTokenizer::Tokenizer.new(text)
28
- expect(pt.tokenize).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
27
+ pt = PragmaticTokenizer::Tokenizer.new
28
+ expect(pt.tokenize(text)).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
29
29
  end
30
30
 
31
31
  it 'tokenizes a string #005' do
32
32
  text = "Jan. 2015 was 20% colder than now. But not in inter- and outer-space."
33
- pt = PragmaticTokenizer::Tokenizer.new(text)
34
- expect(pt.tokenize).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
33
+ pt = PragmaticTokenizer::Tokenizer.new
34
+ expect(pt.tokenize(text)).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
35
35
  end
36
36
 
37
37
  it 'tokenizes a string #006' do
38
38
  text = 'Go to http://www.example.com.'
39
- pt = PragmaticTokenizer::Tokenizer.new(text)
40
- expect(pt.tokenize).to eq(["go", "to", "http://www.example.com", "."])
39
+ pt = PragmaticTokenizer::Tokenizer.new
40
+ expect(pt.tokenize(text)).to eq(["go", "to", "http://www.example.com", "."])
41
41
  end
42
42
 
43
43
  it 'tokenizes a string #007' do
44
44
  text = 'One of the lawyers from ‚Making a Murderer’ admitted a mistake'
45
- pt = PragmaticTokenizer::Tokenizer.new(text)
46
- expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
45
+ pt = PragmaticTokenizer::Tokenizer.new
46
+ expect(pt.tokenize(text)).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
47
47
  end
48
48
 
49
49
  it 'tokenizes a string #008' do
50
50
  text = "One of the lawyers from 'Making a Murderer' admitted a mistake"
51
- pt = PragmaticTokenizer::Tokenizer.new(text)
52
- expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
51
+ pt = PragmaticTokenizer::Tokenizer.new
52
+ expect(pt.tokenize(text)).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
53
53
  end
54
54
 
55
55
  it 'tokenizes a string #009' do
56
56
  text = "hello ;-) yes"
57
- pt = PragmaticTokenizer::Tokenizer.new(text)
58
- expect(pt.tokenize).to eq(["hello", ";", "-", ")", "yes"])
57
+ pt = PragmaticTokenizer::Tokenizer.new
58
+ expect(pt.tokenize(text)).to eq(["hello", ";", "-", ")", "yes"])
59
59
  end
60
60
 
61
61
  it 'tokenizes a string #010' do
62
62
  text = "hello ;)"
63
- pt = PragmaticTokenizer::Tokenizer.new(text)
64
- expect(pt.tokenize).to eq(["hello", ";", ")"])
63
+ pt = PragmaticTokenizer::Tokenizer.new
64
+ expect(pt.tokenize(text)).to eq(["hello", ";", ")"])
65
65
  end
66
66
 
67
67
  it 'tokenizes a string #011' do
68
68
  text = "area <0.8 cm2"
69
- pt = PragmaticTokenizer::Tokenizer.new(text)
70
- expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
69
+ pt = PragmaticTokenizer::Tokenizer.new
70
+ expect(pt.tokenize(text)).to eq(["area", "<0.8", "cm2"])
71
71
  end
72
72
 
73
73
  it 'tokenizes a string #012' do
74
74
  text = "area <0.8 cm2"
75
- pt = PragmaticTokenizer::Tokenizer.new(text)
76
- expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
75
+ pt = PragmaticTokenizer::Tokenizer.new
76
+ expect(pt.tokenize(text)).to eq(["area", "<0.8", "cm2"])
77
77
  end
78
78
 
79
79
  it 'tokenizes a string #013' do
80
80
  text = "the “Star-Trek“-Inventor"
81
- pt = PragmaticTokenizer::Tokenizer.new(text)
82
- expect(pt.tokenize).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
81
+ pt = PragmaticTokenizer::Tokenizer.new
82
+ expect(pt.tokenize(text)).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
83
83
  end
84
84
 
85
85
  it 'tokenizes a string #014' do
86
86
  text = "#ab-cd"
87
- pt = PragmaticTokenizer::Tokenizer.new(text)
88
- expect(pt.tokenize).to eq(["#ab-cd"])
87
+ pt = PragmaticTokenizer::Tokenizer.new
88
+ expect(pt.tokenize(text)).to eq(["#ab-cd"])
89
89
  end
90
90
 
91
91
  it 'handles numbers with symbols 2' do
92
92
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
93
- pt = PragmaticTokenizer::Tokenizer.new(text)
94
- expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
93
+ pt = PragmaticTokenizer::Tokenizer.new
94
+ expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
95
95
  end
96
96
 
97
97
  it 'handles numbers with symbols 3' do
98
98
  text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
99
- pt = PragmaticTokenizer::Tokenizer.new(text)
100
- expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
99
+ pt = PragmaticTokenizer::Tokenizer.new
100
+ expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
101
101
  end
102
102
 
103
103
  it 'splits at a comma' do
104
104
  text = "16.1. day one,17.2. day two"
105
- pt = PragmaticTokenizer::Tokenizer.new(text)
106
- expect(pt.tokenize).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
105
+ pt = PragmaticTokenizer::Tokenizer.new
106
+ expect(pt.tokenize(text)).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
107
107
  end
108
108
 
109
109
  it 'identifies single quotes' do
110
110
  text = "Sean Penn Sat for Secret Interview With ‘El Chapo,’ Mexican Drug"
111
- pt = PragmaticTokenizer::Tokenizer.new(text)
112
- expect(pt.tokenize).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
111
+ pt = PragmaticTokenizer::Tokenizer.new
112
+ expect(pt.tokenize(text)).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
113
113
  end
114
114
 
115
115
  it 'identifies prefixed symbols' do
116
116
  text = "look:the sky is blue"
117
- pt = PragmaticTokenizer::Tokenizer.new(text)
118
- expect(pt.tokenize).to eq(["look", ":", "the", "sky", "is", "blue"])
117
+ pt = PragmaticTokenizer::Tokenizer.new
118
+ expect(pt.tokenize(text)).to eq(["look", ":", "the", "sky", "is", "blue"])
119
119
  end
120
120
 
121
121
  it 'identifies hashtags with numbers too' do
122
122
  text = "this is a sentence.#yay this too.#withnumbers123"
123
- pt = PragmaticTokenizer::Tokenizer.new(text)
124
- expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
123
+ pt = PragmaticTokenizer::Tokenizer.new
124
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
125
125
  end
126
126
 
127
127
  it 'splits emojis' do
128
128
  text = "🤔🙄"
129
- pt = PragmaticTokenizer::Tokenizer.new(text)
130
- expect(pt.tokenize).to eq(["🤔", "🙄"])
129
+ pt = PragmaticTokenizer::Tokenizer.new
130
+ expect(pt.tokenize(text)).to eq(["🤔", "🙄"])
131
131
  end
132
132
 
133
133
  it 'handles snowflakes 1' do
134
134
  text = "❄️❄️❄️"
135
- pt = PragmaticTokenizer::Tokenizer.new(text)
136
- expect(pt.tokenize).to eq(["❄️", "❄️", "❄️"])
135
+ pt = PragmaticTokenizer::Tokenizer.new
136
+ expect(pt.tokenize(text)).to eq(["❄️", "❄️", "❄️"])
137
137
  end
138
138
 
139
139
  it 'handles snowflakes 2' do
140
140
  text = "\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
141
- pt = PragmaticTokenizer::Tokenizer.new(text)
142
- expect(pt.tokenize).to eq(["❄︎", "❄︎", "❄︎"])
141
+ pt = PragmaticTokenizer::Tokenizer.new
142
+ expect(pt.tokenize(text)).to eq(["❄︎", "❄︎", "❄︎"])
143
143
  end
144
144
 
145
145
  it 'handles snowflakes 3' do
146
146
  text = "\u2744\u2744\u2744"
147
- pt = PragmaticTokenizer::Tokenizer.new(text)
148
- expect(pt.tokenize).to eq(["\u2744", "\u2744", "\u2744"])
147
+ pt = PragmaticTokenizer::Tokenizer.new
148
+ expect(pt.tokenize(text)).to eq(["\u2744", "\u2744", "\u2744"])
149
149
  end
150
150
 
151
151
  it 'separates tokens' do
152
152
  text = "football≠soccer"
153
- pt = PragmaticTokenizer::Tokenizer.new(text)
154
- expect(pt.tokenize).to eq(["football", "≠", "soccer"])
153
+ pt = PragmaticTokenizer::Tokenizer.new
154
+ expect(pt.tokenize(text)).to eq(["football", "≠", "soccer"])
155
155
  end
156
156
 
157
157
  it 'deals with missing whitespaces' do
158
158
  text = "this is sentence one!this is sentence two.@someone"
159
- pt = PragmaticTokenizer::Tokenizer.new(text)
160
- expect(pt.tokenize).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
159
+ pt = PragmaticTokenizer::Tokenizer.new
160
+ expect(pt.tokenize(text)).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
161
161
  end
162
162
 
163
163
  it 'handles weird apostrophes' do
164
164
  text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
165
- pt = PragmaticTokenizer::Tokenizer.new(text)
166
- expect(pt.tokenize).to eq(["there`s", "something"])
165
+ pt = PragmaticTokenizer::Tokenizer.new
166
+ expect(pt.tokenize(text)).to eq(["there`s", "something"])
167
167
  end
168
168
 
169
169
  it 'treats abbreviations always the same' do
170
170
  text = "U.S.A. U.S.A. U.S.A."
171
- pt = PragmaticTokenizer::Tokenizer.new(text)
172
- expect(pt.tokenize).to eq(
171
+ pt = PragmaticTokenizer::Tokenizer.new
172
+ expect(pt.tokenize(text)).to eq(
173
173
  ["u.s.a.", "u.s.a.", "u.s.a."]
174
174
  )
175
175
  end
@@ -178,58 +178,53 @@ describe PragmaticTokenizer do
178
178
  context 'user-supplied abbreviations' do
179
179
  it 'tokenizes a regular string with an abbreviation' do
180
180
  text = "Mr. Smith, hello world."
181
- pt = PragmaticTokenizer::Tokenizer.new(text)
182
- expect(pt.tokenize).to eq(["mr.", "smith", ",", "hello", "world", "."])
181
+ pt = PragmaticTokenizer::Tokenizer.new
182
+ expect(pt.tokenize(text)).to eq(["mr.", "smith", ",", "hello", "world", "."])
183
183
  end
184
184
 
185
185
  it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
186
186
  text = "Mr. Smith, hello world."
187
187
  abbreviations = ['mrs']
188
188
  pt = PragmaticTokenizer::Tokenizer.new(
189
- text,
190
189
  abbreviations: abbreviations
191
190
  )
192
- expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
191
+ expect(pt.tokenize(text)).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
193
192
  end
194
193
 
195
194
  it 'recognizes a user-supplied abbreviation' do
196
195
  text = "thisisnotanormalabbreviation. hello world."
197
196
  abbreviations = ['thisisnotanormalabbreviation']
198
197
  pt = PragmaticTokenizer::Tokenizer.new(
199
- text,
200
198
  abbreviations: abbreviations
201
199
  )
202
- expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
200
+ expect(pt.tokenize(text)).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
203
201
  end
204
202
 
205
203
  it 'handles an empty user-supplied abbreviation array' do
206
204
  text = "thisisnotanormalabbreviation. hello world."
207
205
  abbreviations = []
208
206
  pt = PragmaticTokenizer::Tokenizer.new(
209
- text,
210
207
  abbreviations: abbreviations
211
208
  )
212
- expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
209
+ expect(pt.tokenize(text)).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
213
210
  end
214
211
 
215
212
  it 'handles abrreviations across multiple languages' do
216
213
  text = "Mr. Smith how are ü. today."
217
214
  pt = PragmaticTokenizer::Tokenizer.new(
218
- text,
219
215
  filter_languages: [:en, :de]
220
216
  )
221
- expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
217
+ expect(pt.tokenize(text)).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
222
218
  end
223
219
 
224
220
  it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
225
221
  text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
226
222
  abbreviations = ['thisisnotanormalabbreviation']
227
223
  pt = PragmaticTokenizer::Tokenizer.new(
228
- text,
229
224
  filter_languages: [:en, :de],
230
225
  abbreviations: abbreviations
231
226
  )
232
- expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
227
+ expect(pt.tokenize(text)).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
233
228
  end
234
229
  end
235
230
 
@@ -237,90 +232,82 @@ describe PragmaticTokenizer do
237
232
  it 'does not expand the contractions' do
238
233
  # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
239
234
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
240
- pt = PragmaticTokenizer::Tokenizer.new(text)
241
- expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
235
+ pt = PragmaticTokenizer::Tokenizer.new
236
+ expect(pt.tokenize(text)).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
242
237
  end
243
238
 
244
239
  it 'expands user-supplied contractions' do
245
240
  text = "Hello supa'soo guy."
246
241
  contractions = { "supa'soo" => "super smooth" }
247
242
  pt = PragmaticTokenizer::Tokenizer.new(
248
- text,
249
243
  contractions: contractions,
250
244
  expand_contractions: true
251
245
  )
252
- expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
246
+ expect(pt.tokenize(text)).to eq(["hello", "super", "smooth", "guy", "."])
253
247
  end
254
248
 
255
249
  it 'does not expands user-supplied contractions' do
256
250
  text = "Hello supa'soo guy."
257
251
  contractions = { "supa'soo" => "super smooth" }
258
252
  pt = PragmaticTokenizer::Tokenizer.new(
259
- text,
260
253
  contractions: contractions,
261
254
  expand_contractions: false
262
255
  )
263
- expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", "."])
256
+ expect(pt.tokenize(text)).to eq(["hello", "supa'soo", "guy", "."])
264
257
  end
265
258
 
266
259
  it 'expands user-supplied contractions and language contractions' do
267
260
  text = "Hello supa'soo guy. auf's wasn't it?"
268
261
  contractions = { "supa'soo" => "super smooth" }
269
262
  pt = PragmaticTokenizer::Tokenizer.new(
270
- text,
271
263
  contractions: contractions,
272
264
  expand_contractions: true,
273
265
  filter_languages: [:en, :de]
274
266
  )
275
- expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
267
+ expect(pt.tokenize(text)).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
276
268
  end
277
269
 
278
270
  it 'expands language contractions' do
279
271
  text = "Hello supa'soo guy. auf's wasn't it?"
280
272
  pt = PragmaticTokenizer::Tokenizer.new(
281
- text,
282
273
  expand_contractions: true,
283
274
  filter_languages: [:en, :de]
284
275
  )
285
- expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
276
+ expect(pt.tokenize(text)).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
286
277
  end
287
278
 
288
279
  it 'tokenizes a string #001' do
289
280
  # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
290
281
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
291
282
  pt = PragmaticTokenizer::Tokenizer.new(
292
- text,
293
283
  expand_contractions: true
294
284
  )
295
- expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
285
+ expect(pt.tokenize(text)).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
296
286
  end
297
287
 
298
288
  it 'tokenizes a string #002' do
299
289
  # http://nlp.stanford.edu/software/tokenizer.shtml
300
290
  text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
301
291
  pt = PragmaticTokenizer::Tokenizer.new(
302
- text,
303
292
  expand_contractions: true
304
293
  )
305
- expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
294
+ expect(pt.tokenize(text)).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
306
295
  end
307
296
 
308
297
  it 'tokenizes a string #003' do
309
298
  text = "Look for his/her account."
310
299
  pt = PragmaticTokenizer::Tokenizer.new(
311
- text,
312
300
  expand_contractions: true
313
301
  )
314
- expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
302
+ expect(pt.tokenize(text)).to eq(["look", "for", "his", "her", "account", "."])
315
303
  end
316
304
 
317
305
  it 'tokenizes a string #004' do
318
306
  text = "I like apples and/or oranges."
319
307
  pt = PragmaticTokenizer::Tokenizer.new(
320
- text,
321
308
  expand_contractions: true
322
309
  )
323
- expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
310
+ expect(pt.tokenize(text)).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
324
311
  end
325
312
  end
326
313
 
@@ -328,43 +315,39 @@ describe PragmaticTokenizer do
328
315
  it 'removes emoji' do
329
316
  text = "Return the emoji 👿😍😱🐔🌚. 🌚"
330
317
  pt = PragmaticTokenizer::Tokenizer.new(
331
- text,
332
318
  remove_emoji: true
333
319
  )
334
- expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
320
+ expect(pt.tokenize(text)).to eq(["return", "the", "emoji", "."])
335
321
  end
336
322
 
337
323
  it 'does not remove emoji' do
338
324
  text = "Return the emoji 👿😍😱🐔🌚. 🌚"
339
- pt = PragmaticTokenizer::Tokenizer.new(text)
340
- expect(pt.tokenize).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
325
+ pt = PragmaticTokenizer::Tokenizer.new
326
+ expect(pt.tokenize(text)).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
341
327
  end
342
328
 
343
329
  it 'removes snowflakes 1' do
344
330
  text = "hello❄️❄️❄️"
345
331
  pt = PragmaticTokenizer::Tokenizer.new(
346
- text,
347
332
  remove_emoji: true
348
333
  )
349
- expect(pt.tokenize).to eq(["hello"])
334
+ expect(pt.tokenize(text)).to eq(["hello"])
350
335
  end
351
336
 
352
337
  it 'removes snowflakes 2' do
353
338
  text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
354
339
  pt = PragmaticTokenizer::Tokenizer.new(
355
- text,
356
340
  remove_emoji: true
357
341
  )
358
- expect(pt.tokenize).to eq(["hello"])
342
+ expect(pt.tokenize(text)).to eq(["hello"])
359
343
  end
360
344
 
361
345
  it 'removes snowflakes 3' do
362
346
  text = "hello\u2744\u2744\u2744"
363
347
  pt = PragmaticTokenizer::Tokenizer.new(
364
- text,
365
348
  remove_emoji: true
366
349
  )
367
- expect(pt.tokenize).to eq(["hello"])
350
+ expect(pt.tokenize(text)).to eq(["hello"])
368
351
  end
369
352
  end
370
353
 
@@ -372,28 +355,25 @@ describe PragmaticTokenizer do
372
355
  it 'tokenizes a string #001' do
373
356
  text = "This is a #hashtag yay!"
374
357
  pt = PragmaticTokenizer::Tokenizer.new(
375
- text,
376
358
  hashtags: :remove
377
359
  )
378
- expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
360
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "yay", "!"])
379
361
  end
380
362
 
381
363
  it 'tokenizes a string #002' do
382
364
  text = "This is a #hashtag yay!"
383
365
  pt = PragmaticTokenizer::Tokenizer.new(
384
- text,
385
366
  hashtags: :keep_and_clean
386
367
  )
387
- expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
368
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "hashtag", "yay", "!"])
388
369
  end
389
370
 
390
371
  it 'tokenizes a string #003' do
391
372
  text = "This is a #hashtag yay!"
392
373
  pt = PragmaticTokenizer::Tokenizer.new(
393
- text,
394
374
  hashtags: :keep_original
395
375
  )
396
- expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
376
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
397
377
  end
398
378
  end
399
379
 
@@ -401,28 +381,25 @@ describe PragmaticTokenizer do
401
381
  it 'tokenizes a string #001' do
402
382
  text = "This is a @mention @mention2 yay!"
403
383
  pt = PragmaticTokenizer::Tokenizer.new(
404
- text,
405
384
  mentions: :remove
406
385
  )
407
- expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
386
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "yay", "!"])
408
387
  end
409
388
 
410
389
  it 'tokenizes a string #002' do
411
390
  text = "This is a @mention @mention2 yay!"
412
391
  pt = PragmaticTokenizer::Tokenizer.new(
413
- text,
414
392
  mentions: :keep_and_clean
415
393
  )
416
- expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
394
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
417
395
  end
418
396
 
419
397
  it 'tokenizes a string #003' do
420
398
  text = "This is a @mention @mention2 yay!"
421
399
  pt = PragmaticTokenizer::Tokenizer.new(
422
- text,
423
400
  mentions: :keep_original
424
401
  )
425
- expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
402
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
426
403
  end
427
404
  end
428
405
 
@@ -430,25 +407,23 @@ describe PragmaticTokenizer do
430
407
  it 'tokenizes a string #001' do
431
408
  text = "Here are some emails jon@hotmail.com ben123@gmail.com."
432
409
  pt = PragmaticTokenizer::Tokenizer.new(
433
- text,
434
410
  remove_emails: :true
435
411
  )
436
- expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
412
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "emails", "."])
437
413
  end
438
414
 
439
415
  it 'tokenizes a string #002' do
440
416
  text = "Here are some emails jon@hotmail.com ben123@gmail.com."
441
- pt = PragmaticTokenizer::Tokenizer.new(text)
442
- expect(pt.tokenize).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
417
+ pt = PragmaticTokenizer::Tokenizer.new
418
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
443
419
  end
444
420
 
445
421
  it 'knows what is not an email address' do
446
422
  text = "the great cook.@someone something else@whoever"
447
423
  pt = PragmaticTokenizer::Tokenizer.new(
448
- text,
449
424
  remove_emails: true
450
425
  )
451
- expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
426
+ expect(pt.tokenize(text)).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
452
427
  end
453
428
  end
454
429
 
@@ -456,16 +431,15 @@ describe PragmaticTokenizer do
456
431
  it 'tokenizes a string #001' do
457
432
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
458
433
  pt = PragmaticTokenizer::Tokenizer.new(
459
- text,
460
434
  remove_urls: :true
461
435
  )
462
- expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
436
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
463
437
  end
464
438
 
465
439
  it 'tokenizes a string #002' do
466
440
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
467
- pt = PragmaticTokenizer::Tokenizer.new(text)
468
- expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
441
+ pt = PragmaticTokenizer::Tokenizer.new
442
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
469
443
  end
470
444
  end
471
445
 
@@ -473,44 +447,40 @@ describe PragmaticTokenizer do
473
447
  it 'tokenizes a string #001' do
474
448
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
475
449
  pt = PragmaticTokenizer::Tokenizer.new(
476
- text,
477
450
  remove_domains: :true
478
451
  )
479
- expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
452
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
480
453
  end
481
454
 
482
455
  it 'tokenizes a string #002' do
483
456
  text = "Here are some domains and urls google.com https://www.google.com www.google.com."
484
- pt = PragmaticTokenizer::Tokenizer.new(text)
485
- expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
457
+ pt = PragmaticTokenizer::Tokenizer.new
458
+ expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
486
459
  end
487
460
 
488
461
  it 'knows what is not a domain 1' do
489
462
  skip "NOT IMPLEMENTED"
490
463
  text = "this is a sentence.and no domain."
491
464
  pt = PragmaticTokenizer::Tokenizer.new(
492
- text,
493
465
  remove_domains: true
494
466
  )
495
- expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
467
+ expect(pt.tokenize(text)).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
496
468
  end
497
469
 
498
470
  it 'knows what is not a domain 2' do
499
471
  text = "former president g.w.bush was..."
500
472
  pt = PragmaticTokenizer::Tokenizer.new(
501
- text,
502
473
  remove_domains: true
503
474
  )
504
- expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
475
+ expect(pt.tokenize(text)).to eq(["former", "president", "g.w.bush", "was", "..."])
505
476
  end
506
477
 
507
478
  it 'knows what is not a domain 3' do
508
479
  text = "2.something-times"
509
480
  pt = PragmaticTokenizer::Tokenizer.new(
510
- text,
511
481
  remove_domains: true
512
482
  )
513
- expect(pt.tokenize).to eq(["2.something-times"])
483
+ expect(pt.tokenize(text)).to eq(["2.something-times"])
514
484
  end
515
485
  end
516
486
 
@@ -518,19 +488,17 @@ describe PragmaticTokenizer do
518
488
  it 'tokenizes a string #001' do
519
489
  text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
520
490
  pt = PragmaticTokenizer::Tokenizer.new(
521
- text,
522
491
  long_word_split: 10
523
492
  )
524
- expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
493
+ expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
525
494
  end
526
495
 
527
496
  it 'tokenizes a string #002' do
528
497
  text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
529
498
  pt = PragmaticTokenizer::Tokenizer.new(
530
- text,
531
499
  long_word_split: 4
532
500
  )
533
- expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
501
+ expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
534
502
  end
535
503
  end
536
504
 
@@ -538,154 +506,137 @@ describe PragmaticTokenizer do
538
506
  it 'tokenizes a string #001' do
539
507
  text = "Hello ---------------."
540
508
  pt = PragmaticTokenizer::Tokenizer.new(
541
- text,
542
509
  clean: true
543
510
  )
544
- expect(pt.tokenize).to eq(["hello", "."])
511
+ expect(pt.tokenize(text)).to eq(["hello", "."])
545
512
  end
546
513
 
547
514
  it 'tokenizes a string #002' do
548
515
  text = "Hello ____________________ ."
549
516
  pt = PragmaticTokenizer::Tokenizer.new(
550
- text,
551
517
  clean: true
552
518
  )
553
- expect(pt.tokenize).to eq(["hello", "."])
519
+ expect(pt.tokenize(text)).to eq(["hello", "."])
554
520
  end
555
521
 
556
522
  it 'tokenizes a string #003' do
557
523
  text = "© ABC Company 1994"
558
524
  pt = PragmaticTokenizer::Tokenizer.new(
559
- text,
560
525
  clean: true
561
526
  )
562
- expect(pt.tokenize).to eq(%w(abc company 1994))
527
+ expect(pt.tokenize(text)).to eq(%w(abc company 1994))
563
528
  end
564
529
 
565
530
  it 'tokenizes a string #004' do
566
531
  text = "This sentence has a long string of dots ......................."
567
532
  pt = PragmaticTokenizer::Tokenizer.new(
568
- text,
569
533
  clean: true
570
534
  )
571
- expect(pt.tokenize).to eq(%w(this sentence has a long string of dots))
535
+ expect(pt.tokenize(text)).to eq(%w(this sentence has a long string of dots))
572
536
  end
573
537
 
574
538
  it 'tokenizes a string #005' do
575
539
  text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
576
540
  pt = PragmaticTokenizer::Tokenizer.new(
577
- text,
578
541
  clean: true
579
542
  )
580
- expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
543
+ expect(pt.tokenize(text)).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
581
544
  end
582
545
 
583
546
  it 'does not remove a valid hashtag' do
584
547
  text = "This #sentence has a long string of dots ......................."
585
548
  pt = PragmaticTokenizer::Tokenizer.new(
586
- text,
587
549
  clean: true
588
550
  )
589
- expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
551
+ expect(pt.tokenize(text)).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
590
552
  end
591
553
 
592
554
  it 'does not remove a valid mention' do
593
555
  text = "This @sentence has a long string of dots ......................."
594
556
  pt = PragmaticTokenizer::Tokenizer.new(
595
- text,
596
557
  clean: true
597
558
  )
598
- expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
559
+ expect(pt.tokenize(text)).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
599
560
  end
600
561
 
601
562
  it 'cleans words with symbols 1' do
602
563
  text = "something.com:article title !!wow look!!1"
603
564
  pt = PragmaticTokenizer::Tokenizer.new(
604
- text,
605
565
  clean: true
606
566
  )
607
- expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
567
+ expect(pt.tokenize(text)).to eq(["something.com", "article", "title", "wow", "look"])
608
568
  end
609
569
 
610
570
  it 'cleans words with symbols 2' do
611
571
  text = "something.com:article title !!wow look!!1!1!11!"
612
572
  pt = PragmaticTokenizer::Tokenizer.new(
613
- text,
614
573
  clean: true
615
574
  )
616
- expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
575
+ expect(pt.tokenize(text)).to eq(["something.com", "article", "title", "wow", "look"])
617
576
  end
618
577
 
619
578
  it 'identifies prefixed symbols' do
620
579
  text = "look:the sky is blue"
621
580
  pt = PragmaticTokenizer::Tokenizer.new(
622
- text,
623
581
  clean: true
624
582
  )
625
- expect(pt.tokenize).to eq(%w(look the sky is blue))
583
+ expect(pt.tokenize(text)).to eq(%w(look the sky is blue))
626
584
  end
627
585
 
628
586
  it 'keeps numbers at the end of mentions and hashtags' do
629
587
  text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
630
588
  pt = PragmaticTokenizer::Tokenizer.new(
631
- text,
632
589
  clean: true
633
590
  )
634
- expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
591
+ expect(pt.tokenize(text)).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
635
592
  end
636
593
 
637
594
  it 'cleans a prefixed weird hyphen' do
638
595
  text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
639
596
  pt = PragmaticTokenizer::Tokenizer.new(
640
- text,
641
597
  clean: true
642
598
  )
643
- expect(pt.tokenize).to eq(%w(high temperature and wind))
599
+ expect(pt.tokenize(text)).to eq(%w(high temperature and wind))
644
600
  end
645
601
 
646
602
  it 'cleans (r) and (c) and (tm)' do
647
603
  text = "the oscar® night ©companyname is a trademark™"
648
604
  pt = PragmaticTokenizer::Tokenizer.new(
649
- text,
650
605
  clean: true
651
606
  )
652
- expect(pt.tokenize).to eq(%w(the oscar night companyname is a trademark))
607
+ expect(pt.tokenize(text)).to eq(%w(the oscar night companyname is a trademark))
653
608
  end
654
609
 
655
610
  it 'cleans letters in boxes 1' do
656
611
  text = "making🇦🇹postcards"
657
612
  pt = PragmaticTokenizer::Tokenizer.new(
658
- text,
659
613
  clean: true
660
614
  )
661
- expect(pt.tokenize).to eq(%w(making postcards))
615
+ expect(pt.tokenize(text)).to eq(%w(making postcards))
662
616
  end
663
617
 
664
618
  it 'removes colons' do
665
619
  text = "At 19:30 o'clock: Mad Max: Fury Road"
666
620
  pt = PragmaticTokenizer::Tokenizer.new(
667
- text,
668
621
  clean: true
669
622
  )
670
- expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
623
+ expect(pt.tokenize(text)).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
671
624
  end
672
625
 
673
626
  it 'removes a hyphen prefix 3' do
674
627
  text = "women's clothes and –shoes needed"
675
628
  pt = PragmaticTokenizer::Tokenizer.new(
676
- text,
677
629
  clean: true
678
630
  )
679
- expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
631
+ expect(pt.tokenize(text)).to eq(["women's", "clothes", "and", "shoes", "needed"])
680
632
  end
681
633
 
682
634
  it 'does not remove tokens with ampersands' do
683
635
  text = "you&amp;me"
684
636
  pt = PragmaticTokenizer::Tokenizer.new(
685
- text,
686
637
  clean: true
687
638
  )
688
- expect(pt.tokenize).to eq(["you", "&", "me"])
639
+ expect(pt.tokenize(text)).to eq(["you", "&", "me"])
689
640
  end
690
641
  end
691
642
 
@@ -694,38 +645,34 @@ describe PragmaticTokenizer do
694
645
  # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
695
646
  text = "I.B.M. cat's can't"
696
647
  pt = PragmaticTokenizer::Tokenizer.new(
697
- text,
698
648
  classic_filter: true
699
649
  )
700
- expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
650
+ expect(pt.tokenize(text)).to eq(["ibm", "cat", "can't"])
701
651
  end
702
652
 
703
653
  it 'tokenizes a string #002' do
704
654
  # http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
705
655
  text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
706
656
  pt = PragmaticTokenizer::Tokenizer.new(
707
- text,
708
657
  classic_filter: true
709
658
  )
710
- expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
659
+ expect(pt.tokenize(text)).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
711
660
  end
712
661
 
713
662
  it 'optimizes the classic filter' do
714
663
  text = "therés something"
715
664
  pt = PragmaticTokenizer::Tokenizer.new(
716
- text,
717
665
  classic_filter: true
718
666
  )
719
- expect(pt.tokenize).to eq(%w(there something))
667
+ expect(pt.tokenize(text)).to eq(%w(there something))
720
668
  end
721
669
 
722
670
  it 'optimizes the classic filter' do
723
671
  text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
724
672
  pt = PragmaticTokenizer::Tokenizer.new(
725
- text,
726
673
  classic_filter: true
727
674
  )
728
- expect(pt.tokenize).to eq(%w(there something))
675
+ expect(pt.tokenize(text)).to eq(%w(there something))
729
676
  end
730
677
  end
731
678
 
@@ -733,10 +680,9 @@ describe PragmaticTokenizer do
733
680
  it 'tokenizes a string #001' do
734
681
  text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
735
682
  pt = PragmaticTokenizer::Tokenizer.new(
736
- text,
737
683
  language: 'en'
738
684
  )
739
- expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
685
+ expect(pt.tokenize(text)).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
740
686
  end
741
687
 
742
688
  it 'tokenizes a string #002' do
@@ -751,10 +697,9 @@ describe PragmaticTokenizer do
751
697
  Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
752
698
  patent lawyers on the court could prove troublesome.\'"
753
699
  pt = PragmaticTokenizer::Tokenizer.new(
754
- text,
755
700
  language: 'en'
756
701
  )
757
- expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
702
+ expect(pt.tokenize(text)).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
758
703
  end
759
704
  end
760
705
 
@@ -762,64 +707,57 @@ describe PragmaticTokenizer do
762
707
  it 'tokenizes a string #001' do
763
708
  text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
764
709
  pt = PragmaticTokenizer::Tokenizer.new(
765
- text,
766
710
  numbers: :all
767
711
  )
768
- expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
712
+ expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
769
713
  end
770
714
 
771
715
  it 'tokenizes a string #002' do
772
716
  text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
773
717
  pt = PragmaticTokenizer::Tokenizer.new(
774
- text,
775
718
  numbers: :none
776
719
  )
777
- expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
720
+ expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
778
721
  end
779
722
 
780
723
  it 'tokenizes a string #003' do
781
724
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
782
725
  pt = PragmaticTokenizer::Tokenizer.new(
783
- text,
784
726
  numbers: :semi
785
727
  )
786
- expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
728
+ expect(pt.tokenize(text)).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
787
729
  end
788
730
 
789
731
  it 'tokenizes a string #004' do
790
732
  text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
791
733
  pt = PragmaticTokenizer::Tokenizer.new(
792
- text,
793
734
  numbers: :only
794
735
  )
795
- expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
736
+ expect(pt.tokenize(text)).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
796
737
  end
797
738
 
798
739
  it 'tokenizes a string #005' do
799
740
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
800
741
  pt = PragmaticTokenizer::Tokenizer.new(
801
- text,
802
742
  numbers: :none
803
743
  )
804
- expect(pt.tokenize).to eq([])
744
+ expect(pt.tokenize(text)).to eq([])
805
745
  end
806
746
 
807
747
  it 'tokenizes a string #005' do
808
748
  text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
809
749
  pt = PragmaticTokenizer::Tokenizer.new(
810
- text,
811
750
  numbers: :none
812
751
  )
813
- expect(pt.tokenize).to eq(["number"])
752
+ expect(pt.tokenize(text)).to eq(["number"])
814
753
  end
815
754
 
816
755
  it 'tokenizes a string #006' do
817
756
  text = "Remove III Roman Numerals and IX. with a period."
818
757
  pt = PragmaticTokenizer::Tokenizer.new(
819
- text,
820
758
  numbers: :none
821
759
  )
822
- expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
760
+ expect(pt.tokenize(text)).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
823
761
  end
824
762
  end
825
763
 
@@ -827,10 +765,9 @@ describe PragmaticTokenizer do
827
765
  it 'tokenizes a string #001' do
828
766
  text = "Let's test the minimum length of fiver."
829
767
  pt = PragmaticTokenizer::Tokenizer.new(
830
- text,
831
768
  minimum_length: 5
832
769
  )
833
- expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
770
+ expect(pt.tokenize(text)).to eq(["let's", "minimum", "length", "fiver"])
834
771
  end
835
772
  end
836
773
 
@@ -838,291 +775,259 @@ describe PragmaticTokenizer do
838
775
  it 'tokenizes a string #001' do
839
776
  text = "kath. / evang"
840
777
  pt = PragmaticTokenizer::Tokenizer.new(
841
- text,
842
778
  punctuation: 'none'
843
779
  )
844
- expect(pt.tokenize).to eq(%w(kath evang))
780
+ expect(pt.tokenize(text)).to eq(%w(kath evang))
845
781
  end
846
782
 
847
783
  it 'tokenizes a string #002' do
848
784
  text = "derStandard.at › Sport"
849
785
  pt = PragmaticTokenizer::Tokenizer.new(
850
- text,
851
786
  punctuation: 'none'
852
787
  )
853
- expect(pt.tokenize).to eq(["derstandard.at", "sport"])
788
+ expect(pt.tokenize(text)).to eq(["derstandard.at", "sport"])
854
789
  end
855
790
 
856
791
  it 'tokenizes a string #003' do
857
792
  text = "hello ^^"
858
793
  pt = PragmaticTokenizer::Tokenizer.new(
859
- text,
860
794
  punctuation: 'none'
861
795
  )
862
- expect(pt.tokenize).to eq(["hello"])
796
+ expect(pt.tokenize(text)).to eq(["hello"])
863
797
  end
864
798
 
865
799
  it 'tokenizes a string #004' do
866
800
  text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
867
801
  pt = PragmaticTokenizer::Tokenizer.new(
868
- text,
869
802
  punctuation: 'none'
870
803
  )
871
- expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
804
+ expect(pt.tokenize(text)).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
872
805
  end
873
806
 
874
807
  it 'tokenizes a string #005' do
875
808
  text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
876
809
  pt = PragmaticTokenizer::Tokenizer.new(
877
- text,
878
810
  punctuation: 'none'
879
811
  )
880
- expect(pt.tokenize).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
812
+ expect(pt.tokenize(text)).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
881
813
  end
882
814
 
883
815
  it 'tokenizes a string #006' do
884
816
  text = "+++ BREAKING +++ something happened; is it interesting?"
885
817
  pt = PragmaticTokenizer::Tokenizer.new(
886
- text,
887
818
  punctuation: 'none'
888
819
  )
889
- expect(pt.tokenize).to eq(%w(breaking something happened is it interesting))
820
+ expect(pt.tokenize(text)).to eq(%w(breaking something happened is it interesting))
890
821
  end
891
822
 
892
823
  it 'tokenizes a string #007' do
893
824
  text = "Some *interesting stuff* is __happening here__"
894
825
  pt = PragmaticTokenizer::Tokenizer.new(
895
- text,
896
826
  punctuation: 'none'
897
827
  )
898
- expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
828
+ expect(pt.tokenize(text)).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
899
829
  end
900
830
 
901
831
  it 'tokenizes a string #008' do
902
832
  text = "Hello; what is your: name @username **delete**"
903
833
  pt = PragmaticTokenizer::Tokenizer.new(
904
- text,
905
834
  punctuation: 'none'
906
835
  )
907
- expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
836
+ expect(pt.tokenize(text)).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
908
837
  end
909
838
 
910
839
  it 'tokenizes a string #009' do
911
840
  text = "hello ;-) yes"
912
841
  pt = PragmaticTokenizer::Tokenizer.new(
913
- text,
914
842
  punctuation: :none
915
843
  )
916
- expect(pt.tokenize).to eq(%w(hello yes))
844
+ expect(pt.tokenize(text)).to eq(%w(hello yes))
917
845
  end
918
846
 
919
847
  it 'tokenizes a string #010' do
920
848
  text = "hello ;)"
921
849
  pt = PragmaticTokenizer::Tokenizer.new(
922
- text,
923
850
  punctuation: 'none'
924
851
  )
925
- expect(pt.tokenize).to eq(["hello"])
852
+ expect(pt.tokenize(text)).to eq(["hello"])
926
853
  end
927
854
 
928
855
  it 'tokenizes a string #011' do
929
856
  text = "Hello ____________________ ."
930
857
  pt = PragmaticTokenizer::Tokenizer.new(
931
- text,
932
858
  punctuation: :none
933
859
  )
934
- expect(pt.tokenize).to eq(["hello"])
860
+ expect(pt.tokenize(text)).to eq(["hello"])
935
861
  end
936
862
 
937
863
  it 'handles non-domain words with a dot 1' do
938
864
  text = "They were being helped.This is solidarity."
939
865
  pt = PragmaticTokenizer::Tokenizer.new(
940
- text,
941
866
  punctuation: 'none'
942
867
  )
943
- expect(pt.tokenize).to eq(%w(they were being helped this is solidarity))
868
+ expect(pt.tokenize(text)).to eq(%w(they were being helped this is solidarity))
944
869
  end
945
870
 
946
871
  it 'handles non-domain words with a dot 2' do
947
872
  text = "picture was taken in sept.2015"
948
873
  pt = PragmaticTokenizer::Tokenizer.new(
949
- text,
950
874
  punctuation: 'none'
951
875
  )
952
- expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
876
+ expect(pt.tokenize(text)).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
953
877
  end
954
878
 
955
879
  it 'handles non-domain words with a dot 3' do
956
880
  text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
957
881
  pt = PragmaticTokenizer::Tokenizer.new(
958
- text,
959
882
  punctuation: 'none'
960
883
  )
961
- expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
884
+ expect(pt.tokenize(text)).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
962
885
  end
963
886
 
964
887
  it 'handles numbers with symbols 1' do
965
888
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
966
889
  pt = PragmaticTokenizer::Tokenizer.new(
967
- text,
968
890
  punctuation: 'none'
969
891
  )
970
- expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
892
+ expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
971
893
  end
972
894
 
973
895
  it 'handles numbers with symbols 2' do
974
896
  text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
975
897
  pt = PragmaticTokenizer::Tokenizer.new(
976
- text,
977
898
  punctuation: 'none'
978
899
  )
979
- expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
900
+ expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
980
901
  end
981
902
 
982
903
  it 'handles apostrophes and quotes' do
983
904
  text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
984
905
  pt = PragmaticTokenizer::Tokenizer.new(
985
- text,
986
906
  punctuation: 'none'
987
907
  )
988
- expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
908
+ expect(pt.tokenize(text)).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
989
909
  end
990
910
 
991
911
  it 'handles mentions' do
992
912
  text = ".@someone I disagree"
993
913
  pt = PragmaticTokenizer::Tokenizer.new(
994
- text,
995
914
  punctuation: 'none'
996
915
  )
997
- expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
916
+ expect(pt.tokenize(text)).to eq(["@someone", "i", "disagree"])
998
917
  end
999
918
 
1000
919
  it 'handles old school emoticons 2' do
1001
920
  text = "oooh! <3"
1002
921
  pt = PragmaticTokenizer::Tokenizer.new(
1003
- text,
1004
922
  punctuation: 'none'
1005
923
  )
1006
- expect(pt.tokenize).to eq(["oooh", "<3"])
924
+ expect(pt.tokenize(text)).to eq(["oooh", "<3"])
1007
925
  end
1008
926
 
1009
927
  it 'handles old school emoticons 3' do
1010
928
  text = "@someone &lt;33"
1011
929
  pt = PragmaticTokenizer::Tokenizer.new(
1012
- text,
1013
930
  punctuation: 'none'
1014
931
  )
1015
- expect(pt.tokenize).to eq(["@someone", "<33"])
932
+ expect(pt.tokenize(text)).to eq(["@someone", "<33"])
1016
933
  end
1017
934
 
1018
935
  it 'handles words with a symbol prefix 1' do
1019
936
  text = "Yes! /cc @someone"
1020
937
  pt = PragmaticTokenizer::Tokenizer.new(
1021
- text,
1022
938
  punctuation: 'none'
1023
939
  )
1024
- expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
940
+ expect(pt.tokenize(text)).to eq(["yes", "cc", "@someone"])
1025
941
  end
1026
942
 
1027
943
  it 'handles words with a emoji suffix' do
1028
944
  text = "Let's meet there.😝 ok?"
1029
945
  pt = PragmaticTokenizer::Tokenizer.new(
1030
- text,
1031
946
  punctuation: 'none'
1032
947
  )
1033
- expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
948
+ expect(pt.tokenize(text)).to eq(["let's", "meet", "there", "😝", "ok"])
1034
949
  end
1035
950
 
1036
951
  it 'handles words with a symbol prefix 2' do
1037
952
  text = "blah blah |photo by @someone"
1038
953
  pt = PragmaticTokenizer::Tokenizer.new(
1039
- text,
1040
954
  punctuation: 'none'
1041
955
  )
1042
- expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
956
+ expect(pt.tokenize(text)).to eq(["blah", "blah", "photo", "by", "@someone"])
1043
957
  end
1044
958
 
1045
959
  it 'handles pseudo-contractions' do
1046
960
  text = "I suggest to buy stocks that are low value+have momentum"
1047
961
  pt = PragmaticTokenizer::Tokenizer.new(
1048
- text,
1049
962
  punctuation: 'none'
1050
963
  )
1051
- expect(pt.tokenize).to eq(%w(i suggest to buy stocks that are low value have momentum))
964
+ expect(pt.tokenize(text)).to eq(%w(i suggest to buy stocks that are low value have momentum))
1052
965
  end
1053
966
 
1054
967
  it 'handles apostrophes and quotes 1' do
1055
968
  text = "Watch the video of @amandapalmer's song “Killing Type” here"
1056
969
  pt = PragmaticTokenizer::Tokenizer.new(
1057
- text,
1058
970
  punctuation: 'none'
1059
971
  )
1060
- expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
972
+ expect(pt.tokenize(text)).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
1061
973
  end
1062
974
 
1063
975
  it 'handles apostrophes and quotes 2' do
1064
976
  text = "Watch the video of @amandapalmer`s song “Killing Type” here"
1065
977
  pt = PragmaticTokenizer::Tokenizer.new(
1066
- text,
1067
978
  punctuation: 'none'
1068
979
  )
1069
- expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
980
+ expect(pt.tokenize(text)).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
1070
981
  end
1071
982
 
1072
983
  it 'handles numbers suffixed with a symbol' do
1073
984
  text = "4 Things Marketers Must Do Better in 2016: blah"
1074
985
  pt = PragmaticTokenizer::Tokenizer.new(
1075
- text,
1076
986
  punctuation: 'none'
1077
987
  )
1078
- expect(pt.tokenize).to eq(%w(4 things marketers must do better in 2016 blah))
988
+ expect(pt.tokenize(text)).to eq(%w(4 things marketers must do better in 2016 blah))
1079
989
  end
1080
990
 
1081
991
  it 'handles words with a emoticon suffix' do
1082
992
  skip "NOT IMPLEMENTED"
1083
993
  text = "look, a dog with shoes☺ !!"
1084
994
  pt = PragmaticTokenizer::Tokenizer.new(
1085
- text,
1086
995
  punctuation: 'none'
1087
996
  )
1088
- expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
997
+ expect(pt.tokenize(text)).to eq(["look", "a", "dog", "with", "shoes", "☺"])
1089
998
  end
1090
999
 
1091
1000
  it 'handles emoji 1' do
1092
1001
  text = "How bad!😝"
1093
1002
  pt = PragmaticTokenizer::Tokenizer.new(
1094
- text,
1095
1003
  punctuation: 'none'
1096
1004
  )
1097
- expect(pt.tokenize).to eq(["how", "bad", "😝"])
1005
+ expect(pt.tokenize(text)).to eq(["how", "bad", "😝"])
1098
1006
  end
1099
1007
 
1100
1008
  it 'handles emoji 2' do
1101
1009
  text = "😝How bad!"
1102
1010
  pt = PragmaticTokenizer::Tokenizer.new(
1103
- text,
1104
1011
  punctuation: 'none'
1105
1012
  )
1106
- expect(pt.tokenize).to eq(["😝", "how", "bad"])
1013
+ expect(pt.tokenize(text)).to eq(["😝", "how", "bad"])
1107
1014
  end
1108
1015
 
1109
1016
  it 'identifies old school emoticons' do
1110
1017
  skip "NOT IMPLEMENTED"
1111
1018
  text = 'looking forward to the new kodak super8 camera \o/'
1112
1019
  pt = PragmaticTokenizer::Tokenizer.new(
1113
- text,
1114
1020
  punctuation: 'none'
1115
1021
  )
1116
- expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
1022
+ expect(pt.tokenize(text)).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
1117
1023
  end
1118
1024
 
1119
1025
  it 'splits at hashtags' do
1120
1026
  text = "some sentence#RT ... i like u2.#bono"
1121
1027
  pt = PragmaticTokenizer::Tokenizer.new(
1122
- text,
1123
1028
  punctuation: :none
1124
1029
  )
1125
- expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
1030
+ expect(pt.tokenize(text)).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
1126
1031
  end
1127
1032
  end
1128
1033
 
@@ -1130,46 +1035,42 @@ describe PragmaticTokenizer do
1130
1035
  it 'removes stop words' do
1131
1036
  text = 'This is a short sentence with explanations and stop words.'
1132
1037
  pt = PragmaticTokenizer::Tokenizer.new(
1133
- text,
1134
1038
  language: 'en',
1135
1039
  remove_stop_words: true
1136
1040
  )
1137
- expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
1041
+ expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
1138
1042
  end
1139
1043
 
1140
1044
  it 'removes user-supplied stop words' do
1141
1045
  text = 'This is a short sentence with explanations and stop words.'
1142
1046
  pt = PragmaticTokenizer::Tokenizer.new(
1143
- text,
1144
1047
  language: 'en',
1145
1048
  remove_stop_words: true,
1146
1049
  stop_words: %w(and a)
1147
1050
  )
1148
- expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
1051
+ expect(pt.tokenize(text)).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
1149
1052
  end
1150
1053
 
1151
1054
  it 'removes user-supplied stop words and default stop words' do
1152
1055
  text = 'This is a short sentence with explanations and stop words.'
1153
1056
  pt = PragmaticTokenizer::Tokenizer.new(
1154
- text,
1155
1057
  language: 'en',
1156
1058
  remove_stop_words: true,
1157
1059
  stop_words: ["sentence"],
1158
1060
  filter_languages: [:en]
1159
1061
  )
1160
- expect(pt.tokenize).to eq(["short", "explanations", "."])
1062
+ expect(pt.tokenize(text)).to eq(["short", "explanations", "."])
1161
1063
  end
1162
1064
 
1163
1065
  it 'removes user-supplied stop words and default stop words across multiple languages' do
1164
1066
  text = 'This is a short sentence with explanations and stop words. And achte German words.'
1165
1067
  pt = PragmaticTokenizer::Tokenizer.new(
1166
- text,
1167
1068
  language: 'en',
1168
1069
  remove_stop_words: true,
1169
1070
  stop_words: ["sentence"],
1170
1071
  filter_languages: [:en, :de]
1171
1072
  )
1172
- expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
1073
+ expect(pt.tokenize(text)).to eq(["short", "explanations", ".", "german", "."])
1173
1074
  end
1174
1075
  end
1175
1076
 
@@ -1177,49 +1078,44 @@ describe PragmaticTokenizer do
1177
1078
  it 'tokenizes a string #001' do
1178
1079
  text = 'His name is Mr. Smith.'
1179
1080
  pt = PragmaticTokenizer::Tokenizer.new(
1180
- text,
1181
1081
  language: 'en',
1182
1082
  punctuation: 'none'
1183
1083
  )
1184
- expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1084
+ expect(pt.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1185
1085
  end
1186
1086
 
1187
1087
  it 'tokenizes a string #002' do
1188
1088
  text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
1189
1089
  pt = PragmaticTokenizer::Tokenizer.new(
1190
- text,
1191
1090
  language: 'en',
1192
1091
  punctuation: 'only'
1193
1092
  )
1194
- expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
1093
+ expect(pt.tokenize(text)).to eq([",", ".", ".", ".", "'", "'", ",", "."])
1195
1094
  end
1196
1095
 
1197
1096
  it 'tokenizes a string #003' do
1198
1097
  text = "Hello the a it experiment one fine."
1199
1098
  pt = PragmaticTokenizer::Tokenizer.new(
1200
- text,
1201
1099
  language: 'en',
1202
1100
  remove_stop_words: true
1203
1101
  )
1204
- expect(pt.tokenize).to eq(["experiment", "fine", "."])
1102
+ expect(pt.tokenize(text)).to eq(["experiment", "fine", "."])
1205
1103
  end
1206
1104
 
1207
1105
  it 'tokenizes a string #004' do
1208
1106
  # https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
1209
1107
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
1210
1108
  pt = PragmaticTokenizer::Tokenizer.new(
1211
- text,
1212
1109
  expand_contractions: true,
1213
1110
  remove_stop_words: true,
1214
1111
  punctuation: 'none'
1215
1112
  )
1216
- expect(pt.tokenize).to eq(%w(crazy sandowsky afford))
1113
+ expect(pt.tokenize(text)).to eq(%w(crazy sandowsky afford))
1217
1114
  end
1218
1115
 
1219
1116
  it 'tokenizes a string #005' do
1220
1117
  text = "Hello world with a stop word experiment."
1221
1118
  pt = PragmaticTokenizer::Tokenizer.new(
1222
- text,
1223
1119
  language: 'en',
1224
1120
  clean: true,
1225
1121
  numbers: :none,
@@ -1228,64 +1124,58 @@ describe PragmaticTokenizer do
1228
1124
  remove_stop_words: true,
1229
1125
  punctuation: 'none'
1230
1126
  )
1231
- expect(pt.tokenize).to eq(["experiment"])
1127
+ expect(pt.tokenize(text)).to eq(["experiment"])
1232
1128
  end
1233
1129
 
1234
1130
  it 'tokenizes a string #006' do
1235
1131
  text = "Hello; what is your: name @username **delete**"
1236
1132
  pt = PragmaticTokenizer::Tokenizer.new(
1237
- text,
1238
1133
  clean: true,
1239
1134
  punctuation: 'none'
1240
1135
  )
1241
- expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
1136
+ expect(pt.tokenize(text)).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
1242
1137
  end
1243
1138
 
1244
1139
  it 'tokenizes a string #007' do
1245
1140
  text = 'His name is Mr. Smith.'
1246
1141
  pt = PragmaticTokenizer::Tokenizer.new(
1247
- text,
1248
1142
  language: 'en',
1249
1143
  punctuation: 'none',
1250
1144
  downcase: false
1251
1145
  )
1252
- expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
1146
+ expect(pt.tokenize(text)).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
1253
1147
  end
1254
1148
 
1255
1149
  it 'tokenizes a string #008' do
1256
1150
  text = "Can't go tonight. Didn't finish."
1257
1151
  pt = PragmaticTokenizer::Tokenizer.new(
1258
- text,
1259
1152
  downcase: false,
1260
1153
  expand_contractions: true
1261
1154
  )
1262
- expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
1155
+ expect(pt.tokenize(text)).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
1263
1156
  end
1264
1157
 
1265
1158
  it 'tokenizes a string #009' do
1266
1159
  text = "Some *interesting stuff* is __happening here__"
1267
1160
  pt = PragmaticTokenizer::Tokenizer.new(
1268
- text,
1269
1161
  punctuation: 'none',
1270
1162
  clean: true
1271
1163
  )
1272
- expect(pt.tokenize).to eq(%w(some interesting stuff is happening here))
1164
+ expect(pt.tokenize(text)).to eq(%w(some interesting stuff is happening here))
1273
1165
  end
1274
1166
 
1275
1167
  it 'also allows symbols for options' do
1276
1168
  text = 'His name is Mr. Smith.'
1277
1169
  pt = PragmaticTokenizer::Tokenizer.new(
1278
- text,
1279
1170
  language: :en,
1280
1171
  punctuation: :none
1281
1172
  )
1282
- expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1173
+ expect(pt.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith'])
1283
1174
  end
1284
1175
 
1285
1176
  it 'handles long strings 1' do
1286
1177
  text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
1287
1178
  pt = PragmaticTokenizer::Tokenizer.new(
1288
- text,
1289
1179
  language: 'en',
1290
1180
  clean: true,
1291
1181
  minimum_length: 3,
@@ -1294,13 +1184,12 @@ describe PragmaticTokenizer do
1294
1184
  numbers: :none,
1295
1185
  punctuation: :none
1296
1186
  )
1297
- expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
1187
+ expect(pt.tokenize(text)).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
1298
1188
  end
1299
1189
 
1300
1190
  it 'handles long strings 2' do
1301
1191
  text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
1302
1192
  pt = PragmaticTokenizer::Tokenizer.new(
1303
- text,
1304
1193
  language: 'en',
1305
1194
  clean: true,
1306
1195
  minimum_length: 3,
@@ -1309,23 +1198,21 @@ describe PragmaticTokenizer do
1309
1198
  numbers: :none,
1310
1199
  punctuation: :none
1311
1200
  )
1312
- expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
1201
+ expect(pt.tokenize(text)).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
1313
1202
  end
1314
1203
 
1315
1204
  it 'handles markdown' do
1316
1205
  text = "This is _bold_ and this is *italic*"
1317
1206
  pt = PragmaticTokenizer::Tokenizer.new(
1318
- text,
1319
1207
  punctuation: 'none',
1320
1208
  clean: true
1321
1209
  )
1322
- expect(pt.tokenize).to eq(%w(this is bold and this is italic))
1210
+ expect(pt.tokenize(text)).to eq(%w(this is bold and this is italic))
1323
1211
  end
1324
1212
 
1325
1213
  it 'handles single quotes' do
1326
1214
  text = "Recognised as one of the ‘good’ games."
1327
1215
  pt = PragmaticTokenizer::Tokenizer.new(
1328
- text,
1329
1216
  language: 'en',
1330
1217
  clean: true,
1331
1218
  numbers: :none,
@@ -1334,113 +1221,103 @@ describe PragmaticTokenizer do
1334
1221
  remove_stop_words: true,
1335
1222
  punctuation: :none,
1336
1223
  downcase: true)
1337
- expect(pt.tokenize).to eq(%w(recognised good games))
1224
+ expect(pt.tokenize(text)).to eq(%w(recognised good games))
1338
1225
  end
1339
1226
 
1340
1227
  it 'removes control characters' do
1341
1228
  text = "\u0000 \u001F \u007FHello test."
1342
1229
  pt = PragmaticTokenizer::Tokenizer.new(
1343
- text,
1344
1230
  language: 'en',
1345
1231
  clean: true
1346
1232
  )
1347
- expect(pt.tokenize).to eq(["hello", "test", "."])
1233
+ expect(pt.tokenize(text)).to eq(["hello", "test", "."])
1348
1234
  end
1349
1235
 
1350
1236
  it 'splits too long words with hypens' do
1351
1237
  text = "hi-hat and old-school but not really-important-long-word"
1352
1238
  pt = PragmaticTokenizer::Tokenizer.new(
1353
- text,
1354
1239
  punctuation: 'none',
1355
1240
  long_word_split: 12
1356
1241
  )
1357
- expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
1242
+ expect(pt.tokenize(text)).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
1358
1243
  end
1359
1244
 
1360
1245
  it 'handles hashtags 2' do
1361
1246
  text = "This is the #upper-#limit"
1362
1247
  pt = PragmaticTokenizer::Tokenizer.new(
1363
- text,
1364
1248
  punctuation: 'none',
1365
1249
  hashtags: :keep_and_clean
1366
1250
  )
1367
- expect(pt.tokenize).to eq(%w(this is the upper limit))
1251
+ expect(pt.tokenize(text)).to eq(%w(this is the upper limit))
1368
1252
  end
1369
1253
 
1370
1254
  it 'handles hashtags 3' do
1371
1255
  text = "The #2016-fun has just begun."
1372
1256
  pt = PragmaticTokenizer::Tokenizer.new(
1373
- text,
1374
1257
  punctuation: 'none',
1375
1258
  hashtags: :keep_and_clean
1376
1259
  )
1377
- expect(pt.tokenize).to eq(%w(the 2016 fun has just begun))
1260
+ expect(pt.tokenize(text)).to eq(%w(the 2016 fun has just begun))
1378
1261
  end
1379
1262
 
1380
1263
  it 'does not clean mentions' do
1381
1264
  text = "@_someone_ because @someone and @_someone was taken"
1382
1265
  pt = PragmaticTokenizer::Tokenizer.new(
1383
- text,
1384
1266
  mentions: :keep_original,
1385
1267
  clean: true
1386
1268
  )
1387
- expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
1269
+ expect(pt.tokenize(text)).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
1388
1270
  end
1389
1271
 
1390
1272
  it 'removes double single quotes' do
1391
1273
  text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
1392
1274
  pt = PragmaticTokenizer::Tokenizer.new(
1393
- text,
1394
1275
  punctuation: :none,
1395
1276
  clean: true
1396
1277
  )
1397
- expect(pt.tokenize).to eq(%w(strong statement in the day the earth caught fire 1961))
1278
+ expect(pt.tokenize(text)).to eq(%w(strong statement in the day the earth caught fire 1961))
1398
1279
  end
1399
1280
 
1400
1281
  it 'removes a hyphen prefix 1' do
1401
1282
  text = "Geopol.-Strategy"
1402
1283
  pt = PragmaticTokenizer::Tokenizer.new(
1403
- text,
1404
1284
  punctuation: :none,
1405
1285
  clean: true
1406
1286
  )
1407
- expect(pt.tokenize).to eq(%w(geopol strategy))
1287
+ expect(pt.tokenize(text)).to eq(%w(geopol strategy))
1408
1288
  end
1409
1289
 
1410
1290
  it 'removes a hyphen prefix 2' do
1411
1291
  text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
1412
1292
  pt = PragmaticTokenizer::Tokenizer.new(
1413
- text,
1414
1293
  punctuation: :none,
1415
1294
  clean: true
1416
1295
  )
1417
- expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
1296
+ expect(pt.tokenize(text)).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
1418
1297
  end
1419
1298
 
1420
1299
  it 'does not remove tokens with ampersands' do
1421
1300
  text = "you&amp;me"
1422
1301
  pt = PragmaticTokenizer::Tokenizer.new(
1423
- text,
1424
1302
  clean: true,
1425
1303
  punctuation: :none
1426
1304
  )
1427
- expect(pt.tokenize).to eq(%w(you me))
1305
+ expect(pt.tokenize(text)).to eq(%w(you me))
1428
1306
  end
1429
1307
 
1430
1308
  it 'cleans percent signs not related to numbers' do
1431
1309
  text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
1432
1310
  pt = PragmaticTokenizer::Tokenizer.new(
1433
- text,
1434
1311
  clean: true,
1435
1312
  numbers: :none,
1436
1313
  punctuation: :none
1437
1314
  )
1438
- expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
1315
+ expect(pt.tokenize(text)).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
1439
1316
  end
1440
1317
 
1441
1318
  it 'removes non-breaking spaces' do
1442
1319
  text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast    da hello."
1443
- pt = PragmaticTokenizer::Tokenizer.new(text,
1320
+ pt = PragmaticTokenizer::Tokenizer.new(
1444
1321
  language: :en,
1445
1322
  filter_languages: [:en],
1446
1323
  clean: true,
@@ -1456,12 +1333,12 @@ describe PragmaticTokenizer do
1456
1333
  mentions: :remove,
1457
1334
  downcase: true
1458
1335
  )
1459
- expect(pt.tokenize).to eq(["user", "john", "pt-br", "wordfast"])
1336
+ expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1460
1337
  end
1461
1338
 
1462
- it 'removes hyphens' do
1463
- text = "princípio da legalidade;&tA; EN-US Therefore, ANEEL - though still"
1464
- pt = PragmaticTokenizer::Tokenizer.new(text,
1339
+ it 'removes non-breaking spaces' do
1340
+ text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast    da hello."
1341
+ pt = PragmaticTokenizer::Tokenizer.new(
1465
1342
  language: :en,
1466
1343
  filter_languages: [:en],
1467
1344
  clean: true,
@@ -1477,225 +1354,222 @@ describe PragmaticTokenizer do
1477
1354
  mentions: :remove,
1478
1355
  downcase: true
1479
1356
  )
1480
- expect(pt.tokenize).to eq(["princípio", "legalidade", "en-us", "aneel"])
1357
+ expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
1481
1358
  end
1482
-
1483
-
1484
1359
  end
1485
1360
  end
1486
1361
 
1487
1362
  context 'ending punctutation' do
1488
1363
  it 'handles ending question marks' do
1489
1364
  text = 'What is your name?'
1490
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["what", "is", "your", "name", "?"])
1365
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["what", "is", "your", "name", "?"])
1491
1366
  end
1492
1367
 
1493
1368
  it 'handles exclamation points' do
1494
1369
  text = 'You are the best!'
1495
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "are", "the", "best", "!"])
1370
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["you", "are", "the", "best", "!"])
1496
1371
  end
1497
1372
 
1498
1373
  it 'handles periods' do
1499
1374
  text = 'This way a productive day.'
1500
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "way", "a", "productive", "day", "."])
1375
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "way", "a", "productive", "day", "."])
1501
1376
  end
1502
1377
 
1503
1378
  it 'handles quotation marks' do
1504
1379
  text = "\"He is not the one you are looking for.\""
1505
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "\""])
1380
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["\"", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "\""])
1506
1381
  end
1507
1382
 
1508
1383
  it 'handles single quotation marks' do
1509
1384
  text = "'He is not the one you are looking for.'"
1510
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "'"])
1385
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["'", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "'"])
1511
1386
  end
1512
1387
 
1513
1388
  it "handles single quotation marks ('twas)" do
1514
1389
  text = "'Twas the night before Christmas and 'twas cloudy."
1515
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["'twas", "the", "night", "before", "christmas", "and", "'twas", "cloudy", "."])
1390
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["'twas", "the", "night", "before", "christmas", "and", "'twas", "cloudy", "."])
1516
1391
  end
1517
1392
 
1518
1393
  it 'handles double quotes at the end of a sentence' do
1519
1394
  text = "She said, \"I love cake.\""
1520
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\""])
1395
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\""])
1521
1396
  end
1522
1397
 
1523
1398
  it 'handles double quotes at the beginning of a sentence' do
1524
1399
  text = "\"I love cake.\", she said to her friend."
1525
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["\"", "i", "love", "cake", ".", "\"", ",", "she", "said", "to", "her", "friend", "."])
1400
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["\"", "i", "love", "cake", ".", "\"", ",", "she", "said", "to", "her", "friend", "."])
1526
1401
  end
1527
1402
 
1528
1403
  it 'handles double quotes in the middle of a sentence' do
1529
1404
  text = "She said, \"I love cake.\" to her friend."
1530
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\"", "to", "her", "friend", "."])
1405
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\"", "to", "her", "friend", "."])
1531
1406
  end
1532
1407
  end
1533
1408
 
1534
1409
  context 'other punctutation' do
1535
1410
  it 'handles ellipses' do
1536
1411
  text = 'Today is the last day...'
1537
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['today', 'is', 'the', 'last', 'day', '...'])
1412
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['today', 'is', 'the', 'last', 'day', '...'])
1538
1413
  end
1539
1414
 
1540
1415
  it 'handles special quotes' do
1541
1416
  text = "«That's right», he said."
1542
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["«", "that's", "right", "»", ",", "he", "said", "."])
1417
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["«", "that's", "right", "»", ",", "he", "said", "."])
1543
1418
  end
1544
1419
 
1545
1420
  it 'handles upside down punctuation (¿)' do
1546
1421
  text = "¿Really?"
1547
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¿", "really", "?"])
1422
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["¿", "really", "?"])
1548
1423
  end
1549
1424
 
1550
1425
  it 'handles upside down punctuation (¡)' do
1551
1426
  text = "¡Really!"
1552
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["¡", "really", "!"])
1427
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["¡", "really", "!"])
1553
1428
  end
1554
1429
 
1555
1430
  it 'handles colons' do
1556
1431
  text = "This was the news: 'Today is the day!'"
1557
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "was", "the", "news", ":", "'", "today", "is", "the", "day", "!", "'"])
1432
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "was", "the", "news", ":", "'", "today", "is", "the", "day", "!", "'"])
1558
1433
  end
1559
1434
 
1560
1435
  it 'handles web addresses' do
1561
1436
  text = "Please visit the site - https://www.tm-town.com"
1562
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", "-", "https://www.tm-town.com"])
1437
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "visit", "the", "site", "-", "https://www.tm-town.com"])
1563
1438
  end
1564
1439
 
1565
1440
  it 'handles multiple colons and web addresses' do
1566
1441
  text = "Please visit the site: https://www.tm-town.com"
1567
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "visit", "the", "site", ":", "https://www.tm-town.com"])
1442
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "visit", "the", "site", ":", "https://www.tm-town.com"])
1568
1443
  end
1569
1444
 
1570
1445
  it 'handles multiple dashes' do
1571
1446
  text = "John--here is your ticket."
1572
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["john", "-", "here", "is", "your", "ticket", "."])
1447
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["john", "-", "here", "is", "your", "ticket", "."])
1573
1448
  end
1574
1449
 
1575
1450
  it 'handles brackets' do
1576
1451
  text = "This is an array: ['Hello']."
1577
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "an", "array", ":", "[", "'", "hello", "'", "]", "."])
1452
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "an", "array", ":", "[", "'", "hello", "'", "]", "."])
1578
1453
  end
1579
1454
 
1580
1455
  it 'handles double question marks' do
1581
1456
  text = "This is a question??"
1582
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "?"])
1457
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "a", "question", "?", "?"])
1583
1458
  end
1584
1459
 
1585
1460
  it 'handles multiple ending punctuation' do
1586
1461
  text = "This is a question?!?"
1587
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["this", "is", "a", "question", "?", "!", "?"])
1462
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "a", "question", "?", "!", "?"])
1588
1463
  end
1589
1464
 
1590
1465
  it 'handles contractions 1' do
1591
1466
  text = "How'd it go yesterday?"
1592
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["how'd", "it", "go", "yesterday", "?"])
1467
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["how'd", "it", "go", "yesterday", "?"])
1593
1468
  end
1594
1469
 
1595
1470
  it 'handles contractions 2' do
1596
1471
  text = "You shouldn't worry."
1597
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["you", "shouldn't", "worry", "."])
1472
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["you", "shouldn't", "worry", "."])
1598
1473
  end
1599
1474
 
1600
1475
  it 'handles contractions 3' do
1601
1476
  text = "We've gone too far. It'll be over when we're done."
1602
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["we've", "gone", "too", "far", ".", "it'll", "be", "over", "when", "we're", "done", "."])
1477
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["we've", "gone", "too", "far", ".", "it'll", "be", "over", "when", "we're", "done", "."])
1603
1478
  end
1604
1479
 
1605
1480
  it 'handles numbers' do
1606
1481
  text = 'He paid $10,000,000 for the new house which is equivalent to ¥1,000,000,000.00.'
1607
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['he', 'paid', '$10,000,000', 'for', 'the', 'new', 'house', 'which', 'is', 'equivalent', 'to', '¥1,000,000,000.00', '.'])
1482
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['he', 'paid', '$10,000,000', 'for', 'the', 'new', 'house', 'which', 'is', 'equivalent', 'to', '¥1,000,000,000.00', '.'])
1608
1483
  end
1609
1484
 
1610
1485
  it 'follows the Chicago Manual of Style on punctuation' do
1611
1486
  text = 'An abbreviation that ends with a period must not be left hanging without it (in parentheses, e.g.), and a sentence containing a parenthesis must itself have terminal punctuation (are we almost done?).'
1612
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['an', 'abbreviation', 'that', 'ends', 'with', 'a', 'period', 'must', 'not', 'be', 'left', 'hanging', 'without', 'it', '(', 'in', 'parentheses', ',', 'e.g.', ')', ',', 'and', 'a', 'sentence', 'containing', 'a', 'parenthesis', 'must', 'itself', 'have', 'terminal', 'punctuation', '(', 'are', 'we', 'almost', 'done', '?', ')', '.'])
1487
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['an', 'abbreviation', 'that', 'ends', 'with', 'a', 'period', 'must', 'not', 'be', 'left', 'hanging', 'without', 'it', '(', 'in', 'parentheses', ',', 'e.g.', ')', ',', 'and', 'a', 'sentence', 'containing', 'a', 'parenthesis', 'must', 'itself', 'have', 'terminal', 'punctuation', '(', 'are', 'we', 'almost', 'done', '?', ')', '.'])
1613
1488
  end
1614
1489
 
1615
1490
  it 'is case insensitive' do
1616
1491
  text = 'his name is mr. smith, king of the \'entire\' forest.'
1617
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith', ',', 'king', 'of', 'the', '\'', 'entire', '\'', 'forest', '.'])
1492
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith', ',', 'king', 'of', 'the', '\'', 'entire', '\'', 'forest', '.'])
1618
1493
  end
1619
1494
 
1620
1495
  it 'handles web url addresses #1' do
1621
1496
  text = 'Check out http://www.google.com/?this_is_a_url/hello-world.html for more info.'
1622
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "http://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1497
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "http://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1623
1498
  end
1624
1499
 
1625
1500
  it 'handles web url addresses #2' do
1626
1501
  text = 'Check out https://www.google.com/?this_is_a_url/hello-world.html for more info.'
1627
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "https://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1502
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "https://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1628
1503
  end
1629
1504
 
1630
1505
  it 'handles web url addresses #3' do
1631
1506
  text = 'Check out www.google.com/?this_is_a_url/hello-world.html for more info.'
1632
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["check", "out", "www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1507
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
1633
1508
  end
1634
1509
 
1635
1510
  it 'handles email addresses' do
1636
1511
  text = 'Please email example@example.com for more info.'
1637
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["please", "email", "example@example.com", "for", "more", "info", "."])
1512
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "email", "example@example.com", "for", "more", "info", "."])
1638
1513
  end
1639
1514
 
1640
1515
  it 'handles empty tokens' do
1641
1516
  text = "!!!!! https://t.co/xxxx"
1642
1517
  pt = PragmaticTokenizer::Tokenizer.new(
1643
- text,
1644
1518
  punctuation: 'none'
1645
1519
  )
1646
- expect(pt.tokenize).to eq(["https://t.co/xxxx"])
1520
+ expect(pt.tokenize(text)).to eq(["https://t.co/xxxx"])
1647
1521
  end
1648
1522
  end
1649
1523
 
1650
1524
  context 'abbreviations' do
1651
1525
  it 'handles military abbreviations' do
1652
1526
  text = 'His name is Col. Smith.'
1653
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["his", "name", "is", "col.", "smith", "."])
1527
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["his", "name", "is", "col.", "smith", "."])
1654
1528
  end
1655
1529
 
1656
1530
  it 'handles institution abbreviations' do
1657
1531
  text = 'She went to East Univ. to get her degree.'
1658
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["she", "went", "to", "east", "univ.", "to", "get", "her", "degree", "."])
1532
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "went", "to", "east", "univ.", "to", "get", "her", "degree", "."])
1659
1533
  end
1660
1534
 
1661
1535
  it 'handles company abbreviations' do
1662
1536
  text = 'He works at ABC Inc. on weekends.'
1663
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc.", "on", "weekends", "."])
1537
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "works", "at", "abc", "inc.", "on", "weekends", "."])
1664
1538
  end
1665
1539
 
1666
1540
  it 'handles old state abbreviations' do
1667
1541
  text = 'He went to school in Mass. back in the day.'
1668
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "went", "to", "school", "in", "mass.", "back", "in", "the", "day", "."])
1542
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "went", "to", "school", "in", "mass.", "back", "in", "the", "day", "."])
1669
1543
  end
1670
1544
 
1671
1545
  it 'handles month abbreviations' do
1672
1546
  text = 'It is cold in Jan. they say.'
1673
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["it", "is", "cold", "in", "jan.", "they", "say", "."])
1547
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["it", "is", "cold", "in", "jan.", "they", "say", "."])
1674
1548
  end
1675
1549
 
1676
1550
  it 'handles miscellaneous abbreviations' do
1677
1551
  text = '1, 2, 3, etc. is the beat.'
1678
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['1', ',', '2', ',', '3', ',', 'etc.', 'is', 'the', 'beat', '.'])
1552
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['1', ',', '2', ',', '3', ',', 'etc.', 'is', 'the', 'beat', '.'])
1679
1553
  end
1680
1554
 
1681
1555
  it 'handles one letter abbreviations (i.e. Alfred E. Stone)' do
1682
1556
  text = 'Alfred E. Stone is a person.'
1683
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["alfred", "e.", "stone", "is", "a", "person", "."])
1557
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["alfred", "e.", "stone", "is", "a", "person", "."])
1684
1558
  end
1685
1559
 
1686
1560
  it 'handles repeating letter-dot words (i.e. U.S.A. or J.C. Penney)' do
1687
1561
  text = 'The U.S.A. is a country.'
1688
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["the", "u.s.a.", "is", "a", "country", "."])
1562
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["the", "u.s.a.", "is", "a", "country", "."])
1689
1563
  end
1690
1564
 
1691
1565
  it 'handles abbreviations that occur at the end of a sentence' do
1692
1566
  text = 'He works at ABC Inc.'
1693
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(["he", "works", "at", "abc", "inc."])
1567
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "works", "at", "abc", "inc."])
1694
1568
  end
1695
1569
 
1696
1570
  it 'handles punctuation after an abbreviation' do
1697
1571
  text = 'Exclamation point requires both marks (Q.E.D.!).'
1698
- expect(PragmaticTokenizer::Tokenizer.new(text).tokenize).to eq(['exclamation', 'point', 'requires', 'both', 'marks', '(', 'q.e.d.', '!', ')', '.'])
1572
+ expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['exclamation', 'point', 'requires', 'both', 'marks', '(', 'q.e.d.', '!', ')', '.'])
1699
1573
  end
1700
1574
  end
1701
1575
  end