pragmatic_tokenizer 2.1.0 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +77 -13
- data/README.md +3 -3
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +2 -2
- data/lib/pragmatic_tokenizer/languages.rb +27 -26
- data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/catalan.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/common.rb +11 -11
- data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/french.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/italian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/latvian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/russian.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/spanish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +11 -13
- data/lib/pragmatic_tokenizer/tokenizer.rb +195 -187
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -1
- data/spec/languages/bulgarian_spec.rb +4 -8
- data/spec/languages/deutsch_spec.rb +25 -49
- data/spec/languages/english_spec.rb +238 -364
- data/spec/languages/french_spec.rb +1 -2
- data/spec/performance_spec.rb +15 -16
- metadata +4 -4
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency "unicode_case_converter", "~> 0
|
21
|
+
spec.add_runtime_dependency "unicode_case_converter", "~> 1.0"
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec"
|
@@ -5,41 +5,37 @@ describe PragmaticTokenizer do
|
|
5
5
|
it 'tokenizes a string #001' do
|
6
6
|
text = 'Стойностни, вкл. български и руски'
|
7
7
|
pt = PragmaticTokenizer::Tokenizer.new(
|
8
|
-
text,
|
9
8
|
language: 'bg'
|
10
9
|
)
|
11
|
-
expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
10
|
+
expect(pt.tokenize(text)).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
12
11
|
end
|
13
12
|
|
14
13
|
it 'tokenizes a string #002' do
|
15
14
|
text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
|
16
15
|
pt = PragmaticTokenizer::Tokenizer.new(
|
17
|
-
text,
|
18
16
|
language: 'bg',
|
19
17
|
remove_stop_words: true
|
20
18
|
)
|
21
|
-
expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
19
|
+
expect(pt.tokenize(text)).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
22
20
|
end
|
23
21
|
|
24
22
|
it 'tokenizes a string #003' do
|
25
23
|
text = 'Без български жертви в Париж.'
|
26
24
|
pt = PragmaticTokenizer::Tokenizer.new(
|
27
|
-
text,
|
28
25
|
language: 'bg',
|
29
26
|
remove_stop_words: true
|
30
27
|
)
|
31
|
-
expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
|
28
|
+
expect(pt.tokenize(text)).to eq(["български", "жертви", "париж", "."])
|
32
29
|
end
|
33
30
|
|
34
31
|
it 'tokenizes a string #004' do
|
35
32
|
text = 'Без български жертви в Париж.'
|
36
33
|
pt = PragmaticTokenizer::Tokenizer.new(
|
37
|
-
text,
|
38
34
|
language: 'bg',
|
39
35
|
remove_stop_words: true,
|
40
36
|
downcase: false
|
41
37
|
)
|
42
|
-
expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
|
38
|
+
expect(pt.tokenize(text)).to eq(["български", "жертви", "Париж", "."])
|
43
39
|
end
|
44
40
|
end
|
45
41
|
end
|
@@ -4,217 +4,196 @@ describe PragmaticTokenizer do
|
|
4
4
|
context 'Language: German (de)' do
|
5
5
|
it 'tokenizes a string #001' do
|
6
6
|
text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
|
7
|
-
expect(PragmaticTokenizer::Tokenizer.new(
|
7
|
+
expect(PragmaticTokenizer::Tokenizer.new(language: 'de').tokenize(text)).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
|
8
8
|
end
|
9
9
|
|
10
10
|
it 'tokenizes a string #002' do
|
11
11
|
text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
|
12
12
|
expect(PragmaticTokenizer::Tokenizer.new(
|
13
|
-
text,
|
14
13
|
language: 'de',
|
15
14
|
downcase: false,
|
16
15
|
remove_stop_words: true,
|
17
16
|
punctuation: 'none',
|
18
17
|
numbers: :none
|
19
|
-
).tokenize).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
|
18
|
+
).tokenize(text)).to eq(%w(größte Ausdehnung Landes Westen Osten beträgt Nord Süd europäischen Staaten Weißrussland flächenmäßig))
|
20
19
|
end
|
21
20
|
|
22
21
|
it 'tokenizes a string #003' do
|
23
22
|
text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
|
24
23
|
expect(PragmaticTokenizer::Tokenizer.new(
|
25
|
-
text,
|
26
24
|
language: 'de',
|
27
25
|
downcase: false
|
28
|
-
).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
26
|
+
).tokenize(text)).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
29
27
|
end
|
30
28
|
|
31
29
|
it 'tokenizes a string #004' do
|
32
30
|
text = 'der Kaffee-Ersatz'
|
33
31
|
expect(PragmaticTokenizer::Tokenizer.new(
|
34
|
-
text,
|
35
32
|
language: 'de',
|
36
33
|
downcase: false
|
37
|
-
).tokenize).to eq(['der', 'Kaffee-Ersatz'])
|
34
|
+
).tokenize(text)).to eq(['der', 'Kaffee-Ersatz'])
|
38
35
|
end
|
39
36
|
|
40
37
|
it 'tokenizes a string #005' do
|
41
38
|
text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
|
42
39
|
expect(PragmaticTokenizer::Tokenizer.new(
|
43
|
-
text,
|
44
40
|
language: 'de'
|
45
|
-
).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
41
|
+
).tokenize(text)).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
46
42
|
end
|
47
43
|
|
48
44
|
it 'handles words with a slash 1' do
|
49
45
|
text = "We pay 3000 €/month"
|
50
46
|
pt = PragmaticTokenizer::Tokenizer.new(
|
51
|
-
text,
|
52
47
|
punctuation: 'none',
|
53
48
|
language: 'de'
|
54
49
|
)
|
55
|
-
expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
|
50
|
+
expect(pt.tokenize(text)).to eq(["we", "pay", "3000", "€", "month"])
|
56
51
|
end
|
57
52
|
|
58
53
|
it 'handles words with a slash 2' do
|
59
54
|
text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
|
60
55
|
pt = PragmaticTokenizer::Tokenizer.new(
|
61
|
-
text,
|
62
56
|
punctuation: 'none',
|
63
57
|
language: 'de'
|
64
58
|
)
|
65
|
-
expect(pt.tokenize).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
|
59
|
+
expect(pt.tokenize(text)).to eq(%w(ich frage mich wieso er nicht herr der lage war ist))
|
66
60
|
end
|
67
61
|
|
68
62
|
it 'handles words with a slash 3' do
|
69
63
|
text = "Poison gas attack in Ghuta/Syria."
|
70
64
|
pt = PragmaticTokenizer::Tokenizer.new(
|
71
|
-
text,
|
72
65
|
punctuation: 'none',
|
73
66
|
language: 'de'
|
74
67
|
)
|
75
|
-
expect(pt.tokenize).to eq(%w(poison gas attack in ghuta syria))
|
68
|
+
expect(pt.tokenize(text)).to eq(%w(poison gas attack in ghuta syria))
|
76
69
|
end
|
77
70
|
|
78
71
|
it 'handles words with a question mark' do
|
79
72
|
text = "Essen á la carte?Man ist versucht…"
|
80
73
|
pt = PragmaticTokenizer::Tokenizer.new(
|
81
|
-
text,
|
82
74
|
punctuation: 'none',
|
83
75
|
language: 'de'
|
84
76
|
)
|
85
|
-
expect(pt.tokenize).to eq(%w(essen á la carte man ist versucht))
|
77
|
+
expect(pt.tokenize(text)).to eq(%w(essen á la carte man ist versucht))
|
86
78
|
end
|
87
79
|
|
88
80
|
it 'handles apostrophes and quotes 3' do
|
89
81
|
text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
|
90
82
|
pt = PragmaticTokenizer::Tokenizer.new(
|
91
|
-
text,
|
92
83
|
punctuation: 'none',
|
93
84
|
language: 'de'
|
94
85
|
)
|
95
|
-
expect(pt.tokenize).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
|
86
|
+
expect(pt.tokenize(text)).to eq(%w(die mitte der gesellschaft interessiert sich jetzt für feminismus))
|
96
87
|
end
|
97
88
|
|
98
89
|
it 'handles mentions 1' do
|
99
90
|
text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
|
100
91
|
pt = PragmaticTokenizer::Tokenizer.new(
|
101
|
-
text,
|
102
92
|
punctuation: 'none',
|
103
93
|
language: 'de'
|
104
94
|
)
|
105
|
-
expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
95
|
+
expect(pt.tokenize(text)).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
106
96
|
end
|
107
97
|
|
108
98
|
it 'handles mentions 2' do
|
109
99
|
text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
|
110
100
|
pt = PragmaticTokenizer::Tokenizer.new(
|
111
|
-
text,
|
112
101
|
punctuation: 'none',
|
113
102
|
language: 'de'
|
114
103
|
)
|
115
|
-
expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
104
|
+
expect(pt.tokenize(text)).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
116
105
|
end
|
117
106
|
|
118
107
|
it 'handles old school emoticons 1' do
|
119
108
|
text = "du übertreibst maßlos :D"
|
120
109
|
pt = PragmaticTokenizer::Tokenizer.new(
|
121
|
-
text,
|
122
110
|
punctuation: 'none',
|
123
111
|
downcase: false,
|
124
112
|
language: 'de'
|
125
113
|
)
|
126
|
-
expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
|
114
|
+
expect(pt.tokenize(text)).to eq(["du", "übertreibst", "maßlos", ":D"])
|
127
115
|
end
|
128
116
|
|
129
117
|
it 'handles words with a symbol suffix' do
|
130
118
|
text = "hier ist ein Whirlpool versteckt^^"
|
131
119
|
pt = PragmaticTokenizer::Tokenizer.new(
|
132
|
-
text,
|
133
120
|
punctuation: 'none',
|
134
121
|
language: 'de'
|
135
122
|
)
|
136
|
-
expect(pt.tokenize).to eq(%w(hier ist ein whirlpool versteckt))
|
123
|
+
expect(pt.tokenize(text)).to eq(%w(hier ist ein whirlpool versteckt))
|
137
124
|
end
|
138
125
|
|
139
126
|
it 'handles hashtags 1' do
|
140
127
|
text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
|
141
128
|
pt = PragmaticTokenizer::Tokenizer.new(
|
142
|
-
text,
|
143
129
|
punctuation: 'none',
|
144
130
|
language: 'de'
|
145
131
|
)
|
146
|
-
expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
132
|
+
expect(pt.tokenize(text)).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
147
133
|
end
|
148
134
|
|
149
135
|
it 'handles numbers and words' do
|
150
136
|
text = "Air Force Once ist 18.270-mal abgehoben."
|
151
137
|
pt = PragmaticTokenizer::Tokenizer.new(
|
152
|
-
text,
|
153
138
|
punctuation: 'none',
|
154
139
|
language: 'de'
|
155
140
|
)
|
156
|
-
expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
141
|
+
expect(pt.tokenize(text)).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
157
142
|
end
|
158
143
|
|
159
144
|
it 'maintains the german gender-neutrality form 2' do
|
160
145
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen"
|
161
146
|
pt = PragmaticTokenizer::Tokenizer.new(
|
162
|
-
text,
|
163
147
|
punctuation: 'none',
|
164
148
|
language: 'de'
|
165
149
|
)
|
166
|
-
expect(pt.tokenize).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
|
150
|
+
expect(pt.tokenize(text)).to eq(%w(der die lehrer_in und seine ihre schüler_innen))
|
167
151
|
end
|
168
152
|
|
169
153
|
it 'handles contractions 1' do
|
170
154
|
text = "gibt's"
|
171
155
|
pt = PragmaticTokenizer::Tokenizer.new(
|
172
|
-
text,
|
173
156
|
expand_contractions: true,
|
174
157
|
language: 'de'
|
175
158
|
)
|
176
|
-
expect(pt.tokenize).to eq(%w(gibt es))
|
159
|
+
expect(pt.tokenize(text)).to eq(%w(gibt es))
|
177
160
|
end
|
178
161
|
|
179
162
|
it 'handles contractions 2' do
|
180
163
|
text = "gibt‘s schaut’s wenn's g›spür find´s"
|
181
164
|
pt = PragmaticTokenizer::Tokenizer.new(
|
182
|
-
text,
|
183
165
|
expand_contractions: true,
|
184
166
|
language: 'de'
|
185
167
|
)
|
186
|
-
expect(pt.tokenize).to eq(%w(gibt es schaut es wenn es gespür finde es))
|
168
|
+
expect(pt.tokenize(text)).to eq(%w(gibt es schaut es wenn es gespür finde es))
|
187
169
|
end
|
188
170
|
|
189
171
|
it 'removes English stopwords' do
|
190
172
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
191
173
|
pt = PragmaticTokenizer::Tokenizer.new(
|
192
|
-
text,
|
193
174
|
filter_languages: [:en],
|
194
175
|
remove_stop_words: true,
|
195
176
|
language: 'de'
|
196
177
|
)
|
197
|
-
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
178
|
+
expect(pt.tokenize(text)).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
198
179
|
end
|
199
180
|
|
200
181
|
it 'removes English and German stopwords' do
|
201
182
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
202
183
|
pt = PragmaticTokenizer::Tokenizer.new(
|
203
|
-
text,
|
204
184
|
filter_languages: [:en, :de],
|
205
185
|
remove_stop_words: true,
|
206
186
|
language: 'de'
|
207
187
|
)
|
208
|
-
expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
188
|
+
expect(pt.tokenize(text)).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
209
189
|
end
|
210
190
|
|
211
191
|
it 'does not remove English stopwords' do
|
212
192
|
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
213
193
|
pt = PragmaticTokenizer::Tokenizer.new(
|
214
|
-
text,
|
215
194
|
language: 'de'
|
216
195
|
)
|
217
|
-
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
196
|
+
expect(pt.tokenize(text)).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
218
197
|
end
|
219
198
|
|
220
199
|
# I don't know how to easily treat these forms, especially the most frequent form
|
@@ -223,31 +202,28 @@ describe PragmaticTokenizer do
|
|
223
202
|
skip "NOT IMPLEMENTED"
|
224
203
|
text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
|
225
204
|
pt = PragmaticTokenizer::Tokenizer.new(
|
226
|
-
text,
|
227
205
|
punctuation: 'none',
|
228
206
|
language: 'de'
|
229
207
|
)
|
230
|
-
expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
208
|
+
expect(pt.tokenize(text)).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
231
209
|
end
|
232
210
|
|
233
211
|
it 'handles apostrophes and quotes 4' do
|
234
212
|
skip "NOT IMPLEMENTED"
|
235
213
|
text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
|
236
214
|
pt = PragmaticTokenizer::Tokenizer.new(
|
237
|
-
text,
|
238
215
|
punctuation: 'none',
|
239
216
|
language: 'de'
|
240
217
|
)
|
241
|
-
expect(pt.tokenize).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
|
218
|
+
expect(pt.tokenize(text)).to eq(%w(endlich regnet es ihm nicht mehr auf́s haupt))
|
242
219
|
end
|
243
220
|
|
244
221
|
it 'handles abrreviations for languages other than English' do
|
245
222
|
text = "Adj. Smith how are ü. today."
|
246
223
|
pt = PragmaticTokenizer::Tokenizer.new(
|
247
|
-
text,
|
248
224
|
language: :de
|
249
225
|
)
|
250
|
-
expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
226
|
+
expect(pt.tokenize(text)).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
251
227
|
end
|
252
228
|
end
|
253
229
|
end
|
@@ -6,170 +6,170 @@ describe PragmaticTokenizer do
|
|
6
6
|
context 'no options selected' do
|
7
7
|
it 'tokenizes a string #001' do
|
8
8
|
text = "Hello world."
|
9
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
10
|
-
expect(pt.tokenize).to eq(["hello", "world", "."])
|
9
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
10
|
+
expect(pt.tokenize(text)).to eq(["hello", "world", "."])
|
11
11
|
end
|
12
12
|
|
13
13
|
it 'tokenizes a string #002' do
|
14
14
|
text = "Hello Dr. Death."
|
15
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
16
|
-
expect(pt.tokenize).to eq(["hello", "dr.", "death", "."])
|
15
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
16
|
+
expect(pt.tokenize(text)).to eq(["hello", "dr.", "death", "."])
|
17
17
|
end
|
18
18
|
|
19
19
|
it 'tokenizes a string #003' do
|
20
20
|
text = "Hello ____________________ ."
|
21
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
22
|
-
expect(pt.tokenize).to eq(["hello", "____________________", "."])
|
21
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
22
|
+
expect(pt.tokenize(text)).to eq(["hello", "____________________", "."])
|
23
23
|
end
|
24
24
|
|
25
25
|
it 'tokenizes a string #004' do
|
26
26
|
text = "It has a state-of-the-art design."
|
27
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
28
|
-
expect(pt.tokenize).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
|
27
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
28
|
+
expect(pt.tokenize(text)).to eq(["it", "has", "a", "state-of-the-art", "design", "."])
|
29
29
|
end
|
30
30
|
|
31
31
|
it 'tokenizes a string #005' do
|
32
32
|
text = "Jan. 2015 was 20% colder than now. But not in inter- and outer-space."
|
33
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
34
|
-
expect(pt.tokenize).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
|
33
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
34
|
+
expect(pt.tokenize(text)).to eq(["jan.", "2015", "was", "20%", "colder", "than", "now", ".", "but", "not", "in", "inter", "-", "and", "outer-space", "."])
|
35
35
|
end
|
36
36
|
|
37
37
|
it 'tokenizes a string #006' do
|
38
38
|
text = 'Go to http://www.example.com.'
|
39
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
40
|
-
expect(pt.tokenize).to eq(["go", "to", "http://www.example.com", "."])
|
39
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
40
|
+
expect(pt.tokenize(text)).to eq(["go", "to", "http://www.example.com", "."])
|
41
41
|
end
|
42
42
|
|
43
43
|
it 'tokenizes a string #007' do
|
44
44
|
text = 'One of the lawyers from ‚Making a Murderer’ admitted a mistake'
|
45
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
46
|
-
expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
|
45
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
46
|
+
expect(pt.tokenize(text)).to eq(["one", "of", "the", "lawyers", "from", "‚", "making", "a", "murderer", "’", "admitted", "a", "mistake"])
|
47
47
|
end
|
48
48
|
|
49
49
|
it 'tokenizes a string #008' do
|
50
50
|
text = "One of the lawyers from 'Making a Murderer' admitted a mistake"
|
51
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
52
|
-
expect(pt.tokenize).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
|
51
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
52
|
+
expect(pt.tokenize(text)).to eq(["one", "of", "the", "lawyers", "from", "'", "making", "a", "murderer", "'", "admitted", "a", "mistake"])
|
53
53
|
end
|
54
54
|
|
55
55
|
it 'tokenizes a string #009' do
|
56
56
|
text = "hello ;-) yes"
|
57
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
58
|
-
expect(pt.tokenize).to eq(["hello", ";", "-", ")", "yes"])
|
57
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
58
|
+
expect(pt.tokenize(text)).to eq(["hello", ";", "-", ")", "yes"])
|
59
59
|
end
|
60
60
|
|
61
61
|
it 'tokenizes a string #010' do
|
62
62
|
text = "hello ;)"
|
63
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
64
|
-
expect(pt.tokenize).to eq(["hello", ";", ")"])
|
63
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
64
|
+
expect(pt.tokenize(text)).to eq(["hello", ";", ")"])
|
65
65
|
end
|
66
66
|
|
67
67
|
it 'tokenizes a string #011' do
|
68
68
|
text = "area <0.8 cm2"
|
69
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
70
|
-
expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
|
69
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
70
|
+
expect(pt.tokenize(text)).to eq(["area", "<0.8", "cm2"])
|
71
71
|
end
|
72
72
|
|
73
73
|
it 'tokenizes a string #012' do
|
74
74
|
text = "area <0.8 cm2"
|
75
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
76
|
-
expect(pt.tokenize).to eq(["area", "<0.8", "cm2"])
|
75
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
76
|
+
expect(pt.tokenize(text)).to eq(["area", "<0.8", "cm2"])
|
77
77
|
end
|
78
78
|
|
79
79
|
it 'tokenizes a string #013' do
|
80
80
|
text = "the “Star-Trek“-Inventor"
|
81
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
82
|
-
expect(pt.tokenize).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
|
81
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
82
|
+
expect(pt.tokenize(text)).to eq(["the", "“", "star-trek", "“", "-", "inventor"])
|
83
83
|
end
|
84
84
|
|
85
85
|
it 'tokenizes a string #014' do
|
86
86
|
text = "#ab-cd"
|
87
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
88
|
-
expect(pt.tokenize).to eq(["#ab-cd"])
|
87
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
88
|
+
expect(pt.tokenize(text)).to eq(["#ab-cd"])
|
89
89
|
end
|
90
90
|
|
91
91
|
it 'handles numbers with symbols 2' do
|
92
92
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
93
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
94
|
-
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
|
93
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
94
|
+
expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals", "!"])
|
95
95
|
end
|
96
96
|
|
97
97
|
it 'handles numbers with symbols 3' do
|
98
98
|
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
99
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
100
|
-
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
99
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
100
|
+
expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
101
101
|
end
|
102
102
|
|
103
103
|
it 'splits at a comma' do
|
104
104
|
text = "16.1. day one,17.2. day two"
|
105
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
106
|
-
expect(pt.tokenize).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
|
105
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
106
|
+
expect(pt.tokenize(text)).to eq(["16.1", ".", "day", "one", ",", "17.2", ".", "day", "two"])
|
107
107
|
end
|
108
108
|
|
109
109
|
it 'identifies single quotes' do
|
110
110
|
text = "Sean Penn Sat for Secret Interview With ‘El Chapo,’ Mexican Drug"
|
111
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
112
|
-
expect(pt.tokenize).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
|
111
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
112
|
+
expect(pt.tokenize(text)).to eq(["sean", "penn", "sat", "for", "secret", "interview", "with", "‘", "el", "chapo", ",", "’", "mexican", "drug"])
|
113
113
|
end
|
114
114
|
|
115
115
|
it 'identifies prefixed symbols' do
|
116
116
|
text = "look:the sky is blue"
|
117
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
118
|
-
expect(pt.tokenize).to eq(["look", ":", "the", "sky", "is", "blue"])
|
117
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
118
|
+
expect(pt.tokenize(text)).to eq(["look", ":", "the", "sky", "is", "blue"])
|
119
119
|
end
|
120
120
|
|
121
121
|
it 'identifies hashtags with numbers too' do
|
122
122
|
text = "this is a sentence.#yay this too.#withnumbers123"
|
123
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
124
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
|
123
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
124
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "sentence", ".", "#yay", "this", "too", ".", "#withnumbers123"])
|
125
125
|
end
|
126
126
|
|
127
127
|
it 'splits emojis' do
|
128
128
|
text = "🤔🙄"
|
129
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
130
|
-
expect(pt.tokenize).to eq(["🤔", "🙄"])
|
129
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
130
|
+
expect(pt.tokenize(text)).to eq(["🤔", "🙄"])
|
131
131
|
end
|
132
132
|
|
133
133
|
it 'handles snowflakes 1' do
|
134
134
|
text = "❄️❄️❄️"
|
135
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
136
|
-
expect(pt.tokenize).to eq(["❄️", "❄️", "❄️"])
|
135
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
136
|
+
expect(pt.tokenize(text)).to eq(["❄️", "❄️", "❄️"])
|
137
137
|
end
|
138
138
|
|
139
139
|
it 'handles snowflakes 2' do
|
140
140
|
text = "\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
|
141
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
142
|
-
expect(pt.tokenize).to eq(["❄︎", "❄︎", "❄︎"])
|
141
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
142
|
+
expect(pt.tokenize(text)).to eq(["❄︎", "❄︎", "❄︎"])
|
143
143
|
end
|
144
144
|
|
145
145
|
it 'handles snowflakes 3' do
|
146
146
|
text = "\u2744\u2744\u2744"
|
147
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
148
|
-
expect(pt.tokenize).to eq(["\u2744", "\u2744", "\u2744"])
|
147
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
148
|
+
expect(pt.tokenize(text)).to eq(["\u2744", "\u2744", "\u2744"])
|
149
149
|
end
|
150
150
|
|
151
151
|
it 'separates tokens' do
|
152
152
|
text = "football≠soccer"
|
153
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
154
|
-
expect(pt.tokenize).to eq(["football", "≠", "soccer"])
|
153
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
154
|
+
expect(pt.tokenize(text)).to eq(["football", "≠", "soccer"])
|
155
155
|
end
|
156
156
|
|
157
157
|
it 'deals with missing whitespaces' do
|
158
158
|
text = "this is sentence one!this is sentence two.@someone"
|
159
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
160
|
-
expect(pt.tokenize).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
|
159
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
160
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "sentence", "one", "!", "this", "is", "sentence", "two", ".", "@someone"])
|
161
161
|
end
|
162
162
|
|
163
163
|
it 'handles weird apostrophes' do
|
164
164
|
text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
|
165
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
166
|
-
expect(pt.tokenize).to eq(["there`s", "something"])
|
165
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
166
|
+
expect(pt.tokenize(text)).to eq(["there`s", "something"])
|
167
167
|
end
|
168
168
|
|
169
169
|
it 'treats abbreviations always the same' do
|
170
170
|
text = "U.S.A. U.S.A. U.S.A."
|
171
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
172
|
-
expect(pt.tokenize).to eq(
|
171
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
172
|
+
expect(pt.tokenize(text)).to eq(
|
173
173
|
["u.s.a.", "u.s.a.", "u.s.a."]
|
174
174
|
)
|
175
175
|
end
|
@@ -178,58 +178,53 @@ describe PragmaticTokenizer do
|
|
178
178
|
context 'user-supplied abbreviations' do
|
179
179
|
it 'tokenizes a regular string with an abbreviation' do
|
180
180
|
text = "Mr. Smith, hello world."
|
181
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
182
|
-
expect(pt.tokenize).to eq(["mr.", "smith", ",", "hello", "world", "."])
|
181
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
182
|
+
expect(pt.tokenize(text)).to eq(["mr.", "smith", ",", "hello", "world", "."])
|
183
183
|
end
|
184
184
|
|
185
185
|
it 'fails to recognize an English abbreviation if the user supplies an abbreviations array without it' do
|
186
186
|
text = "Mr. Smith, hello world."
|
187
187
|
abbreviations = ['mrs']
|
188
188
|
pt = PragmaticTokenizer::Tokenizer.new(
|
189
|
-
text,
|
190
189
|
abbreviations: abbreviations
|
191
190
|
)
|
192
|
-
expect(pt.tokenize).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
|
191
|
+
expect(pt.tokenize(text)).to eq(["mr", ".", "smith", ",", "hello", "world", "."])
|
193
192
|
end
|
194
193
|
|
195
194
|
it 'recognizes a user-supplied abbreviation' do
|
196
195
|
text = "thisisnotanormalabbreviation. hello world."
|
197
196
|
abbreviations = ['thisisnotanormalabbreviation']
|
198
197
|
pt = PragmaticTokenizer::Tokenizer.new(
|
199
|
-
text,
|
200
198
|
abbreviations: abbreviations
|
201
199
|
)
|
202
|
-
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
|
200
|
+
expect(pt.tokenize(text)).to eq(["thisisnotanormalabbreviation.", "hello", "world", "."])
|
203
201
|
end
|
204
202
|
|
205
203
|
it 'handles an empty user-supplied abbreviation array' do
|
206
204
|
text = "thisisnotanormalabbreviation. hello world."
|
207
205
|
abbreviations = []
|
208
206
|
pt = PragmaticTokenizer::Tokenizer.new(
|
209
|
-
text,
|
210
207
|
abbreviations: abbreviations
|
211
208
|
)
|
212
|
-
expect(pt.tokenize).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
|
209
|
+
expect(pt.tokenize(text)).to eq(["thisisnotanormalabbreviation", ".", "hello", "world", "."])
|
213
210
|
end
|
214
211
|
|
215
212
|
it 'handles abrreviations across multiple languages' do
|
216
213
|
text = "Mr. Smith how are ü. today."
|
217
214
|
pt = PragmaticTokenizer::Tokenizer.new(
|
218
|
-
text,
|
219
215
|
filter_languages: [:en, :de]
|
220
216
|
)
|
221
|
-
expect(pt.tokenize).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
|
217
|
+
expect(pt.tokenize(text)).to eq(["mr.", "smith", "how", "are", "ü.", "today", "."])
|
222
218
|
end
|
223
219
|
|
224
220
|
it 'handles abrreviations across multiple languages and user-supplied abbreviations' do
|
225
221
|
text = "Adj. Smith how are ü. today. thisisnotanormalabbreviation. is it?"
|
226
222
|
abbreviations = ['thisisnotanormalabbreviation']
|
227
223
|
pt = PragmaticTokenizer::Tokenizer.new(
|
228
|
-
text,
|
229
224
|
filter_languages: [:en, :de],
|
230
225
|
abbreviations: abbreviations
|
231
226
|
)
|
232
|
-
expect(pt.tokenize).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
|
227
|
+
expect(pt.tokenize(text)).to eq(["adj.", "smith", "how", "are", "ü.", "today", ".", "thisisnotanormalabbreviation.", "is", "it", "?"])
|
233
228
|
end
|
234
229
|
end
|
235
230
|
|
@@ -237,90 +232,82 @@ describe PragmaticTokenizer do
|
|
237
232
|
it 'does not expand the contractions' do
|
238
233
|
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
239
234
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
240
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
241
|
-
expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
|
235
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
236
|
+
expect(pt.tokenize(text)).to eq(['"', 'i', 'said', ',', "'", "what're", 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', "can't", 'afford', 'to', 'do', 'that', '.', '"'])
|
242
237
|
end
|
243
238
|
|
244
239
|
it 'expands user-supplied contractions' do
|
245
240
|
text = "Hello supa'soo guy."
|
246
241
|
contractions = { "supa'soo" => "super smooth" }
|
247
242
|
pt = PragmaticTokenizer::Tokenizer.new(
|
248
|
-
text,
|
249
243
|
contractions: contractions,
|
250
244
|
expand_contractions: true
|
251
245
|
)
|
252
|
-
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", "."])
|
246
|
+
expect(pt.tokenize(text)).to eq(["hello", "super", "smooth", "guy", "."])
|
253
247
|
end
|
254
248
|
|
255
249
|
it 'does not expands user-supplied contractions' do
|
256
250
|
text = "Hello supa'soo guy."
|
257
251
|
contractions = { "supa'soo" => "super smooth" }
|
258
252
|
pt = PragmaticTokenizer::Tokenizer.new(
|
259
|
-
text,
|
260
253
|
contractions: contractions,
|
261
254
|
expand_contractions: false
|
262
255
|
)
|
263
|
-
expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", "."])
|
256
|
+
expect(pt.tokenize(text)).to eq(["hello", "supa'soo", "guy", "."])
|
264
257
|
end
|
265
258
|
|
266
259
|
it 'expands user-supplied contractions and language contractions' do
|
267
260
|
text = "Hello supa'soo guy. auf's wasn't it?"
|
268
261
|
contractions = { "supa'soo" => "super smooth" }
|
269
262
|
pt = PragmaticTokenizer::Tokenizer.new(
|
270
|
-
text,
|
271
263
|
contractions: contractions,
|
272
264
|
expand_contractions: true,
|
273
265
|
filter_languages: [:en, :de]
|
274
266
|
)
|
275
|
-
expect(pt.tokenize).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
267
|
+
expect(pt.tokenize(text)).to eq(["hello", "super", "smooth", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
276
268
|
end
|
277
269
|
|
278
270
|
it 'expands language contractions' do
|
279
271
|
text = "Hello supa'soo guy. auf's wasn't it?"
|
280
272
|
pt = PragmaticTokenizer::Tokenizer.new(
|
281
|
-
text,
|
282
273
|
expand_contractions: true,
|
283
274
|
filter_languages: [:en, :de]
|
284
275
|
)
|
285
|
-
expect(pt.tokenize).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
276
|
+
expect(pt.tokenize(text)).to eq(["hello", "supa'soo", "guy", ".", "auf", "das", "was", "not", "it", "?"])
|
286
277
|
end
|
287
278
|
|
288
279
|
it 'tokenizes a string #001' do
|
289
280
|
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
290
281
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
291
282
|
pt = PragmaticTokenizer::Tokenizer.new(
|
292
|
-
text,
|
293
283
|
expand_contractions: true
|
294
284
|
)
|
295
|
-
expect(pt.tokenize).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
|
285
|
+
expect(pt.tokenize(text)).to eq(['"', 'i', 'said', ',', "'", 'what', 'are', 'you', '?', 'crazy', '?', "'", '"', 'said', 'sandowsky', '.', '"', 'i', 'cannot', 'afford', 'to', 'do', 'that', '.', '"'])
|
296
286
|
end
|
297
287
|
|
298
288
|
it 'tokenizes a string #002' do
|
299
289
|
# http://nlp.stanford.edu/software/tokenizer.shtml
|
300
290
|
text = "\"Oh, no,\" she's saying, \"our $400 blender can't handle something this hard!\""
|
301
291
|
pt = PragmaticTokenizer::Tokenizer.new(
|
302
|
-
text,
|
303
292
|
expand_contractions: true
|
304
293
|
)
|
305
|
-
expect(pt.tokenize).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
|
294
|
+
expect(pt.tokenize(text)).to eq(['"', 'oh', ',', 'no', ',', '"', 'she', 'is', 'saying', ',', '"', 'our', '$400', 'blender', 'cannot', 'handle', 'something', 'this', 'hard', '!', '"'])
|
306
295
|
end
|
307
296
|
|
308
297
|
it 'tokenizes a string #003' do
|
309
298
|
text = "Look for his/her account."
|
310
299
|
pt = PragmaticTokenizer::Tokenizer.new(
|
311
|
-
text,
|
312
300
|
expand_contractions: true
|
313
301
|
)
|
314
|
-
expect(pt.tokenize).to eq(["look", "for", "his", "her", "account", "."])
|
302
|
+
expect(pt.tokenize(text)).to eq(["look", "for", "his", "her", "account", "."])
|
315
303
|
end
|
316
304
|
|
317
305
|
it 'tokenizes a string #004' do
|
318
306
|
text = "I like apples and/or oranges."
|
319
307
|
pt = PragmaticTokenizer::Tokenizer.new(
|
320
|
-
text,
|
321
308
|
expand_contractions: true
|
322
309
|
)
|
323
|
-
expect(pt.tokenize).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
|
310
|
+
expect(pt.tokenize(text)).to eq(["i", "like", "apples", "and", "or", "oranges", "."])
|
324
311
|
end
|
325
312
|
end
|
326
313
|
|
@@ -328,43 +315,39 @@ describe PragmaticTokenizer do
|
|
328
315
|
it 'removes emoji' do
|
329
316
|
text = "Return the emoji 👿😍😱🐔🌚. 🌚"
|
330
317
|
pt = PragmaticTokenizer::Tokenizer.new(
|
331
|
-
text,
|
332
318
|
remove_emoji: true
|
333
319
|
)
|
334
|
-
expect(pt.tokenize).to eq(["return", "the", "emoji", "."])
|
320
|
+
expect(pt.tokenize(text)).to eq(["return", "the", "emoji", "."])
|
335
321
|
end
|
336
322
|
|
337
323
|
it 'does not remove emoji' do
|
338
324
|
text = "Return the emoji 👿😍😱🐔🌚. 🌚"
|
339
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
340
|
-
expect(pt.tokenize).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
|
325
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
326
|
+
expect(pt.tokenize(text)).to eq(["return", "the", "emoji", "👿", "😍", "😱", "🐔", "🌚", ".", "🌚"])
|
341
327
|
end
|
342
328
|
|
343
329
|
it 'removes snowflakes 1' do
|
344
330
|
text = "hello❄️❄️❄️"
|
345
331
|
pt = PragmaticTokenizer::Tokenizer.new(
|
346
|
-
text,
|
347
332
|
remove_emoji: true
|
348
333
|
)
|
349
|
-
expect(pt.tokenize).to eq(["hello"])
|
334
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
350
335
|
end
|
351
336
|
|
352
337
|
it 'removes snowflakes 2' do
|
353
338
|
text = "hello\u2744\uFE0E\u2744\uFE0E\u2744\uFE0E"
|
354
339
|
pt = PragmaticTokenizer::Tokenizer.new(
|
355
|
-
text,
|
356
340
|
remove_emoji: true
|
357
341
|
)
|
358
|
-
expect(pt.tokenize).to eq(["hello"])
|
342
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
359
343
|
end
|
360
344
|
|
361
345
|
it 'removes snowflakes 3' do
|
362
346
|
text = "hello\u2744\u2744\u2744"
|
363
347
|
pt = PragmaticTokenizer::Tokenizer.new(
|
364
|
-
text,
|
365
348
|
remove_emoji: true
|
366
349
|
)
|
367
|
-
expect(pt.tokenize).to eq(["hello"])
|
350
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
368
351
|
end
|
369
352
|
end
|
370
353
|
|
@@ -372,28 +355,25 @@ describe PragmaticTokenizer do
|
|
372
355
|
it 'tokenizes a string #001' do
|
373
356
|
text = "This is a #hashtag yay!"
|
374
357
|
pt = PragmaticTokenizer::Tokenizer.new(
|
375
|
-
text,
|
376
358
|
hashtags: :remove
|
377
359
|
)
|
378
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
360
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "yay", "!"])
|
379
361
|
end
|
380
362
|
|
381
363
|
it 'tokenizes a string #002' do
|
382
364
|
text = "This is a #hashtag yay!"
|
383
365
|
pt = PragmaticTokenizer::Tokenizer.new(
|
384
|
-
text,
|
385
366
|
hashtags: :keep_and_clean
|
386
367
|
)
|
387
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "hashtag", "yay", "!"])
|
368
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "hashtag", "yay", "!"])
|
388
369
|
end
|
389
370
|
|
390
371
|
it 'tokenizes a string #003' do
|
391
372
|
text = "This is a #hashtag yay!"
|
392
373
|
pt = PragmaticTokenizer::Tokenizer.new(
|
393
|
-
text,
|
394
374
|
hashtags: :keep_original
|
395
375
|
)
|
396
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
|
376
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "#hashtag", "yay", "!"])
|
397
377
|
end
|
398
378
|
end
|
399
379
|
|
@@ -401,28 +381,25 @@ describe PragmaticTokenizer do
|
|
401
381
|
it 'tokenizes a string #001' do
|
402
382
|
text = "This is a @mention @mention2 yay!"
|
403
383
|
pt = PragmaticTokenizer::Tokenizer.new(
|
404
|
-
text,
|
405
384
|
mentions: :remove
|
406
385
|
)
|
407
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "yay", "!"])
|
386
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "yay", "!"])
|
408
387
|
end
|
409
388
|
|
410
389
|
it 'tokenizes a string #002' do
|
411
390
|
text = "This is a @mention @mention2 yay!"
|
412
391
|
pt = PragmaticTokenizer::Tokenizer.new(
|
413
|
-
text,
|
414
392
|
mentions: :keep_and_clean
|
415
393
|
)
|
416
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
|
394
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "mention", "mention2", "yay", "!"])
|
417
395
|
end
|
418
396
|
|
419
397
|
it 'tokenizes a string #003' do
|
420
398
|
text = "This is a @mention @mention2 yay!"
|
421
399
|
pt = PragmaticTokenizer::Tokenizer.new(
|
422
|
-
text,
|
423
400
|
mentions: :keep_original
|
424
401
|
)
|
425
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
|
402
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "@mention", "@mention2", "yay", "!"])
|
426
403
|
end
|
427
404
|
end
|
428
405
|
|
@@ -430,25 +407,23 @@ describe PragmaticTokenizer do
|
|
430
407
|
it 'tokenizes a string #001' do
|
431
408
|
text = "Here are some emails jon@hotmail.com ben123@gmail.com."
|
432
409
|
pt = PragmaticTokenizer::Tokenizer.new(
|
433
|
-
text,
|
434
410
|
remove_emails: :true
|
435
411
|
)
|
436
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "emails", "."])
|
412
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "emails", "."])
|
437
413
|
end
|
438
414
|
|
439
415
|
it 'tokenizes a string #002' do
|
440
416
|
text = "Here are some emails jon@hotmail.com ben123@gmail.com."
|
441
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
442
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
|
417
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
418
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "emails", "jon@hotmail.com", "ben123@gmail.com", "."])
|
443
419
|
end
|
444
420
|
|
445
421
|
it 'knows what is not an email address' do
|
446
422
|
text = "the great cook.@someone something else@whoever"
|
447
423
|
pt = PragmaticTokenizer::Tokenizer.new(
|
448
|
-
text,
|
449
424
|
remove_emails: true
|
450
425
|
)
|
451
|
-
expect(pt.tokenize).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
|
426
|
+
expect(pt.tokenize(text)).to eq(["the", "great", "cook", ".", "@someone", "something", "else@whoever"])
|
452
427
|
end
|
453
428
|
end
|
454
429
|
|
@@ -456,16 +431,15 @@ describe PragmaticTokenizer do
|
|
456
431
|
it 'tokenizes a string #001' do
|
457
432
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
458
433
|
pt = PragmaticTokenizer::Tokenizer.new(
|
459
|
-
text,
|
460
434
|
remove_urls: :true
|
461
435
|
)
|
462
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
|
436
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "www.google.com", "."])
|
463
437
|
end
|
464
438
|
|
465
439
|
it 'tokenizes a string #002' do
|
466
440
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
467
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
468
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
441
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
442
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
469
443
|
end
|
470
444
|
end
|
471
445
|
|
@@ -473,44 +447,40 @@ describe PragmaticTokenizer do
|
|
473
447
|
it 'tokenizes a string #001' do
|
474
448
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
475
449
|
pt = PragmaticTokenizer::Tokenizer.new(
|
476
|
-
text,
|
477
450
|
remove_domains: :true
|
478
451
|
)
|
479
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
|
452
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "https://www.google.com", "."])
|
480
453
|
end
|
481
454
|
|
482
455
|
it 'tokenizes a string #002' do
|
483
456
|
text = "Here are some domains and urls google.com https://www.google.com www.google.com."
|
484
|
-
pt = PragmaticTokenizer::Tokenizer.new
|
485
|
-
expect(pt.tokenize).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
457
|
+
pt = PragmaticTokenizer::Tokenizer.new
|
458
|
+
expect(pt.tokenize(text)).to eq(["here", "are", "some", "domains", "and", "urls", "google.com", "https://www.google.com", "www.google.com", "."])
|
486
459
|
end
|
487
460
|
|
488
461
|
it 'knows what is not a domain 1' do
|
489
462
|
skip "NOT IMPLEMENTED"
|
490
463
|
text = "this is a sentence.and no domain."
|
491
464
|
pt = PragmaticTokenizer::Tokenizer.new(
|
492
|
-
text,
|
493
465
|
remove_domains: true
|
494
466
|
)
|
495
|
-
expect(pt.tokenize).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
|
467
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "a", "sentence", ".", "and", "no", "domain", "."])
|
496
468
|
end
|
497
469
|
|
498
470
|
it 'knows what is not a domain 2' do
|
499
471
|
text = "former president g.w.bush was..."
|
500
472
|
pt = PragmaticTokenizer::Tokenizer.new(
|
501
|
-
text,
|
502
473
|
remove_domains: true
|
503
474
|
)
|
504
|
-
expect(pt.tokenize).to eq(["former", "president", "g.w.bush", "was", "..."])
|
475
|
+
expect(pt.tokenize(text)).to eq(["former", "president", "g.w.bush", "was", "..."])
|
505
476
|
end
|
506
477
|
|
507
478
|
it 'knows what is not a domain 3' do
|
508
479
|
text = "2.something-times"
|
509
480
|
pt = PragmaticTokenizer::Tokenizer.new(
|
510
|
-
text,
|
511
481
|
remove_domains: true
|
512
482
|
)
|
513
|
-
expect(pt.tokenize).to eq(["2.something-times"])
|
483
|
+
expect(pt.tokenize(text)).to eq(["2.something-times"])
|
514
484
|
end
|
515
485
|
end
|
516
486
|
|
@@ -518,19 +488,17 @@ describe PragmaticTokenizer do
|
|
518
488
|
it 'tokenizes a string #001' do
|
519
489
|
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
520
490
|
pt = PragmaticTokenizer::Tokenizer.new(
|
521
|
-
text,
|
522
491
|
long_word_split: 10
|
523
492
|
)
|
524
|
-
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
|
493
|
+
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14-year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990-years", "needs", "to", "be", "revised", "."])
|
525
494
|
end
|
526
495
|
|
527
496
|
it 'tokenizes a string #002' do
|
528
497
|
text = "Some main-categories of the mathematics-test have sub-examples that most 14-year olds can't answer, therefor the implementation-instruction made in the 1990-years needs to be revised."
|
529
498
|
pt = PragmaticTokenizer::Tokenizer.new(
|
530
|
-
text,
|
531
499
|
long_word_split: 4
|
532
500
|
)
|
533
|
-
expect(pt.tokenize).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
501
|
+
expect(pt.tokenize(text)).to eq(["some", "main", "categories", "of", "the", "mathematics", "test", "have", "sub", "examples", "that", "most", "14", "year", "olds", "can't", "answer", ",", "therefor", "the", "implementation", "instruction", "made", "in", "the", "1990", "years", "needs", "to", "be", "revised", "."])
|
534
502
|
end
|
535
503
|
end
|
536
504
|
|
@@ -538,154 +506,137 @@ describe PragmaticTokenizer do
|
|
538
506
|
it 'tokenizes a string #001' do
|
539
507
|
text = "Hello ---------------."
|
540
508
|
pt = PragmaticTokenizer::Tokenizer.new(
|
541
|
-
text,
|
542
509
|
clean: true
|
543
510
|
)
|
544
|
-
expect(pt.tokenize).to eq(["hello", "."])
|
511
|
+
expect(pt.tokenize(text)).to eq(["hello", "."])
|
545
512
|
end
|
546
513
|
|
547
514
|
it 'tokenizes a string #002' do
|
548
515
|
text = "Hello ____________________ ."
|
549
516
|
pt = PragmaticTokenizer::Tokenizer.new(
|
550
|
-
text,
|
551
517
|
clean: true
|
552
518
|
)
|
553
|
-
expect(pt.tokenize).to eq(["hello", "."])
|
519
|
+
expect(pt.tokenize(text)).to eq(["hello", "."])
|
554
520
|
end
|
555
521
|
|
556
522
|
it 'tokenizes a string #003' do
|
557
523
|
text = "© ABC Company 1994"
|
558
524
|
pt = PragmaticTokenizer::Tokenizer.new(
|
559
|
-
text,
|
560
525
|
clean: true
|
561
526
|
)
|
562
|
-
expect(pt.tokenize).to eq(%w(abc company 1994))
|
527
|
+
expect(pt.tokenize(text)).to eq(%w(abc company 1994))
|
563
528
|
end
|
564
529
|
|
565
530
|
it 'tokenizes a string #004' do
|
566
531
|
text = "This sentence has a long string of dots ......................."
|
567
532
|
pt = PragmaticTokenizer::Tokenizer.new(
|
568
|
-
text,
|
569
533
|
clean: true
|
570
534
|
)
|
571
|
-
expect(pt.tokenize).to eq(%w(this sentence has a long string of dots))
|
535
|
+
expect(pt.tokenize(text)).to eq(%w(this sentence has a long string of dots))
|
572
536
|
end
|
573
537
|
|
574
538
|
it 'tokenizes a string #005' do
|
575
539
|
text = "cnn.com mentions this *funny* #hashtag used by @obama http://cnn.com/something"
|
576
540
|
pt = PragmaticTokenizer::Tokenizer.new(
|
577
|
-
text,
|
578
541
|
clean: true
|
579
542
|
)
|
580
|
-
expect(pt.tokenize).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
|
543
|
+
expect(pt.tokenize(text)).to eq(["cnn.com", "mentions", "this", "funny", "#hashtag", "used", "by", "@obama", "http://cnn.com/something"])
|
581
544
|
end
|
582
545
|
|
583
546
|
it 'does not remove a valid hashtag' do
|
584
547
|
text = "This #sentence has a long string of dots ......................."
|
585
548
|
pt = PragmaticTokenizer::Tokenizer.new(
|
586
|
-
text,
|
587
549
|
clean: true
|
588
550
|
)
|
589
|
-
expect(pt.tokenize).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
|
551
|
+
expect(pt.tokenize(text)).to eq(["this", "#sentence", "has", "a", "long", "string", "of", "dots"])
|
590
552
|
end
|
591
553
|
|
592
554
|
it 'does not remove a valid mention' do
|
593
555
|
text = "This @sentence has a long string of dots ......................."
|
594
556
|
pt = PragmaticTokenizer::Tokenizer.new(
|
595
|
-
text,
|
596
557
|
clean: true
|
597
558
|
)
|
598
|
-
expect(pt.tokenize).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
|
559
|
+
expect(pt.tokenize(text)).to eq(["this", "@sentence", "has", "a", "long", "string", "of", "dots"])
|
599
560
|
end
|
600
561
|
|
601
562
|
it 'cleans words with symbols 1' do
|
602
563
|
text = "something.com:article title !!wow look!!1"
|
603
564
|
pt = PragmaticTokenizer::Tokenizer.new(
|
604
|
-
text,
|
605
565
|
clean: true
|
606
566
|
)
|
607
|
-
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
567
|
+
expect(pt.tokenize(text)).to eq(["something.com", "article", "title", "wow", "look"])
|
608
568
|
end
|
609
569
|
|
610
570
|
it 'cleans words with symbols 2' do
|
611
571
|
text = "something.com:article title !!wow look!!1!1!11!"
|
612
572
|
pt = PragmaticTokenizer::Tokenizer.new(
|
613
|
-
text,
|
614
573
|
clean: true
|
615
574
|
)
|
616
|
-
expect(pt.tokenize).to eq(["something.com", "article", "title", "wow", "look"])
|
575
|
+
expect(pt.tokenize(text)).to eq(["something.com", "article", "title", "wow", "look"])
|
617
576
|
end
|
618
577
|
|
619
578
|
it 'identifies prefixed symbols' do
|
620
579
|
text = "look:the sky is blue"
|
621
580
|
pt = PragmaticTokenizer::Tokenizer.new(
|
622
|
-
text,
|
623
581
|
clean: true
|
624
582
|
)
|
625
|
-
expect(pt.tokenize).to eq(%w(look the sky is blue))
|
583
|
+
expect(pt.tokenize(text)).to eq(%w(look the sky is blue))
|
626
584
|
end
|
627
585
|
|
628
586
|
it 'keeps numbers at the end of mentions and hashtags' do
|
629
587
|
text = "#le1101 #artistQ21 @someone12 @someoneelse1 and @somebody1980"
|
630
588
|
pt = PragmaticTokenizer::Tokenizer.new(
|
631
|
-
text,
|
632
589
|
clean: true
|
633
590
|
)
|
634
|
-
expect(pt.tokenize).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
|
591
|
+
expect(pt.tokenize(text)).to eq(["#le1101", "#artistq21", "@someone12", "@someoneelse1", "and", "@somebody1980"])
|
635
592
|
end
|
636
593
|
|
637
594
|
it 'cleans a prefixed weird hyphen' do
|
638
595
|
text = [104, 105, 103, 104, 32, 173, 116, 101, 109, 112, 101, 114, 97, 116, 117, 114, 101, 32, 97, 110, 100, 32, 173, 119, 105, 110, 100].pack("U*")
|
639
596
|
pt = PragmaticTokenizer::Tokenizer.new(
|
640
|
-
text,
|
641
597
|
clean: true
|
642
598
|
)
|
643
|
-
expect(pt.tokenize).to eq(%w(high temperature and wind))
|
599
|
+
expect(pt.tokenize(text)).to eq(%w(high temperature and wind))
|
644
600
|
end
|
645
601
|
|
646
602
|
it 'cleans (r) and (c) and (tm)' do
|
647
603
|
text = "the oscar® night ©companyname is a trademark™"
|
648
604
|
pt = PragmaticTokenizer::Tokenizer.new(
|
649
|
-
text,
|
650
605
|
clean: true
|
651
606
|
)
|
652
|
-
expect(pt.tokenize).to eq(%w(the oscar night companyname is a trademark))
|
607
|
+
expect(pt.tokenize(text)).to eq(%w(the oscar night companyname is a trademark))
|
653
608
|
end
|
654
609
|
|
655
610
|
it 'cleans letters in boxes 1' do
|
656
611
|
text = "making🇦🇹postcards"
|
657
612
|
pt = PragmaticTokenizer::Tokenizer.new(
|
658
|
-
text,
|
659
613
|
clean: true
|
660
614
|
)
|
661
|
-
expect(pt.tokenize).to eq(%w(making postcards))
|
615
|
+
expect(pt.tokenize(text)).to eq(%w(making postcards))
|
662
616
|
end
|
663
617
|
|
664
618
|
it 'removes colons' do
|
665
619
|
text = "At 19:30 o'clock: Mad Max: Fury Road"
|
666
620
|
pt = PragmaticTokenizer::Tokenizer.new(
|
667
|
-
text,
|
668
621
|
clean: true
|
669
622
|
)
|
670
|
-
expect(pt.tokenize).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
|
623
|
+
expect(pt.tokenize(text)).to eq(["at", "19:30", "o'clock", "mad", "max", "fury", "road"])
|
671
624
|
end
|
672
625
|
|
673
626
|
it 'removes a hyphen prefix 3' do
|
674
627
|
text = "women's clothes and –shoes needed"
|
675
628
|
pt = PragmaticTokenizer::Tokenizer.new(
|
676
|
-
text,
|
677
629
|
clean: true
|
678
630
|
)
|
679
|
-
expect(pt.tokenize).to eq(["women's", "clothes", "and", "shoes", "needed"])
|
631
|
+
expect(pt.tokenize(text)).to eq(["women's", "clothes", "and", "shoes", "needed"])
|
680
632
|
end
|
681
633
|
|
682
634
|
it 'does not remove tokens with ampersands' do
|
683
635
|
text = "you&me"
|
684
636
|
pt = PragmaticTokenizer::Tokenizer.new(
|
685
|
-
text,
|
686
637
|
clean: true
|
687
638
|
)
|
688
|
-
expect(pt.tokenize).to eq(["you", "&", "me"])
|
639
|
+
expect(pt.tokenize(text)).to eq(["you", "&", "me"])
|
689
640
|
end
|
690
641
|
end
|
691
642
|
|
@@ -694,38 +645,34 @@ describe PragmaticTokenizer do
|
|
694
645
|
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
695
646
|
text = "I.B.M. cat's can't"
|
696
647
|
pt = PragmaticTokenizer::Tokenizer.new(
|
697
|
-
text,
|
698
648
|
classic_filter: true
|
699
649
|
)
|
700
|
-
expect(pt.tokenize).to eq(["ibm", "cat", "can't"])
|
650
|
+
expect(pt.tokenize(text)).to eq(["ibm", "cat", "can't"])
|
701
651
|
end
|
702
652
|
|
703
653
|
it 'tokenizes a string #002' do
|
704
654
|
# http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory
|
705
655
|
text = "St.Veit, which usually would be written St. Veit was not visited by B.Obama reported CNN.com"
|
706
656
|
pt = PragmaticTokenizer::Tokenizer.new(
|
707
|
-
text,
|
708
657
|
classic_filter: true
|
709
658
|
)
|
710
|
-
expect(pt.tokenize).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
|
659
|
+
expect(pt.tokenize(text)).to eq(["st.veit", ",", "which", "usually", "would", "be", "written", "st", "veit", "was", "not", "visited", "by", "b.obama", "reported", "cnn.com"])
|
711
660
|
end
|
712
661
|
|
713
662
|
it 'optimizes the classic filter' do
|
714
663
|
text = "therés something"
|
715
664
|
pt = PragmaticTokenizer::Tokenizer.new(
|
716
|
-
text,
|
717
665
|
classic_filter: true
|
718
666
|
)
|
719
|
-
expect(pt.tokenize).to eq(%w(there something))
|
667
|
+
expect(pt.tokenize(text)).to eq(%w(there something))
|
720
668
|
end
|
721
669
|
|
722
670
|
it 'optimizes the classic filter' do
|
723
671
|
text = [116, 104, 101, 114, 101, 32, 769, 115, 32, 115, 111, 109, 101, 116, 104, 105, 110, 103].pack("U*")
|
724
672
|
pt = PragmaticTokenizer::Tokenizer.new(
|
725
|
-
text,
|
726
673
|
classic_filter: true
|
727
674
|
)
|
728
|
-
expect(pt.tokenize).to eq(%w(there something))
|
675
|
+
expect(pt.tokenize(text)).to eq(%w(there something))
|
729
676
|
end
|
730
677
|
end
|
731
678
|
|
@@ -733,10 +680,9 @@ describe PragmaticTokenizer do
|
|
733
680
|
it 'tokenizes a string #001' do
|
734
681
|
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
735
682
|
pt = PragmaticTokenizer::Tokenizer.new(
|
736
|
-
text,
|
737
683
|
language: 'en'
|
738
684
|
)
|
739
|
-
expect(pt.tokenize).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
|
685
|
+
expect(pt.tokenize(text)).to eq(["hello", "ms.", "piggy", ",", "this", "is", "john", ".", "we", "are", "selling", "a", "new", "fridge", "for", "$5,000", ".", "that", "is", "a", "20%", "discount", "over", "the", "nev.", "retailers", ".", "it", "is", "a", "'", "must", "buy", "'", ",", "so", "don't", "hesistate", "."])
|
740
686
|
end
|
741
687
|
|
742
688
|
it 'tokenizes a string #002' do
|
@@ -751,10 +697,9 @@ describe PragmaticTokenizer do
|
|
751
697
|
Says Ms. Raines, \'[The judgement] confirms our concern that the absence of
|
752
698
|
patent lawyers on the court could prove troublesome.\'"
|
753
699
|
pt = PragmaticTokenizer::Tokenizer.new(
|
754
|
-
text,
|
755
700
|
language: 'en'
|
756
701
|
)
|
757
|
-
expect(pt.tokenize).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
|
702
|
+
expect(pt.tokenize(text)).to eq(['lisa', 'raines', ',', 'a', 'lawyer', 'and', 'director', 'of', 'government', 'relations', 'for', 'the', 'industrial', 'biotechnical', 'association', ',', 'contends', 'that', 'a', 'judge', 'well-versed', 'in', 'patent', 'law', 'and', 'the', 'concerns', 'of', 'research-based', 'industries', 'would', 'have', 'ruled', 'otherwise', '.', 'and', 'judge', 'newman', ',', 'a', 'former', 'patent', 'lawyer', ',', 'wrote', 'in', 'her', 'dissent', 'when', 'the', 'court', 'denied', 'a', 'motion', 'for', 'a', 'rehearing', 'of', 'the', 'case', 'by', 'the', 'full', 'court', ',', "\'", 'the', "panel's", 'judicial', 'legislation', 'has', 'affected', 'an', 'important', 'high-technological', 'industry', ',', 'without', 'regard', 'to', 'the', 'consequences', 'for', 'research', 'and', 'innovation', 'or', 'the', 'public', 'interest', '.', '\'', 'says', 'ms.', 'raines', ',', '\'', '[', 'the', 'judgement', ']', 'confirms', 'our', 'concern', 'that', 'the', 'absence', 'of', 'patent', 'lawyers', 'on', 'the', 'court', 'could', 'prove', 'troublesome', '.', "\'"])
|
758
703
|
end
|
759
704
|
end
|
760
705
|
|
@@ -762,64 +707,57 @@ describe PragmaticTokenizer do
|
|
762
707
|
it 'tokenizes a string #001' do
|
763
708
|
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
764
709
|
pt = PragmaticTokenizer::Tokenizer.new(
|
765
|
-
text,
|
766
710
|
numbers: :all
|
767
711
|
)
|
768
|
-
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
712
|
+
expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "$5", "dollars", ".", "you", "can", "pay", "at", "5:00", ",", "after", "it", "is", "500", "."])
|
769
713
|
end
|
770
714
|
|
771
715
|
it 'tokenizes a string #002' do
|
772
716
|
text = "Hello, that will be $5 dollars. You can pay at 5:00, after it is 500."
|
773
717
|
pt = PragmaticTokenizer::Tokenizer.new(
|
774
|
-
text,
|
775
718
|
numbers: :none
|
776
719
|
)
|
777
|
-
expect(pt.tokenize).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
|
720
|
+
expect(pt.tokenize(text)).to eq(["hello", ",", "that", "will", "be", "dollars", ".", "you", "can", "pay", "at", ",", "after", "it", "is", "."])
|
778
721
|
end
|
779
722
|
|
780
723
|
it 'tokenizes a string #003' do
|
781
724
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
782
725
|
pt = PragmaticTokenizer::Tokenizer.new(
|
783
|
-
text,
|
784
726
|
numbers: :semi
|
785
727
|
)
|
786
|
-
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
|
728
|
+
expect(pt.tokenize(text)).to eq(["2pac", "u2", "50cent", "blink-182", "$500", "zero7", "m83", "b-52s"])
|
787
729
|
end
|
788
730
|
|
789
731
|
it 'tokenizes a string #004' do
|
790
732
|
text = "2pac U2 50cent blink-182 zero7 M83 B-52s 500 Hello"
|
791
733
|
pt = PragmaticTokenizer::Tokenizer.new(
|
792
|
-
text,
|
793
734
|
numbers: :only
|
794
735
|
)
|
795
|
-
expect(pt.tokenize).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
|
736
|
+
expect(pt.tokenize(text)).to eq(["2pac", "u2", "50cent", "blink-182", "zero7", "m83", "b-52s", "500"])
|
796
737
|
end
|
797
738
|
|
798
739
|
it 'tokenizes a string #005' do
|
799
740
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500"
|
800
741
|
pt = PragmaticTokenizer::Tokenizer.new(
|
801
|
-
text,
|
802
742
|
numbers: :none
|
803
743
|
)
|
804
|
-
expect(pt.tokenize).to eq([])
|
744
|
+
expect(pt.tokenize(text)).to eq([])
|
805
745
|
end
|
806
746
|
|
807
747
|
it 'tokenizes a string #005' do
|
808
748
|
text = "2pac U2 50cent blink-182 $500 zero7 M83 B-52s 500 number iv VI"
|
809
749
|
pt = PragmaticTokenizer::Tokenizer.new(
|
810
|
-
text,
|
811
750
|
numbers: :none
|
812
751
|
)
|
813
|
-
expect(pt.tokenize).to eq(["number"])
|
752
|
+
expect(pt.tokenize(text)).to eq(["number"])
|
814
753
|
end
|
815
754
|
|
816
755
|
it 'tokenizes a string #006' do
|
817
756
|
text = "Remove III Roman Numerals and IX. with a period."
|
818
757
|
pt = PragmaticTokenizer::Tokenizer.new(
|
819
|
-
text,
|
820
758
|
numbers: :none
|
821
759
|
)
|
822
|
-
expect(pt.tokenize).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
|
760
|
+
expect(pt.tokenize(text)).to eq(["remove", "roman", "numerals", "and", ".", "with", "a", "period", "."])
|
823
761
|
end
|
824
762
|
end
|
825
763
|
|
@@ -827,10 +765,9 @@ describe PragmaticTokenizer do
|
|
827
765
|
it 'tokenizes a string #001' do
|
828
766
|
text = "Let's test the minimum length of fiver."
|
829
767
|
pt = PragmaticTokenizer::Tokenizer.new(
|
830
|
-
text,
|
831
768
|
minimum_length: 5
|
832
769
|
)
|
833
|
-
expect(pt.tokenize).to eq(["let's", "minimum", "length", "fiver"])
|
770
|
+
expect(pt.tokenize(text)).to eq(["let's", "minimum", "length", "fiver"])
|
834
771
|
end
|
835
772
|
end
|
836
773
|
|
@@ -838,291 +775,259 @@ describe PragmaticTokenizer do
|
|
838
775
|
it 'tokenizes a string #001' do
|
839
776
|
text = "kath. / evang"
|
840
777
|
pt = PragmaticTokenizer::Tokenizer.new(
|
841
|
-
text,
|
842
778
|
punctuation: 'none'
|
843
779
|
)
|
844
|
-
expect(pt.tokenize).to eq(%w(kath evang))
|
780
|
+
expect(pt.tokenize(text)).to eq(%w(kath evang))
|
845
781
|
end
|
846
782
|
|
847
783
|
it 'tokenizes a string #002' do
|
848
784
|
text = "derStandard.at › Sport"
|
849
785
|
pt = PragmaticTokenizer::Tokenizer.new(
|
850
|
-
text,
|
851
786
|
punctuation: 'none'
|
852
787
|
)
|
853
|
-
expect(pt.tokenize).to eq(["derstandard.at", "sport"])
|
788
|
+
expect(pt.tokenize(text)).to eq(["derstandard.at", "sport"])
|
854
789
|
end
|
855
790
|
|
856
791
|
it 'tokenizes a string #003' do
|
857
792
|
text = "hello ^^"
|
858
793
|
pt = PragmaticTokenizer::Tokenizer.new(
|
859
|
-
text,
|
860
794
|
punctuation: 'none'
|
861
795
|
)
|
862
|
-
expect(pt.tokenize).to eq(["hello"])
|
796
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
863
797
|
end
|
864
798
|
|
865
799
|
it 'tokenizes a string #004' do
|
866
800
|
text = "This hyphen – is not...or is it? ... It's a - dash... And a horizontal ellipsis…"
|
867
801
|
pt = PragmaticTokenizer::Tokenizer.new(
|
868
|
-
text,
|
869
802
|
punctuation: 'none'
|
870
803
|
)
|
871
|
-
expect(pt.tokenize).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
|
804
|
+
expect(pt.tokenize(text)).to eq(["this", "hyphen", "is", "not", "or", "is", "it", "it's", "a", "dash", "and", "a", "horizontal", "ellipsis"])
|
872
805
|
end
|
873
806
|
|
874
807
|
it 'tokenizes a string #005' do
|
875
808
|
text = "A sentence. One with two dots.. And with three... Or horizontal ellipsis… which are three dots too."
|
876
809
|
pt = PragmaticTokenizer::Tokenizer.new(
|
877
|
-
text,
|
878
810
|
punctuation: 'none'
|
879
811
|
)
|
880
|
-
expect(pt.tokenize).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
|
812
|
+
expect(pt.tokenize(text)).to eq(%w(a sentence one with two dots and with three or horizontal ellipsis which are three dots too))
|
881
813
|
end
|
882
814
|
|
883
815
|
it 'tokenizes a string #006' do
|
884
816
|
text = "+++ BREAKING +++ something happened; is it interesting?"
|
885
817
|
pt = PragmaticTokenizer::Tokenizer.new(
|
886
|
-
text,
|
887
818
|
punctuation: 'none'
|
888
819
|
)
|
889
|
-
expect(pt.tokenize).to eq(%w(breaking something happened is it interesting))
|
820
|
+
expect(pt.tokenize(text)).to eq(%w(breaking something happened is it interesting))
|
890
821
|
end
|
891
822
|
|
892
823
|
it 'tokenizes a string #007' do
|
893
824
|
text = "Some *interesting stuff* is __happening here__"
|
894
825
|
pt = PragmaticTokenizer::Tokenizer.new(
|
895
|
-
text,
|
896
826
|
punctuation: 'none'
|
897
827
|
)
|
898
|
-
expect(pt.tokenize).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
|
828
|
+
expect(pt.tokenize(text)).to eq(["some", "*interesting", "stuff*", "is", "__happening", "here__"])
|
899
829
|
end
|
900
830
|
|
901
831
|
it 'tokenizes a string #008' do
|
902
832
|
text = "Hello; what is your: name @username **delete**"
|
903
833
|
pt = PragmaticTokenizer::Tokenizer.new(
|
904
|
-
text,
|
905
834
|
punctuation: 'none'
|
906
835
|
)
|
907
|
-
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
|
836
|
+
expect(pt.tokenize(text)).to eq(["hello", "what", "is", "your", "name", "@username", "**delete**"])
|
908
837
|
end
|
909
838
|
|
910
839
|
it 'tokenizes a string #009' do
|
911
840
|
text = "hello ;-) yes"
|
912
841
|
pt = PragmaticTokenizer::Tokenizer.new(
|
913
|
-
text,
|
914
842
|
punctuation: :none
|
915
843
|
)
|
916
|
-
expect(pt.tokenize).to eq(%w(hello yes))
|
844
|
+
expect(pt.tokenize(text)).to eq(%w(hello yes))
|
917
845
|
end
|
918
846
|
|
919
847
|
it 'tokenizes a string #010' do
|
920
848
|
text = "hello ;)"
|
921
849
|
pt = PragmaticTokenizer::Tokenizer.new(
|
922
|
-
text,
|
923
850
|
punctuation: 'none'
|
924
851
|
)
|
925
|
-
expect(pt.tokenize).to eq(["hello"])
|
852
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
926
853
|
end
|
927
854
|
|
928
855
|
it 'tokenizes a string #011' do
|
929
856
|
text = "Hello ____________________ ."
|
930
857
|
pt = PragmaticTokenizer::Tokenizer.new(
|
931
|
-
text,
|
932
858
|
punctuation: :none
|
933
859
|
)
|
934
|
-
expect(pt.tokenize).to eq(["hello"])
|
860
|
+
expect(pt.tokenize(text)).to eq(["hello"])
|
935
861
|
end
|
936
862
|
|
937
863
|
it 'handles non-domain words with a dot 1' do
|
938
864
|
text = "They were being helped.This is solidarity."
|
939
865
|
pt = PragmaticTokenizer::Tokenizer.new(
|
940
|
-
text,
|
941
866
|
punctuation: 'none'
|
942
867
|
)
|
943
|
-
expect(pt.tokenize).to eq(%w(they were being helped this is solidarity))
|
868
|
+
expect(pt.tokenize(text)).to eq(%w(they were being helped this is solidarity))
|
944
869
|
end
|
945
870
|
|
946
871
|
it 'handles non-domain words with a dot 2' do
|
947
872
|
text = "picture was taken in sept.2015"
|
948
873
|
pt = PragmaticTokenizer::Tokenizer.new(
|
949
|
-
text,
|
950
874
|
punctuation: 'none'
|
951
875
|
)
|
952
|
-
expect(pt.tokenize).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
|
876
|
+
expect(pt.tokenize(text)).to eq(["picture", "was", "taken", "in", "sept.", "2015"])
|
953
877
|
end
|
954
878
|
|
955
879
|
it 'handles non-domain words with a dot 3' do
|
956
880
|
text = "They were being helped.This is solidarity. See the breaking news stories about X on cnn.com/europe and english.alarabiya.net, here’s a screenshot: https://t.co/s83k28f29d31s83"
|
957
881
|
pt = PragmaticTokenizer::Tokenizer.new(
|
958
|
-
text,
|
959
882
|
punctuation: 'none'
|
960
883
|
)
|
961
|
-
expect(pt.tokenize).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
|
884
|
+
expect(pt.tokenize(text)).to eq(["they", "were", "being", "helped", "this", "is", "solidarity", "see", "the", "breaking", "news", "stories", "about", "x", "on", "cnn.com", "europe", "and", "english.alarabiya.net", "here’s", "a", "screenshot", "https://t.co/s83k28f29d31s83"])
|
962
885
|
end
|
963
886
|
|
964
887
|
it 'handles numbers with symbols 1' do
|
965
888
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
966
889
|
pt = PragmaticTokenizer::Tokenizer.new(
|
967
|
-
text,
|
968
890
|
punctuation: 'none'
|
969
891
|
)
|
970
|
-
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
892
|
+
expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
971
893
|
end
|
972
894
|
|
973
895
|
it 'handles numbers with symbols 2' do
|
974
896
|
text = "Pittsburgh Steelers won 18:16 against Cincinnati Bengals!"
|
975
897
|
pt = PragmaticTokenizer::Tokenizer.new(
|
976
|
-
text,
|
977
898
|
punctuation: 'none'
|
978
899
|
)
|
979
|
-
expect(pt.tokenize).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
900
|
+
expect(pt.tokenize(text)).to eq(["pittsburgh", "steelers", "won", "18:16", "against", "cincinnati", "bengals"])
|
980
901
|
end
|
981
902
|
|
982
903
|
it 'handles apostrophes and quotes' do
|
983
904
|
text = "“Data Visualization: How to Tell Stories with Data — Jeff Korhan” by @AINewsletter"
|
984
905
|
pt = PragmaticTokenizer::Tokenizer.new(
|
985
|
-
text,
|
986
906
|
punctuation: 'none'
|
987
907
|
)
|
988
|
-
expect(pt.tokenize).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
|
908
|
+
expect(pt.tokenize(text)).to eq(["data", "visualization", "how", "to", "tell", "stories", "with", "data", "jeff", "korhan", "by", "@ainewsletter"])
|
989
909
|
end
|
990
910
|
|
991
911
|
it 'handles mentions' do
|
992
912
|
text = ".@someone I disagree"
|
993
913
|
pt = PragmaticTokenizer::Tokenizer.new(
|
994
|
-
text,
|
995
914
|
punctuation: 'none'
|
996
915
|
)
|
997
|
-
expect(pt.tokenize).to eq(["@someone", "i", "disagree"])
|
916
|
+
expect(pt.tokenize(text)).to eq(["@someone", "i", "disagree"])
|
998
917
|
end
|
999
918
|
|
1000
919
|
it 'handles old school emoticons 2' do
|
1001
920
|
text = "oooh! <3"
|
1002
921
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1003
|
-
text,
|
1004
922
|
punctuation: 'none'
|
1005
923
|
)
|
1006
|
-
expect(pt.tokenize).to eq(["oooh", "<3"])
|
924
|
+
expect(pt.tokenize(text)).to eq(["oooh", "<3"])
|
1007
925
|
end
|
1008
926
|
|
1009
927
|
it 'handles old school emoticons 3' do
|
1010
928
|
text = "@someone <33"
|
1011
929
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1012
|
-
text,
|
1013
930
|
punctuation: 'none'
|
1014
931
|
)
|
1015
|
-
expect(pt.tokenize).to eq(["@someone", "<33"])
|
932
|
+
expect(pt.tokenize(text)).to eq(["@someone", "<33"])
|
1016
933
|
end
|
1017
934
|
|
1018
935
|
it 'handles words with a symbol prefix 1' do
|
1019
936
|
text = "Yes! /cc @someone"
|
1020
937
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1021
|
-
text,
|
1022
938
|
punctuation: 'none'
|
1023
939
|
)
|
1024
|
-
expect(pt.tokenize).to eq(["yes", "cc", "@someone"])
|
940
|
+
expect(pt.tokenize(text)).to eq(["yes", "cc", "@someone"])
|
1025
941
|
end
|
1026
942
|
|
1027
943
|
it 'handles words with a emoji suffix' do
|
1028
944
|
text = "Let's meet there.😝 ok?"
|
1029
945
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1030
|
-
text,
|
1031
946
|
punctuation: 'none'
|
1032
947
|
)
|
1033
|
-
expect(pt.tokenize).to eq(["let's", "meet", "there", "😝", "ok"])
|
948
|
+
expect(pt.tokenize(text)).to eq(["let's", "meet", "there", "😝", "ok"])
|
1034
949
|
end
|
1035
950
|
|
1036
951
|
it 'handles words with a symbol prefix 2' do
|
1037
952
|
text = "blah blah |photo by @someone"
|
1038
953
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1039
|
-
text,
|
1040
954
|
punctuation: 'none'
|
1041
955
|
)
|
1042
|
-
expect(pt.tokenize).to eq(["blah", "blah", "photo", "by", "@someone"])
|
956
|
+
expect(pt.tokenize(text)).to eq(["blah", "blah", "photo", "by", "@someone"])
|
1043
957
|
end
|
1044
958
|
|
1045
959
|
it 'handles pseudo-contractions' do
|
1046
960
|
text = "I suggest to buy stocks that are low value+have momentum"
|
1047
961
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1048
|
-
text,
|
1049
962
|
punctuation: 'none'
|
1050
963
|
)
|
1051
|
-
expect(pt.tokenize).to eq(%w(i suggest to buy stocks that are low value have momentum))
|
964
|
+
expect(pt.tokenize(text)).to eq(%w(i suggest to buy stocks that are low value have momentum))
|
1052
965
|
end
|
1053
966
|
|
1054
967
|
it 'handles apostrophes and quotes 1' do
|
1055
968
|
text = "Watch the video of @amandapalmer's song “Killing Type” here"
|
1056
969
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1057
|
-
text,
|
1058
970
|
punctuation: 'none'
|
1059
971
|
)
|
1060
|
-
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
|
972
|
+
expect(pt.tokenize(text)).to eq(["watch", "the", "video", "of", "@amandapalmer's", "song", "killing", "type", "here"])
|
1061
973
|
end
|
1062
974
|
|
1063
975
|
it 'handles apostrophes and quotes 2' do
|
1064
976
|
text = "Watch the video of @amandapalmer`s song “Killing Type” here"
|
1065
977
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1066
|
-
text,
|
1067
978
|
punctuation: 'none'
|
1068
979
|
)
|
1069
|
-
expect(pt.tokenize).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
|
980
|
+
expect(pt.tokenize(text)).to eq(["watch", "the", "video", "of", "@amandapalmer`s", "song", "killing", "type", "here"])
|
1070
981
|
end
|
1071
982
|
|
1072
983
|
it 'handles numbers suffixed with a symbol' do
|
1073
984
|
text = "4 Things Marketers Must Do Better in 2016: blah"
|
1074
985
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1075
|
-
text,
|
1076
986
|
punctuation: 'none'
|
1077
987
|
)
|
1078
|
-
expect(pt.tokenize).to eq(%w(4 things marketers must do better in 2016 blah))
|
988
|
+
expect(pt.tokenize(text)).to eq(%w(4 things marketers must do better in 2016 blah))
|
1079
989
|
end
|
1080
990
|
|
1081
991
|
it 'handles words with a emoticon suffix' do
|
1082
992
|
skip "NOT IMPLEMENTED"
|
1083
993
|
text = "look, a dog with shoes☺ !!"
|
1084
994
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1085
|
-
text,
|
1086
995
|
punctuation: 'none'
|
1087
996
|
)
|
1088
|
-
expect(pt.tokenize).to eq(["look", "a", "dog", "with", "shoes", "☺"])
|
997
|
+
expect(pt.tokenize(text)).to eq(["look", "a", "dog", "with", "shoes", "☺"])
|
1089
998
|
end
|
1090
999
|
|
1091
1000
|
it 'handles emoji 1' do
|
1092
1001
|
text = "How bad!😝"
|
1093
1002
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1094
|
-
text,
|
1095
1003
|
punctuation: 'none'
|
1096
1004
|
)
|
1097
|
-
expect(pt.tokenize).to eq(["how", "bad", "😝"])
|
1005
|
+
expect(pt.tokenize(text)).to eq(["how", "bad", "😝"])
|
1098
1006
|
end
|
1099
1007
|
|
1100
1008
|
it 'handles emoji 2' do
|
1101
1009
|
text = "😝How bad!"
|
1102
1010
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1103
|
-
text,
|
1104
1011
|
punctuation: 'none'
|
1105
1012
|
)
|
1106
|
-
expect(pt.tokenize).to eq(["😝", "how", "bad"])
|
1013
|
+
expect(pt.tokenize(text)).to eq(["😝", "how", "bad"])
|
1107
1014
|
end
|
1108
1015
|
|
1109
1016
|
it 'identifies old school emoticons' do
|
1110
1017
|
skip "NOT IMPLEMENTED"
|
1111
1018
|
text = 'looking forward to the new kodak super8 camera \o/'
|
1112
1019
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1113
|
-
text,
|
1114
1020
|
punctuation: 'none'
|
1115
1021
|
)
|
1116
|
-
expect(pt.tokenize).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
|
1022
|
+
expect(pt.tokenize(text)).to eq(["looking", "forward", "to", "the", "new", "kodak", "super8", "camera", '\o/'])
|
1117
1023
|
end
|
1118
1024
|
|
1119
1025
|
it 'splits at hashtags' do
|
1120
1026
|
text = "some sentence#RT ... i like u2.#bono"
|
1121
1027
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1122
|
-
text,
|
1123
1028
|
punctuation: :none
|
1124
1029
|
)
|
1125
|
-
expect(pt.tokenize).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
|
1030
|
+
expect(pt.tokenize(text)).to eq(["some", "sentence", "#rt", "i", "like", "u2", "#bono"])
|
1126
1031
|
end
|
1127
1032
|
end
|
1128
1033
|
|
@@ -1130,46 +1035,42 @@ describe PragmaticTokenizer do
|
|
1130
1035
|
it 'removes stop words' do
|
1131
1036
|
text = 'This is a short sentence with explanations and stop words.'
|
1132
1037
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1133
|
-
text,
|
1134
1038
|
language: 'en',
|
1135
1039
|
remove_stop_words: true
|
1136
1040
|
)
|
1137
|
-
expect(pt.tokenize).to eq(["short", "sentence", "explanations", "."])
|
1041
|
+
expect(pt.tokenize(text)).to eq(["short", "sentence", "explanations", "."])
|
1138
1042
|
end
|
1139
1043
|
|
1140
1044
|
it 'removes user-supplied stop words' do
|
1141
1045
|
text = 'This is a short sentence with explanations and stop words.'
|
1142
1046
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1143
|
-
text,
|
1144
1047
|
language: 'en',
|
1145
1048
|
remove_stop_words: true,
|
1146
1049
|
stop_words: %w(and a)
|
1147
1050
|
)
|
1148
|
-
expect(pt.tokenize).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
|
1051
|
+
expect(pt.tokenize(text)).to eq(["this", "is", "short", "sentence", "with", "explanations", "stop", "words", "."])
|
1149
1052
|
end
|
1150
1053
|
|
1151
1054
|
it 'removes user-supplied stop words and default stop words' do
|
1152
1055
|
text = 'This is a short sentence with explanations and stop words.'
|
1153
1056
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1154
|
-
text,
|
1155
1057
|
language: 'en',
|
1156
1058
|
remove_stop_words: true,
|
1157
1059
|
stop_words: ["sentence"],
|
1158
1060
|
filter_languages: [:en]
|
1159
1061
|
)
|
1160
|
-
expect(pt.tokenize).to eq(["short", "explanations", "."])
|
1062
|
+
expect(pt.tokenize(text)).to eq(["short", "explanations", "."])
|
1161
1063
|
end
|
1162
1064
|
|
1163
1065
|
it 'removes user-supplied stop words and default stop words across multiple languages' do
|
1164
1066
|
text = 'This is a short sentence with explanations and stop words. And achte German words.'
|
1165
1067
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1166
|
-
text,
|
1167
1068
|
language: 'en',
|
1168
1069
|
remove_stop_words: true,
|
1169
1070
|
stop_words: ["sentence"],
|
1170
1071
|
filter_languages: [:en, :de]
|
1171
1072
|
)
|
1172
|
-
expect(pt.tokenize).to eq(["short", "explanations", ".", "german", "."])
|
1073
|
+
expect(pt.tokenize(text)).to eq(["short", "explanations", ".", "german", "."])
|
1173
1074
|
end
|
1174
1075
|
end
|
1175
1076
|
|
@@ -1177,49 +1078,44 @@ describe PragmaticTokenizer do
|
|
1177
1078
|
it 'tokenizes a string #001' do
|
1178
1079
|
text = 'His name is Mr. Smith.'
|
1179
1080
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1180
|
-
text,
|
1181
1081
|
language: 'en',
|
1182
1082
|
punctuation: 'none'
|
1183
1083
|
)
|
1184
|
-
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1084
|
+
expect(pt.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1185
1085
|
end
|
1186
1086
|
|
1187
1087
|
it 'tokenizes a string #002' do
|
1188
1088
|
text = "Hello Ms. Piggy, this is John. We are selling a new fridge for $5,000. That is a 20% discount over the Nev. retailers. It is a 'MUST BUY', so don't hesistate."
|
1189
1089
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1190
|
-
text,
|
1191
1090
|
language: 'en',
|
1192
1091
|
punctuation: 'only'
|
1193
1092
|
)
|
1194
|
-
expect(pt.tokenize).to eq([",", ".", ".", ".", "'", "'", ",", "."])
|
1093
|
+
expect(pt.tokenize(text)).to eq([",", ".", ".", ".", "'", "'", ",", "."])
|
1195
1094
|
end
|
1196
1095
|
|
1197
1096
|
it 'tokenizes a string #003' do
|
1198
1097
|
text = "Hello the a it experiment one fine."
|
1199
1098
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1200
|
-
text,
|
1201
1099
|
language: 'en',
|
1202
1100
|
remove_stop_words: true
|
1203
1101
|
)
|
1204
|
-
expect(pt.tokenize).to eq(["experiment", "fine", "."])
|
1102
|
+
expect(pt.tokenize(text)).to eq(["experiment", "fine", "."])
|
1205
1103
|
end
|
1206
1104
|
|
1207
1105
|
it 'tokenizes a string #004' do
|
1208
1106
|
# https://www.ibm.com/developerworks/community/blogs/nlp/entry/tokenization?lang=en
|
1209
1107
|
text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
|
1210
1108
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1211
|
-
text,
|
1212
1109
|
expand_contractions: true,
|
1213
1110
|
remove_stop_words: true,
|
1214
1111
|
punctuation: 'none'
|
1215
1112
|
)
|
1216
|
-
expect(pt.tokenize).to eq(%w(crazy sandowsky afford))
|
1113
|
+
expect(pt.tokenize(text)).to eq(%w(crazy sandowsky afford))
|
1217
1114
|
end
|
1218
1115
|
|
1219
1116
|
it 'tokenizes a string #005' do
|
1220
1117
|
text = "Hello world with a stop word experiment."
|
1221
1118
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1222
|
-
text,
|
1223
1119
|
language: 'en',
|
1224
1120
|
clean: true,
|
1225
1121
|
numbers: :none,
|
@@ -1228,64 +1124,58 @@ describe PragmaticTokenizer do
|
|
1228
1124
|
remove_stop_words: true,
|
1229
1125
|
punctuation: 'none'
|
1230
1126
|
)
|
1231
|
-
expect(pt.tokenize).to eq(["experiment"])
|
1127
|
+
expect(pt.tokenize(text)).to eq(["experiment"])
|
1232
1128
|
end
|
1233
1129
|
|
1234
1130
|
it 'tokenizes a string #006' do
|
1235
1131
|
text = "Hello; what is your: name @username **delete**"
|
1236
1132
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1237
|
-
text,
|
1238
1133
|
clean: true,
|
1239
1134
|
punctuation: 'none'
|
1240
1135
|
)
|
1241
|
-
expect(pt.tokenize).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
|
1136
|
+
expect(pt.tokenize(text)).to eq(["hello", "what", "is", "your", "name", "@username", "delete"])
|
1242
1137
|
end
|
1243
1138
|
|
1244
1139
|
it 'tokenizes a string #007' do
|
1245
1140
|
text = 'His name is Mr. Smith.'
|
1246
1141
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1247
|
-
text,
|
1248
1142
|
language: 'en',
|
1249
1143
|
punctuation: 'none',
|
1250
1144
|
downcase: false
|
1251
1145
|
)
|
1252
|
-
expect(pt.tokenize).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
|
1146
|
+
expect(pt.tokenize(text)).to eq(['His', 'name', 'is', 'Mr.', 'Smith'])
|
1253
1147
|
end
|
1254
1148
|
|
1255
1149
|
it 'tokenizes a string #008' do
|
1256
1150
|
text = "Can't go tonight. Didn't finish."
|
1257
1151
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1258
|
-
text,
|
1259
1152
|
downcase: false,
|
1260
1153
|
expand_contractions: true
|
1261
1154
|
)
|
1262
|
-
expect(pt.tokenize).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
|
1155
|
+
expect(pt.tokenize(text)).to eq(["Cannot", "go", "tonight", ".", "Did", "not", "finish", "."])
|
1263
1156
|
end
|
1264
1157
|
|
1265
1158
|
it 'tokenizes a string #009' do
|
1266
1159
|
text = "Some *interesting stuff* is __happening here__"
|
1267
1160
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1268
|
-
text,
|
1269
1161
|
punctuation: 'none',
|
1270
1162
|
clean: true
|
1271
1163
|
)
|
1272
|
-
expect(pt.tokenize).to eq(%w(some interesting stuff is happening here))
|
1164
|
+
expect(pt.tokenize(text)).to eq(%w(some interesting stuff is happening here))
|
1273
1165
|
end
|
1274
1166
|
|
1275
1167
|
it 'also allows symbols for options' do
|
1276
1168
|
text = 'His name is Mr. Smith.'
|
1277
1169
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1278
|
-
text,
|
1279
1170
|
language: :en,
|
1280
1171
|
punctuation: :none
|
1281
1172
|
)
|
1282
|
-
expect(pt.tokenize).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1173
|
+
expect(pt.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith'])
|
1283
1174
|
end
|
1284
1175
|
|
1285
1176
|
it 'handles long strings 1' do
|
1286
1177
|
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
1287
1178
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1288
|
-
text,
|
1289
1179
|
language: 'en',
|
1290
1180
|
clean: true,
|
1291
1181
|
minimum_length: 3,
|
@@ -1294,13 +1184,12 @@ describe PragmaticTokenizer do
|
|
1294
1184
|
numbers: :none,
|
1295
1185
|
punctuation: :none
|
1296
1186
|
)
|
1297
|
-
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
|
1187
|
+
expect(pt.tokenize(text)).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"])
|
1298
1188
|
end
|
1299
1189
|
|
1300
1190
|
it 'handles long strings 2' do
|
1301
1191
|
text = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 10
|
1302
1192
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1303
|
-
text,
|
1304
1193
|
language: 'en',
|
1305
1194
|
clean: true,
|
1306
1195
|
minimum_length: 3,
|
@@ -1309,23 +1198,21 @@ describe PragmaticTokenizer do
|
|
1309
1198
|
numbers: :none,
|
1310
1199
|
punctuation: :none
|
1311
1200
|
)
|
1312
|
-
expect(pt.tokenize).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
|
1201
|
+
expect(pt.tokenize(text)).to eq(["jonas", "jonas", "smith", "jonas", "smith", "turn", "jane", "party", "closed", "deal", "pitt", "briggs", "noon", "jane", "closed", "deal", "pitt", "briggs", "closed", "yesterday", "mt.", "fuji", "st.", "michael's", "church", "st.", "light", "jfk", "jr.", "book", "visited", "u.s.a.", "year", "live", "e.u.", "live", "u.s.", "work", "u.s.", "government", "virginia", "lived", "u.s.", "years", "bag", "bag", "teaches", "science", "worked", "years", "engineer", "local", "university", "email", "jane.doe@example.com", "email", "site", "check", "turned", "great", "turned", "great", "turned", "great", "held", "book", "long", "time", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "item", "list", "item", "sentence", "cut", "middle", "pdf", "cold", "night", "city", "features", "contact", "manager", "events", "activities", "treasure", "works", "yahoo", "accounting", "department", "good", "team", "albert", "jones", "yesterday", "thoreau", "argues", "simplifying", "one’s", "life", "laws", "universe", "complex", "bohr", "analogy", "parallel", "stairways", "smith", "left", "sentence", "omission", "ellipsis", "marks", "preceded", "space", "sentence", "period", "sentence", "meant", "left", "store", "habit", "weakened", "combining", "self-interpreting", "compounds", "practice", "abandoned"] * 10)
|
1313
1202
|
end
|
1314
1203
|
|
1315
1204
|
it 'handles markdown' do
|
1316
1205
|
text = "This is _bold_ and this is *italic*"
|
1317
1206
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1318
|
-
text,
|
1319
1207
|
punctuation: 'none',
|
1320
1208
|
clean: true
|
1321
1209
|
)
|
1322
|
-
expect(pt.tokenize).to eq(%w(this is bold and this is italic))
|
1210
|
+
expect(pt.tokenize(text)).to eq(%w(this is bold and this is italic))
|
1323
1211
|
end
|
1324
1212
|
|
1325
1213
|
it 'handles single quotes' do
|
1326
1214
|
text = "Recognised as one of the ‘good’ games."
|
1327
1215
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1328
|
-
text,
|
1329
1216
|
language: 'en',
|
1330
1217
|
clean: true,
|
1331
1218
|
numbers: :none,
|
@@ -1334,113 +1221,103 @@ describe PragmaticTokenizer do
|
|
1334
1221
|
remove_stop_words: true,
|
1335
1222
|
punctuation: :none,
|
1336
1223
|
downcase: true)
|
1337
|
-
expect(pt.tokenize).to eq(%w(recognised good games))
|
1224
|
+
expect(pt.tokenize(text)).to eq(%w(recognised good games))
|
1338
1225
|
end
|
1339
1226
|
|
1340
1227
|
it 'removes control characters' do
|
1341
1228
|
text = "\u0000 \u001F \u007FHello test."
|
1342
1229
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1343
|
-
text,
|
1344
1230
|
language: 'en',
|
1345
1231
|
clean: true
|
1346
1232
|
)
|
1347
|
-
expect(pt.tokenize).to eq(["hello", "test", "."])
|
1233
|
+
expect(pt.tokenize(text)).to eq(["hello", "test", "."])
|
1348
1234
|
end
|
1349
1235
|
|
1350
1236
|
it 'splits too long words with hypens' do
|
1351
1237
|
text = "hi-hat and old-school but not really-important-long-word"
|
1352
1238
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1353
|
-
text,
|
1354
1239
|
punctuation: 'none',
|
1355
1240
|
long_word_split: 12
|
1356
1241
|
)
|
1357
|
-
expect(pt.tokenize).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
|
1242
|
+
expect(pt.tokenize(text)).to eq(["hi-hat", "and", "old-school", "but", "not", "really", "important", "long", "word"])
|
1358
1243
|
end
|
1359
1244
|
|
1360
1245
|
it 'handles hashtags 2' do
|
1361
1246
|
text = "This is the #upper-#limit"
|
1362
1247
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1363
|
-
text,
|
1364
1248
|
punctuation: 'none',
|
1365
1249
|
hashtags: :keep_and_clean
|
1366
1250
|
)
|
1367
|
-
expect(pt.tokenize).to eq(%w(this is the upper limit))
|
1251
|
+
expect(pt.tokenize(text)).to eq(%w(this is the upper limit))
|
1368
1252
|
end
|
1369
1253
|
|
1370
1254
|
it 'handles hashtags 3' do
|
1371
1255
|
text = "The #2016-fun has just begun."
|
1372
1256
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1373
|
-
text,
|
1374
1257
|
punctuation: 'none',
|
1375
1258
|
hashtags: :keep_and_clean
|
1376
1259
|
)
|
1377
|
-
expect(pt.tokenize).to eq(%w(the 2016 fun has just begun))
|
1260
|
+
expect(pt.tokenize(text)).to eq(%w(the 2016 fun has just begun))
|
1378
1261
|
end
|
1379
1262
|
|
1380
1263
|
it 'does not clean mentions' do
|
1381
1264
|
text = "@_someone_ because @someone and @_someone was taken"
|
1382
1265
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1383
|
-
text,
|
1384
1266
|
mentions: :keep_original,
|
1385
1267
|
clean: true
|
1386
1268
|
)
|
1387
|
-
expect(pt.tokenize).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
|
1269
|
+
expect(pt.tokenize(text)).to eq(["@_someone_", "because", "@someone", "and", "@_someone", "was", "taken"])
|
1388
1270
|
end
|
1389
1271
|
|
1390
1272
|
it 'removes double single quotes' do
|
1391
1273
|
text = "Strong statement in ''The Day The Earth Caught Fire'' (1961)"
|
1392
1274
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1393
|
-
text,
|
1394
1275
|
punctuation: :none,
|
1395
1276
|
clean: true
|
1396
1277
|
)
|
1397
|
-
expect(pt.tokenize).to eq(%w(strong statement in the day the earth caught fire 1961))
|
1278
|
+
expect(pt.tokenize(text)).to eq(%w(strong statement in the day the earth caught fire 1961))
|
1398
1279
|
end
|
1399
1280
|
|
1400
1281
|
it 'removes a hyphen prefix 1' do
|
1401
1282
|
text = "Geopol.-Strategy"
|
1402
1283
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1403
|
-
text,
|
1404
1284
|
punctuation: :none,
|
1405
1285
|
clean: true
|
1406
1286
|
)
|
1407
|
-
expect(pt.tokenize).to eq(%w(geopol strategy))
|
1287
|
+
expect(pt.tokenize(text)).to eq(%w(geopol strategy))
|
1408
1288
|
end
|
1409
1289
|
|
1410
1290
|
it 'removes a hyphen prefix 2' do
|
1411
1291
|
text = "The language we use creates the reality we experience.-Michael Hyatt #quote"
|
1412
1292
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1413
|
-
text,
|
1414
1293
|
punctuation: :none,
|
1415
1294
|
clean: true
|
1416
1295
|
)
|
1417
|
-
expect(pt.tokenize).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
|
1296
|
+
expect(pt.tokenize(text)).to eq(["the", "language", "we", "use", "creates", "the", "reality", "we", "experience", "michael", "hyatt", "#quote"])
|
1418
1297
|
end
|
1419
1298
|
|
1420
1299
|
it 'does not remove tokens with ampersands' do
|
1421
1300
|
text = "you&me"
|
1422
1301
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1423
|
-
text,
|
1424
1302
|
clean: true,
|
1425
1303
|
punctuation: :none
|
1426
1304
|
)
|
1427
|
-
expect(pt.tokenize).to eq(%w(you me))
|
1305
|
+
expect(pt.tokenize(text)).to eq(%w(you me))
|
1428
1306
|
end
|
1429
1307
|
|
1430
1308
|
it 'cleans percent signs not related to numbers' do
|
1431
1309
|
text = "TudoW%1 provides company users a way to offer each other, and guests, and interpreters%6 free assistance. To date, there have been %2 questions asked."
|
1432
1310
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1433
|
-
text,
|
1434
1311
|
clean: true,
|
1435
1312
|
numbers: :none,
|
1436
1313
|
punctuation: :none
|
1437
1314
|
)
|
1438
|
-
expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
|
1315
|
+
expect(pt.tokenize(text)).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
|
1439
1316
|
end
|
1440
1317
|
|
1441
1318
|
it 'removes non-breaking spaces' do
|
1442
1319
|
text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast da hello."
|
1443
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1320
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1444
1321
|
language: :en,
|
1445
1322
|
filter_languages: [:en],
|
1446
1323
|
clean: true,
|
@@ -1456,12 +1333,12 @@ describe PragmaticTokenizer do
|
|
1456
1333
|
mentions: :remove,
|
1457
1334
|
downcase: true
|
1458
1335
|
)
|
1459
|
-
expect(pt.tokenize).to eq(["user", "john", "pt-br", "wordfast"])
|
1336
|
+
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1460
1337
|
end
|
1461
1338
|
|
1462
|
-
it 'removes
|
1463
|
-
text = "
|
1464
|
-
pt = PragmaticTokenizer::Tokenizer.new(
|
1339
|
+
it 'removes non-breaking spaces' do
|
1340
|
+
text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast da hello."
|
1341
|
+
pt = PragmaticTokenizer::Tokenizer.new(
|
1465
1342
|
language: :en,
|
1466
1343
|
filter_languages: [:en],
|
1467
1344
|
clean: true,
|
@@ -1477,225 +1354,222 @@ describe PragmaticTokenizer do
|
|
1477
1354
|
mentions: :remove,
|
1478
1355
|
downcase: true
|
1479
1356
|
)
|
1480
|
-
expect(pt.tokenize).to eq(["
|
1357
|
+
expect(pt.tokenize(text)).to eq(["user", "john", "pt-br", "wordfast"])
|
1481
1358
|
end
|
1482
|
-
|
1483
|
-
|
1484
1359
|
end
|
1485
1360
|
end
|
1486
1361
|
|
1487
1362
|
context 'ending punctutation' do
|
1488
1363
|
it 'handles ending question marks' do
|
1489
1364
|
text = 'What is your name?'
|
1490
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1365
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["what", "is", "your", "name", "?"])
|
1491
1366
|
end
|
1492
1367
|
|
1493
1368
|
it 'handles exclamation points' do
|
1494
1369
|
text = 'You are the best!'
|
1495
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1370
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["you", "are", "the", "best", "!"])
|
1496
1371
|
end
|
1497
1372
|
|
1498
1373
|
it 'handles periods' do
|
1499
1374
|
text = 'This way a productive day.'
|
1500
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1375
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "way", "a", "productive", "day", "."])
|
1501
1376
|
end
|
1502
1377
|
|
1503
1378
|
it 'handles quotation marks' do
|
1504
1379
|
text = "\"He is not the one you are looking for.\""
|
1505
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1380
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["\"", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "\""])
|
1506
1381
|
end
|
1507
1382
|
|
1508
1383
|
it 'handles single quotation marks' do
|
1509
1384
|
text = "'He is not the one you are looking for.'"
|
1510
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1385
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["'", "he", "is", "not", "the", "one", "you", "are", "looking", "for", ".", "'"])
|
1511
1386
|
end
|
1512
1387
|
|
1513
1388
|
it "handles single quotation marks ('twas)" do
|
1514
1389
|
text = "'Twas the night before Christmas and 'twas cloudy."
|
1515
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1390
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["'twas", "the", "night", "before", "christmas", "and", "'twas", "cloudy", "."])
|
1516
1391
|
end
|
1517
1392
|
|
1518
1393
|
it 'handles double quotes at the end of a sentence' do
|
1519
1394
|
text = "She said, \"I love cake.\""
|
1520
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1395
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\""])
|
1521
1396
|
end
|
1522
1397
|
|
1523
1398
|
it 'handles double quotes at the beginning of a sentence' do
|
1524
1399
|
text = "\"I love cake.\", she said to her friend."
|
1525
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1400
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["\"", "i", "love", "cake", ".", "\"", ",", "she", "said", "to", "her", "friend", "."])
|
1526
1401
|
end
|
1527
1402
|
|
1528
1403
|
it 'handles double quotes in the middle of a sentence' do
|
1529
1404
|
text = "She said, \"I love cake.\" to her friend."
|
1530
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1405
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "said", ",", "\"", "i", "love", "cake", ".", "\"", "to", "her", "friend", "."])
|
1531
1406
|
end
|
1532
1407
|
end
|
1533
1408
|
|
1534
1409
|
context 'other punctutation' do
|
1535
1410
|
it 'handles ellipses' do
|
1536
1411
|
text = 'Today is the last day...'
|
1537
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1412
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['today', 'is', 'the', 'last', 'day', '...'])
|
1538
1413
|
end
|
1539
1414
|
|
1540
1415
|
it 'handles special quotes' do
|
1541
1416
|
text = "«That's right», he said."
|
1542
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1417
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["«", "that's", "right", "»", ",", "he", "said", "."])
|
1543
1418
|
end
|
1544
1419
|
|
1545
1420
|
it 'handles upside down punctuation (¿)' do
|
1546
1421
|
text = "¿Really?"
|
1547
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1422
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["¿", "really", "?"])
|
1548
1423
|
end
|
1549
1424
|
|
1550
1425
|
it 'handles upside down punctuation (¡)' do
|
1551
1426
|
text = "¡Really!"
|
1552
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1427
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["¡", "really", "!"])
|
1553
1428
|
end
|
1554
1429
|
|
1555
1430
|
it 'handles colons' do
|
1556
1431
|
text = "This was the news: 'Today is the day!'"
|
1557
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1432
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "was", "the", "news", ":", "'", "today", "is", "the", "day", "!", "'"])
|
1558
1433
|
end
|
1559
1434
|
|
1560
1435
|
it 'handles web addresses' do
|
1561
1436
|
text = "Please visit the site - https://www.tm-town.com"
|
1562
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1437
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "visit", "the", "site", "-", "https://www.tm-town.com"])
|
1563
1438
|
end
|
1564
1439
|
|
1565
1440
|
it 'handles multiple colons and web addresses' do
|
1566
1441
|
text = "Please visit the site: https://www.tm-town.com"
|
1567
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1442
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "visit", "the", "site", ":", "https://www.tm-town.com"])
|
1568
1443
|
end
|
1569
1444
|
|
1570
1445
|
it 'handles multiple dashes' do
|
1571
1446
|
text = "John--here is your ticket."
|
1572
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1447
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["john", "-", "here", "is", "your", "ticket", "."])
|
1573
1448
|
end
|
1574
1449
|
|
1575
1450
|
it 'handles brackets' do
|
1576
1451
|
text = "This is an array: ['Hello']."
|
1577
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1452
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "an", "array", ":", "[", "'", "hello", "'", "]", "."])
|
1578
1453
|
end
|
1579
1454
|
|
1580
1455
|
it 'handles double question marks' do
|
1581
1456
|
text = "This is a question??"
|
1582
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1457
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "a", "question", "?", "?"])
|
1583
1458
|
end
|
1584
1459
|
|
1585
1460
|
it 'handles multiple ending punctuation' do
|
1586
1461
|
text = "This is a question?!?"
|
1587
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1462
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["this", "is", "a", "question", "?", "!", "?"])
|
1588
1463
|
end
|
1589
1464
|
|
1590
1465
|
it 'handles contractions 1' do
|
1591
1466
|
text = "How'd it go yesterday?"
|
1592
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1467
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["how'd", "it", "go", "yesterday", "?"])
|
1593
1468
|
end
|
1594
1469
|
|
1595
1470
|
it 'handles contractions 2' do
|
1596
1471
|
text = "You shouldn't worry."
|
1597
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1472
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["you", "shouldn't", "worry", "."])
|
1598
1473
|
end
|
1599
1474
|
|
1600
1475
|
it 'handles contractions 3' do
|
1601
1476
|
text = "We've gone too far. It'll be over when we're done."
|
1602
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1477
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["we've", "gone", "too", "far", ".", "it'll", "be", "over", "when", "we're", "done", "."])
|
1603
1478
|
end
|
1604
1479
|
|
1605
1480
|
it 'handles numbers' do
|
1606
1481
|
text = 'He paid $10,000,000 for the new house which is equivalent to ¥1,000,000,000.00.'
|
1607
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1482
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['he', 'paid', '$10,000,000', 'for', 'the', 'new', 'house', 'which', 'is', 'equivalent', 'to', '¥1,000,000,000.00', '.'])
|
1608
1483
|
end
|
1609
1484
|
|
1610
1485
|
it 'follows the Chicago Manual of Style on punctuation' do
|
1611
1486
|
text = 'An abbreviation that ends with a period must not be left hanging without it (in parentheses, e.g.), and a sentence containing a parenthesis must itself have terminal punctuation (are we almost done?).'
|
1612
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1487
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['an', 'abbreviation', 'that', 'ends', 'with', 'a', 'period', 'must', 'not', 'be', 'left', 'hanging', 'without', 'it', '(', 'in', 'parentheses', ',', 'e.g.', ')', ',', 'and', 'a', 'sentence', 'containing', 'a', 'parenthesis', 'must', 'itself', 'have', 'terminal', 'punctuation', '(', 'are', 'we', 'almost', 'done', '?', ')', '.'])
|
1613
1488
|
end
|
1614
1489
|
|
1615
1490
|
it 'is case insensitive' do
|
1616
1491
|
text = 'his name is mr. smith, king of the \'entire\' forest.'
|
1617
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1492
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['his', 'name', 'is', 'mr.', 'smith', ',', 'king', 'of', 'the', '\'', 'entire', '\'', 'forest', '.'])
|
1618
1493
|
end
|
1619
1494
|
|
1620
1495
|
it 'handles web url addresses #1' do
|
1621
1496
|
text = 'Check out http://www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1622
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1497
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "http://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1623
1498
|
end
|
1624
1499
|
|
1625
1500
|
it 'handles web url addresses #2' do
|
1626
1501
|
text = 'Check out https://www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1627
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1502
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "https://www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1628
1503
|
end
|
1629
1504
|
|
1630
1505
|
it 'handles web url addresses #3' do
|
1631
1506
|
text = 'Check out www.google.com/?this_is_a_url/hello-world.html for more info.'
|
1632
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1507
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["check", "out", "www.google.com/?this_is_a_url/hello-world.html", "for", "more", "info", "."])
|
1633
1508
|
end
|
1634
1509
|
|
1635
1510
|
it 'handles email addresses' do
|
1636
1511
|
text = 'Please email example@example.com for more info.'
|
1637
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1512
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["please", "email", "example@example.com", "for", "more", "info", "."])
|
1638
1513
|
end
|
1639
1514
|
|
1640
1515
|
it 'handles empty tokens' do
|
1641
1516
|
text = "!!!!! https://t.co/xxxx"
|
1642
1517
|
pt = PragmaticTokenizer::Tokenizer.new(
|
1643
|
-
text,
|
1644
1518
|
punctuation: 'none'
|
1645
1519
|
)
|
1646
|
-
expect(pt.tokenize).to eq(["https://t.co/xxxx"])
|
1520
|
+
expect(pt.tokenize(text)).to eq(["https://t.co/xxxx"])
|
1647
1521
|
end
|
1648
1522
|
end
|
1649
1523
|
|
1650
1524
|
context 'abbreviations' do
|
1651
1525
|
it 'handles military abbreviations' do
|
1652
1526
|
text = 'His name is Col. Smith.'
|
1653
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1527
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["his", "name", "is", "col.", "smith", "."])
|
1654
1528
|
end
|
1655
1529
|
|
1656
1530
|
it 'handles institution abbreviations' do
|
1657
1531
|
text = 'She went to East Univ. to get her degree.'
|
1658
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1532
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["she", "went", "to", "east", "univ.", "to", "get", "her", "degree", "."])
|
1659
1533
|
end
|
1660
1534
|
|
1661
1535
|
it 'handles company abbreviations' do
|
1662
1536
|
text = 'He works at ABC Inc. on weekends.'
|
1663
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1537
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "works", "at", "abc", "inc.", "on", "weekends", "."])
|
1664
1538
|
end
|
1665
1539
|
|
1666
1540
|
it 'handles old state abbreviations' do
|
1667
1541
|
text = 'He went to school in Mass. back in the day.'
|
1668
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1542
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "went", "to", "school", "in", "mass.", "back", "in", "the", "day", "."])
|
1669
1543
|
end
|
1670
1544
|
|
1671
1545
|
it 'handles month abbreviations' do
|
1672
1546
|
text = 'It is cold in Jan. they say.'
|
1673
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1547
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["it", "is", "cold", "in", "jan.", "they", "say", "."])
|
1674
1548
|
end
|
1675
1549
|
|
1676
1550
|
it 'handles miscellaneous abbreviations' do
|
1677
1551
|
text = '1, 2, 3, etc. is the beat.'
|
1678
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1552
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['1', ',', '2', ',', '3', ',', 'etc.', 'is', 'the', 'beat', '.'])
|
1679
1553
|
end
|
1680
1554
|
|
1681
1555
|
it 'handles one letter abbreviations (i.e. Alfred E. Stone)' do
|
1682
1556
|
text = 'Alfred E. Stone is a person.'
|
1683
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1557
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["alfred", "e.", "stone", "is", "a", "person", "."])
|
1684
1558
|
end
|
1685
1559
|
|
1686
1560
|
it 'handles repeating letter-dot words (i.e. U.S.A. or J.C. Penney)' do
|
1687
1561
|
text = 'The U.S.A. is a country.'
|
1688
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1562
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["the", "u.s.a.", "is", "a", "country", "."])
|
1689
1563
|
end
|
1690
1564
|
|
1691
1565
|
it 'handles abbreviations that occur at the end of a sentence' do
|
1692
1566
|
text = 'He works at ABC Inc.'
|
1693
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1567
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(["he", "works", "at", "abc", "inc."])
|
1694
1568
|
end
|
1695
1569
|
|
1696
1570
|
it 'handles punctuation after an abbreviation' do
|
1697
1571
|
text = 'Exclamation point requires both marks (Q.E.D.!).'
|
1698
|
-
expect(PragmaticTokenizer::Tokenizer.new(text)
|
1572
|
+
expect(PragmaticTokenizer::Tokenizer.new.tokenize(text)).to eq(['exclamation', 'point', 'requires', 'both', 'marks', '(', 'q.e.d.', '!', ')', '.'])
|
1699
1573
|
end
|
1700
1574
|
end
|
1701
1575
|
end
|