pragmatic_tokenizer 1.3.1 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -9
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +3 -3
- data/spec/languages/bulgarian_spec.rb +41 -0
- data/spec/languages/deutsch_spec.rb +229 -0
- data/spec/languages/english_spec.rb +1535 -0
- data/spec/languages/french_spec.rb +13 -0
- data/spec/performance_spec.rb +62 -0
- data/spec/pragmatic_tokenizer_spec.rb +41 -0
- data/spec/spec_helper.rb +2 -0
- metadata +17 -5
- data/bin/console +0 -14
- data/bin/setup +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46da26b8afc38bfc2699a2875d9786be47702160
|
4
|
+
data.tar.gz: 9073a0505244bb97d7fa51c48e9fe4ab30983c90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8397bca6ada5fae51d1894b26154b5f9fed73e375b9e78b2803da892c20e5f32efb2722f0fe767cbc171586a5a6979a720041de5f0fd5c50824a0498c55b8394
|
7
|
+
data.tar.gz: f58cae264490ce16bef8e5512760e992dd38a11ced8fa872ab5faff2e92f61e6414f7e9ffd732f1e2910de560b2e9a2e98e1d59aa7039426ee0fbf3d8daaaa1f
|
data/README.md
CHANGED
@@ -8,17 +8,16 @@ Pragmatic Tokenizer is a multilingual tokenizer to split a string into tokens.
|
|
8
8
|
|
9
9
|
Add this line to your application's Gemfile:
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
**Ruby**
|
12
|
+
```
|
13
|
+
gem install pragmatic_tokenizer
|
13
14
|
```
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
$ gem install pragmatic_tokenizer
|
16
|
+
**Ruby on Rails**
|
17
|
+
Add this line to your application’s Gemfile:
|
18
|
+
```ruby
|
19
|
+
gem 'pragmatic_tokenizer'
|
20
|
+
```
|
22
21
|
|
23
22
|
## Usage
|
24
23
|
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -13,9 +13,9 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = %q{A multilingual tokenizer to split a string into tokens.}
|
14
14
|
spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
|
15
15
|
|
16
|
-
spec.files = `git ls-files -z`.split("\x0")
|
17
|
-
spec.
|
18
|
-
spec.
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
21
|
spec.add_runtime_dependency "unicode"
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe PragmaticTokenizer do
|
4
|
+
context 'Language: Bulgarian (bg)' do
|
5
|
+
it 'tokenizes a string #001' do
|
6
|
+
text = 'Стойностни, вкл. български и руски'
|
7
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
8
|
+
language: 'bg'
|
9
|
+
)
|
10
|
+
expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'tokenizes a string #002' do
|
14
|
+
text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
|
15
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
16
|
+
language: 'bg',
|
17
|
+
remove_stop_words: true
|
18
|
+
)
|
19
|
+
expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'tokenizes a string #003' do
|
23
|
+
text = 'Без български жертви в Париж.'
|
24
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
25
|
+
language: 'bg',
|
26
|
+
remove_stop_words: true
|
27
|
+
)
|
28
|
+
expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'tokenizes a string #004' do
|
32
|
+
text = 'Без български жертви в Париж.'
|
33
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
34
|
+
language: 'bg',
|
35
|
+
remove_stop_words: true,
|
36
|
+
downcase: false
|
37
|
+
)
|
38
|
+
expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe PragmaticTokenizer do
|
4
|
+
context 'Language: German (de)' do
|
5
|
+
it 'tokenizes a string #001' do
|
6
|
+
text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
|
7
|
+
expect(PragmaticTokenizer::Tokenizer.new(text, language: 'de').tokenize).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'tokenizes a string #002' do
|
11
|
+
text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
|
12
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
13
|
+
language: 'de',
|
14
|
+
downcase: false,
|
15
|
+
remove_stop_words: true,
|
16
|
+
punctuation: 'none',
|
17
|
+
numbers: :none
|
18
|
+
).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'tokenizes a string #003' do
|
22
|
+
text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
|
23
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
24
|
+
language: 'de',
|
25
|
+
downcase: false
|
26
|
+
).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'tokenizes a string #004' do
|
30
|
+
text = 'der Kaffee-Ersatz'
|
31
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
32
|
+
language: 'de',
|
33
|
+
downcase: false
|
34
|
+
).tokenize).to eq(['der', 'Kaffee-Ersatz'])
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'tokenizes a string #005' do
|
38
|
+
text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
|
39
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
40
|
+
language: 'de',
|
41
|
+
).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'handles words with a slash 1' do
|
45
|
+
text = "We pay 3000 €/month"
|
46
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
47
|
+
punctuation: 'none',
|
48
|
+
language: 'de'
|
49
|
+
)
|
50
|
+
expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'handles words with a slash 2' do
|
54
|
+
text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
|
55
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
56
|
+
punctuation: 'none',
|
57
|
+
language: 'de'
|
58
|
+
)
|
59
|
+
expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'handles words with a slash 3' do
|
63
|
+
text = "Poison gas attack in Ghuta/Syria."
|
64
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
65
|
+
punctuation: 'none',
|
66
|
+
language: 'de'
|
67
|
+
)
|
68
|
+
expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'handles words with a question mark' do
|
72
|
+
text = "Essen á la carte?Man ist versucht…"
|
73
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
74
|
+
punctuation: 'none',
|
75
|
+
language: 'de'
|
76
|
+
)
|
77
|
+
expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'handles apostrophes and quotes 3' do
|
81
|
+
text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
|
82
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
83
|
+
punctuation: 'none',
|
84
|
+
language: 'de'
|
85
|
+
)
|
86
|
+
expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'handles mentions 1' do
|
90
|
+
text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
|
91
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
92
|
+
punctuation: 'none',
|
93
|
+
language: 'de'
|
94
|
+
)
|
95
|
+
expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'handles mentions 2' do
|
99
|
+
text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
|
100
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
101
|
+
punctuation: 'none',
|
102
|
+
language: 'de'
|
103
|
+
)
|
104
|
+
expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'handles old school emoticons 1' do
|
108
|
+
text = "du übertreibst maßlos :D"
|
109
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
110
|
+
punctuation: 'none',
|
111
|
+
downcase: false,
|
112
|
+
language: 'de'
|
113
|
+
)
|
114
|
+
expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'handles words with a symbol suffix' do
|
118
|
+
text = "hier ist ein Whirlpool versteckt^^"
|
119
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
120
|
+
punctuation: 'none',
|
121
|
+
language: 'de'
|
122
|
+
)
|
123
|
+
expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'handles hashtags 1' do
|
127
|
+
text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
|
128
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
129
|
+
punctuation: 'none',
|
130
|
+
language: 'de'
|
131
|
+
)
|
132
|
+
expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'handles numbers and words' do
|
136
|
+
text = "Air Force Once ist 18.270-mal abgehoben."
|
137
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
138
|
+
punctuation: 'none',
|
139
|
+
language: 'de'
|
140
|
+
)
|
141
|
+
expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'maintains the german gender-neutrality form 2' do
|
145
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen"
|
146
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
147
|
+
punctuation: 'none',
|
148
|
+
language: 'de'
|
149
|
+
)
|
150
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'handles contractions 1' do
|
154
|
+
text = "gibt's"
|
155
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
156
|
+
expand_contractions: true,
|
157
|
+
language: 'de'
|
158
|
+
)
|
159
|
+
expect(pt.tokenize).to eq(["gibt", "es"])
|
160
|
+
end
|
161
|
+
|
162
|
+
it 'handles contractions 2' do
|
163
|
+
text = "gibt‘s schaut’s wenn's g›spür find´s"
|
164
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
165
|
+
expand_contractions: true,
|
166
|
+
language: 'de'
|
167
|
+
)
|
168
|
+
expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
|
169
|
+
end
|
170
|
+
|
171
|
+
it 'removes English stopwords' do
|
172
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
173
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
174
|
+
filter_languages: [:en],
|
175
|
+
remove_stop_words: true,
|
176
|
+
language: 'de'
|
177
|
+
)
|
178
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'removes English and German stopwords' do
|
182
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
183
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
184
|
+
filter_languages: [:en, :de],
|
185
|
+
remove_stop_words: true,
|
186
|
+
language: 'de'
|
187
|
+
)
|
188
|
+
expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'does not remove English stopwords' do
|
192
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
193
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
194
|
+
language: 'de'
|
195
|
+
)
|
196
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
197
|
+
end
|
198
|
+
|
199
|
+
# I don't know how to easily treat these forms, especially the most frequent form
|
200
|
+
# that attaches "Innen" (plural) or "In" (singular) (with a capital I) to a word.
|
201
|
+
it 'maintains the german gender-neutrality form 1' do
|
202
|
+
skip "NOT IMPLEMENTED"
|
203
|
+
text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
|
204
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
205
|
+
punctuation: 'none',
|
206
|
+
language: 'de'
|
207
|
+
)
|
208
|
+
expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
209
|
+
end
|
210
|
+
|
211
|
+
it 'handles apostrophes and quotes 4' do
|
212
|
+
skip "NOT IMPLEMENTED"
|
213
|
+
text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
|
214
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
215
|
+
punctuation: 'none',
|
216
|
+
language: 'de'
|
217
|
+
)
|
218
|
+
expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
|
219
|
+
end
|
220
|
+
|
221
|
+
it 'handles abrreviations for languages other than English' do
|
222
|
+
text = "Adj. Smith how are ü. today."
|
223
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
224
|
+
language: :de
|
225
|
+
)
|
226
|
+
expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|