pragmatic_tokenizer 1.3.1 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -9
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +3 -3
- data/spec/languages/bulgarian_spec.rb +41 -0
- data/spec/languages/deutsch_spec.rb +229 -0
- data/spec/languages/english_spec.rb +1535 -0
- data/spec/languages/french_spec.rb +13 -0
- data/spec/performance_spec.rb +62 -0
- data/spec/pragmatic_tokenizer_spec.rb +41 -0
- data/spec/spec_helper.rb +2 -0
- metadata +17 -5
- data/bin/console +0 -14
- data/bin/setup +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46da26b8afc38bfc2699a2875d9786be47702160
|
4
|
+
data.tar.gz: 9073a0505244bb97d7fa51c48e9fe4ab30983c90
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8397bca6ada5fae51d1894b26154b5f9fed73e375b9e78b2803da892c20e5f32efb2722f0fe767cbc171586a5a6979a720041de5f0fd5c50824a0498c55b8394
|
7
|
+
data.tar.gz: f58cae264490ce16bef8e5512760e992dd38a11ced8fa872ab5faff2e92f61e6414f7e9ffd732f1e2910de560b2e9a2e98e1d59aa7039426ee0fbf3d8daaaa1f
|
data/README.md
CHANGED
@@ -8,17 +8,16 @@ Pragmatic Tokenizer is a multilingual tokenizer to split a string into tokens.
|
|
8
8
|
|
9
9
|
Add this line to your application's Gemfile:
|
10
10
|
|
11
|
-
|
12
|
-
|
11
|
+
**Ruby**
|
12
|
+
```
|
13
|
+
gem install pragmatic_tokenizer
|
13
14
|
```
|
14
15
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
$ gem install pragmatic_tokenizer
|
16
|
+
**Ruby on Rails**
|
17
|
+
Add this line to your application’s Gemfile:
|
18
|
+
```ruby
|
19
|
+
gem 'pragmatic_tokenizer'
|
20
|
+
```
|
22
21
|
|
23
22
|
## Usage
|
24
23
|
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -13,9 +13,9 @@ Gem::Specification.new do |spec|
|
|
13
13
|
spec.description = %q{A multilingual tokenizer to split a string into tokens.}
|
14
14
|
spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
|
15
15
|
|
16
|
-
spec.files = `git ls-files -z`.split("\x0")
|
17
|
-
spec.
|
18
|
-
spec.
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
21
|
spec.add_runtime_dependency "unicode"
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe PragmaticTokenizer do
|
4
|
+
context 'Language: Bulgarian (bg)' do
|
5
|
+
it 'tokenizes a string #001' do
|
6
|
+
text = 'Стойностни, вкл. български и руски'
|
7
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
8
|
+
language: 'bg'
|
9
|
+
)
|
10
|
+
expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'tokenizes a string #002' do
|
14
|
+
text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
|
15
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
16
|
+
language: 'bg',
|
17
|
+
remove_stop_words: true
|
18
|
+
)
|
19
|
+
expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
|
20
|
+
end
|
21
|
+
|
22
|
+
it 'tokenizes a string #003' do
|
23
|
+
text = 'Без български жертви в Париж.'
|
24
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
25
|
+
language: 'bg',
|
26
|
+
remove_stop_words: true
|
27
|
+
)
|
28
|
+
expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
|
29
|
+
end
|
30
|
+
|
31
|
+
it 'tokenizes a string #004' do
|
32
|
+
text = 'Без български жертви в Париж.'
|
33
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
34
|
+
language: 'bg',
|
35
|
+
remove_stop_words: true,
|
36
|
+
downcase: false
|
37
|
+
)
|
38
|
+
expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
@@ -0,0 +1,229 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe PragmaticTokenizer do
|
4
|
+
context 'Language: German (de)' do
|
5
|
+
it 'tokenizes a string #001' do
|
6
|
+
text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
|
7
|
+
expect(PragmaticTokenizer::Tokenizer.new(text, language: 'de').tokenize).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
|
8
|
+
end
|
9
|
+
|
10
|
+
it 'tokenizes a string #002' do
|
11
|
+
text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
|
12
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
13
|
+
language: 'de',
|
14
|
+
downcase: false,
|
15
|
+
remove_stop_words: true,
|
16
|
+
punctuation: 'none',
|
17
|
+
numbers: :none
|
18
|
+
).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
|
19
|
+
end
|
20
|
+
|
21
|
+
it 'tokenizes a string #003' do
|
22
|
+
text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
|
23
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
24
|
+
language: 'de',
|
25
|
+
downcase: false
|
26
|
+
).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'tokenizes a string #004' do
|
30
|
+
text = 'der Kaffee-Ersatz'
|
31
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
32
|
+
language: 'de',
|
33
|
+
downcase: false
|
34
|
+
).tokenize).to eq(['der', 'Kaffee-Ersatz'])
|
35
|
+
end
|
36
|
+
|
37
|
+
it 'tokenizes a string #005' do
|
38
|
+
text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
|
39
|
+
expect(PragmaticTokenizer::Tokenizer.new(text,
|
40
|
+
language: 'de',
|
41
|
+
).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'handles words with a slash 1' do
|
45
|
+
text = "We pay 3000 €/month"
|
46
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
47
|
+
punctuation: 'none',
|
48
|
+
language: 'de'
|
49
|
+
)
|
50
|
+
expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'handles words with a slash 2' do
|
54
|
+
text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
|
55
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
56
|
+
punctuation: 'none',
|
57
|
+
language: 'de'
|
58
|
+
)
|
59
|
+
expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
|
60
|
+
end
|
61
|
+
|
62
|
+
it 'handles words with a slash 3' do
|
63
|
+
text = "Poison gas attack in Ghuta/Syria."
|
64
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
65
|
+
punctuation: 'none',
|
66
|
+
language: 'de'
|
67
|
+
)
|
68
|
+
expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
|
69
|
+
end
|
70
|
+
|
71
|
+
it 'handles words with a question mark' do
|
72
|
+
text = "Essen á la carte?Man ist versucht…"
|
73
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
74
|
+
punctuation: 'none',
|
75
|
+
language: 'de'
|
76
|
+
)
|
77
|
+
expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
|
78
|
+
end
|
79
|
+
|
80
|
+
it 'handles apostrophes and quotes 3' do
|
81
|
+
text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
|
82
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
83
|
+
punctuation: 'none',
|
84
|
+
language: 'de'
|
85
|
+
)
|
86
|
+
expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
|
87
|
+
end
|
88
|
+
|
89
|
+
it 'handles mentions 1' do
|
90
|
+
text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
|
91
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
92
|
+
punctuation: 'none',
|
93
|
+
language: 'de'
|
94
|
+
)
|
95
|
+
expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'handles mentions 2' do
|
99
|
+
text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
|
100
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
101
|
+
punctuation: 'none',
|
102
|
+
language: 'de'
|
103
|
+
)
|
104
|
+
expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
|
105
|
+
end
|
106
|
+
|
107
|
+
it 'handles old school emoticons 1' do
|
108
|
+
text = "du übertreibst maßlos :D"
|
109
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
110
|
+
punctuation: 'none',
|
111
|
+
downcase: false,
|
112
|
+
language: 'de'
|
113
|
+
)
|
114
|
+
expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
|
115
|
+
end
|
116
|
+
|
117
|
+
it 'handles words with a symbol suffix' do
|
118
|
+
text = "hier ist ein Whirlpool versteckt^^"
|
119
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
120
|
+
punctuation: 'none',
|
121
|
+
language: 'de'
|
122
|
+
)
|
123
|
+
expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
|
124
|
+
end
|
125
|
+
|
126
|
+
it 'handles hashtags 1' do
|
127
|
+
text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
|
128
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
129
|
+
punctuation: 'none',
|
130
|
+
language: 'de'
|
131
|
+
)
|
132
|
+
expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
|
133
|
+
end
|
134
|
+
|
135
|
+
it 'handles numbers and words' do
|
136
|
+
text = "Air Force Once ist 18.270-mal abgehoben."
|
137
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
138
|
+
punctuation: 'none',
|
139
|
+
language: 'de'
|
140
|
+
)
|
141
|
+
expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'maintains the german gender-neutrality form 2' do
|
145
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen"
|
146
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
147
|
+
punctuation: 'none',
|
148
|
+
language: 'de'
|
149
|
+
)
|
150
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'handles contractions 1' do
|
154
|
+
text = "gibt's"
|
155
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
156
|
+
expand_contractions: true,
|
157
|
+
language: 'de'
|
158
|
+
)
|
159
|
+
expect(pt.tokenize).to eq(["gibt", "es"])
|
160
|
+
end
|
161
|
+
|
162
|
+
it 'handles contractions 2' do
|
163
|
+
text = "gibt‘s schaut’s wenn's g›spür find´s"
|
164
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
165
|
+
expand_contractions: true,
|
166
|
+
language: 'de'
|
167
|
+
)
|
168
|
+
expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
|
169
|
+
end
|
170
|
+
|
171
|
+
it 'removes English stopwords' do
|
172
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
173
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
174
|
+
filter_languages: [:en],
|
175
|
+
remove_stop_words: true,
|
176
|
+
language: 'de'
|
177
|
+
)
|
178
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
|
179
|
+
end
|
180
|
+
|
181
|
+
it 'removes English and German stopwords' do
|
182
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
183
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
184
|
+
filter_languages: [:en, :de],
|
185
|
+
remove_stop_words: true,
|
186
|
+
language: 'de'
|
187
|
+
)
|
188
|
+
expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
|
189
|
+
end
|
190
|
+
|
191
|
+
it 'does not remove English stopwords' do
|
192
|
+
text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
|
193
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
194
|
+
language: 'de'
|
195
|
+
)
|
196
|
+
expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
|
197
|
+
end
|
198
|
+
|
199
|
+
# I don't know how to easily treat these forms, especially the most frequent form
|
200
|
+
# that attaches "Innen" (plural) or "In" (singular) (with a capital I) to a word.
|
201
|
+
it 'maintains the german gender-neutrality form 1' do
|
202
|
+
skip "NOT IMPLEMENTED"
|
203
|
+
text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
|
204
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
205
|
+
punctuation: 'none',
|
206
|
+
language: 'de'
|
207
|
+
)
|
208
|
+
expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
|
209
|
+
end
|
210
|
+
|
211
|
+
it 'handles apostrophes and quotes 4' do
|
212
|
+
skip "NOT IMPLEMENTED"
|
213
|
+
text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
|
214
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
215
|
+
punctuation: 'none',
|
216
|
+
language: 'de'
|
217
|
+
)
|
218
|
+
expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
|
219
|
+
end
|
220
|
+
|
221
|
+
it 'handles abrreviations for languages other than English' do
|
222
|
+
text = "Adj. Smith how are ü. today."
|
223
|
+
pt = PragmaticTokenizer::Tokenizer.new(text,
|
224
|
+
language: :de
|
225
|
+
)
|
226
|
+
expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|