pragmatic_tokenizer 1.3.1 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 600855c6a883dea597e6abd87b43721579680fe2
4
- data.tar.gz: f0dc608d45aac9701b6ded9b5de001cde3b987e9
3
+ metadata.gz: 46da26b8afc38bfc2699a2875d9786be47702160
4
+ data.tar.gz: 9073a0505244bb97d7fa51c48e9fe4ab30983c90
5
5
  SHA512:
6
- metadata.gz: 93ac2871c7e053060289f5fd758a090470c5dc7a98a9d68453c965701e425c63ef0825b83715e081e3c69c2d31617efa1738fdca36411537e622b6409d975416
7
- data.tar.gz: 2ec0fbdbdfdb29f91324853aea7ccd992ec4e3ca04274d97aead2eccc84b819b83e351e5554caac36211040508dfa01f0f8877c73a87a33c79bf5567f580f3f8
6
+ metadata.gz: 8397bca6ada5fae51d1894b26154b5f9fed73e375b9e78b2803da892c20e5f32efb2722f0fe767cbc171586a5a6979a720041de5f0fd5c50824a0498c55b8394
7
+ data.tar.gz: f58cae264490ce16bef8e5512760e992dd38a11ced8fa872ab5faff2e92f61e6414f7e9ffd732f1e2910de560b2e9a2e98e1d59aa7039426ee0fbf3d8daaaa1f
data/README.md CHANGED
@@ -8,17 +8,16 @@ Pragmatic Tokenizer is a multilingual tokenizer to split a string into tokens.
8
8
 
9
9
  Add this line to your application's Gemfile:
10
10
 
11
- ```ruby
12
- gem 'pragmatic_tokenizer'
11
+ **Ruby**
12
+ ```
13
+ gem install pragmatic_tokenizer
13
14
  ```
14
15
 
15
- And then execute:
16
-
17
- $ bundle
18
-
19
- Or install it yourself as:
20
-
21
- $ gem install pragmatic_tokenizer
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'pragmatic_tokenizer'
20
+ ```
22
21
 
23
22
  ## Usage
24
23
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.3.1"
2
+ VERSION = "1.4.0"
3
3
  end
@@ -13,9 +13,9 @@ Gem::Specification.new do |spec|
13
13
  spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
14
  spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
15
15
 
16
- spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
- spec.bindir = "exe"
18
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  spec.add_runtime_dependency "unicode"
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: Bulgarian (bg)' do
5
+ it 'tokenizes a string #001' do
6
+ text = 'Стойностни, вкл. български и руски'
7
+ pt = PragmaticTokenizer::Tokenizer.new(text,
8
+ language: 'bg'
9
+ )
10
+ expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
11
+ end
12
+
13
+ it 'tokenizes a string #002' do
14
+ text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
15
+ pt = PragmaticTokenizer::Tokenizer.new(text,
16
+ language: 'bg',
17
+ remove_stop_words: true
18
+ )
19
+ expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
20
+ end
21
+
22
+ it 'tokenizes a string #003' do
23
+ text = 'Без български жертви в Париж.'
24
+ pt = PragmaticTokenizer::Tokenizer.new(text,
25
+ language: 'bg',
26
+ remove_stop_words: true
27
+ )
28
+ expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
29
+ end
30
+
31
+ it 'tokenizes a string #004' do
32
+ text = 'Без български жертви в Париж.'
33
+ pt = PragmaticTokenizer::Tokenizer.new(text,
34
+ language: 'bg',
35
+ remove_stop_words: true,
36
+ downcase: false
37
+ )
38
+ expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,229 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: German (de)' do
5
+ it 'tokenizes a string #001' do
6
+ text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
7
+ expect(PragmaticTokenizer::Tokenizer.new(text, language: 'de').tokenize).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
8
+ end
9
+
10
+ it 'tokenizes a string #002' do
11
+ text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
12
+ expect(PragmaticTokenizer::Tokenizer.new(text,
13
+ language: 'de',
14
+ downcase: false,
15
+ remove_stop_words: true,
16
+ punctuation: 'none',
17
+ numbers: :none
18
+ ).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
19
+ end
20
+
21
+ it 'tokenizes a string #003' do
22
+ text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
23
+ expect(PragmaticTokenizer::Tokenizer.new(text,
24
+ language: 'de',
25
+ downcase: false
26
+ ).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
27
+ end
28
+
29
+ it 'tokenizes a string #004' do
30
+ text = 'der Kaffee-Ersatz'
31
+ expect(PragmaticTokenizer::Tokenizer.new(text,
32
+ language: 'de',
33
+ downcase: false
34
+ ).tokenize).to eq(['der', 'Kaffee-Ersatz'])
35
+ end
36
+
37
+ it 'tokenizes a string #005' do
38
+ text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
39
+ expect(PragmaticTokenizer::Tokenizer.new(text,
40
+ language: 'de',
41
+ ).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
42
+ end
43
+
44
+ it 'handles words with a slash 1' do
45
+ text = "We pay 3000 €/month"
46
+ pt = PragmaticTokenizer::Tokenizer.new(text,
47
+ punctuation: 'none',
48
+ language: 'de'
49
+ )
50
+ expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
51
+ end
52
+
53
+ it 'handles words with a slash 2' do
54
+ text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
55
+ pt = PragmaticTokenizer::Tokenizer.new(text,
56
+ punctuation: 'none',
57
+ language: 'de'
58
+ )
59
+ expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
60
+ end
61
+
62
+ it 'handles words with a slash 3' do
63
+ text = "Poison gas attack in Ghuta/Syria."
64
+ pt = PragmaticTokenizer::Tokenizer.new(text,
65
+ punctuation: 'none',
66
+ language: 'de'
67
+ )
68
+ expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
69
+ end
70
+
71
+ it 'handles words with a question mark' do
72
+ text = "Essen á la carte?Man ist versucht…"
73
+ pt = PragmaticTokenizer::Tokenizer.new(text,
74
+ punctuation: 'none',
75
+ language: 'de'
76
+ )
77
+ expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
78
+ end
79
+
80
+ it 'handles apostrophes and quotes 3' do
81
+ text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
82
+ pt = PragmaticTokenizer::Tokenizer.new(text,
83
+ punctuation: 'none',
84
+ language: 'de'
85
+ )
86
+ expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
87
+ end
88
+
89
+ it 'handles mentions 1' do
90
+ text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
91
+ pt = PragmaticTokenizer::Tokenizer.new(text,
92
+ punctuation: 'none',
93
+ language: 'de'
94
+ )
95
+ expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
96
+ end
97
+
98
+ it 'handles mentions 2' do
99
+ text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
100
+ pt = PragmaticTokenizer::Tokenizer.new(text,
101
+ punctuation: 'none',
102
+ language: 'de'
103
+ )
104
+ expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
105
+ end
106
+
107
+ it 'handles old school emoticons 1' do
108
+ text = "du übertreibst maßlos :D"
109
+ pt = PragmaticTokenizer::Tokenizer.new(text,
110
+ punctuation: 'none',
111
+ downcase: false,
112
+ language: 'de'
113
+ )
114
+ expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
115
+ end
116
+
117
+ it 'handles words with a symbol suffix' do
118
+ text = "hier ist ein Whirlpool versteckt^^"
119
+ pt = PragmaticTokenizer::Tokenizer.new(text,
120
+ punctuation: 'none',
121
+ language: 'de'
122
+ )
123
+ expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
124
+ end
125
+
126
+ it 'handles hashtags 1' do
127
+ text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
128
+ pt = PragmaticTokenizer::Tokenizer.new(text,
129
+ punctuation: 'none',
130
+ language: 'de'
131
+ )
132
+ expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
133
+ end
134
+
135
+ it 'handles numbers and words' do
136
+ text = "Air Force Once ist 18.270-mal abgehoben."
137
+ pt = PragmaticTokenizer::Tokenizer.new(text,
138
+ punctuation: 'none',
139
+ language: 'de'
140
+ )
141
+ expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
142
+ end
143
+
144
+ it 'maintains the german gender-neutrality form 2' do
145
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen"
146
+ pt = PragmaticTokenizer::Tokenizer.new(text,
147
+ punctuation: 'none',
148
+ language: 'de'
149
+ )
150
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
151
+ end
152
+
153
+ it 'handles contractions 1' do
154
+ text = "gibt's"
155
+ pt = PragmaticTokenizer::Tokenizer.new(text,
156
+ expand_contractions: true,
157
+ language: 'de'
158
+ )
159
+ expect(pt.tokenize).to eq(["gibt", "es"])
160
+ end
161
+
162
+ it 'handles contractions 2' do
163
+ text = "gibt‘s schaut’s wenn's g›spür find´s"
164
+ pt = PragmaticTokenizer::Tokenizer.new(text,
165
+ expand_contractions: true,
166
+ language: 'de'
167
+ )
168
+ expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
169
+ end
170
+
171
+ it 'removes English stopwords' do
172
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
173
+ pt = PragmaticTokenizer::Tokenizer.new(text,
174
+ filter_languages: [:en],
175
+ remove_stop_words: true,
176
+ language: 'de'
177
+ )
178
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
179
+ end
180
+
181
+ it 'removes English and German stopwords' do
182
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
183
+ pt = PragmaticTokenizer::Tokenizer.new(text,
184
+ filter_languages: [:en, :de],
185
+ remove_stop_words: true,
186
+ language: 'de'
187
+ )
188
+ expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
189
+ end
190
+
191
+ it 'does not remove English stopwords' do
192
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
193
+ pt = PragmaticTokenizer::Tokenizer.new(text,
194
+ language: 'de'
195
+ )
196
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
197
+ end
198
+
199
+ # I don't know how to easily treat these forms, especially the most frequent form
200
+ # that attaches "Innen" (plural) or "In" (singular) (with a capital I) to a word.
201
+ it 'maintains the german gender-neutrality form 1' do
202
+ skip "NOT IMPLEMENTED"
203
+ text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
204
+ pt = PragmaticTokenizer::Tokenizer.new(text,
205
+ punctuation: 'none',
206
+ language: 'de'
207
+ )
208
+ expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
209
+ end
210
+
211
+ it 'handles apostrophes and quotes 4' do
212
+ skip "NOT IMPLEMENTED"
213
+ text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
214
+ pt = PragmaticTokenizer::Tokenizer.new(text,
215
+ punctuation: 'none',
216
+ language: 'de'
217
+ )
218
+ expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
219
+ end
220
+
221
+ it 'handles abrreviations for languages other than English' do
222
+ text = "Adj. Smith how are ü. today."
223
+ pt = PragmaticTokenizer::Tokenizer.new(text,
224
+ language: :de
225
+ )
226
+ expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
227
+ end
228
+ end
229
+ end