pragmatic_tokenizer 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 600855c6a883dea597e6abd87b43721579680fe2
4
- data.tar.gz: f0dc608d45aac9701b6ded9b5de001cde3b987e9
3
+ metadata.gz: 46da26b8afc38bfc2699a2875d9786be47702160
4
+ data.tar.gz: 9073a0505244bb97d7fa51c48e9fe4ab30983c90
5
5
  SHA512:
6
- metadata.gz: 93ac2871c7e053060289f5fd758a090470c5dc7a98a9d68453c965701e425c63ef0825b83715e081e3c69c2d31617efa1738fdca36411537e622b6409d975416
7
- data.tar.gz: 2ec0fbdbdfdb29f91324853aea7ccd992ec4e3ca04274d97aead2eccc84b819b83e351e5554caac36211040508dfa01f0f8877c73a87a33c79bf5567f580f3f8
6
+ metadata.gz: 8397bca6ada5fae51d1894b26154b5f9fed73e375b9e78b2803da892c20e5f32efb2722f0fe767cbc171586a5a6979a720041de5f0fd5c50824a0498c55b8394
7
+ data.tar.gz: f58cae264490ce16bef8e5512760e992dd38a11ced8fa872ab5faff2e92f61e6414f7e9ffd732f1e2910de560b2e9a2e98e1d59aa7039426ee0fbf3d8daaaa1f
data/README.md CHANGED
@@ -8,17 +8,16 @@ Pragmatic Tokenizer is a multilingual tokenizer to split a string into tokens.
8
8
 
9
9
  Add this line to your application's Gemfile:
10
10
 
11
- ```ruby
12
- gem 'pragmatic_tokenizer'
11
+ **Ruby**
12
+ ```
13
+ gem install pragmatic_tokenizer
13
14
  ```
14
15
 
15
- And then execute:
16
-
17
- $ bundle
18
-
19
- Or install it yourself as:
20
-
21
- $ gem install pragmatic_tokenizer
16
+ **Ruby on Rails**
17
+ Add this line to your application’s Gemfile:
18
+ ```ruby
19
+ gem 'pragmatic_tokenizer'
20
+ ```
22
21
 
23
22
  ## Usage
24
23
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.3.1"
2
+ VERSION = "1.4.0"
3
3
  end
@@ -13,9 +13,9 @@ Gem::Specification.new do |spec|
13
13
  spec.description = %q{A multilingual tokenizer to split a string into tokens.}
14
14
  spec.homepage = "https://github.com/diasks2/pragmatic_tokenizer"
15
15
 
16
- spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
17
- spec.bindir = "exe"
18
- spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  spec.add_runtime_dependency "unicode"
@@ -0,0 +1,41 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: Bulgarian (bg)' do
5
+ it 'tokenizes a string #001' do
6
+ text = 'Стойностни, вкл. български и руски'
7
+ pt = PragmaticTokenizer::Tokenizer.new(text,
8
+ language: 'bg'
9
+ )
10
+ expect(pt.tokenize).to eq(["стойностни", ",", "вкл.", "български", "и", "руски"])
11
+ end
12
+
13
+ it 'tokenizes a string #002' do
14
+ text = 'Той поставя началото на могъща династия, която управлява в продължение на 150 г. Саргон надделява в двубой с владетеля на град Ур и разширява териториите на държавата си по долното течение на Тигър и Ефрат.'
15
+ pt = PragmaticTokenizer::Tokenizer.new(text,
16
+ language: 'bg',
17
+ remove_stop_words: true
18
+ )
19
+ expect(pt.tokenize).to eq(["поставя", "началото", "могъща", "династия", ",", "управлява", "продължение", "150", "саргон", "надделява", "двубой", "владетеля", "град", "ур", "разширява", "териториите", "държавата", "долното", "течение", "тигър", "ефрат", "."])
20
+ end
21
+
22
+ it 'tokenizes a string #003' do
23
+ text = 'Без български жертви в Париж.'
24
+ pt = PragmaticTokenizer::Tokenizer.new(text,
25
+ language: 'bg',
26
+ remove_stop_words: true
27
+ )
28
+ expect(pt.tokenize).to eq(["български", "жертви", "париж", "."])
29
+ end
30
+
31
+ it 'tokenizes a string #004' do
32
+ text = 'Без български жертви в Париж.'
33
+ pt = PragmaticTokenizer::Tokenizer.new(text,
34
+ language: 'bg',
35
+ remove_stop_words: true,
36
+ downcase: false
37
+ )
38
+ expect(pt.tokenize).to eq(["български", "жертви", "Париж", "."])
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,229 @@
1
+ require 'spec_helper'
2
+
3
+ describe PragmaticTokenizer do
4
+ context 'Language: German (de)' do
5
+ it 'tokenizes a string #001' do
6
+ text = 'Das steht auf S. 23, s. vorherige Anmerkung.'
7
+ expect(PragmaticTokenizer::Tokenizer.new(text, language: 'de').tokenize).to eq(['das', 'steht', 'auf', 's.', '23', ',', 's.', 'vorherige', 'anmerkung', '.'])
8
+ end
9
+
10
+ it 'tokenizes a string #002' do
11
+ text = 'Die größte Ausdehnung des Landes vom Westen nach Osten beträgt 650 km – von Nord nach Süd sind es 560 km. Unter den europäischen Staaten ist Weißrussland flächenmäßig an 13'
12
+ expect(PragmaticTokenizer::Tokenizer.new(text,
13
+ language: 'de',
14
+ downcase: false,
15
+ remove_stop_words: true,
16
+ punctuation: 'none',
17
+ numbers: :none
18
+ ).tokenize).to eq(["größte", "Ausdehnung", "Landes", "Westen", "Osten", "beträgt", "Nord", "Süd", "europäischen", "Staaten", "Weißrussland", "flächenmäßig"])
19
+ end
20
+
21
+ it 'tokenizes a string #003' do
22
+ text = 'Die weißrussischen offiziellen Stellen wie auch die deutsche Diplomatie verwenden in offiziellen deutschsprachigen Texten den Namen Belarus, um die Unterscheidung von Russland zu verdeutlichen.'
23
+ expect(PragmaticTokenizer::Tokenizer.new(text,
24
+ language: 'de',
25
+ downcase: false
26
+ ).tokenize).to eq(["Die", "weißrussischen", "offiziellen", "Stellen", "wie", "auch", "die", "deutsche", "Diplomatie", "verwenden", "in", "offiziellen", "deutschsprachigen", "Texten", "den", "Namen", "Belarus", ",", "um", "die", "Unterscheidung", "von", "Russland", "zu", "verdeutlichen", "."])
27
+ end
28
+
29
+ it 'tokenizes a string #004' do
30
+ text = 'der Kaffee-Ersatz'
31
+ expect(PragmaticTokenizer::Tokenizer.new(text,
32
+ language: 'de',
33
+ downcase: false
34
+ ).tokenize).to eq(['der', 'Kaffee-Ersatz'])
35
+ end
36
+
37
+ it 'tokenizes a string #005' do
38
+ text = "Charlie Hebdo backlash over 'racist' Alan Kurdi cartoon - https://t.co/J8N2ylVV3w"
39
+ expect(PragmaticTokenizer::Tokenizer.new(text,
40
+ language: 'de',
41
+ ).tokenize).to eq(["charlie", "hebdo", "backlash", "over", "'", "racist", "'", "alan", "kurdi", "cartoon", "-", "https://t.co/j8n2ylvv3w"])
42
+ end
43
+
44
+ it 'handles words with a slash 1' do
45
+ text = "We pay 3000 €/month"
46
+ pt = PragmaticTokenizer::Tokenizer.new(text,
47
+ punctuation: 'none',
48
+ language: 'de'
49
+ )
50
+ expect(pt.tokenize).to eq(["we", "pay", "3000", "€", "month"])
51
+ end
52
+
53
+ it 'handles words with a slash 2' do
54
+ text = "Ich frage mich, wieso er nicht Herr der Lage war/ist."
55
+ pt = PragmaticTokenizer::Tokenizer.new(text,
56
+ punctuation: 'none',
57
+ language: 'de'
58
+ )
59
+ expect(pt.tokenize).to eq(["ich", "frage", "mich", "wieso", "er", "nicht", "herr", "der", "lage", "war", "ist"])
60
+ end
61
+
62
+ it 'handles words with a slash 3' do
63
+ text = "Poison gas attack in Ghuta/Syria."
64
+ pt = PragmaticTokenizer::Tokenizer.new(text,
65
+ punctuation: 'none',
66
+ language: 'de'
67
+ )
68
+ expect(pt.tokenize).to eq(["poison", "gas", "attack", "in", "ghuta", "syria"])
69
+ end
70
+
71
+ it 'handles words with a question mark' do
72
+ text = "Essen á la carte?Man ist versucht…"
73
+ pt = PragmaticTokenizer::Tokenizer.new(text,
74
+ punctuation: 'none',
75
+ language: 'de'
76
+ )
77
+ expect(pt.tokenize).to eq(["essen", "á", "la", "carte", "man", "ist", "versucht"])
78
+ end
79
+
80
+ it 'handles apostrophes and quotes 3' do
81
+ text = "Die “Mitte der Gesellschaft” interessiert sich jetzt für “Feminismus”."
82
+ pt = PragmaticTokenizer::Tokenizer.new(text,
83
+ punctuation: 'none',
84
+ language: 'de'
85
+ )
86
+ expect(pt.tokenize).to eq(["die", "mitte", "der", "gesellschaft", "interessiert", "sich", "jetzt", "für", "feminismus"])
87
+ end
88
+
89
+ it 'handles mentions 1' do
90
+ text = "@RainerSteinke @_Sternchen_2015 1:0 für dich."
91
+ pt = PragmaticTokenizer::Tokenizer.new(text,
92
+ punctuation: 'none',
93
+ language: 'de'
94
+ )
95
+ expect(pt.tokenize).to eq(["@rainersteinke", "@_sternchen_2015", "1:0", "für", "dich"])
96
+ end
97
+
98
+ it 'handles mentions 2' do
99
+ text = "@LandauDaniel @AnthZeto @julianfranz @S_Beck19 Yep!"
100
+ pt = PragmaticTokenizer::Tokenizer.new(text,
101
+ punctuation: 'none',
102
+ language: 'de'
103
+ )
104
+ expect(pt.tokenize).to eq(["@landaudaniel", "@anthzeto", "@julianfranz", "@s_beck19", "yep"])
105
+ end
106
+
107
+ it 'handles old school emoticons 1' do
108
+ text = "du übertreibst maßlos :D"
109
+ pt = PragmaticTokenizer::Tokenizer.new(text,
110
+ punctuation: 'none',
111
+ downcase: false,
112
+ language: 'de'
113
+ )
114
+ expect(pt.tokenize).to eq(["du", "übertreibst", "maßlos", ":D"])
115
+ end
116
+
117
+ it 'handles words with a symbol suffix' do
118
+ text = "hier ist ein Whirlpool versteckt^^"
119
+ pt = PragmaticTokenizer::Tokenizer.new(text,
120
+ punctuation: 'none',
121
+ language: 'de'
122
+ )
123
+ expect(pt.tokenize).to eq(["hier", "ist", "ein", "whirlpool", "versteckt"])
124
+ end
125
+
126
+ it 'handles hashtags 1' do
127
+ text = "„Was wir tun wird in diesem Land Leben retten“:#Obama"
128
+ pt = PragmaticTokenizer::Tokenizer.new(text,
129
+ punctuation: 'none',
130
+ language: 'de'
131
+ )
132
+ expect(pt.tokenize).to eq(["was", "wir", "tun", "wird", "in", "diesem", "land", "leben", "retten", "#obama"])
133
+ end
134
+
135
+ it 'handles numbers and words' do
136
+ text = "Air Force Once ist 18.270-mal abgehoben."
137
+ pt = PragmaticTokenizer::Tokenizer.new(text,
138
+ punctuation: 'none',
139
+ language: 'de'
140
+ )
141
+ expect(pt.tokenize).to eq(["air", "force", "once", "ist", "18.270-mal", "abgehoben"])
142
+ end
143
+
144
+ it 'maintains the german gender-neutrality form 2' do
145
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen"
146
+ pt = PragmaticTokenizer::Tokenizer.new(text,
147
+ punctuation: 'none',
148
+ language: 'de'
149
+ )
150
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen"])
151
+ end
152
+
153
+ it 'handles contractions 1' do
154
+ text = "gibt's"
155
+ pt = PragmaticTokenizer::Tokenizer.new(text,
156
+ expand_contractions: true,
157
+ language: 'de'
158
+ )
159
+ expect(pt.tokenize).to eq(["gibt", "es"])
160
+ end
161
+
162
+ it 'handles contractions 2' do
163
+ text = "gibt‘s schaut’s wenn's g›spür find´s"
164
+ pt = PragmaticTokenizer::Tokenizer.new(text,
165
+ expand_contractions: true,
166
+ language: 'de'
167
+ )
168
+ expect(pt.tokenize).to eq(["gibt", "es", "schaut", "es", "wenn", "es", "gespür", "finde", "es"])
169
+ end
170
+
171
+ it 'removes English stopwords' do
172
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
173
+ pt = PragmaticTokenizer::Tokenizer.new(text,
174
+ filter_languages: [:en],
175
+ remove_stop_words: true,
176
+ language: 'de'
177
+ )
178
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "english", "."])
179
+ end
180
+
181
+ it 'removes English and German stopwords' do
182
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
183
+ pt = PragmaticTokenizer::Tokenizer.new(text,
184
+ filter_languages: [:en, :de],
185
+ remove_stop_words: true,
186
+ language: 'de'
187
+ )
188
+ expect(pt.tokenize).to eq(["lehrer_in", "schüler_innen", ".", "english", "."])
189
+ end
190
+
191
+ it 'does not remove English stopwords' do
192
+ text = "der/die Lehrer_in und seine/ihre Schüler_innen. This has some English."
193
+ pt = PragmaticTokenizer::Tokenizer.new(text,
194
+ language: 'de'
195
+ )
196
+ expect(pt.tokenize).to eq(["der", "die", "lehrer_in", "und", "seine", "ihre", "schüler_innen", ".", "this", "has", "some", "english", "."])
197
+ end
198
+
199
+ # I don't know how to easily treat these forms, especially the most frequent form
200
+ # that attaches "Innen" (plural) or "In" (singular) (with a capital I) to a word.
201
+ it 'maintains the german gender-neutrality form 1' do
202
+ skip "NOT IMPLEMENTED"
203
+ text = "Wir brauchen eine/n erfahrene/n Informatiker/in."
204
+ pt = PragmaticTokenizer::Tokenizer.new(text,
205
+ punctuation: 'none',
206
+ language: 'de'
207
+ )
208
+ expect(pt.tokenize).to eq(["wir", "brauchen", "eine/n", "erfahrene/n", "informatiker/in"])
209
+ end
210
+
211
+ it 'handles apostrophes and quotes 4' do
212
+ skip "NOT IMPLEMENTED"
213
+ text = "Endlich regnet es ihm nicht mehr auf ́s Haupt!"
214
+ pt = PragmaticTokenizer::Tokenizer.new(text,
215
+ punctuation: 'none',
216
+ language: 'de'
217
+ )
218
+ expect(pt.tokenize).to eq(["endlich", "regnet", "es", "ihm", "nicht", "mehr", "auf́s", "haupt"])
219
+ end
220
+
221
+ it 'handles abrreviations for languages other than English' do
222
+ text = "Adj. Smith how are ü. today."
223
+ pt = PragmaticTokenizer::Tokenizer.new(text,
224
+ language: :de
225
+ )
226
+ expect(pt.tokenize).to eq(["adj", ".", "smith", "how", "are", "ü.", "today", "."])
227
+ end
228
+ end
229
+ end