pragmatic_tokenizer 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +106 -95
- data/lib/pragmatic_tokenizer/languages/common.rb +9 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +10 -0
- data/lib/pragmatic_tokenizer/languages/french.rb +11 -1
- data/lib/pragmatic_tokenizer/processor.rb +8 -13
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b8f3d1c236d1faafc68ff8d689de291a59596b1f
|
4
|
+
data.tar.gz: ed010aae17417aa9ea87a1fa817f893c893e5d77
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e10fbb56ec2097ef2854a0517f2b4add98479fecb0318c813c791f0b3872b7ba93a489e409de14e83b5327a8e12ec64747ef64846f57c32e0625d994c50f9331
|
7
|
+
data.tar.gz: e6a8f35c84c6e1029d26dd7981c2f8bd3f43803d095ecd45de0d2ab2132399f1dc652704269217573dbcb4b99aa438b751be57ffc26fc39b8e9b3b4554a2d56e
|
data/README.md
CHANGED
@@ -70,7 +70,16 @@ Or install it yourself as:
|
|
70
70
|
##### `remove_numbers`
|
71
71
|
**default** = `'false'`
|
72
72
|
- `true`
|
73
|
-
Removes any token that contains a number
|
73
|
+
Removes any token that contains a number.
|
74
|
+
- `false`
|
75
|
+
Leaves tokens as is.
|
76
|
+
|
77
|
+
<hr>
|
78
|
+
|
79
|
+
##### `remove_roman_numerals`
|
80
|
+
**default** = `'false'`
|
81
|
+
- `true`
|
82
|
+
Removes any token that contains a Roman numeral.
|
74
83
|
- `false`
|
75
84
|
Leaves tokens as is.
|
76
85
|
|
@@ -124,145 +133,147 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
|
124
133
|
|
125
134
|
## Language Support
|
126
135
|
|
127
|
-
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
136
|
+
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
137
|
+
|
138
|
+
*N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
|
128
139
|
|
129
140
|
##### English
|
130
|
-
Specs: Yes
|
131
|
-
Abbreviations: Yes
|
132
|
-
Stop Words: Yes
|
133
|
-
Contractions: Yes
|
141
|
+
Specs: Yes
|
142
|
+
Abbreviations: Yes
|
143
|
+
Stop Words: Yes
|
144
|
+
Contractions: Yes
|
134
145
|
|
135
146
|
##### Arabic
|
136
|
-
Specs: No
|
137
|
-
Abbreviations: Yes
|
138
|
-
Stop Words: Yes
|
139
|
-
Contractions: No
|
147
|
+
Specs: No
|
148
|
+
Abbreviations: Yes
|
149
|
+
Stop Words: Yes
|
150
|
+
Contractions: No
|
140
151
|
|
141
152
|
##### Bulgarian
|
142
|
-
Specs: No
|
143
|
-
Abbreviations: Yes
|
144
|
-
Stop Words: Yes
|
145
|
-
Contractions: No
|
153
|
+
Specs: No
|
154
|
+
Abbreviations: Yes
|
155
|
+
Stop Words: Yes
|
156
|
+
Contractions: No
|
146
157
|
|
147
158
|
##### Catalan
|
148
|
-
Specs: No
|
149
|
-
Abbreviations: No
|
150
|
-
Stop Words: Yes
|
151
|
-
Contractions: No
|
159
|
+
Specs: No
|
160
|
+
Abbreviations: No
|
161
|
+
Stop Words: Yes
|
162
|
+
Contractions: No
|
152
163
|
|
153
164
|
##### Czech
|
154
|
-
Specs: No
|
155
|
-
Abbreviations: No
|
156
|
-
Stop Words: Yes
|
157
|
-
Contractions: No
|
165
|
+
Specs: No
|
166
|
+
Abbreviations: No
|
167
|
+
Stop Words: Yes
|
168
|
+
Contractions: No
|
158
169
|
|
159
170
|
##### Danish
|
160
|
-
Specs: No
|
161
|
-
Abbreviations: No
|
162
|
-
Stop Words: Yes
|
163
|
-
Contractions: No
|
171
|
+
Specs: No
|
172
|
+
Abbreviations: No
|
173
|
+
Stop Words: Yes
|
174
|
+
Contractions: No
|
164
175
|
|
165
|
-
#####
|
166
|
-
Specs: More needed
|
167
|
-
Abbreviations: Yes
|
168
|
-
Stop Words: Yes
|
169
|
-
Contractions: No
|
176
|
+
##### Deutsch
|
177
|
+
Specs: More needed
|
178
|
+
Abbreviations: Yes
|
179
|
+
Stop Words: Yes
|
180
|
+
Contractions: No
|
170
181
|
|
171
182
|
##### Finnish
|
172
|
-
Specs: No
|
173
|
-
Abbreviations: No
|
174
|
-
Stop Words: Yes
|
175
|
-
Contractions: No
|
183
|
+
Specs: No
|
184
|
+
Abbreviations: No
|
185
|
+
Stop Words: Yes
|
186
|
+
Contractions: No
|
176
187
|
|
177
188
|
##### French
|
178
|
-
Specs: More needed
|
179
|
-
Abbreviations: Yes
|
180
|
-
Stop Words: Yes
|
181
|
-
Contractions: No
|
189
|
+
Specs: More needed
|
190
|
+
Abbreviations: Yes
|
191
|
+
Stop Words: Yes
|
192
|
+
Contractions: No
|
182
193
|
|
183
194
|
##### Greek
|
184
|
-
Specs: No
|
185
|
-
Abbreviations: No
|
186
|
-
Stop Words: Yes
|
187
|
-
Contractions: No
|
195
|
+
Specs: No
|
196
|
+
Abbreviations: No
|
197
|
+
Stop Words: Yes
|
198
|
+
Contractions: No
|
188
199
|
|
189
200
|
##### Indonesian
|
190
|
-
Specs: No
|
191
|
-
Abbreviations: No
|
192
|
-
Stop Words: Yes
|
193
|
-
Contractions: No
|
201
|
+
Specs: No
|
202
|
+
Abbreviations: No
|
203
|
+
Stop Words: Yes
|
204
|
+
Contractions: No
|
194
205
|
|
195
206
|
##### Italian
|
196
|
-
Specs: No
|
197
|
-
Abbreviations: Yes
|
198
|
-
Stop Words: Yes
|
199
|
-
Contractions: No
|
207
|
+
Specs: No
|
208
|
+
Abbreviations: Yes
|
209
|
+
Stop Words: Yes
|
210
|
+
Contractions: No
|
200
211
|
|
201
212
|
##### Latvian
|
202
|
-
Specs: No
|
203
|
-
Abbreviations: No
|
204
|
-
Stop Words: Yes
|
205
|
-
Contractions: No
|
213
|
+
Specs: No
|
214
|
+
Abbreviations: No
|
215
|
+
Stop Words: Yes
|
216
|
+
Contractions: No
|
206
217
|
|
207
218
|
##### Norwegian
|
208
|
-
Specs: No
|
209
|
-
Abbreviations: No
|
210
|
-
Stop Words: Yes
|
211
|
-
Contractions: No
|
219
|
+
Specs: No
|
220
|
+
Abbreviations: No
|
221
|
+
Stop Words: Yes
|
222
|
+
Contractions: No
|
212
223
|
|
213
224
|
##### Persian
|
214
|
-
Specs: No
|
215
|
-
Abbreviations: No
|
216
|
-
Stop Words: Yes
|
217
|
-
Contractions: No
|
225
|
+
Specs: No
|
226
|
+
Abbreviations: No
|
227
|
+
Stop Words: Yes
|
228
|
+
Contractions: No
|
218
229
|
|
219
230
|
##### Polish
|
220
|
-
Specs: No
|
221
|
-
Abbreviations: Yes
|
222
|
-
Stop Words: Yes
|
223
|
-
Contractions: No
|
231
|
+
Specs: No
|
232
|
+
Abbreviations: Yes
|
233
|
+
Stop Words: Yes
|
234
|
+
Contractions: No
|
224
235
|
|
225
236
|
##### Portuguese
|
226
|
-
Specs: No
|
227
|
-
Abbreviations: No
|
228
|
-
Stop Words: Yes
|
229
|
-
Contractions: No
|
237
|
+
Specs: No
|
238
|
+
Abbreviations: No
|
239
|
+
Stop Words: Yes
|
240
|
+
Contractions: No
|
230
241
|
|
231
242
|
##### Romanian
|
232
|
-
Specs: No
|
233
|
-
Abbreviations: No
|
234
|
-
Stop Words: Yes
|
235
|
-
Contractions: No
|
243
|
+
Specs: No
|
244
|
+
Abbreviations: No
|
245
|
+
Stop Words: Yes
|
246
|
+
Contractions: No
|
236
247
|
|
237
248
|
##### Russian
|
238
|
-
Specs: No
|
239
|
-
Abbreviations: Yes
|
240
|
-
Stop Words: Yes
|
241
|
-
Contractions: No
|
249
|
+
Specs: No
|
250
|
+
Abbreviations: Yes
|
251
|
+
Stop Words: Yes
|
252
|
+
Contractions: No
|
242
253
|
|
243
254
|
##### Slovak
|
244
|
-
Specs: No
|
245
|
-
Abbreviations: No
|
246
|
-
Stop Words: Yes
|
247
|
-
Contractions: No
|
255
|
+
Specs: No
|
256
|
+
Abbreviations: No
|
257
|
+
Stop Words: Yes
|
258
|
+
Contractions: No
|
248
259
|
|
249
260
|
##### Spanish
|
250
|
-
Specs: No
|
251
|
-
Abbreviations: Yes
|
252
|
-
Stop Words: Yes
|
253
|
-
Contractions: Yes
|
261
|
+
Specs: No
|
262
|
+
Abbreviations: Yes
|
263
|
+
Stop Words: Yes
|
264
|
+
Contractions: Yes
|
254
265
|
|
255
266
|
##### Swedish
|
256
|
-
Specs: No
|
257
|
-
Abbreviations: No
|
258
|
-
Stop Words: Yes
|
259
|
-
Contractions: No
|
267
|
+
Specs: No
|
268
|
+
Abbreviations: No
|
269
|
+
Stop Words: Yes
|
270
|
+
Contractions: No
|
260
271
|
|
261
272
|
##### Turkish
|
262
|
-
Specs: No
|
263
|
-
Abbreviations: No
|
264
|
-
Stop Words: Yes
|
265
|
-
Contractions: No
|
273
|
+
Specs: No
|
274
|
+
Abbreviations: No
|
275
|
+
Stop Words: Yes
|
276
|
+
Contractions: No
|
266
277
|
|
267
278
|
## Development
|
268
279
|
|
@@ -9,7 +9,15 @@ module PragmaticTokenizer
|
|
9
9
|
ABBREVIATIONS = []
|
10
10
|
STOP_WORDS = []
|
11
11
|
CONTRACTIONS = {}
|
12
|
+
|
13
|
+
class SingleQuotes
|
14
|
+
def handle_single_quotes(text)
|
15
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
16
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
17
|
+
# Separate right single quotes
|
18
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
19
|
+
end
|
20
|
+
end
|
12
21
|
end
|
13
22
|
end
|
14
23
|
end
|
15
|
-
|
@@ -86,6 +86,16 @@ module PragmaticTokenizer
|
|
86
86
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
87
87
|
"'twas" => "it was"
|
88
88
|
}
|
89
|
+
class SingleQuotes
|
90
|
+
def handle_single_quotes(text)
|
91
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
92
|
+
# Convert left quotes to special character except for 'Twas or 'twas
|
93
|
+
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
94
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
95
|
+
# Separate right single quotes
|
96
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
97
|
+
end
|
98
|
+
end
|
89
99
|
end
|
90
100
|
end
|
91
101
|
end
|
@@ -5,6 +5,16 @@ module PragmaticTokenizer
|
|
5
5
|
ABBREVIATIONS = ["a.c.n", "a.m", "al", "ann", "apr", "art", "auj", "av", "b.p", "boul", "c.-à-d", "c.n", "c.n.s", "c.p.i", "c.q.f.d", "c.s", "ca", "cf", "ch.-l", "chap", "co", "contr", "dir", "e.g", "e.v", "env", "etc", "ex", "fasc", "fig", "fr", "fém", "hab", "i.e", "ibid", "id", "inf", "l.d", "lib", "ll.aa", "ll.aa.ii", "ll.aa.rr", "ll.aa.ss", "ll.ee", "ll.mm", "ll.mm.ii.rr", "loc.cit", "ltd", "masc", "mm", "ms", "n.b", "n.d", "n.d.a", "n.d.l.r", "n.d.t", "n.p.a.i", "n.s", "n/réf", "nn.ss", "p.c.c", "p.ex", "p.j", "p.s", "pl", "pp", "r.-v", "r.a.s", "r.i.p", "r.p", "s.a", "s.a.i", "s.a.r", "s.a.s", "s.e", "s.m", "s.m.i.r", "s.s", "sec", "sect", "sing", "sq", "sqq", "ss", "suiv", "sup", "suppl", "t.s.v.p", "tél", "vb", "vol", "vs", "x.o", "z.i", "éd"]
|
6
6
|
STOP_WORDS = ["a", "à", "â", "abord", "afin", "ah", "ai", "aie", "ainsi", "allaient", "allo", "allô", "allons", "après", "assez", "attendu", "au", "aucun", "aucune", "aujourd", "aujourd'hui", "auquel", "aura", "auront", "aussi", "autre", "autres", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "ayant", "b", "bah", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "ça", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chaque", "cher", "chère", "chères", "chers", "chez", "chiche", "chut", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "dès", "désormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "différent", "différente", "différentes", "différents", "dire", "divers", "diverse", "diverses", "dix", "dix-huit", "dixième", "dix-neuf", "dix-sept", "doit", "doivent", "donc", "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "e", "effet", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "entre", "envers", "environ", "es", "ès", "est", "et", "etant", "étaient", "étais", "était", "étant", "etc", "été", "etre", "être", "eu", "euh", "eux", "eux-mêmes", "excepté", "f", "façon", "fais", "faisaient", "faisant", "fait", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hé", "hein", "hélas", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "k", "l", "la", "là", "laquelle", "las", "le", "lequel", "les", "lès", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lorsque", "lui", "lui-même", "m", "ma", "maint", "mais", "malgré", "me", "même", "mêmes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "moi", "moi-même", "moins", "mon", "moyennant", "n", "na", "ne", "néanmoins", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notre", "nôtre", "nôtres", "nous", "nous-mêmes", "nul", "o", "o|", "ô", "oh", "ohé", "olé", "ollé", "on", "ont", "onze", "onzième", "ore", "ou", "où", "ouf", "ouias", "oust", "ouste", "outre", "p", "paf", "pan", "par", "parmi", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "plein", "plouf", "plus", "plusieurs", "plutôt", "pouah", "pour", "pourquoi", "premier", "première", "premièrement", "près", "proche", "psitt", "puisque", "q", "qu", "quand", "quant", "quanta", "quant-à-soi", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelque", "quelques", "quelqu'un", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sans", "sapristi", "sauf", "se", "seize", "selon", "sept", "septième", "sera", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "stop", "suis", "suivant", "sur", "surtout", "t", "ta", "tac", "tant", "te", "té", "tel", "telle", "tellement", "telles", "tels", "tenant", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutes", "treize", "trente", "très", "trois", "troisième", "troisièmement", "trop", "tsoin", "tsouin", "tu", "u", "un", "une", "unes", "uns", "v", "va", "vais", "vas", "vé", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", "vont", "vos", "votre", "vôtre", "vôtres", "vous", "vous-mêmes", "vu", "w", "x", "y", "z", "zut", "alors", "aucuns", "bon", "devrait", "dos", "droite", "début", "essai", "faites", "fois", "force", "haut", "ici", "juste", "maintenant", "mine", "mot", "nommés", "nouveaux", "parce", "parole", "personnes", "pièce", "plupart", "seulement", "soyez", "sujet", "tandis", "valeur", "voie", "voient", "état", "étions", "d'un", "d'une"]
|
7
7
|
CONTRACTIONS = {}
|
8
|
+
|
9
|
+
class SingleQuotes
|
10
|
+
def handle_single_quotes(text)
|
11
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
12
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
13
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
14
|
+
text.gsub!(/l\'/, '\1 l\' \2') || text
|
15
|
+
text.gsub!(/L\'/, '\1 L\' \2') || text
|
16
|
+
end
|
17
|
+
end
|
8
18
|
end
|
9
19
|
end
|
10
|
-
end
|
20
|
+
end
|
@@ -24,18 +24,17 @@ module PragmaticTokenizer
|
|
24
24
|
|
25
25
|
def convert_dbl_quotes(text)
|
26
26
|
# Convert left double quotes to special character
|
27
|
-
text.gsub!(/"(?=.*\w)/o, ' ' +
|
27
|
+
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
28
28
|
# Convert remaining quotes to special character
|
29
|
-
text.gsub!(/"/, ' ' +
|
29
|
+
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
30
30
|
end
|
31
31
|
|
32
32
|
def convert_sgl_quotes(text)
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + convert_punct_to_sym("'") + ' ' } || text
|
33
|
+
if defined? @language::SingleQuotes
|
34
|
+
@language::SingleQuotes.new.handle_single_quotes(text)
|
35
|
+
else
|
36
|
+
PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
|
37
|
+
end
|
39
38
|
end
|
40
39
|
|
41
40
|
def shift_multiple_dash(text)
|
@@ -71,7 +70,7 @@ module PragmaticTokenizer
|
|
71
70
|
!(/\A\d+/ == text.partition(':').last[0]) &&
|
72
71
|
!(/\A\d+/ == text.partition(':').first[-1])
|
73
72
|
# Ignore web addresses
|
74
|
-
text.gsub!(/(?<=[http|https]):(?=\/\/)/,
|
73
|
+
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
75
74
|
text.gsub!(/:/o, ' :') || text
|
76
75
|
end
|
77
76
|
|
@@ -124,10 +123,6 @@ module PragmaticTokenizer
|
|
124
123
|
cleaned_tokens
|
125
124
|
end
|
126
125
|
|
127
|
-
def convert_punct_to_sym(punctuation)
|
128
|
-
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
|
129
|
-
end
|
130
|
-
|
131
126
|
def convert_sym_to_punct(token)
|
132
127
|
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
|
133
128
|
if symbol.nil?
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|