pragmatic_tokenizer 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +106 -95
- data/lib/pragmatic_tokenizer/languages/common.rb +9 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +10 -0
- data/lib/pragmatic_tokenizer/languages/french.rb +11 -1
- data/lib/pragmatic_tokenizer/processor.rb +8 -13
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b8f3d1c236d1faafc68ff8d689de291a59596b1f
|
|
4
|
+
data.tar.gz: ed010aae17417aa9ea87a1fa817f893c893e5d77
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: e10fbb56ec2097ef2854a0517f2b4add98479fecb0318c813c791f0b3872b7ba93a489e409de14e83b5327a8e12ec64747ef64846f57c32e0625d994c50f9331
|
|
7
|
+
data.tar.gz: e6a8f35c84c6e1029d26dd7981c2f8bd3f43803d095ecd45de0d2ab2132399f1dc652704269217573dbcb4b99aa438b751be57ffc26fc39b8e9b3b4554a2d56e
|
data/README.md
CHANGED
|
@@ -70,7 +70,16 @@ Or install it yourself as:
|
|
|
70
70
|
##### `remove_numbers`
|
|
71
71
|
**default** = `'false'`
|
|
72
72
|
- `true`
|
|
73
|
-
Removes any token that contains a number
|
|
73
|
+
Removes any token that contains a number.
|
|
74
|
+
- `false`
|
|
75
|
+
Leaves tokens as is.
|
|
76
|
+
|
|
77
|
+
<hr>
|
|
78
|
+
|
|
79
|
+
##### `remove_roman_numerals`
|
|
80
|
+
**default** = `'false'`
|
|
81
|
+
- `true`
|
|
82
|
+
Removes any token that contains a Roman numeral.
|
|
74
83
|
- `false`
|
|
75
84
|
Leaves tokens as is.
|
|
76
85
|
|
|
@@ -124,145 +133,147 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
|
|
|
124
133
|
|
|
125
134
|
## Language Support
|
|
126
135
|
|
|
127
|
-
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
|
136
|
+
The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
|
|
137
|
+
|
|
138
|
+
*N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
|
|
128
139
|
|
|
129
140
|
##### English
|
|
130
|
-
Specs: Yes
|
|
131
|
-
Abbreviations: Yes
|
|
132
|
-
Stop Words: Yes
|
|
133
|
-
Contractions: Yes
|
|
141
|
+
Specs: Yes
|
|
142
|
+
Abbreviations: Yes
|
|
143
|
+
Stop Words: Yes
|
|
144
|
+
Contractions: Yes
|
|
134
145
|
|
|
135
146
|
##### Arabic
|
|
136
|
-
Specs: No
|
|
137
|
-
Abbreviations: Yes
|
|
138
|
-
Stop Words: Yes
|
|
139
|
-
Contractions: No
|
|
147
|
+
Specs: No
|
|
148
|
+
Abbreviations: Yes
|
|
149
|
+
Stop Words: Yes
|
|
150
|
+
Contractions: No
|
|
140
151
|
|
|
141
152
|
##### Bulgarian
|
|
142
|
-
Specs: No
|
|
143
|
-
Abbreviations: Yes
|
|
144
|
-
Stop Words: Yes
|
|
145
|
-
Contractions: No
|
|
153
|
+
Specs: No
|
|
154
|
+
Abbreviations: Yes
|
|
155
|
+
Stop Words: Yes
|
|
156
|
+
Contractions: No
|
|
146
157
|
|
|
147
158
|
##### Catalan
|
|
148
|
-
Specs: No
|
|
149
|
-
Abbreviations: No
|
|
150
|
-
Stop Words: Yes
|
|
151
|
-
Contractions: No
|
|
159
|
+
Specs: No
|
|
160
|
+
Abbreviations: No
|
|
161
|
+
Stop Words: Yes
|
|
162
|
+
Contractions: No
|
|
152
163
|
|
|
153
164
|
##### Czech
|
|
154
|
-
Specs: No
|
|
155
|
-
Abbreviations: No
|
|
156
|
-
Stop Words: Yes
|
|
157
|
-
Contractions: No
|
|
165
|
+
Specs: No
|
|
166
|
+
Abbreviations: No
|
|
167
|
+
Stop Words: Yes
|
|
168
|
+
Contractions: No
|
|
158
169
|
|
|
159
170
|
##### Danish
|
|
160
|
-
Specs: No
|
|
161
|
-
Abbreviations: No
|
|
162
|
-
Stop Words: Yes
|
|
163
|
-
Contractions: No
|
|
171
|
+
Specs: No
|
|
172
|
+
Abbreviations: No
|
|
173
|
+
Stop Words: Yes
|
|
174
|
+
Contractions: No
|
|
164
175
|
|
|
165
|
-
#####
|
|
166
|
-
Specs: More needed
|
|
167
|
-
Abbreviations: Yes
|
|
168
|
-
Stop Words: Yes
|
|
169
|
-
Contractions: No
|
|
176
|
+
##### Deutsch
|
|
177
|
+
Specs: More needed
|
|
178
|
+
Abbreviations: Yes
|
|
179
|
+
Stop Words: Yes
|
|
180
|
+
Contractions: No
|
|
170
181
|
|
|
171
182
|
##### Finnish
|
|
172
|
-
Specs: No
|
|
173
|
-
Abbreviations: No
|
|
174
|
-
Stop Words: Yes
|
|
175
|
-
Contractions: No
|
|
183
|
+
Specs: No
|
|
184
|
+
Abbreviations: No
|
|
185
|
+
Stop Words: Yes
|
|
186
|
+
Contractions: No
|
|
176
187
|
|
|
177
188
|
##### French
|
|
178
|
-
Specs: More needed
|
|
179
|
-
Abbreviations: Yes
|
|
180
|
-
Stop Words: Yes
|
|
181
|
-
Contractions: No
|
|
189
|
+
Specs: More needed
|
|
190
|
+
Abbreviations: Yes
|
|
191
|
+
Stop Words: Yes
|
|
192
|
+
Contractions: No
|
|
182
193
|
|
|
183
194
|
##### Greek
|
|
184
|
-
Specs: No
|
|
185
|
-
Abbreviations: No
|
|
186
|
-
Stop Words: Yes
|
|
187
|
-
Contractions: No
|
|
195
|
+
Specs: No
|
|
196
|
+
Abbreviations: No
|
|
197
|
+
Stop Words: Yes
|
|
198
|
+
Contractions: No
|
|
188
199
|
|
|
189
200
|
##### Indonesian
|
|
190
|
-
Specs: No
|
|
191
|
-
Abbreviations: No
|
|
192
|
-
Stop Words: Yes
|
|
193
|
-
Contractions: No
|
|
201
|
+
Specs: No
|
|
202
|
+
Abbreviations: No
|
|
203
|
+
Stop Words: Yes
|
|
204
|
+
Contractions: No
|
|
194
205
|
|
|
195
206
|
##### Italian
|
|
196
|
-
Specs: No
|
|
197
|
-
Abbreviations: Yes
|
|
198
|
-
Stop Words: Yes
|
|
199
|
-
Contractions: No
|
|
207
|
+
Specs: No
|
|
208
|
+
Abbreviations: Yes
|
|
209
|
+
Stop Words: Yes
|
|
210
|
+
Contractions: No
|
|
200
211
|
|
|
201
212
|
##### Latvian
|
|
202
|
-
Specs: No
|
|
203
|
-
Abbreviations: No
|
|
204
|
-
Stop Words: Yes
|
|
205
|
-
Contractions: No
|
|
213
|
+
Specs: No
|
|
214
|
+
Abbreviations: No
|
|
215
|
+
Stop Words: Yes
|
|
216
|
+
Contractions: No
|
|
206
217
|
|
|
207
218
|
##### Norwegian
|
|
208
|
-
Specs: No
|
|
209
|
-
Abbreviations: No
|
|
210
|
-
Stop Words: Yes
|
|
211
|
-
Contractions: No
|
|
219
|
+
Specs: No
|
|
220
|
+
Abbreviations: No
|
|
221
|
+
Stop Words: Yes
|
|
222
|
+
Contractions: No
|
|
212
223
|
|
|
213
224
|
##### Persian
|
|
214
|
-
Specs: No
|
|
215
|
-
Abbreviations: No
|
|
216
|
-
Stop Words: Yes
|
|
217
|
-
Contractions: No
|
|
225
|
+
Specs: No
|
|
226
|
+
Abbreviations: No
|
|
227
|
+
Stop Words: Yes
|
|
228
|
+
Contractions: No
|
|
218
229
|
|
|
219
230
|
##### Polish
|
|
220
|
-
Specs: No
|
|
221
|
-
Abbreviations: Yes
|
|
222
|
-
Stop Words: Yes
|
|
223
|
-
Contractions: No
|
|
231
|
+
Specs: No
|
|
232
|
+
Abbreviations: Yes
|
|
233
|
+
Stop Words: Yes
|
|
234
|
+
Contractions: No
|
|
224
235
|
|
|
225
236
|
##### Portuguese
|
|
226
|
-
Specs: No
|
|
227
|
-
Abbreviations: No
|
|
228
|
-
Stop Words: Yes
|
|
229
|
-
Contractions: No
|
|
237
|
+
Specs: No
|
|
238
|
+
Abbreviations: No
|
|
239
|
+
Stop Words: Yes
|
|
240
|
+
Contractions: No
|
|
230
241
|
|
|
231
242
|
##### Romanian
|
|
232
|
-
Specs: No
|
|
233
|
-
Abbreviations: No
|
|
234
|
-
Stop Words: Yes
|
|
235
|
-
Contractions: No
|
|
243
|
+
Specs: No
|
|
244
|
+
Abbreviations: No
|
|
245
|
+
Stop Words: Yes
|
|
246
|
+
Contractions: No
|
|
236
247
|
|
|
237
248
|
##### Russian
|
|
238
|
-
Specs: No
|
|
239
|
-
Abbreviations: Yes
|
|
240
|
-
Stop Words: Yes
|
|
241
|
-
Contractions: No
|
|
249
|
+
Specs: No
|
|
250
|
+
Abbreviations: Yes
|
|
251
|
+
Stop Words: Yes
|
|
252
|
+
Contractions: No
|
|
242
253
|
|
|
243
254
|
##### Slovak
|
|
244
|
-
Specs: No
|
|
245
|
-
Abbreviations: No
|
|
246
|
-
Stop Words: Yes
|
|
247
|
-
Contractions: No
|
|
255
|
+
Specs: No
|
|
256
|
+
Abbreviations: No
|
|
257
|
+
Stop Words: Yes
|
|
258
|
+
Contractions: No
|
|
248
259
|
|
|
249
260
|
##### Spanish
|
|
250
|
-
Specs: No
|
|
251
|
-
Abbreviations: Yes
|
|
252
|
-
Stop Words: Yes
|
|
253
|
-
Contractions: Yes
|
|
261
|
+
Specs: No
|
|
262
|
+
Abbreviations: Yes
|
|
263
|
+
Stop Words: Yes
|
|
264
|
+
Contractions: Yes
|
|
254
265
|
|
|
255
266
|
##### Swedish
|
|
256
|
-
Specs: No
|
|
257
|
-
Abbreviations: No
|
|
258
|
-
Stop Words: Yes
|
|
259
|
-
Contractions: No
|
|
267
|
+
Specs: No
|
|
268
|
+
Abbreviations: No
|
|
269
|
+
Stop Words: Yes
|
|
270
|
+
Contractions: No
|
|
260
271
|
|
|
261
272
|
##### Turkish
|
|
262
|
-
Specs: No
|
|
263
|
-
Abbreviations: No
|
|
264
|
-
Stop Words: Yes
|
|
265
|
-
Contractions: No
|
|
273
|
+
Specs: No
|
|
274
|
+
Abbreviations: No
|
|
275
|
+
Stop Words: Yes
|
|
276
|
+
Contractions: No
|
|
266
277
|
|
|
267
278
|
## Development
|
|
268
279
|
|
|
@@ -9,7 +9,15 @@ module PragmaticTokenizer
|
|
|
9
9
|
ABBREVIATIONS = []
|
|
10
10
|
STOP_WORDS = []
|
|
11
11
|
CONTRACTIONS = {}
|
|
12
|
+
|
|
13
|
+
class SingleQuotes
|
|
14
|
+
def handle_single_quotes(text)
|
|
15
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
|
16
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
|
17
|
+
# Separate right single quotes
|
|
18
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
|
19
|
+
end
|
|
20
|
+
end
|
|
12
21
|
end
|
|
13
22
|
end
|
|
14
23
|
end
|
|
15
|
-
|
|
@@ -86,6 +86,16 @@ module PragmaticTokenizer
|
|
|
86
86
|
"will-o'-the-wisp" => "will-of-the-wisp",
|
|
87
87
|
"'twas" => "it was"
|
|
88
88
|
}
|
|
89
|
+
class SingleQuotes
|
|
90
|
+
def handle_single_quotes(text)
|
|
91
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
|
92
|
+
# Convert left quotes to special character except for 'Twas or 'twas
|
|
93
|
+
text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
|
94
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
|
95
|
+
# Separate right single quotes
|
|
96
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
|
97
|
+
end
|
|
98
|
+
end
|
|
89
99
|
end
|
|
90
100
|
end
|
|
91
101
|
end
|
|
@@ -5,6 +5,16 @@ module PragmaticTokenizer
|
|
|
5
5
|
ABBREVIATIONS = ["a.c.n", "a.m", "al", "ann", "apr", "art", "auj", "av", "b.p", "boul", "c.-à-d", "c.n", "c.n.s", "c.p.i", "c.q.f.d", "c.s", "ca", "cf", "ch.-l", "chap", "co", "contr", "dir", "e.g", "e.v", "env", "etc", "ex", "fasc", "fig", "fr", "fém", "hab", "i.e", "ibid", "id", "inf", "l.d", "lib", "ll.aa", "ll.aa.ii", "ll.aa.rr", "ll.aa.ss", "ll.ee", "ll.mm", "ll.mm.ii.rr", "loc.cit", "ltd", "masc", "mm", "ms", "n.b", "n.d", "n.d.a", "n.d.l.r", "n.d.t", "n.p.a.i", "n.s", "n/réf", "nn.ss", "p.c.c", "p.ex", "p.j", "p.s", "pl", "pp", "r.-v", "r.a.s", "r.i.p", "r.p", "s.a", "s.a.i", "s.a.r", "s.a.s", "s.e", "s.m", "s.m.i.r", "s.s", "sec", "sect", "sing", "sq", "sqq", "ss", "suiv", "sup", "suppl", "t.s.v.p", "tél", "vb", "vol", "vs", "x.o", "z.i", "éd"]
|
|
6
6
|
STOP_WORDS = ["a", "à", "â", "abord", "afin", "ah", "ai", "aie", "ainsi", "allaient", "allo", "allô", "allons", "après", "assez", "attendu", "au", "aucun", "aucune", "aujourd", "aujourd'hui", "auquel", "aura", "auront", "aussi", "autre", "autres", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "ayant", "b", "bah", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "ça", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chaque", "cher", "chère", "chères", "chers", "chez", "chiche", "chut", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "dès", "désormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "différent", "différente", "différentes", "différents", "dire", "divers", "diverse", "diverses", "dix", "dix-huit", "dixième", "dix-neuf", "dix-sept", "doit", "doivent", "donc", "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "e", "effet", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "entre", "envers", "environ", "es", "ès", "est", "et", "etant", "étaient", "étais", "était", "étant", "etc", "été", "etre", "être", "eu", "euh", "eux", "eux-mêmes", "excepté", "f", "façon", "fais", "faisaient", "faisant", "fait", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hé", "hein", "hélas", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "k", "l", "la", "là", "laquelle", "las", "le", "lequel", "les", "lès", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lorsque", "lui", "lui-même", "m", "ma", "maint", "mais", "malgré", "me", "même", "mêmes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "moi", "moi-même", "moins", "mon", "moyennant", "n", "na", "ne", "néanmoins", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notre", "nôtre", "nôtres", "nous", "nous-mêmes", "nul", "o", "o|", "ô", "oh", "ohé", "olé", "ollé", "on", "ont", "onze", "onzième", "ore", "ou", "où", "ouf", "ouias", "oust", "ouste", "outre", "p", "paf", "pan", "par", "parmi", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "plein", "plouf", "plus", "plusieurs", "plutôt", "pouah", "pour", "pourquoi", "premier", "première", "premièrement", "près", "proche", "psitt", "puisque", "q", "qu", "quand", "quant", "quanta", "quant-à-soi", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelque", "quelques", "quelqu'un", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sans", "sapristi", "sauf", "se", "seize", "selon", "sept", "septième", "sera", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "stop", "suis", "suivant", "sur", "surtout", "t", "ta", "tac", "tant", "te", "té", "tel", "telle", "tellement", "telles", "tels", "tenant", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutes", "treize", "trente", "très", "trois", "troisième", "troisièmement", "trop", "tsoin", "tsouin", "tu", "u", "un", "une", "unes", "uns", "v", "va", "vais", "vas", "vé", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", "vont", "vos", "votre", "vôtre", "vôtres", "vous", "vous-mêmes", "vu", "w", "x", "y", "z", "zut", "alors", "aucuns", "bon", "devrait", "dos", "droite", "début", "essai", "faites", "fois", "force", "haut", "ici", "juste", "maintenant", "mine", "mot", "nommés", "nouveaux", "parce", "parole", "personnes", "pièce", "plupart", "seulement", "soyez", "sujet", "tandis", "valeur", "voie", "voient", "état", "étions", "d'un", "d'une"]
|
|
7
7
|
CONTRACTIONS = {}
|
|
8
|
+
|
|
9
|
+
class SingleQuotes
|
|
10
|
+
def handle_single_quotes(text)
|
|
11
|
+
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
|
|
12
|
+
text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
|
|
13
|
+
text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
|
|
14
|
+
text.gsub!(/l\'/, '\1 l\' \2') || text
|
|
15
|
+
text.gsub!(/L\'/, '\1 L\' \2') || text
|
|
16
|
+
end
|
|
17
|
+
end
|
|
8
18
|
end
|
|
9
19
|
end
|
|
10
|
-
end
|
|
20
|
+
end
|
|
@@ -24,18 +24,17 @@ module PragmaticTokenizer
|
|
|
24
24
|
|
|
25
25
|
def convert_dbl_quotes(text)
|
|
26
26
|
# Convert left double quotes to special character
|
|
27
|
-
text.gsub!(/"(?=.*\w)/o, ' ' +
|
|
27
|
+
text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
|
28
28
|
# Convert remaining quotes to special character
|
|
29
|
-
text.gsub!(/"/, ' ' +
|
|
29
|
+
text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
def convert_sgl_quotes(text)
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + convert_punct_to_sym("'") + ' ' } || text
|
|
33
|
+
if defined? @language::SingleQuotes
|
|
34
|
+
@language::SingleQuotes.new.handle_single_quotes(text)
|
|
35
|
+
else
|
|
36
|
+
PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
|
|
37
|
+
end
|
|
39
38
|
end
|
|
40
39
|
|
|
41
40
|
def shift_multiple_dash(text)
|
|
@@ -71,7 +70,7 @@ module PragmaticTokenizer
|
|
|
71
70
|
!(/\A\d+/ == text.partition(':').last[0]) &&
|
|
72
71
|
!(/\A\d+/ == text.partition(':').first[-1])
|
|
73
72
|
# Ignore web addresses
|
|
74
|
-
text.gsub!(/(?<=[http|https]):(?=\/\/)/,
|
|
73
|
+
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
|
75
74
|
text.gsub!(/:/o, ' :') || text
|
|
76
75
|
end
|
|
77
76
|
|
|
@@ -124,10 +123,6 @@ module PragmaticTokenizer
|
|
|
124
123
|
cleaned_tokens
|
|
125
124
|
end
|
|
126
125
|
|
|
127
|
-
def convert_punct_to_sym(punctuation)
|
|
128
|
-
PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
|
|
129
|
-
end
|
|
130
|
-
|
|
131
126
|
def convert_sym_to_punct(token)
|
|
132
127
|
symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
|
|
133
128
|
if symbol.nil?
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: pragmatic_tokenizer
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.6
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kevin S. Dias
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2016-01-
|
|
11
|
+
date: 2016-01-07 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|