pragmatic_tokenizer 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a52658ccd583baac9e69b37649e5403ad6f3549b
4
- data.tar.gz: 95335fab907589faa066f7a76bd8a6df4d9f2f70
3
+ metadata.gz: b8f3d1c236d1faafc68ff8d689de291a59596b1f
4
+ data.tar.gz: ed010aae17417aa9ea87a1fa817f893c893e5d77
5
5
  SHA512:
6
- metadata.gz: f7b42942e2676cc22a807938244fc38b7e4d4097bcdb3a6f0c08fe94fccc923cce50e8328e91efdfdd93f758771dda08b3ea432bb67099536630d4a9d9d9f4e9
7
- data.tar.gz: d78a40443dc9948866d1f271e47af355a8838e38dbd932d282d7b0c905ad06dfb392f65ba7cc2df6a891098ff52b9a795317b1a458363214930c08650cc6ef96
6
+ metadata.gz: e10fbb56ec2097ef2854a0517f2b4add98479fecb0318c813c791f0b3872b7ba93a489e409de14e83b5327a8e12ec64747ef64846f57c32e0625d994c50f9331
7
+ data.tar.gz: e6a8f35c84c6e1029d26dd7981c2f8bd3f43803d095ecd45de0d2ab2132399f1dc652704269217573dbcb4b99aa438b751be57ffc26fc39b8e9b3b4554a2d56e
data/README.md CHANGED
@@ -70,7 +70,16 @@ Or install it yourself as:
70
70
  ##### `remove_numbers`
71
71
  **default** = `'false'`
72
72
  - `true`
73
- Removes any token that contains a number or Roman numeral.
73
+ Removes any token that contains a number.
74
+ - `false`
75
+ Leaves tokens as is.
76
+
77
+ <hr>
78
+
79
+ ##### `remove_roman_numerals`
80
+ **default** = `'false'`
81
+ - `true`
82
+ Removes any token that contains a Roman numeral.
74
83
  - `false`
75
84
  Leaves tokens as is.
76
85
 
@@ -124,145 +133,147 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
124
133
 
125
134
  ## Language Support
126
135
 
127
- The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated. *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
136
+ The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
137
+
138
+ *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
128
139
 
129
140
  ##### English
130
- Specs: Yes
131
- Abbreviations: Yes
132
- Stop Words: Yes
133
- Contractions: Yes
141
+ Specs: Yes
142
+ Abbreviations: Yes
143
+ Stop Words: Yes
144
+ Contractions: Yes
134
145
 
135
146
  ##### Arabic
136
- Specs: No
137
- Abbreviations: Yes
138
- Stop Words: Yes
139
- Contractions: No
147
+ Specs: No
148
+ Abbreviations: Yes
149
+ Stop Words: Yes
150
+ Contractions: No
140
151
 
141
152
  ##### Bulgarian
142
- Specs: No
143
- Abbreviations: Yes
144
- Stop Words: Yes
145
- Contractions: No
153
+ Specs: No
154
+ Abbreviations: Yes
155
+ Stop Words: Yes
156
+ Contractions: No
146
157
 
147
158
  ##### Catalan
148
- Specs: No
149
- Abbreviations: No
150
- Stop Words: Yes
151
- Contractions: No
159
+ Specs: No
160
+ Abbreviations: No
161
+ Stop Words: Yes
162
+ Contractions: No
152
163
 
153
164
  ##### Czech
154
- Specs: No
155
- Abbreviations: No
156
- Stop Words: Yes
157
- Contractions: No
165
+ Specs: No
166
+ Abbreviations: No
167
+ Stop Words: Yes
168
+ Contractions: No
158
169
 
159
170
  ##### Danish
160
- Specs: No
161
- Abbreviations: No
162
- Stop Words: Yes
163
- Contractions: No
171
+ Specs: No
172
+ Abbreviations: No
173
+ Stop Words: Yes
174
+ Contractions: No
164
175
 
165
- ##### German
166
- Specs: More needed
167
- Abbreviations: Yes
168
- Stop Words: Yes
169
- Contractions: No
176
+ ##### Deutsch
177
+ Specs: More needed
178
+ Abbreviations: Yes
179
+ Stop Words: Yes
180
+ Contractions: No
170
181
 
171
182
  ##### Finnish
172
- Specs: No
173
- Abbreviations: No
174
- Stop Words: Yes
175
- Contractions: No
183
+ Specs: No
184
+ Abbreviations: No
185
+ Stop Words: Yes
186
+ Contractions: No
176
187
 
177
188
  ##### French
178
- Specs: More needed
179
- Abbreviations: Yes
180
- Stop Words: Yes
181
- Contractions: No
189
+ Specs: More needed
190
+ Abbreviations: Yes
191
+ Stop Words: Yes
192
+ Contractions: No
182
193
 
183
194
  ##### Greek
184
- Specs: No
185
- Abbreviations: No
186
- Stop Words: Yes
187
- Contractions: No
195
+ Specs: No
196
+ Abbreviations: No
197
+ Stop Words: Yes
198
+ Contractions: No
188
199
 
189
200
  ##### Indonesian
190
- Specs: No
191
- Abbreviations: No
192
- Stop Words: Yes
193
- Contractions: No
201
+ Specs: No
202
+ Abbreviations: No
203
+ Stop Words: Yes
204
+ Contractions: No
194
205
 
195
206
  ##### Italian
196
- Specs: No
197
- Abbreviations: Yes
198
- Stop Words: Yes
199
- Contractions: No
207
+ Specs: No
208
+ Abbreviations: Yes
209
+ Stop Words: Yes
210
+ Contractions: No
200
211
 
201
212
  ##### Latvian
202
- Specs: No
203
- Abbreviations: No
204
- Stop Words: Yes
205
- Contractions: No
213
+ Specs: No
214
+ Abbreviations: No
215
+ Stop Words: Yes
216
+ Contractions: No
206
217
 
207
218
  ##### Norwegian
208
- Specs: No
209
- Abbreviations: No
210
- Stop Words: Yes
211
- Contractions: No
219
+ Specs: No
220
+ Abbreviations: No
221
+ Stop Words: Yes
222
+ Contractions: No
212
223
 
213
224
  ##### Persian
214
- Specs: No
215
- Abbreviations: No
216
- Stop Words: Yes
217
- Contractions: No
225
+ Specs: No
226
+ Abbreviations: No
227
+ Stop Words: Yes
228
+ Contractions: No
218
229
 
219
230
  ##### Polish
220
- Specs: No
221
- Abbreviations: Yes
222
- Stop Words: Yes
223
- Contractions: No
231
+ Specs: No
232
+ Abbreviations: Yes
233
+ Stop Words: Yes
234
+ Contractions: No
224
235
 
225
236
  ##### Portuguese
226
- Specs: No
227
- Abbreviations: No
228
- Stop Words: Yes
229
- Contractions: No
237
+ Specs: No
238
+ Abbreviations: No
239
+ Stop Words: Yes
240
+ Contractions: No
230
241
 
231
242
  ##### Romanian
232
- Specs: No
233
- Abbreviations: No
234
- Stop Words: Yes
235
- Contractions: No
243
+ Specs: No
244
+ Abbreviations: No
245
+ Stop Words: Yes
246
+ Contractions: No
236
247
 
237
248
  ##### Russian
238
- Specs: No
239
- Abbreviations: Yes
240
- Stop Words: Yes
241
- Contractions: No
249
+ Specs: No
250
+ Abbreviations: Yes
251
+ Stop Words: Yes
252
+ Contractions: No
242
253
 
243
254
  ##### Slovak
244
- Specs: No
245
- Abbreviations: No
246
- Stop Words: Yes
247
- Contractions: No
255
+ Specs: No
256
+ Abbreviations: No
257
+ Stop Words: Yes
258
+ Contractions: No
248
259
 
249
260
  ##### Spanish
250
- Specs: No
251
- Abbreviations: Yes
252
- Stop Words: Yes
253
- Contractions: Yes
261
+ Specs: No
262
+ Abbreviations: Yes
263
+ Stop Words: Yes
264
+ Contractions: Yes
254
265
 
255
266
  ##### Swedish
256
- Specs: No
257
- Abbreviations: No
258
- Stop Words: Yes
259
- Contractions: No
267
+ Specs: No
268
+ Abbreviations: No
269
+ Stop Words: Yes
270
+ Contractions: No
260
271
 
261
272
  ##### Turkish
262
- Specs: No
263
- Abbreviations: No
264
- Stop Words: Yes
265
- Contractions: No
273
+ Specs: No
274
+ Abbreviations: No
275
+ Stop Words: Yes
276
+ Contractions: No
266
277
 
267
278
  ## Development
268
279
 
@@ -9,7 +9,15 @@ module PragmaticTokenizer
9
9
  ABBREVIATIONS = []
10
10
  STOP_WORDS = []
11
11
  CONTRACTIONS = {}
12
+
13
+ class SingleQuotes
14
+ def handle_single_quotes(text)
15
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
16
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
17
+ # Separate right single quotes
18
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
19
+ end
20
+ end
12
21
  end
13
22
  end
14
23
  end
15
-
@@ -86,6 +86,16 @@ module PragmaticTokenizer
86
86
  "will-o'-the-wisp" => "will-of-the-wisp",
87
87
  "'twas" => "it was"
88
88
  }
89
+ class SingleQuotes
90
+ def handle_single_quotes(text)
91
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
92
+ # Convert left quotes to special character except for 'Twas or 'twas
93
+ text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
94
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
95
+ # Separate right single quotes
96
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
97
+ end
98
+ end
89
99
  end
90
100
  end
91
101
  end
@@ -5,6 +5,16 @@ module PragmaticTokenizer
5
5
  ABBREVIATIONS = ["a.c.n", "a.m", "al", "ann", "apr", "art", "auj", "av", "b.p", "boul", "c.-à-d", "c.n", "c.n.s", "c.p.i", "c.q.f.d", "c.s", "ca", "cf", "ch.-l", "chap", "co", "contr", "dir", "e.g", "e.v", "env", "etc", "ex", "fasc", "fig", "fr", "fém", "hab", "i.e", "ibid", "id", "inf", "l.d", "lib", "ll.aa", "ll.aa.ii", "ll.aa.rr", "ll.aa.ss", "ll.ee", "ll.mm", "ll.mm.ii.rr", "loc.cit", "ltd", "masc", "mm", "ms", "n.b", "n.d", "n.d.a", "n.d.l.r", "n.d.t", "n.p.a.i", "n.s", "n/réf", "nn.ss", "p.c.c", "p.ex", "p.j", "p.s", "pl", "pp", "r.-v", "r.a.s", "r.i.p", "r.p", "s.a", "s.a.i", "s.a.r", "s.a.s", "s.e", "s.m", "s.m.i.r", "s.s", "sec", "sect", "sing", "sq", "sqq", "ss", "suiv", "sup", "suppl", "t.s.v.p", "tél", "vb", "vol", "vs", "x.o", "z.i", "éd"]
6
6
  STOP_WORDS = ["a", "à", "â", "abord", "afin", "ah", "ai", "aie", "ainsi", "allaient", "allo", "allô", "allons", "après", "assez", "attendu", "au", "aucun", "aucune", "aujourd", "aujourd'hui", "auquel", "aura", "auront", "aussi", "autre", "autres", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "ayant", "b", "bah", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "ça", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chaque", "cher", "chère", "chères", "chers", "chez", "chiche", "chut", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "dès", "désormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "différent", "différente", "différentes", "différents", "dire", "divers", "diverse", "diverses", "dix", "dix-huit", "dixième", "dix-neuf", "dix-sept", "doit", "doivent", "donc", "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "e", "effet", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "entre", "envers", "environ", "es", "ès", "est", "et", "etant", "étaient", "étais", "était", "étant", "etc", "été", "etre", "être", "eu", "euh", "eux", "eux-mêmes", "excepté", "f", "façon", "fais", "faisaient", "faisant", "fait", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hé", "hein", "hélas", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "k", "l", "la", "là", "laquelle", "las", "le", "lequel", "les", "lès", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lorsque", "lui", "lui-même", "m", "ma", "maint", "mais", "malgré", "me", "même", "mêmes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "moi", "moi-même", "moins", "mon", "moyennant", "n", "na", "ne", "néanmoins", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notre", "nôtre", "nôtres", "nous", "nous-mêmes", "nul", "o", "o|", "ô", "oh", "ohé", "olé", "ollé", "on", "ont", "onze", "onzième", "ore", "ou", "où", "ouf", "ouias", "oust", "ouste", "outre", "p", "paf", "pan", "par", "parmi", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "plein", "plouf", "plus", "plusieurs", "plutôt", "pouah", "pour", "pourquoi", "premier", "première", "premièrement", "près", "proche", "psitt", "puisque", "q", "qu", "quand", "quant", "quanta", "quant-à-soi", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelque", "quelques", "quelqu'un", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sans", "sapristi", "sauf", "se", "seize", "selon", "sept", "septième", "sera", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "stop", "suis", "suivant", "sur", "surtout", "t", "ta", "tac", "tant", "te", "té", "tel", "telle", "tellement", "telles", "tels", "tenant", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutes", "treize", "trente", "très", "trois", "troisième", "troisièmement", "trop", "tsoin", "tsouin", "tu", "u", "un", "une", "unes", "uns", "v", "va", "vais", "vas", "vé", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", "vont", "vos", "votre", "vôtre", "vôtres", "vous", "vous-mêmes", "vu", "w", "x", "y", "z", "zut", "alors", "aucuns", "bon", "devrait", "dos", "droite", "début", "essai", "faites", "fois", "force", "haut", "ici", "juste", "maintenant", "mine", "mot", "nommés", "nouveaux", "parce", "parole", "personnes", "pièce", "plupart", "seulement", "soyez", "sujet", "tandis", "valeur", "voie", "voient", "état", "étions", "d'un", "d'une"]
7
7
  CONTRACTIONS = {}
8
+
9
+ class SingleQuotes
10
+ def handle_single_quotes(text)
11
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
13
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
14
+ text.gsub!(/l\'/, '\1 l\' \2') || text
15
+ text.gsub!(/L\'/, '\1 L\' \2') || text
16
+ end
17
+ end
8
18
  end
9
19
  end
10
- end
20
+ end
@@ -24,18 +24,17 @@ module PragmaticTokenizer
24
24
 
25
25
  def convert_dbl_quotes(text)
26
26
  # Convert left double quotes to special character
27
- text.gsub!(/"(?=.*\w)/o, ' ' + convert_punct_to_sym('"') + ' ') || text
27
+ text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
28
28
  # Convert remaining quotes to special character
29
- text.gsub!(/"/, ' ' + convert_punct_to_sym('"') + ' ') || text
29
+ text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
30
30
  end
31
31
 
32
32
  def convert_sgl_quotes(text)
33
- text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + convert_punct_to_sym("'") + ' ') || text
34
- # Convert left quotes to special character except for 'Twas or 'twas
35
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + convert_punct_to_sym("'") + ' ' : ' ' + convert_punct_to_sym("'") + ' ' } || text
36
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + convert_punct_to_sym("'")) || text
37
- # Separate right single quotes
38
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + convert_punct_to_sym("'") + ' ' } || text
33
+ if defined? @language::SingleQuotes
34
+ @language::SingleQuotes.new.handle_single_quotes(text)
35
+ else
36
+ PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
37
+ end
39
38
  end
40
39
 
41
40
  def shift_multiple_dash(text)
@@ -71,7 +70,7 @@ module PragmaticTokenizer
71
70
  !(/\A\d+/ == text.partition(':').last[0]) &&
72
71
  !(/\A\d+/ == text.partition(':').first[-1])
73
72
  # Ignore web addresses
74
- text.gsub!(/(?<=[http|https]):(?=\/\/)/, convert_punct_to_sym(":")) || text
73
+ text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
75
74
  text.gsub!(/:/o, ' :') || text
76
75
  end
77
76
 
@@ -124,10 +123,6 @@ module PragmaticTokenizer
124
123
  cleaned_tokens
125
124
  end
126
125
 
127
- def convert_punct_to_sym(punctuation)
128
- PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
129
- end
130
-
131
126
  def convert_sym_to_punct(token)
132
127
  symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
133
128
  if symbol.nil?
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-06 00:00:00.000000000 Z
11
+ date: 2016-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler