pragmatic_tokenizer 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a52658ccd583baac9e69b37649e5403ad6f3549b
4
- data.tar.gz: 95335fab907589faa066f7a76bd8a6df4d9f2f70
3
+ metadata.gz: b8f3d1c236d1faafc68ff8d689de291a59596b1f
4
+ data.tar.gz: ed010aae17417aa9ea87a1fa817f893c893e5d77
5
5
  SHA512:
6
- metadata.gz: f7b42942e2676cc22a807938244fc38b7e4d4097bcdb3a6f0c08fe94fccc923cce50e8328e91efdfdd93f758771dda08b3ea432bb67099536630d4a9d9d9f4e9
7
- data.tar.gz: d78a40443dc9948866d1f271e47af355a8838e38dbd932d282d7b0c905ad06dfb392f65ba7cc2df6a891098ff52b9a795317b1a458363214930c08650cc6ef96
6
+ metadata.gz: e10fbb56ec2097ef2854a0517f2b4add98479fecb0318c813c791f0b3872b7ba93a489e409de14e83b5327a8e12ec64747ef64846f57c32e0625d994c50f9331
7
+ data.tar.gz: e6a8f35c84c6e1029d26dd7981c2f8bd3f43803d095ecd45de0d2ab2132399f1dc652704269217573dbcb4b99aa438b751be57ffc26fc39b8e9b3b4554a2d56e
data/README.md CHANGED
@@ -70,7 +70,16 @@ Or install it yourself as:
70
70
  ##### `remove_numbers`
71
71
  **default** = `'false'`
72
72
  - `true`
73
- Removes any token that contains a number or Roman numeral.
73
+ Removes any token that contains a number.
74
+ - `false`
75
+ Leaves tokens as is.
76
+
77
+ <hr>
78
+
79
+ ##### `remove_roman_numerals`
80
+ **default** = `'false'`
81
+ - `true`
82
+ Removes any token that contains a Roman numeral.
74
83
  - `false`
75
84
  Leaves tokens as is.
76
85
 
@@ -124,145 +133,147 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
124
133
 
125
134
  ## Language Support
126
135
 
127
- The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated. *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
136
+ The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated.
137
+
138
+ *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
128
139
 
129
140
  ##### English
130
- Specs: Yes
131
- Abbreviations: Yes
132
- Stop Words: Yes
133
- Contractions: Yes
141
+ Specs: Yes
142
+ Abbreviations: Yes
143
+ Stop Words: Yes
144
+ Contractions: Yes
134
145
 
135
146
  ##### Arabic
136
- Specs: No
137
- Abbreviations: Yes
138
- Stop Words: Yes
139
- Contractions: No
147
+ Specs: No
148
+ Abbreviations: Yes
149
+ Stop Words: Yes
150
+ Contractions: No
140
151
 
141
152
  ##### Bulgarian
142
- Specs: No
143
- Abbreviations: Yes
144
- Stop Words: Yes
145
- Contractions: No
153
+ Specs: No
154
+ Abbreviations: Yes
155
+ Stop Words: Yes
156
+ Contractions: No
146
157
 
147
158
  ##### Catalan
148
- Specs: No
149
- Abbreviations: No
150
- Stop Words: Yes
151
- Contractions: No
159
+ Specs: No
160
+ Abbreviations: No
161
+ Stop Words: Yes
162
+ Contractions: No
152
163
 
153
164
  ##### Czech
154
- Specs: No
155
- Abbreviations: No
156
- Stop Words: Yes
157
- Contractions: No
165
+ Specs: No
166
+ Abbreviations: No
167
+ Stop Words: Yes
168
+ Contractions: No
158
169
 
159
170
  ##### Danish
160
- Specs: No
161
- Abbreviations: No
162
- Stop Words: Yes
163
- Contractions: No
171
+ Specs: No
172
+ Abbreviations: No
173
+ Stop Words: Yes
174
+ Contractions: No
164
175
 
165
- ##### German
166
- Specs: More needed
167
- Abbreviations: Yes
168
- Stop Words: Yes
169
- Contractions: No
176
+ ##### Deutsch
177
+ Specs: More needed
178
+ Abbreviations: Yes
179
+ Stop Words: Yes
180
+ Contractions: No
170
181
 
171
182
  ##### Finnish
172
- Specs: No
173
- Abbreviations: No
174
- Stop Words: Yes
175
- Contractions: No
183
+ Specs: No
184
+ Abbreviations: No
185
+ Stop Words: Yes
186
+ Contractions: No
176
187
 
177
188
  ##### French
178
- Specs: More needed
179
- Abbreviations: Yes
180
- Stop Words: Yes
181
- Contractions: No
189
+ Specs: More needed
190
+ Abbreviations: Yes
191
+ Stop Words: Yes
192
+ Contractions: No
182
193
 
183
194
  ##### Greek
184
- Specs: No
185
- Abbreviations: No
186
- Stop Words: Yes
187
- Contractions: No
195
+ Specs: No
196
+ Abbreviations: No
197
+ Stop Words: Yes
198
+ Contractions: No
188
199
 
189
200
  ##### Indonesian
190
- Specs: No
191
- Abbreviations: No
192
- Stop Words: Yes
193
- Contractions: No
201
+ Specs: No
202
+ Abbreviations: No
203
+ Stop Words: Yes
204
+ Contractions: No
194
205
 
195
206
  ##### Italian
196
- Specs: No
197
- Abbreviations: Yes
198
- Stop Words: Yes
199
- Contractions: No
207
+ Specs: No
208
+ Abbreviations: Yes
209
+ Stop Words: Yes
210
+ Contractions: No
200
211
 
201
212
  ##### Latvian
202
- Specs: No
203
- Abbreviations: No
204
- Stop Words: Yes
205
- Contractions: No
213
+ Specs: No
214
+ Abbreviations: No
215
+ Stop Words: Yes
216
+ Contractions: No
206
217
 
207
218
  ##### Norwegian
208
- Specs: No
209
- Abbreviations: No
210
- Stop Words: Yes
211
- Contractions: No
219
+ Specs: No
220
+ Abbreviations: No
221
+ Stop Words: Yes
222
+ Contractions: No
212
223
 
213
224
  ##### Persian
214
- Specs: No
215
- Abbreviations: No
216
- Stop Words: Yes
217
- Contractions: No
225
+ Specs: No
226
+ Abbreviations: No
227
+ Stop Words: Yes
228
+ Contractions: No
218
229
 
219
230
  ##### Polish
220
- Specs: No
221
- Abbreviations: Yes
222
- Stop Words: Yes
223
- Contractions: No
231
+ Specs: No
232
+ Abbreviations: Yes
233
+ Stop Words: Yes
234
+ Contractions: No
224
235
 
225
236
  ##### Portuguese
226
- Specs: No
227
- Abbreviations: No
228
- Stop Words: Yes
229
- Contractions: No
237
+ Specs: No
238
+ Abbreviations: No
239
+ Stop Words: Yes
240
+ Contractions: No
230
241
 
231
242
  ##### Romanian
232
- Specs: No
233
- Abbreviations: No
234
- Stop Words: Yes
235
- Contractions: No
243
+ Specs: No
244
+ Abbreviations: No
245
+ Stop Words: Yes
246
+ Contractions: No
236
247
 
237
248
  ##### Russian
238
- Specs: No
239
- Abbreviations: Yes
240
- Stop Words: Yes
241
- Contractions: No
249
+ Specs: No
250
+ Abbreviations: Yes
251
+ Stop Words: Yes
252
+ Contractions: No
242
253
 
243
254
  ##### Slovak
244
- Specs: No
245
- Abbreviations: No
246
- Stop Words: Yes
247
- Contractions: No
255
+ Specs: No
256
+ Abbreviations: No
257
+ Stop Words: Yes
258
+ Contractions: No
248
259
 
249
260
  ##### Spanish
250
- Specs: No
251
- Abbreviations: Yes
252
- Stop Words: Yes
253
- Contractions: Yes
261
+ Specs: No
262
+ Abbreviations: Yes
263
+ Stop Words: Yes
264
+ Contractions: Yes
254
265
 
255
266
  ##### Swedish
256
- Specs: No
257
- Abbreviations: No
258
- Stop Words: Yes
259
- Contractions: No
267
+ Specs: No
268
+ Abbreviations: No
269
+ Stop Words: Yes
270
+ Contractions: No
260
271
 
261
272
  ##### Turkish
262
- Specs: No
263
- Abbreviations: No
264
- Stop Words: Yes
265
- Contractions: No
273
+ Specs: No
274
+ Abbreviations: No
275
+ Stop Words: Yes
276
+ Contractions: No
266
277
 
267
278
  ## Development
268
279
 
@@ -9,7 +9,15 @@ module PragmaticTokenizer
9
9
  ABBREVIATIONS = []
10
10
  STOP_WORDS = []
11
11
  CONTRACTIONS = {}
12
+
13
+ class SingleQuotes
14
+ def handle_single_quotes(text)
15
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
16
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
17
+ # Separate right single quotes
18
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
19
+ end
20
+ end
12
21
  end
13
22
  end
14
23
  end
15
-
@@ -86,6 +86,16 @@ module PragmaticTokenizer
86
86
  "will-o'-the-wisp" => "will-of-the-wisp",
87
87
  "'twas" => "it was"
88
88
  }
89
+ class SingleQuotes
90
+ def handle_single_quotes(text)
91
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
92
+ # Convert left quotes to special character except for 'Twas or 'twas
93
+ text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
94
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
95
+ # Separate right single quotes
96
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
97
+ end
98
+ end
89
99
  end
90
100
  end
91
101
  end
@@ -5,6 +5,16 @@ module PragmaticTokenizer
5
5
  ABBREVIATIONS = ["a.c.n", "a.m", "al", "ann", "apr", "art", "auj", "av", "b.p", "boul", "c.-à-d", "c.n", "c.n.s", "c.p.i", "c.q.f.d", "c.s", "ca", "cf", "ch.-l", "chap", "co", "contr", "dir", "e.g", "e.v", "env", "etc", "ex", "fasc", "fig", "fr", "fém", "hab", "i.e", "ibid", "id", "inf", "l.d", "lib", "ll.aa", "ll.aa.ii", "ll.aa.rr", "ll.aa.ss", "ll.ee", "ll.mm", "ll.mm.ii.rr", "loc.cit", "ltd", "masc", "mm", "ms", "n.b", "n.d", "n.d.a", "n.d.l.r", "n.d.t", "n.p.a.i", "n.s", "n/réf", "nn.ss", "p.c.c", "p.ex", "p.j", "p.s", "pl", "pp", "r.-v", "r.a.s", "r.i.p", "r.p", "s.a", "s.a.i", "s.a.r", "s.a.s", "s.e", "s.m", "s.m.i.r", "s.s", "sec", "sect", "sing", "sq", "sqq", "ss", "suiv", "sup", "suppl", "t.s.v.p", "tél", "vb", "vol", "vs", "x.o", "z.i", "éd"]
6
6
  STOP_WORDS = ["a", "à", "â", "abord", "afin", "ah", "ai", "aie", "ainsi", "allaient", "allo", "allô", "allons", "après", "assez", "attendu", "au", "aucun", "aucune", "aujourd", "aujourd'hui", "auquel", "aura", "auront", "aussi", "autre", "autres", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avoir", "ayant", "b", "bah", "beaucoup", "bien", "bigre", "boum", "bravo", "brrr", "c", "ça", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chaque", "cher", "chère", "chères", "chers", "chez", "chiche", "chut", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "delà", "depuis", "derrière", "des", "dès", "désormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "différent", "différente", "différentes", "différents", "dire", "divers", "diverse", "diverses", "dix", "dix-huit", "dixième", "dix-neuf", "dix-sept", "doit", "doivent", "donc", "dont", "douze", "douzième", "dring", "du", "duquel", "durant", "e", "effet", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "entre", "envers", "environ", "es", "ès", "est", "et", "etant", "étaient", "étais", "était", "étant", "etc", "été", "etre", "être", "eu", "euh", "eux", "eux-mêmes", "excepté", "f", "façon", "fais", "faisaient", "faisant", "fait", "feront", "fi", "flac", "floc", "font", "g", "gens", "h", "ha", "hé", "hein", "hélas", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "i", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "k", "l", "la", "là", "laquelle", "las", "le", "lequel", "les", "lès", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lorsque", "lui", "lui-même", "m", "ma", "maint", "mais", "malgré", "me", "même", "mêmes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "moi", "moi-même", "moins", "mon", "moyennant", "n", "na", "ne", "néanmoins", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "non", "nos", "notre", "nôtre", "nôtres", "nous", "nous-mêmes", "nul", "o", "o|", "ô", "oh", "ohé", "olé", "ollé", "on", "ont", "onze", "onzième", "ore", "ou", "où", "ouf", "ouias", "oust", "ouste", "outre", "p", "paf", "pan", "par", "parmi", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "personne", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "plein", "plouf", "plus", "plusieurs", "plutôt", "pouah", "pour", "pourquoi", "premier", "première", "premièrement", "près", "proche", "psitt", "puisque", "q", "qu", "quand", "quant", "quanta", "quant-à-soi", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelque", "quelques", "quelqu'un", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sans", "sapristi", "sauf", "se", "seize", "selon", "sept", "septième", "sera", "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soit", "soixante", "son", "sont", "sous", "stop", "suis", "suivant", "sur", "surtout", "t", "ta", "tac", "tant", "te", "té", "tel", "telle", "tellement", "telles", "tels", "tenant", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutes", "treize", "trente", "très", "trois", "troisième", "troisièmement", "trop", "tsoin", "tsouin", "tu", "u", "un", "une", "unes", "uns", "v", "va", "vais", "vas", "vé", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voilà", "vont", "vos", "votre", "vôtre", "vôtres", "vous", "vous-mêmes", "vu", "w", "x", "y", "z", "zut", "alors", "aucuns", "bon", "devrait", "dos", "droite", "début", "essai", "faites", "fois", "force", "haut", "ici", "juste", "maintenant", "mine", "mot", "nommés", "nouveaux", "parce", "parole", "personnes", "pièce", "plupart", "seulement", "soyez", "sujet", "tandis", "valeur", "voie", "voient", "état", "étions", "d'un", "d'une"]
7
7
  CONTRACTIONS = {}
8
+
9
+ class SingleQuotes
10
+ def handle_single_quotes(text)
11
+ text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
+ text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ') || text
13
+ text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
14
+ text.gsub!(/l\'/, '\1 l\' \2') || text
15
+ text.gsub!(/L\'/, '\1 L\' \2') || text
16
+ end
17
+ end
8
18
  end
9
19
  end
10
- end
20
+ end
@@ -24,18 +24,17 @@ module PragmaticTokenizer
24
24
 
25
25
  def convert_dbl_quotes(text)
26
26
  # Convert left double quotes to special character
27
- text.gsub!(/"(?=.*\w)/o, ' ' + convert_punct_to_sym('"') + ' ') || text
27
+ text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
28
28
  # Convert remaining quotes to special character
29
- text.gsub!(/"/, ' ' + convert_punct_to_sym('"') + ' ') || text
29
+ text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
30
30
  end
31
31
 
32
32
  def convert_sgl_quotes(text)
33
- text.gsub!(/`(?!`)(?=.*\w)/o, ' ' + convert_punct_to_sym("'") + ' ') || text
34
- # Convert left quotes to special character except for 'Twas or 'twas
35
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + convert_punct_to_sym("'") + ' ' : ' ' + convert_punct_to_sym("'") + ' ' } || text
36
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + convert_punct_to_sym("'")) || text
37
- # Separate right single quotes
38
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + convert_punct_to_sym("'") + ' ' } || text
33
+ if defined? @language::SingleQuotes
34
+ @language::SingleQuotes.new.handle_single_quotes(text)
35
+ else
36
+ PragmaticTokenizer::Languages::Common::SingleQuotes.new.handle_single_quotes(text)
37
+ end
39
38
  end
40
39
 
41
40
  def shift_multiple_dash(text)
@@ -71,7 +70,7 @@ module PragmaticTokenizer
71
70
  !(/\A\d+/ == text.partition(':').last[0]) &&
72
71
  !(/\A\d+/ == text.partition(':').first[-1])
73
72
  # Ignore web addresses
74
- text.gsub!(/(?<=[http|https]):(?=\/\/)/, convert_punct_to_sym(":")) || text
73
+ text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
75
74
  text.gsub!(/:/o, ' :') || text
76
75
  end
77
76
 
@@ -124,10 +123,6 @@ module PragmaticTokenizer
124
123
  cleaned_tokens
125
124
  end
126
125
 
127
- def convert_punct_to_sym(punctuation)
128
- PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
129
- end
130
-
131
126
  def convert_sym_to_punct(token)
132
127
  symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
133
128
  if symbol.nil?
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-06 00:00:00.000000000 Z
11
+ date: 2016-01-07 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler