pragmatic_tokenizer 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
4
- data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
3
+ metadata.gz: a52658ccd583baac9e69b37649e5403ad6f3549b
4
+ data.tar.gz: 95335fab907589faa066f7a76bd8a6df4d9f2f70
5
5
  SHA512:
6
- metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
7
- data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
6
+ metadata.gz: f7b42942e2676cc22a807938244fc38b7e4d4097bcdb3a6f0c08fe94fccc923cce50e8328e91efdfdd93f758771dda08b3ea432bb67099536630d4a9d9d9f4e9
7
+ data.tar.gz: d78a40443dc9948866d1f271e47af355a8838e38dbd932d282d7b0c905ad06dfb392f65ba7cc2df6a891098ff52b9a795317b1a458363214930c08650cc6ef96
data/README.md CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
61
61
  ##### `clean`
62
62
  **default** = `'false'`
63
63
  - `true`
64
- Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
64
+ Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™).
65
65
  - `false`
66
66
  Leaves tokens as is.
67
67
 
@@ -77,7 +77,7 @@ Or install it yourself as:
77
77
  <hr>
78
78
 
79
79
  ##### `minimum_length`
80
- **default** = `0`
80
+ **default** = `0`
81
81
  The minimum number of characters a token should be.
82
82
 
83
83
  **Example Usage**
@@ -122,6 +122,148 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
122
122
  # => ["minimum", "length"]
123
123
  ```
124
124
 
125
+ ## Language Support
126
+
127
+ The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated. *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
128
+
129
+ ##### English
130
+ Specs: Yes
131
+ Abbreviations: Yes
132
+ Stop Words: Yes
133
+ Contractions: Yes
134
+
135
+ ##### Arabic
136
+ Specs: No
137
+ Abbreviations: Yes
138
+ Stop Words: Yes
139
+ Contractions: No
140
+
141
+ ##### Bulgarian
142
+ Specs: No
143
+ Abbreviations: Yes
144
+ Stop Words: Yes
145
+ Contractions: No
146
+
147
+ ##### Catalan
148
+ Specs: No
149
+ Abbreviations: No
150
+ Stop Words: Yes
151
+ Contractions: No
152
+
153
+ ##### Czech
154
+ Specs: No
155
+ Abbreviations: No
156
+ Stop Words: Yes
157
+ Contractions: No
158
+
159
+ ##### Danish
160
+ Specs: No
161
+ Abbreviations: No
162
+ Stop Words: Yes
163
+ Contractions: No
164
+
165
+ ##### German
166
+ Specs: More needed
167
+ Abbreviations: Yes
168
+ Stop Words: Yes
169
+ Contractions: No
170
+
171
+ ##### Finnish
172
+ Specs: No
173
+ Abbreviations: No
174
+ Stop Words: Yes
175
+ Contractions: No
176
+
177
+ ##### French
178
+ Specs: More needed
179
+ Abbreviations: Yes
180
+ Stop Words: Yes
181
+ Contractions: No
182
+
183
+ ##### Greek
184
+ Specs: No
185
+ Abbreviations: No
186
+ Stop Words: Yes
187
+ Contractions: No
188
+
189
+ ##### Indonesian
190
+ Specs: No
191
+ Abbreviations: No
192
+ Stop Words: Yes
193
+ Contractions: No
194
+
195
+ ##### Italian
196
+ Specs: No
197
+ Abbreviations: Yes
198
+ Stop Words: Yes
199
+ Contractions: No
200
+
201
+ ##### Latvian
202
+ Specs: No
203
+ Abbreviations: No
204
+ Stop Words: Yes
205
+ Contractions: No
206
+
207
+ ##### Norwegian
208
+ Specs: No
209
+ Abbreviations: No
210
+ Stop Words: Yes
211
+ Contractions: No
212
+
213
+ ##### Persian
214
+ Specs: No
215
+ Abbreviations: No
216
+ Stop Words: Yes
217
+ Contractions: No
218
+
219
+ ##### Polish
220
+ Specs: No
221
+ Abbreviations: Yes
222
+ Stop Words: Yes
223
+ Contractions: No
224
+
225
+ ##### Portuguese
226
+ Specs: No
227
+ Abbreviations: No
228
+ Stop Words: Yes
229
+ Contractions: No
230
+
231
+ ##### Romanian
232
+ Specs: No
233
+ Abbreviations: No
234
+ Stop Words: Yes
235
+ Contractions: No
236
+
237
+ ##### Russian
238
+ Specs: No
239
+ Abbreviations: Yes
240
+ Stop Words: Yes
241
+ Contractions: No
242
+
243
+ ##### Slovak
244
+ Specs: No
245
+ Abbreviations: No
246
+ Stop Words: Yes
247
+ Contractions: No
248
+
249
+ ##### Spanish
250
+ Specs: No
251
+ Abbreviations: Yes
252
+ Stop Words: Yes
253
+ Contractions: Yes
254
+
255
+ ##### Swedish
256
+ Specs: No
257
+ Abbreviations: No
258
+ Stop Words: Yes
259
+ Contractions: No
260
+
261
+ ##### Turkish
262
+ Specs: No
263
+ Abbreviations: No
264
+ Stop Words: Yes
265
+ Contractions: No
266
+
125
267
  ## Development
126
268
 
127
269
  After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
@@ -2,7 +2,7 @@ module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
5
- PUNCTUATION_MAP = ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', "☮", '', '', '', '', '', '', '']
5
+ PUNCTUATION_MAP = { "。" => "", "." => "", "." => "", "!" => "", "!" => "", "?" => "", "?" => "", "、" => "", "¡" => "", "¿" => "", "„" => "", "“" => "", "[" => "", "]" => "", "\"" => "", "#" => "", "$" => "", "%" => "", "&" => "", "(" => "", ")" => "", "*" => "", "+" => "", "," => "", ":" => "", ";" => "", "<" => "", "=" => "", ">" => "", "@" => "", "^" => "", "_" => "", "`" => "", "'" => "☮", "{" => "", "|" => "", "}" => "", "~" => "", "-" => "", "«" => "", "»" => "" }
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™']
@@ -11,4 +11,5 @@ module PragmaticTokenizer
11
11
  CONTRACTIONS = {}
12
12
  end
13
13
  end
14
- end
14
+ end
15
+
@@ -4,7 +4,10 @@ module PragmaticTokenizer
4
4
  include Languages::Common
5
5
  ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"]
6
6
  STOP_WORDS = ["algún", "alguna", "algunas", "alguno", "algunos", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "cierto", "ciertos", "como", "con", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "cual", "cuando", "dentro", "desde", "donde", "dos", "el", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esta", "estaba", "estado", "estais", "estamos", "estan", "estoy", "fin", "fue", "fueron", "fui", "fuimos", "gueno", "ha", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "la", "largo", "las", "lo", "los", "mientras", "mio", "modo", "muchos", "muy", "nos", "nosotros", "otro", "para", "pero", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "por", "por qué", "porque", "primero", "puede", "pueden", "puedo", "quien", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "ser", "si", "siendo", "sin", "sobre", "sois", "solamente", "solo", "somos", "soy", "su", "sus", "también", "teneis", "tenemos", "tener", "tengo", "tiempo", "tiene", "tienen", "todo", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuyo", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "va", "vais", "valor", "vamos", "van", "vaya", "verdad", "verdadera", "verdadero", "vosotras", "vosotros", "voy", "yo", "él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió", "aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún", "algo", "alrededor", "anterior", "apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "buen", "buena", "buenas", "bueno", "buenos", "cómo", "casi", "cerca", "cinco", "comentó", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cuales", "cualquier", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "durante", "e", "ejemplo", "ella", "ello", "embargo", "encuentra", "esa", "esas", "ese", "eso", "esos", "está", "están", "estaban", "estar", "estará", "estas", "este", "esto", "estos", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fuera", "gran", "grandes", "había", "habían", "haber", "habrá", "hacerlo", "hacia", "haciendo", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "indicó", "informó", "junto", "lado", "le", "les", "llegó", "lleva", "llevar", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "misma", "mismas", "mismo", "mismos", "momento", "mucha", "muchas", "mucho", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nosotras", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otros", "parece", "parte", "partir", "pasada", "pasado", "pesar", "poca", "pocas", "poco", "pocos", "podrá", "podrán", "podría", "podrían", "poner", "posible", "próximo", "próximos", "primer", "primera", "primeros", "principalmente", "propia", "propias", "propio", "propios", "pudo", "pueda", "pues", "qué", "que", "quedó", "queremos", "quién", "quienes", "quiere", "realizó", "realizado", "realizar", "respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis", "será", "serán", "sería", "sido", "siempre", "siete", "sigue", "siguiente", "sino", "sola", "solas", "solos", "son", "tal", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenga", "tenido", "tercera", "toda", "todas", "todavía", "todos", "total", "trata", "través", "tres", "tuvo", "usted", "varias", "varios", "veces", "ver", "vez", "y", "ya"]
7
- CONTRACTIONS = {}
7
+ CONTRACTIONS = {
8
+ "al" => "a el",
9
+ "del" => "de el"
10
+ }
8
11
  end
9
12
  end
10
13
  end
@@ -124,16 +124,17 @@ module PragmaticTokenizer
124
124
  cleaned_tokens
125
125
  end
126
126
 
127
- def convert_punct_to_sym(p)
128
- index = PragmaticTokenizer::Languages::Common::PUNCTUATION.index(p)
129
- PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[index]
127
+ def convert_punct_to_sym(punctuation)
128
+ PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
130
129
  end
131
130
 
132
- def convert_sym_to_punct(p)
133
- PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.each_with_index do |m, i|
134
- return p.gsub!(m, PragmaticTokenizer::Languages::Common::PUNCTUATION[i]) if p.include?(m)
131
+ def convert_sym_to_punct(token)
132
+ symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
133
+ if symbol.nil?
134
+ return token
135
+ else
136
+ return token.gsub!(symbol[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol[0]))
135
137
  end
136
- p
137
138
  end
138
139
  end
139
140
  end
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
4
4
  module PragmaticTokenizer
5
5
  class Tokenizer
6
6
 
7
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
8
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
7
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals
8
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false)
9
9
  unless punctuation.eql?('all') ||
10
10
  punctuation.eql?('semi') ||
11
11
  punctuation.eql?('none') ||
@@ -33,11 +33,12 @@ module PragmaticTokenizer
33
33
  @clean = clean
34
34
  @remove_numbers = remove_numbers
35
35
  @minimum_length = minimum_length
36
+ @remove_roman_numerals = remove_roman_numerals
36
37
  end
37
38
 
38
39
  def tokenize
39
40
  return [] unless text
40
- remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
41
+ remove_short_tokens(delete_roman_numerals(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))))
41
42
  end
42
43
 
43
44
  private
@@ -54,12 +55,17 @@ module PragmaticTokenizer
54
55
 
55
56
  def delete_numbers(tokens)
56
57
  return tokens unless remove_numbers
57
- tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
58
+ tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
59
+ end
60
+
61
+ def delete_roman_numerals(tokens)
62
+ return tokens unless remove_roman_numerals
63
+ tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") } if remove_roman_numerals
58
64
  end
59
65
 
60
66
  def cleaner(tokens)
61
67
  return tokens unless clean
62
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
68
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ }
63
69
  end
64
70
 
65
71
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.4"
2
+ VERSION = "0.1.5"
3
3
  end
@@ -21,4 +21,5 @@ Gem::Specification.new do |spec|
21
21
  spec.add_development_dependency "bundler", "~> 1.9"
22
22
  spec.add_development_dependency "rake", "~> 10.0"
23
23
  spec.add_development_dependency "rspec"
24
+ spec.add_development_dependency "stackprof"
24
25
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: stackprof
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: A multilingual tokenizer to split a string into tokens.
56
70
  email:
57
71
  - diasks2@gmail.com