RubyGems - pragmatic_tokenizer - Versions diffs - 0.1.4 → 0.1.5 - Mend

pragmatic_tokenizer 0.1.4 → 0.1.5

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +144 -2
data/lib/pragmatic_tokenizer/languages/common.rb +3 -2
data/lib/pragmatic_tokenizer/languages/spanish.rb +4 -1
data/lib/pragmatic_tokenizer/processor.rb +8 -7
data/lib/pragmatic_tokenizer/tokenizer.rb +11 -5
data/lib/pragmatic_tokenizer/version.rb +1 -1
data/pragmatic_tokenizer.gemspec +1 -0
metadata +15 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
-  data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
+  metadata.gz: a52658ccd583baac9e69b37649e5403ad6f3549b
+  data.tar.gz: 95335fab907589faa066f7a76bd8a6df4d9f2f70
 SHA512:
-  metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
-  data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
+  metadata.gz: f7b42942e2676cc22a807938244fc38b7e4d4097bcdb3a6f0c08fe94fccc923cce50e8328e91efdfdd93f758771dda08b3ea432bb67099536630d4a9d9d9f4e9
+  data.tar.gz: d78a40443dc9948866d1f271e47af355a8838e38dbd932d282d7b0c905ad06dfb392f65ba7cc2df6a891098ff52b9a795317b1a458363214930c08650cc6ef96

data/README.md CHANGED Viewed

@@ -61,7 +61,7 @@ Or install it yourself as:
 ##### `clean`
   **default** = `'false'`
 - `true`
-  Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
+  Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™).
 - `false`
   Leaves tokens as is.
@@ -77,7 +77,7 @@ Or install it yourself as:
 <hr>
 ##### `minimum_length`
-  **default** = `0`
+  **default** = `0`
   The minimum number of characters a token should be.
 **Example Usage**
@@ -122,6 +122,148 @@ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
 # => ["minimum", "length"]
 ```
+## Language Support
+The following lists the current level of support for different languages. Pull requests or help for any languages that are not fully supported would be greatly appreciated. *N.B. - contractions might not be applicable for all languages below - in that case the CONTRACTIONS hash should stay empty.*
+##### English
+Specs: Yes
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: Yes
+##### Arabic
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Bulgarian
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Catalan
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Czech
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Danish
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### German
+Specs: More needed
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Finnish
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### French
+Specs: More needed
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Greek
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Indonesian
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Italian
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Latvian
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Norwegian
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Persian
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Polish
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Portuguese
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Romanian
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Russian
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: No
+##### Slovak
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Spanish
+Specs: No
+Abbreviations: Yes
+Stop Words: Yes
+Contractions: Yes
+##### Swedish
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
+##### Turkish
+Specs: No
+Abbreviations: No
+Stop Words: Yes
+Contractions: No
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/pragmatic_tokenizer/languages/common.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module PragmaticTokenizer
   module Languages
     module Common
       PUNCTUATION = ['。', '．', '.', '！', '!', '?', '？', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
-      PUNCTUATION_MAP = ['♳', '♴', '♵', '♶', '♷', '♸', '♹', '♺', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅', '☇', '☈', '☉', '☊', '☋', '☌', '☍', '☠', '☢', '☣', '☤', '☥', '☦', '☧', '☀', '☁', '☂', '☃', '☄', "☮", '♔', '♕', '♖', '♗', '♘', '♙', '♚']
+      PUNCTUATION_MAP = { "。" => "♳", "．" => "♴", "." => "♵", "！" => "♶", "!" => "♷", "?" => "♸", "？" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
       SEMI_PUNCTUATION = ['。', '．', '.']
       ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
       SPECIAL_CHARACTERS = ['®', '©', '™']
@@ -11,4 +11,5 @@ module PragmaticTokenizer
       CONTRACTIONS = {}
     end
   end
-end
+end

data/lib/pragmatic_tokenizer/languages/spanish.rb CHANGED Viewed

@@ -4,7 +4,10 @@ module PragmaticTokenizer
       include Languages::Common
       ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"]
       STOP_WORDS = ["algún", "alguna", "algunas", "alguno", "algunos", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "cierto", "ciertos", "como", "con", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "cual", "cuando", "dentro", "desde", "donde", "dos", "el", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esta", "estaba", "estado", "estais", "estamos", "estan", "estoy", "fin", "fue", "fueron", "fui", "fuimos", "gueno", "ha", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "la", "largo", "las", "lo", "los", "mientras", "mio", "modo", "muchos", "muy", "nos", "nosotros", "otro", "para", "pero", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "por", "por qué", "porque", "primero", "puede", "pueden", "puedo", "quien", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "ser", "si", "siendo", "sin", "sobre", "sois", "solamente", "solo", "somos", "soy", "su", "sus", "también", "teneis", "tenemos", "tener", "tengo", "tiempo", "tiene", "tienen", "todo", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuyo", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "va", "vais", "valor", "vamos", "van", "vaya", "verdad", "verdadera", "verdadero", "vosotras", "vosotros", "voy", "yo", "él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió", "aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún", "algo", "alrededor", "anterior", "apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "buen", "buena", "buenas", "bueno", "buenos", "cómo", "casi", "cerca", "cinco", "comentó", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cuales", "cualquier", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "durante", "e", "ejemplo", "ella", "ello", "embargo", "encuentra", "esa", "esas", "ese", "eso", "esos", "está", "están", "estaban", "estar", "estará", "estas", "este", "esto", "estos", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fuera", "gran", "grandes", "había", "habían", "haber", "habrá", "hacerlo", "hacia", "haciendo", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "indicó", "informó", "junto", "lado", "le", "les", "llegó", "lleva", "llevar", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "misma", "mismas", "mismo", "mismos", "momento", "mucha", "muchas", "mucho", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nosotras", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otros", "parece", "parte", "partir", "pasada", "pasado", "pesar", "poca", "pocas", "poco", "pocos", "podrá", "podrán", "podría", "podrían", "poner", "posible", "próximo", "próximos", "primer", "primera", "primeros", "principalmente", "propia", "propias", "propio", "propios", "pudo", "pueda", "pues", "qué", "que", "quedó", "queremos", "quién", "quienes", "quiere", "realizó", "realizado", "realizar", "respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis", "será", "serán", "sería", "sido", "siempre", "siete", "sigue", "siguiente", "sino", "sola", "solas", "solos", "son", "tal", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenga", "tenido", "tercera", "toda", "todas", "todavía", "todos", "total", "trata", "través", "tres", "tuvo", "usted", "varias", "varios", "veces", "ver", "vez", "y", "ya"]
-      CONTRACTIONS = {}
+      CONTRACTIONS = {
+        "al"  => "a el",
+        "del" => "de el"
+      }
     end
   end
 end

data/lib/pragmatic_tokenizer/processor.rb CHANGED Viewed

@@ -124,16 +124,17 @@ module PragmaticTokenizer
       cleaned_tokens
     end
-    def convert_punct_to_sym(p)
-      index = PragmaticTokenizer::Languages::Common::PUNCTUATION.index(p)
-      PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[index]
+    def convert_punct_to_sym(punctuation)
+      PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[punctuation]
     end
-    def convert_sym_to_punct(p)
-      PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.each_with_index do |m, i|
-        return p.gsub!(m, PragmaticTokenizer::Languages::Common::PUNCTUATION[i]) if p.include?(m)
+    def convert_sym_to_punct(token)
+      symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
+      if symbol.nil?
+        return token
+      else
+        return token.gsub!(symbol[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol[0]))
       end
-      p
     end
   end
 end

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
 module PragmaticTokenizer
   class Tokenizer
-    attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
-    def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
+    attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals
+    def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false)
       unless punctuation.eql?('all') ||
         punctuation.eql?('semi') ||
         punctuation.eql?('none') ||
@@ -33,11 +33,12 @@ module PragmaticTokenizer
       @clean = clean
       @remove_numbers = remove_numbers
       @minimum_length = minimum_length
+      @remove_roman_numerals = remove_roman_numerals
     end
     def tokenize
       return [] unless text
-      remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
+      remove_short_tokens(delete_roman_numerals(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))))
     end
     private
@@ -54,12 +55,17 @@ module PragmaticTokenizer
     def delete_numbers(tokens)
       return tokens unless remove_numbers
-      tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
+      tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
+    end
+    def delete_roman_numerals(tokens)
+      return tokens unless remove_roman_numerals
+      tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") } if remove_roman_numerals
     end
     def cleaner(tokens)
       return tokens unless clean
-      tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
+      tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) || t =~ /\A\.{2,}\z/ }
     end
     def remove_punctuation(tokens)

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "0.1.4"
+  VERSION = "0.1.5"
 end

data/pragmatic_tokenizer.gemspec CHANGED Viewed

@@ -21,4 +21,5 @@ Gem::Specification.new do |spec|
   spec.add_development_dependency "bundler", "~> 1.9"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency "rspec"
+  spec.add_development_dependency "stackprof"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.1.4
+  version: 0.1.5
 platform: ruby
 authors:
 - Kevin S. Dias
@@ -52,6 +52,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: stackprof
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description: A multilingual tokenizer to split a string into tokens.
 email:
 - diasks2@gmail.com