RubyGems - pragmatic_tokenizer - Versions diffs - 1.6.0 → 2.0.0 - Mend

pragmatic_tokenizer 1.6.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/lib/pragmatic_tokenizer/full_stop_separator.rb +4 -4
data/lib/pragmatic_tokenizer/languages/spanish.rb +1 -1
data/lib/pragmatic_tokenizer/post_processor.rb +1 -1
data/lib/pragmatic_tokenizer/pre_processor.rb +0 -5
data/lib/pragmatic_tokenizer/tokenizer.rb +12 -8
data/lib/pragmatic_tokenizer/version.rb +1 -1
data/pragmatic_tokenizer.gemspec +1 -1
data/spec/languages/english_spec.rb +0 -21
data/spec/performance_spec.rb +29 -27
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 38306989cff2d5d9e437a4ff939e1a86943b5cc1
-  data.tar.gz: 5c513f3497b6097005342927d7914b698023112e
+  metadata.gz: e72eaf6ddb3c1c7d0d0dc5f57727f796e01f5280
+  data.tar.gz: 80511d5cbcc8c84d1b3a70b70c8ddd070b1f7170
 SHA512:
-  metadata.gz: 0c6f611b2fc1ee8b379743fe16e8e4ea0033e439f543bbb47a4224b6bd42b7f9e276faa7e3d89aaacfc6292babe9a85bf83d09fe4f4004f88de04ea42feb57b5
-  data.tar.gz: f8b952202a33dddddb8a414e2d1cf7fbbcd1cc20edd937b79f8982d8c65f6ee2b3c12a55db475697758e4612ff25aff5dfc77f88b36599f1f1a020ba6f7e059c
+  metadata.gz: 9b81069f61fecd39d437699bb23090af30bb3a4231b6c8523f0b0a41e1f77d7a511cd2d54997db802f77d4743aead0446bcc8afd3d449007b4e5c37212dd9b65
+  data.tar.gz: 6f5c29155a382aae49949b554d8854e0b74a2967a4d92f5cc4f8dc3ef787c45ee061d7aa6ed5c3f510dd133ea52f199ca226113e5e6530c5f852eae7a08556a5

data/lib/pragmatic_tokenizer/full_stop_separator.rb CHANGED Viewed

@@ -23,7 +23,7 @@ module PragmaticTokenizer
           if downcase
             abbreviation = abbr[w]
           else
-            abbreviation = abbr[Unicode.downcase(w)]
+            abbreviation = abbr[UnicodeCaseConverter::Converter.new(w).downcase]
           end
           unless abbreviation || w =~ /\A[a-z]\z/i ||
                  w =~ /[a-z](?:\.[a-z])+\z/i
@@ -35,11 +35,11 @@ module PragmaticTokenizer
         cleaned_tokens << tokens[i]
       end
       if downcase
-        abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
+        abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
       else
-        abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
+        abbr_included = abbreviations.include?(UnicodeCaseConverter::Converter.new(cleaned_tokens[-1]).downcase.chomp(".")) unless cleaned_tokens[-1].nil?
       end
-      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
+      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
         cleaned_tokens[-1] = Regexp.last_match(1)
         cleaned_tokens.push '.'
       end

data/lib/pragmatic_tokenizer/languages/spanish.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module PragmaticTokenizer
   module Languages
     module Spanish
       include Languages::Common
-      ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "das", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "do", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"].freeze
+      ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"].freeze
       STOP_WORDS = ["algún", "alguna", "algunas", "alguno", "algunos", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "cierto", "ciertos", "como", "con", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "cual", "cuando", "dentro", "desde", "donde", "dos", "el", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esta", "estaba", "estado", "estais", "estamos", "estan", "estoy", "fin", "fue", "fueron", "fui", "fuimos", "gueno", "ha", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "la", "largo", "las", "lo", "los", "mientras", "mio", "modo", "muchos", "muy", "nos", "nosotros", "otro", "para", "pero", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "por", "por qué", "porque", "primero", "puede", "pueden", "puedo", "quien", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "ser", "si", "siendo", "sin", "sobre", "sois", "solamente", "solo", "somos", "soy", "su", "sus", "también", "teneis", "tenemos", "tener", "tengo", "tiempo", "tiene", "tienen", "todo", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuyo", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "va", "vais", "valor", "vamos", "van", "vaya", "verdad", "verdadera", "verdadero", "vosotras", "vosotros", "voy", "yo", "él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió", "aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún", "algo", "alrededor", "anterior", "apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "buen", "buena", "buenas", "bueno", "buenos", "cómo", "casi", "cerca", "cinco", "comentó", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cuales", "cualquier", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "durante", "e", "ejemplo", "ella", "ello", "embargo", "encuentra", "esa", "esas", "ese", "eso", "esos", "está", "están", "estaban", "estar", "estará", "estas", "este", "esto", "estos", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fuera", "gran", "grandes", "había", "habían", "haber", "habrá", "hacerlo", "hacia", "haciendo", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "indicó", "informó", "junto", "lado", "le", "les", "llegó", "lleva", "llevar", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "misma", "mismas", "mismo", "mismos", "momento", "mucha", "muchas", "mucho", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nosotras", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otros", "parece", "parte", "partir", "pasada", "pasado", "pesar", "poca", "pocas", "poco", "pocos", "podrá", "podrán", "podría", "podrían", "poner", "posible", "próximo", "próximos", "primer", "primera", "primeros", "principalmente", "propia", "propias", "propio", "propios", "pudo", "pueda", "pues", "qué", "que", "quedó", "queremos", "quién", "quienes", "quiere", "realizó", "realizado", "realizar", "respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis", "será", "serán", "sería", "sido", "siempre", "siete", "sigue", "siguiente", "sino", "sola", "solas", "solos", "son", "tal", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenga", "tenido", "tercera", "toda", "todas", "todavía", "todos", "total", "trata", "través", "tres", "tuvo", "usted", "varias", "varios", "veces", "ver", "vez", "y", "ya"].freeze
       CONTRACTIONS = {
           "al"  => "a el",

data/lib/pragmatic_tokenizer/post_processor.rb CHANGED Viewed

@@ -96,7 +96,7 @@ module PragmaticTokenizer
         if downcase
           token.split(/(\.)/)[0]
         else
-          Unicode.downcase(token.split(/(\.)/)[0])
+          UnicodeCaseConverter::Converter.new(token.split(/(\.)/)[0]).downcase
         end
       end

data/lib/pragmatic_tokenizer/pre_processor.rb CHANGED Viewed

@@ -2,7 +2,6 @@ module PragmaticTokenizer
   module PreProcessor
     def pre_process(language: Languages::Common)
-      remove_non_breaking_space!
       shift_comma!
       shift_multiple_dash!
       shift_inverted_question_mark!
@@ -32,10 +31,6 @@ module PragmaticTokenizer
     private
-      def remove_non_breaking_space!
-         gsub!(/\u{00A0}/, ''.freeze)
-      end
       # Shift commas off everything but numbers
       def shift_comma!
         gsub!(/,(?!\d)/o, ' , '.freeze)

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -4,7 +4,7 @@ require 'pragmatic_tokenizer/pre_processor'
 require 'pragmatic_tokenizer/post_processor'
 require 'pragmatic_tokenizer/full_stop_separator'
 require 'pragmatic_tokenizer/ending_punctuation_separator'
-require 'unicode'
+require 'unicode_case_converter'
 module PragmaticTokenizer
   class Tokenizer
@@ -128,10 +128,11 @@ module PragmaticTokenizer
       def post_process(text)
         if downcase
-          @tokens = PostProcessor.new(text: Unicode.downcase(text), abbreviations: abbreviations, downcase: downcase).post_process
+          @tokens = PostProcessor.new(text: UnicodeCaseConverter::Converter.new(text).downcase, abbreviations: abbreviations, downcase: downcase).post_process
         else
           @tokens = PostProcessor.new(text: text, abbreviations: abbreviations, downcase: downcase).post_process
         end
+        # downcase! if downcase
         expand_contractions!(contractions) if expand_contractions
         clean! if clean
         classic_filter! if classic_filter
@@ -149,6 +150,10 @@ module PragmaticTokenizer
         @tokens.reject(&:empty?)
       end
+      def downcase!
+        @tokens.map! { |t| UnicodeCaseConverter::Converter.new(t).downcase }
+      end
       def expand_contractions!(contractions)
         @tokens = if downcase
                     @tokens.flat_map do |t|
@@ -162,11 +167,11 @@ module PragmaticTokenizer
                     end
                   else
                     @tokens.flat_map do |t|
-                      if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'")))
-                        contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))]
+                      if contractions.key?(UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›＇´`]/, "'")).downcase)
+                        contractions[UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›＇´`]/, "'")).downcase]
                             .split(' ')
                             .each_with_index
-                            .map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
+                            .map { |token, i| i.eql?(0) ? UnicodeCaseConverter::Converter.new(token).capitalize : token }
                             .flatten
                       else
                         t
@@ -191,7 +196,6 @@ module PragmaticTokenizer
             .map { |t| t.gsub(/\u{00AD}/, '') }
             .map { |t| t.gsub(/\A(-|–)/, '') }
             .map { |t| t.gsub(/[®©]/, '') }
-            .map { |t| t.gsub(/\A\%/, '') }
             .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
             .delete_if do |t|
           t =~ /\A-+\z/ ||
@@ -215,7 +219,7 @@ module PragmaticTokenizer
           if downcase
             @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
           else
-            @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
+            @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(UnicodeCaseConverter::Converter.new(t).downcase) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{UnicodeCaseConverter::Converter.new(t).downcase}.") }
           end
         when 'only'
           @tokens.delete_if { |t| t =~ /\A\D+\z/ }
@@ -241,7 +245,7 @@ module PragmaticTokenizer
         if downcase
           @tokens -= stop_words
         else
-          @tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
+          @tokens.delete_if { |t| stop_words.include?(UnicodeCaseConverter::Converter.new(t).downcase) }
         end
       end

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "1.6.0".freeze
+  VERSION = "2.0.0".freeze
 end

data/pragmatic_tokenizer.gemspec CHANGED Viewed

@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
   spec.require_paths = ["lib"]
-  spec.add_runtime_dependency "unicode"
+  spec.add_runtime_dependency "unicode_case_converter", "~> 0.4"
   spec.add_development_dependency "bundler", "~> 1.9"
   spec.add_development_dependency "rake", "~> 10.0"
   spec.add_development_dependency "rspec"

data/spec/languages/english_spec.rb CHANGED Viewed

@@ -1437,27 +1437,6 @@ describe PragmaticTokenizer do
           )
           expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
         end
-        it 'removes non-breaking spaces' do
-          text = "%20141201~221624  %User ID,JU,JU John %TU=00000362  %PT-BR  %Wordfast    da hello."
-          pt = PragmaticTokenizer::Tokenizer.new(text,
-            language: :en,
-            filter_languages: [:en],
-            clean: true,
-            numbers: :none,
-            minimum_length: 3,
-            expand_contractions: true,
-            remove_stop_words: true,
-            punctuation: :none,
-            remove_emails: true,
-            remove_domains: true,
-            remove_urls: true,
-            hashtags: :remove,
-            mentions: :remove,
-            downcase: true
-          )
-          expect(pt.tokenize).to eq(["user", "john", "pt-br", "wordfast"])
-        end
       end
     end

data/spec/performance_spec.rb CHANGED Viewed

@@ -8,18 +8,21 @@ describe PragmaticTokenizer do
   # it 'is fast?' do
   #   string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
-  #   data = StackProf.run(mode: :cpu, interval: 1000) do
-  #     PragmaticTokenizer::Tokenizer.new(string * 100,
-  #       language: 'en',
-  #       clean: true,
-  #       minimum_length: 3,
-  #       expand_contractions: true,
-  #       remove_stop_words: true,
-  #       numbers: :none,
-  #       punctuation: :none
-  #     ).tokenize
+  #   benchmark do
+  #     10.times do
+  #       data = StackProf.run(mode: :cpu, interval: 1000) do
+  #         PragmaticTokenizer::Tokenizer.new(string * 100,
+  #           language: 'en',
+  #           clean: true,
+  #           remove_numbers: true,
+  #           minimum_length: 3,
+  #           expand_contractions: true,
+  #           remove_stop_words: true
+  #         ).tokenize
+  #       end
+  #       puts StackProf::Report.new(data).print_text
+  #     end
   #   end
-  #   puts StackProf::Report.new(data).print_text
   # end
   # 26.8
@@ -27,22 +30,21 @@ describe PragmaticTokenizer do
   # 9.6
   # 23.25
   # 24.2
-  # 23.2
-  # it 'is fast? (long strings)' do
-  #   string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
-  #   puts "LENGTH: #{string.length}"
-  #   benchmark do
-  #     PragmaticTokenizer::Tokenizer.new(string,
-  #       language: 'en',
-  #       clean: true,
-  #       minimum_length: 3,
-  #       expand_contractions: true,
-  #       remove_stop_words: true,
-  #       numbers: :none,
-  #       punctuation: :none
-  #     ).tokenize
-  #   end
-  # end
+  it 'is fast? (long strings)' do
+    string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
+    puts "LENGTH: #{string.length}"
+    benchmark do
+      PragmaticTokenizer::Tokenizer.new(string,
+        language: 'en',
+        clean: true,
+        minimum_length: 3,
+        expand_contractions: true,
+        remove_stop_words: true,
+        numbers: :none,
+        punctuation: :none
+      ).tokenize
+    end
+  end
   # it 'is the baseline' do
   #   string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 1.6.0
+  version: 2.0.0
 platform: ruby
 authors:
 - Kevin S. Dias
@@ -11,19 +11,19 @@ cert_chain: []
 date: 2016-01-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: unicode
+  name: unicode_case_converter
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.4'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: '0'
+        version: '0.4'
 - !ruby/object:Gem::Dependency
   name: bundler
   requirement: !ruby/object:Gem::Requirement