pragmatic_tokenizer 1.6.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 38306989cff2d5d9e437a4ff939e1a86943b5cc1
4
- data.tar.gz: 5c513f3497b6097005342927d7914b698023112e
3
+ metadata.gz: e72eaf6ddb3c1c7d0d0dc5f57727f796e01f5280
4
+ data.tar.gz: 80511d5cbcc8c84d1b3a70b70c8ddd070b1f7170
5
5
  SHA512:
6
- metadata.gz: 0c6f611b2fc1ee8b379743fe16e8e4ea0033e439f543bbb47a4224b6bd42b7f9e276faa7e3d89aaacfc6292babe9a85bf83d09fe4f4004f88de04ea42feb57b5
7
- data.tar.gz: f8b952202a33dddddb8a414e2d1cf7fbbcd1cc20edd937b79f8982d8c65f6ee2b3c12a55db475697758e4612ff25aff5dfc77f88b36599f1f1a020ba6f7e059c
6
+ metadata.gz: 9b81069f61fecd39d437699bb23090af30bb3a4231b6c8523f0b0a41e1f77d7a511cd2d54997db802f77d4743aead0446bcc8afd3d449007b4e5c37212dd9b65
7
+ data.tar.gz: 6f5c29155a382aae49949b554d8854e0b74a2967a4d92f5cc4f8dc3ef787c45ee061d7aa6ed5c3f510dd133ea52f199ca226113e5e6530c5f852eae7a08556a5
@@ -23,7 +23,7 @@ module PragmaticTokenizer
23
23
  if downcase
24
24
  abbreviation = abbr[w]
25
25
  else
26
- abbreviation = abbr[Unicode.downcase(w)]
26
+ abbreviation = abbr[UnicodeCaseConverter::Converter.new(w).downcase]
27
27
  end
28
28
  unless abbreviation || w =~ /\A[a-z]\z/i ||
29
29
  w =~ /[a-z](?:\.[a-z])+\z/i
@@ -35,11 +35,11 @@ module PragmaticTokenizer
35
35
  cleaned_tokens << tokens[i]
36
36
  end
37
37
  if downcase
38
- abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
38
+ abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
39
  else
40
- abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
40
+ abbr_included = abbreviations.include?(UnicodeCaseConverter::Converter.new(cleaned_tokens[-1]).downcase.chomp(".")) unless cleaned_tokens[-1].nil?
41
41
  end
42
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
42
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
43
43
  cleaned_tokens[-1] = Regexp.last_match(1)
44
44
  cleaned_tokens.push '.'
45
45
  end
@@ -2,7 +2,7 @@ module PragmaticTokenizer
2
2
  module Languages
3
3
  module Spanish
4
4
  include Languages::Common
5
- ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "das", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "do", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"].freeze
5
+ ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"].freeze
6
6
  STOP_WORDS = ["algún", "alguna", "algunas", "alguno", "algunos", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "cierto", "ciertos", "como", "con", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "cual", "cuando", "dentro", "desde", "donde", "dos", "el", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esta", "estaba", "estado", "estais", "estamos", "estan", "estoy", "fin", "fue", "fueron", "fui", "fuimos", "gueno", "ha", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "la", "largo", "las", "lo", "los", "mientras", "mio", "modo", "muchos", "muy", "nos", "nosotros", "otro", "para", "pero", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "por", "por qué", "porque", "primero", "puede", "pueden", "puedo", "quien", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "ser", "si", "siendo", "sin", "sobre", "sois", "solamente", "solo", "somos", "soy", "su", "sus", "también", "teneis", "tenemos", "tener", "tengo", "tiempo", "tiene", "tienen", "todo", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuyo", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "va", "vais", "valor", "vamos", "van", "vaya", "verdad", "verdadera", "verdadero", "vosotras", "vosotros", "voy", "yo", "él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió", "aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún", "algo", "alrededor", "anterior", "apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "buen", "buena", "buenas", "bueno", "buenos", "cómo", "casi", "cerca", "cinco", "comentó", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cuales", "cualquier", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "durante", "e", "ejemplo", "ella", "ello", "embargo", "encuentra", "esa", "esas", "ese", "eso", "esos", "está", "están", "estaban", "estar", "estará", "estas", "este", "esto", "estos", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fuera", "gran", "grandes", "había", "habían", "haber", "habrá", "hacerlo", "hacia", "haciendo", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "indicó", "informó", "junto", "lado", "le", "les", "llegó", "lleva", "llevar", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "misma", "mismas", "mismo", "mismos", "momento", "mucha", "muchas", "mucho", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nosotras", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otros", "parece", "parte", "partir", "pasada", "pasado", "pesar", "poca", "pocas", "poco", "pocos", "podrá", "podrán", "podría", "podrían", "poner", "posible", "próximo", "próximos", "primer", "primera", "primeros", "principalmente", "propia", "propias", "propio", "propios", "pudo", "pueda", "pues", "qué", "que", "quedó", "queremos", "quién", "quienes", "quiere", "realizó", "realizado", "realizar", "respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis", "será", "serán", "sería", "sido", "siempre", "siete", "sigue", "siguiente", "sino", "sola", "solas", "solos", "son", "tal", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenga", "tenido", "tercera", "toda", "todas", "todavía", "todos", "total", "trata", "través", "tres", "tuvo", "usted", "varias", "varios", "veces", "ver", "vez", "y", "ya"].freeze
7
7
  CONTRACTIONS = {
8
8
  "al" => "a el",
@@ -96,7 +96,7 @@ module PragmaticTokenizer
96
96
  if downcase
97
97
  token.split(/(\.)/)[0]
98
98
  else
99
- Unicode.downcase(token.split(/(\.)/)[0])
99
+ UnicodeCaseConverter::Converter.new(token.split(/(\.)/)[0]).downcase
100
100
  end
101
101
  end
102
102
 
@@ -2,7 +2,6 @@ module PragmaticTokenizer
2
2
  module PreProcessor
3
3
 
4
4
  def pre_process(language: Languages::Common)
5
- remove_non_breaking_space!
6
5
  shift_comma!
7
6
  shift_multiple_dash!
8
7
  shift_inverted_question_mark!
@@ -32,10 +31,6 @@ module PragmaticTokenizer
32
31
 
33
32
  private
34
33
 
35
- def remove_non_breaking_space!
36
- gsub!(/\u{00A0}/, ''.freeze)
37
- end
38
-
39
34
  # Shift commas off everything but numbers
40
35
  def shift_comma!
41
36
  gsub!(/,(?!\d)/o, ' , '.freeze)
@@ -4,7 +4,7 @@ require 'pragmatic_tokenizer/pre_processor'
4
4
  require 'pragmatic_tokenizer/post_processor'
5
5
  require 'pragmatic_tokenizer/full_stop_separator'
6
6
  require 'pragmatic_tokenizer/ending_punctuation_separator'
7
- require 'unicode'
7
+ require 'unicode_case_converter'
8
8
 
9
9
  module PragmaticTokenizer
10
10
  class Tokenizer
@@ -128,10 +128,11 @@ module PragmaticTokenizer
128
128
 
129
129
  def post_process(text)
130
130
  if downcase
131
- @tokens = PostProcessor.new(text: Unicode.downcase(text), abbreviations: abbreviations, downcase: downcase).post_process
131
+ @tokens = PostProcessor.new(text: UnicodeCaseConverter::Converter.new(text).downcase, abbreviations: abbreviations, downcase: downcase).post_process
132
132
  else
133
133
  @tokens = PostProcessor.new(text: text, abbreviations: abbreviations, downcase: downcase).post_process
134
134
  end
135
+ # downcase! if downcase
135
136
  expand_contractions!(contractions) if expand_contractions
136
137
  clean! if clean
137
138
  classic_filter! if classic_filter
@@ -149,6 +150,10 @@ module PragmaticTokenizer
149
150
  @tokens.reject(&:empty?)
150
151
  end
151
152
 
153
+ def downcase!
154
+ @tokens.map! { |t| UnicodeCaseConverter::Converter.new(t).downcase }
155
+ end
156
+
152
157
  def expand_contractions!(contractions)
153
158
  @tokens = if downcase
154
159
  @tokens.flat_map do |t|
@@ -162,11 +167,11 @@ module PragmaticTokenizer
162
167
  end
163
168
  else
164
169
  @tokens.flat_map do |t|
165
- if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
166
- contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
170
+ if contractions.key?(UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›'´`]/, "'")).downcase)
171
+ contractions[UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›'´`]/, "'")).downcase]
167
172
  .split(' ')
168
173
  .each_with_index
169
- .map { |token, i| i.eql?(0) ? Unicode.capitalize(token) : token }
174
+ .map { |token, i| i.eql?(0) ? UnicodeCaseConverter::Converter.new(token).capitalize : token }
170
175
  .flatten
171
176
  else
172
177
  t
@@ -191,7 +196,6 @@ module PragmaticTokenizer
191
196
  .map { |t| t.gsub(/\u{00AD}/, '') }
192
197
  .map { |t| t.gsub(/\A(-|–)/, '') }
193
198
  .map { |t| t.gsub(/[®©]/, '') }
194
- .map { |t| t.gsub(/\A\%/, '') }
195
199
  .map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
196
200
  .delete_if do |t|
197
201
  t =~ /\A-+\z/ ||
@@ -215,7 +219,7 @@ module PragmaticTokenizer
215
219
  if downcase
216
220
  @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
217
221
  else
218
- @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
222
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(UnicodeCaseConverter::Converter.new(t).downcase) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{UnicodeCaseConverter::Converter.new(t).downcase}.") }
219
223
  end
220
224
  when 'only'
221
225
  @tokens.delete_if { |t| t =~ /\A\D+\z/ }
@@ -241,7 +245,7 @@ module PragmaticTokenizer
241
245
  if downcase
242
246
  @tokens -= stop_words
243
247
  else
244
- @tokens.delete_if { |t| stop_words.include?(Unicode.downcase(t)) }
248
+ @tokens.delete_if { |t| stop_words.include?(UnicodeCaseConverter::Converter.new(t).downcase) }
245
249
  end
246
250
  end
247
251
 
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.6.0".freeze
2
+ VERSION = "2.0.0".freeze
3
3
  end
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ["lib"]
20
20
 
21
- spec.add_runtime_dependency "unicode"
21
+ spec.add_runtime_dependency "unicode_case_converter", "~> 0.4"
22
22
  spec.add_development_dependency "bundler", "~> 1.9"
23
23
  spec.add_development_dependency "rake", "~> 10.0"
24
24
  spec.add_development_dependency "rspec"
@@ -1437,27 +1437,6 @@ describe PragmaticTokenizer do
1437
1437
  )
1438
1438
  expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
1439
1439
  end
1440
-
1441
- it 'removes non-breaking spaces' do
1442
- text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast    da hello."
1443
- pt = PragmaticTokenizer::Tokenizer.new(text,
1444
- language: :en,
1445
- filter_languages: [:en],
1446
- clean: true,
1447
- numbers: :none,
1448
- minimum_length: 3,
1449
- expand_contractions: true,
1450
- remove_stop_words: true,
1451
- punctuation: :none,
1452
- remove_emails: true,
1453
- remove_domains: true,
1454
- remove_urls: true,
1455
- hashtags: :remove,
1456
- mentions: :remove,
1457
- downcase: true
1458
- )
1459
- expect(pt.tokenize).to eq(["user", "john", "pt-br", "wordfast"])
1460
- end
1461
1440
  end
1462
1441
  end
1463
1442
 
@@ -8,18 +8,21 @@ describe PragmaticTokenizer do
8
8
 
9
9
  # it 'is fast?' do
10
10
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
11
- # data = StackProf.run(mode: :cpu, interval: 1000) do
12
- # PragmaticTokenizer::Tokenizer.new(string * 100,
13
- # language: 'en',
14
- # clean: true,
15
- # minimum_length: 3,
16
- # expand_contractions: true,
17
- # remove_stop_words: true,
18
- # numbers: :none,
19
- # punctuation: :none
20
- # ).tokenize
11
+ # benchmark do
12
+ # 10.times do
13
+ # data = StackProf.run(mode: :cpu, interval: 1000) do
14
+ # PragmaticTokenizer::Tokenizer.new(string * 100,
15
+ # language: 'en',
16
+ # clean: true,
17
+ # remove_numbers: true,
18
+ # minimum_length: 3,
19
+ # expand_contractions: true,
20
+ # remove_stop_words: true
21
+ # ).tokenize
22
+ # end
23
+ # puts StackProf::Report.new(data).print_text
24
+ # end
21
25
  # end
22
- # puts StackProf::Report.new(data).print_text
23
26
  # end
24
27
 
25
28
  # 26.8
@@ -27,22 +30,21 @@ describe PragmaticTokenizer do
27
30
  # 9.6
28
31
  # 23.25
29
32
  # 24.2
30
- # 23.2
31
- # it 'is fast? (long strings)' do
32
- # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
33
- # puts "LENGTH: #{string.length}"
34
- # benchmark do
35
- # PragmaticTokenizer::Tokenizer.new(string,
36
- # language: 'en',
37
- # clean: true,
38
- # minimum_length: 3,
39
- # expand_contractions: true,
40
- # remove_stop_words: true,
41
- # numbers: :none,
42
- # punctuation: :none
43
- # ).tokenize
44
- # end
45
- # end
33
+ it 'is fast? (long strings)' do
34
+ string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
35
+ puts "LENGTH: #{string.length}"
36
+ benchmark do
37
+ PragmaticTokenizer::Tokenizer.new(string,
38
+ language: 'en',
39
+ clean: true,
40
+ minimum_length: 3,
41
+ expand_contractions: true,
42
+ remove_stop_words: true,
43
+ numbers: :none,
44
+ punctuation: :none
45
+ ).tokenize
46
+ end
47
+ end
46
48
 
47
49
  # it 'is the baseline' do
48
50
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
@@ -11,19 +11,19 @@ cert_chain: []
11
11
  date: 2016-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: unicode
14
+ name: unicode_case_converter
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '0.4'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '0.4'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: bundler
29
29
  requirement: !ruby/object:Gem::Requirement