pragmatic_tokenizer 1.6.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +4 -4
- data/lib/pragmatic_tokenizer/languages/spanish.rb +1 -1
- data/lib/pragmatic_tokenizer/post_processor.rb +1 -1
- data/lib/pragmatic_tokenizer/pre_processor.rb +0 -5
- data/lib/pragmatic_tokenizer/tokenizer.rb +12 -8
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/pragmatic_tokenizer.gemspec +1 -1
- data/spec/languages/english_spec.rb +0 -21
- data/spec/performance_spec.rb +29 -27
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e72eaf6ddb3c1c7d0d0dc5f57727f796e01f5280
|
4
|
+
data.tar.gz: 80511d5cbcc8c84d1b3a70b70c8ddd070b1f7170
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9b81069f61fecd39d437699bb23090af30bb3a4231b6c8523f0b0a41e1f77d7a511cd2d54997db802f77d4743aead0446bcc8afd3d449007b4e5c37212dd9b65
|
7
|
+
data.tar.gz: 6f5c29155a382aae49949b554d8854e0b74a2967a4d92f5cc4f8dc3ef787c45ee061d7aa6ed5c3f510dd133ea52f199ca226113e5e6530c5f852eae7a08556a5
|
@@ -23,7 +23,7 @@ module PragmaticTokenizer
|
|
23
23
|
if downcase
|
24
24
|
abbreviation = abbr[w]
|
25
25
|
else
|
26
|
-
abbreviation = abbr[
|
26
|
+
abbreviation = abbr[UnicodeCaseConverter::Converter.new(w).downcase]
|
27
27
|
end
|
28
28
|
unless abbreviation || w =~ /\A[a-z]\z/i ||
|
29
29
|
w =~ /[a-z](?:\.[a-z])+\z/i
|
@@ -35,11 +35,11 @@ module PragmaticTokenizer
|
|
35
35
|
cleaned_tokens << tokens[i]
|
36
36
|
end
|
37
37
|
if downcase
|
38
|
-
|
38
|
+
abbr_included = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
|
39
39
|
else
|
40
|
-
|
40
|
+
abbr_included = abbreviations.include?(UnicodeCaseConverter::Converter.new(cleaned_tokens[-1]).downcase.chomp(".")) unless cleaned_tokens[-1].nil?
|
41
41
|
end
|
42
|
-
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !
|
42
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbr_included
|
43
43
|
cleaned_tokens[-1] = Regexp.last_match(1)
|
44
44
|
cleaned_tokens.push '.'
|
45
45
|
end
|
@@ -2,7 +2,7 @@ module PragmaticTokenizer
|
|
2
2
|
module Languages
|
3
3
|
module Spanish
|
4
4
|
include Languages::Common
|
5
|
-
ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "
|
5
|
+
ABBREVIATIONS = ["a", "a.c", "a/c", "abr", "adj", "admón", "aero", "afmo", "ago", "almte", "ambi", "an", "anfi", "ante", "anti", "ap", "apdo", "archi", "arci", "arq", "art", "atte", "auto", "av", "avda", "bco", "bi", "bibl", "bien", "bis", "bs. as", "c", "c.f", "c.g", "c/c", "c/u", "cap", "cc.aa", "cdad", "cm", "co", "com", "con", "contra", "cra", "crio", "cta", "cuadri", "cuasi", "cuatri", "cv", "d.e.p", "da", "dcha", "dcho", "de", "deci", "dep", "des", "di", "dic", "dicc", "dir", "dis", "dn", "doc", "dom", "dpto", "dr", "dra", "dto", "ecto", "ee", "ej", "en", "endo", "entlo", "entre", "epi", "equi", "esq", "etc", "ex", "excmo", "ext", "extra", "f.c", "fca", "fdo", "febr", "ff. aa", "ff.cc", "fig", "fil", "fra", "g.p", "g/p", "geo", "gob", "gr", "gral", "grs", "hemi", "hetero", "hiper", "hipo", "hnos", "homo", "hs", "i", "igl", "iltre", "im", "imp", "impr", "impto", "in", "incl", "infra", "ing", "inst", "inter", "intra", "iso", "izdo", "izq", "izqdo", "j.c", "jue", "jul", "jun", "kg", "km", "lcdo", "ldo", "let", "lic", "ltd", "lun", "macro", "mar", "may", "mega", "mg", "micro", "min", "mini", "mié", "mm", "mono", "mt", "multi", "máx", "mín", "n. del t", "n.b", "neo", "no", "nos", "nov", "ntra. sra", "núm", "oct", "omni", "p", "p.a", "p.d", "p.ej", "p.v.p", "para", "pen", "ph", "ph.d", "pluri", "poli", "pos", "post", "pp", "ppal", "pre", "prev", "pro", "prof", "prov", "pseudo", "ptas", "pts", "pza", "pág", "págs", "párr", "párrf", "q.e.g.e", "q.e.p.d", "q.e.s.m", "re", "reg", "rep", "retro", "rr. hh", "rte", "s", "s. a", "s.a.r", "s.e", "s.l", "s.r.c", "s.r.l", "s.s.s", "s/n", "sdad", "seg", "semi", "sept", "seudo", "sig", "sobre", "sr", "sra", "sres", "srta", "sta", "sto", "sub", "super", "supra", "sáb", "t.v.e", "tamb", "tel", "tfno", "trans", "tras", "tri", "ud", "uds", "ulter", "ultra", "un", "uni", "univ", "uu", "v.b", "v.e", "vd", "vds", "vice", "vid", "vie", "vol", "vs", "vto", "yuxta"].freeze
|
6
6
|
STOP_WORDS = ["algún", "alguna", "algunas", "alguno", "algunos", "ambos", "ampleamos", "ante", "antes", "aquel", "aquellas", "aquellos", "aqui", "arriba", "atras", "bajo", "bastante", "bien", "cada", "cierta", "ciertas", "cierto", "ciertos", "como", "con", "conseguimos", "conseguir", "consigo", "consigue", "consiguen", "consigues", "cual", "cuando", "dentro", "desde", "donde", "dos", "el", "ellas", "ellos", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "entonces", "entre", "era", "eramos", "eran", "eras", "eres", "es", "esta", "estaba", "estado", "estais", "estamos", "estan", "estoy", "fin", "fue", "fueron", "fui", "fuimos", "gueno", "ha", "hace", "haceis", "hacemos", "hacen", "hacer", "haces", "hago", "incluso", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "la", "largo", "las", "lo", "los", "mientras", "mio", "modo", "muchos", "muy", "nos", "nosotros", "otro", "para", "pero", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "por", "por qué", "porque", "primero", "puede", "pueden", "puedo", "quien", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "ser", "si", "siendo", "sin", "sobre", "sois", "solamente", "solo", "somos", "soy", "su", "sus", "también", "teneis", "tenemos", "tener", "tengo", "tiempo", "tiene", "tienen", "todo", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "tuyo", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "va", "vais", "valor", "vamos", "van", "vaya", "verdad", "verdadera", "verdadero", "vosotras", "vosotros", "voy", "yo", "él", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", "a", "añadió", "aún", "actualmente", "adelante", "además", "afirmó", "agregó", "ahí", "ahora", "al", "algún", "algo", "alrededor", "anterior", "apenas", "aproximadamente", "aquí", "así", "aseguró", "aunque", "ayer", "buen", "buena", "buenas", "bueno", "buenos", "cómo", "casi", "cerca", "cinco", "comentó", "conocer", "consideró", "considera", "contra", "cosas", "creo", "cuales", "cualquier", "cuanto", "cuatro", "cuenta", "da", "dado", "dan", "dar", "de", "debe", "deben", "debido", "decir", "dejó", "del", "demás", "después", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "durante", "e", "ejemplo", "ella", "ello", "embargo", "encuentra", "esa", "esas", "ese", "eso", "esos", "está", "están", "estaban", "estar", "estará", "estas", "este", "esto", "estos", "estuvo", "ex", "existe", "existen", "explicó", "expresó", "fuera", "gran", "grandes", "había", "habían", "haber", "habrá", "hacerlo", "hacia", "haciendo", "han", "hasta", "hay", "haya", "he", "hecho", "hemos", "hicieron", "hizo", "hoy", "hubo", "igual", "indicó", "informó", "junto", "lado", "le", "les", "llegó", "lleva", "llevar", "luego", "lugar", "más", "manera", "manifestó", "mayor", "me", "mediante", "mejor", "mencionó", "menos", "mi", "misma", "mismas", "mismo", "mismos", "momento", "mucha", "muchas", "mucho", "nada", "nadie", "ni", "ningún", "ninguna", "ningunas", "ninguno", "ningunos", "no", "nosotras", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "otra", "otras", "otros", "parece", "parte", "partir", "pasada", "pasado", "pesar", "poca", "pocas", "poco", "pocos", "podrá", "podrán", "podría", "podrían", "poner", "posible", "próximo", "próximos", "primer", "primera", "primeros", "principalmente", "propia", "propias", "propio", "propios", "pudo", "pueda", "pues", "qué", "que", "quedó", "queremos", "quién", "quienes", "quiere", "realizó", "realizado", "realizar", "respecto", "sí", "sólo", "se", "señaló", "sea", "sean", "según", "segunda", "segundo", "seis", "será", "serán", "sería", "sido", "siempre", "siete", "sigue", "siguiente", "sino", "sola", "solas", "solos", "son", "tal", "tampoco", "tan", "tanto", "tenía", "tendrá", "tendrán", "tenga", "tenido", "tercera", "toda", "todas", "todavía", "todos", "total", "trata", "través", "tres", "tuvo", "usted", "varias", "varios", "veces", "ver", "vez", "y", "ya"].freeze
|
7
7
|
CONTRACTIONS = {
|
8
8
|
"al" => "a el",
|
@@ -2,7 +2,6 @@ module PragmaticTokenizer
|
|
2
2
|
module PreProcessor
|
3
3
|
|
4
4
|
def pre_process(language: Languages::Common)
|
5
|
-
remove_non_breaking_space!
|
6
5
|
shift_comma!
|
7
6
|
shift_multiple_dash!
|
8
7
|
shift_inverted_question_mark!
|
@@ -32,10 +31,6 @@ module PragmaticTokenizer
|
|
32
31
|
|
33
32
|
private
|
34
33
|
|
35
|
-
def remove_non_breaking_space!
|
36
|
-
gsub!(/\u{00A0}/, ''.freeze)
|
37
|
-
end
|
38
|
-
|
39
34
|
# Shift commas off everything but numbers
|
40
35
|
def shift_comma!
|
41
36
|
gsub!(/,(?!\d)/o, ' , '.freeze)
|
@@ -4,7 +4,7 @@ require 'pragmatic_tokenizer/pre_processor'
|
|
4
4
|
require 'pragmatic_tokenizer/post_processor'
|
5
5
|
require 'pragmatic_tokenizer/full_stop_separator'
|
6
6
|
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
7
|
-
require '
|
7
|
+
require 'unicode_case_converter'
|
8
8
|
|
9
9
|
module PragmaticTokenizer
|
10
10
|
class Tokenizer
|
@@ -128,10 +128,11 @@ module PragmaticTokenizer
|
|
128
128
|
|
129
129
|
def post_process(text)
|
130
130
|
if downcase
|
131
|
-
@tokens = PostProcessor.new(text:
|
131
|
+
@tokens = PostProcessor.new(text: UnicodeCaseConverter::Converter.new(text).downcase, abbreviations: abbreviations, downcase: downcase).post_process
|
132
132
|
else
|
133
133
|
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations, downcase: downcase).post_process
|
134
134
|
end
|
135
|
+
# downcase! if downcase
|
135
136
|
expand_contractions!(contractions) if expand_contractions
|
136
137
|
clean! if clean
|
137
138
|
classic_filter! if classic_filter
|
@@ -149,6 +150,10 @@ module PragmaticTokenizer
|
|
149
150
|
@tokens.reject(&:empty?)
|
150
151
|
end
|
151
152
|
|
153
|
+
def downcase!
|
154
|
+
@tokens.map! { |t| UnicodeCaseConverter::Converter.new(t).downcase }
|
155
|
+
end
|
156
|
+
|
152
157
|
def expand_contractions!(contractions)
|
153
158
|
@tokens = if downcase
|
154
159
|
@tokens.flat_map do |t|
|
@@ -162,11 +167,11 @@ module PragmaticTokenizer
|
|
162
167
|
end
|
163
168
|
else
|
164
169
|
@tokens.flat_map do |t|
|
165
|
-
if contractions.key?(
|
166
|
-
contractions[
|
170
|
+
if contractions.key?(UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›'´`]/, "'")).downcase)
|
171
|
+
contractions[UnicodeCaseConverter::Converter.new(t.gsub(/[‘’‚‛‹›'´`]/, "'")).downcase]
|
167
172
|
.split(' ')
|
168
173
|
.each_with_index
|
169
|
-
.map { |token, i| i.eql?(0) ?
|
174
|
+
.map { |token, i| i.eql?(0) ? UnicodeCaseConverter::Converter.new(token).capitalize : token }
|
170
175
|
.flatten
|
171
176
|
else
|
172
177
|
t
|
@@ -191,7 +196,6 @@ module PragmaticTokenizer
|
|
191
196
|
.map { |t| t.gsub(/\u{00AD}/, '') }
|
192
197
|
.map { |t| t.gsub(/\A(-|–)/, '') }
|
193
198
|
.map { |t| t.gsub(/[®©]/, '') }
|
194
|
-
.map { |t| t.gsub(/\A\%/, '') }
|
195
199
|
.map { |t| t.gsub(/[\u{1F100}-\u{1F1FF}]/, '') }
|
196
200
|
.delete_if do |t|
|
197
201
|
t =~ /\A-+\z/ ||
|
@@ -215,7 +219,7 @@ module PragmaticTokenizer
|
|
215
219
|
if downcase
|
216
220
|
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
|
217
221
|
else
|
218
|
-
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(
|
222
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(UnicodeCaseConverter::Converter.new(t).downcase) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{UnicodeCaseConverter::Converter.new(t).downcase}.") }
|
219
223
|
end
|
220
224
|
when 'only'
|
221
225
|
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
@@ -241,7 +245,7 @@ module PragmaticTokenizer
|
|
241
245
|
if downcase
|
242
246
|
@tokens -= stop_words
|
243
247
|
else
|
244
|
-
@tokens.delete_if { |t| stop_words.include?(
|
248
|
+
@tokens.delete_if { |t| stop_words.include?(UnicodeCaseConverter::Converter.new(t).downcase) }
|
245
249
|
end
|
246
250
|
end
|
247
251
|
|
data/pragmatic_tokenizer.gemspec
CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ["lib"]
|
20
20
|
|
21
|
-
spec.add_runtime_dependency "
|
21
|
+
spec.add_runtime_dependency "unicode_case_converter", "~> 0.4"
|
22
22
|
spec.add_development_dependency "bundler", "~> 1.9"
|
23
23
|
spec.add_development_dependency "rake", "~> 10.0"
|
24
24
|
spec.add_development_dependency "rspec"
|
@@ -1437,27 +1437,6 @@ describe PragmaticTokenizer do
|
|
1437
1437
|
)
|
1438
1438
|
expect(pt.tokenize).to eq(%w(tudow provides company users a way to offer each other and guests and interpreters free assistance to date there have been questions asked))
|
1439
1439
|
end
|
1440
|
-
|
1441
|
-
it 'removes non-breaking spaces' do
|
1442
|
-
text = "%20141201~221624 %User ID,JU,JU John %TU=00000362 %PT-BR %Wordfast da hello."
|
1443
|
-
pt = PragmaticTokenizer::Tokenizer.new(text,
|
1444
|
-
language: :en,
|
1445
|
-
filter_languages: [:en],
|
1446
|
-
clean: true,
|
1447
|
-
numbers: :none,
|
1448
|
-
minimum_length: 3,
|
1449
|
-
expand_contractions: true,
|
1450
|
-
remove_stop_words: true,
|
1451
|
-
punctuation: :none,
|
1452
|
-
remove_emails: true,
|
1453
|
-
remove_domains: true,
|
1454
|
-
remove_urls: true,
|
1455
|
-
hashtags: :remove,
|
1456
|
-
mentions: :remove,
|
1457
|
-
downcase: true
|
1458
|
-
)
|
1459
|
-
expect(pt.tokenize).to eq(["user", "john", "pt-br", "wordfast"])
|
1460
|
-
end
|
1461
1440
|
end
|
1462
1441
|
end
|
1463
1442
|
|
data/spec/performance_spec.rb
CHANGED
@@ -8,18 +8,21 @@ describe PragmaticTokenizer do
|
|
8
8
|
|
9
9
|
# it 'is fast?' do
|
10
10
|
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
11
|
+
# benchmark do
|
12
|
+
# 10.times do
|
13
|
+
# data = StackProf.run(mode: :cpu, interval: 1000) do
|
14
|
+
# PragmaticTokenizer::Tokenizer.new(string * 100,
|
15
|
+
# language: 'en',
|
16
|
+
# clean: true,
|
17
|
+
# remove_numbers: true,
|
18
|
+
# minimum_length: 3,
|
19
|
+
# expand_contractions: true,
|
20
|
+
# remove_stop_words: true
|
21
|
+
# ).tokenize
|
22
|
+
# end
|
23
|
+
# puts StackProf::Report.new(data).print_text
|
24
|
+
# end
|
21
25
|
# end
|
22
|
-
# puts StackProf::Report.new(data).print_text
|
23
26
|
# end
|
24
27
|
|
25
28
|
# 26.8
|
@@ -27,22 +30,21 @@ describe PragmaticTokenizer do
|
|
27
30
|
# 9.6
|
28
31
|
# 23.25
|
29
32
|
# 24.2
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
# end
|
33
|
+
it 'is fast? (long strings)' do
|
34
|
+
string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
|
35
|
+
puts "LENGTH: #{string.length}"
|
36
|
+
benchmark do
|
37
|
+
PragmaticTokenizer::Tokenizer.new(string,
|
38
|
+
language: 'en',
|
39
|
+
clean: true,
|
40
|
+
minimum_length: 3,
|
41
|
+
expand_contractions: true,
|
42
|
+
remove_stop_words: true,
|
43
|
+
numbers: :none,
|
44
|
+
punctuation: :none
|
45
|
+
).tokenize
|
46
|
+
end
|
47
|
+
end
|
46
48
|
|
47
49
|
# it 'is the baseline' do
|
48
50
|
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
@@ -11,19 +11,19 @@ cert_chain: []
|
|
11
11
|
date: 2016-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
|
-
name:
|
14
|
+
name: unicode_case_converter
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '0.4'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '0.4'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: bundler
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|