pragmatic_tokenizer 1.5.0 → 1.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +3 -3
- data/.rubocop_todo.yml +16 -15
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +15 -4
- data/lib/pragmatic_tokenizer/post_processor.rb +100 -51
- data/lib/pragmatic_tokenizer/tokenizer.rb +12 -9
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- data/spec/performance_spec.rb +12 -14
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b2372718412cd437eac22feb65b9bdae0798c26c
|
4
|
+
data.tar.gz: a9f6d1d84b00494232fa7b84a82de5c4cd8c7700
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2f7c7dad88d79b99de60a5073d4af18cbae5d6395b141425d76b3edc278e2260a7623abc1a3aa9f861593d6b935a06254d2248fc1408e223d01a0f56fd12c39d
|
7
|
+
data.tar.gz: 5973501489bd774e81eb68b9fc0b8e1b061a3f13d360f0cbc888975c44c43169f720efd9f76ce5f20f3e853248cdd4a013502e8076193f30169660bc3595a446
|
data/.rubocop.yml
CHANGED
@@ -49,9 +49,9 @@ Style/MultilineMethodCallIndentation:
|
|
49
49
|
EnforcedStyle: indented
|
50
50
|
IndentationWidth: 4
|
51
51
|
|
52
|
-
|
53
|
-
|
54
|
-
|
52
|
+
Style/MultilineOperationIndentation:
|
53
|
+
EnforcedStyle: indented
|
54
|
+
IndentationWidth: 4
|
55
55
|
|
56
56
|
# these are not continued (e.g. 'private'), so we keep them at 2 chars
|
57
57
|
Style/AccessModifierIndentation:
|
data/.rubocop_todo.yml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# This configuration was generated by
|
2
2
|
# `rubocop --auto-gen-config`
|
3
|
-
# on 2016-01-
|
3
|
+
# on 2016-01-24 21:09:34 +0100 using RuboCop version 0.36.0.
|
4
4
|
# The point is for the user to remove these configuration records
|
5
5
|
# one by one as the offenses are removed from the code base.
|
6
6
|
# Note that changes in the inspected code, or installation of new
|
@@ -8,30 +8,30 @@
|
|
8
8
|
|
9
9
|
# Offense count: 11
|
10
10
|
Metrics/AbcSize:
|
11
|
-
Max:
|
11
|
+
Max: 118
|
12
12
|
|
13
13
|
# Offense count: 2
|
14
14
|
# Configuration parameters: CountComments.
|
15
15
|
Metrics/ClassLength:
|
16
|
-
Max:
|
16
|
+
Max: 218
|
17
17
|
|
18
|
-
# Offense count:
|
18
|
+
# Offense count: 7
|
19
19
|
Metrics/CyclomaticComplexity:
|
20
|
-
Max:
|
20
|
+
Max: 40
|
21
21
|
|
22
|
-
# Offense count:
|
22
|
+
# Offense count: 7
|
23
23
|
# Configuration parameters: CountComments.
|
24
24
|
Metrics/MethodLength:
|
25
25
|
Max: 57
|
26
26
|
|
27
|
-
# Offense count:
|
27
|
+
# Offense count: 2
|
28
28
|
# Configuration parameters: CountComments.
|
29
29
|
Metrics/ModuleLength:
|
30
|
-
Max:
|
30
|
+
Max: 140
|
31
31
|
|
32
32
|
# Offense count: 6
|
33
33
|
Metrics/PerceivedComplexity:
|
34
|
-
Max:
|
34
|
+
Max: 41
|
35
35
|
|
36
36
|
# Offense count: 4
|
37
37
|
# Cop supports --auto-correct.
|
@@ -43,10 +43,12 @@ Style/CommentIndentation:
|
|
43
43
|
Style/Documentation:
|
44
44
|
Enabled: false
|
45
45
|
|
46
|
-
# Offense count:
|
47
|
-
|
48
|
-
|
49
|
-
|
46
|
+
# Offense count: 17
|
47
|
+
# Cop supports --auto-correct.
|
48
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles, IndentationWidth.
|
49
|
+
# SupportedStyles: aligned, indented
|
50
|
+
Style/MultilineOperationIndentation:
|
51
|
+
Enabled: false
|
50
52
|
|
51
53
|
# Offense count: 1
|
52
54
|
# Configuration parameters: SuspiciousParamNames.
|
@@ -55,12 +57,11 @@ Style/OptionHash:
|
|
55
57
|
Exclude:
|
56
58
|
- 'lib/pragmatic_tokenizer/tokenizer.rb'
|
57
59
|
|
58
|
-
# Offense count:
|
60
|
+
# Offense count: 3
|
59
61
|
# Cop supports --auto-correct.
|
60
62
|
# Configuration parameters: EnforcedStyle, SupportedStyles, AllowInnerSlashes.
|
61
63
|
# SupportedStyles: slashes, percent_r, mixed
|
62
64
|
Style/RegexpLiteral:
|
63
65
|
Exclude:
|
64
66
|
- 'lib/pragmatic_tokenizer/post_processor.rb'
|
65
|
-
- 'lib/pragmatic_tokenizer/pre_processor.rb'
|
66
67
|
- 'lib/pragmatic_tokenizer/tokenizer.rb'
|
@@ -4,10 +4,11 @@ module PragmaticTokenizer
|
|
4
4
|
# This class separates true full stops while ignoring
|
5
5
|
# periods that are part of an abbreviation
|
6
6
|
class FullStopSeparator
|
7
|
-
attr_reader :tokens, :abbreviations
|
8
|
-
def initialize(tokens:, abbreviations:)
|
7
|
+
attr_reader :tokens, :abbreviations, :downcase
|
8
|
+
def initialize(tokens:, abbreviations:, downcase:)
|
9
9
|
@tokens = tokens
|
10
10
|
@abbreviations = abbreviations
|
11
|
+
@downcase = downcase
|
11
12
|
end
|
12
13
|
|
13
14
|
def separate
|
@@ -19,7 +20,12 @@ module PragmaticTokenizer
|
|
19
20
|
tokens.each_with_index do |_t, i|
|
20
21
|
if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
|
21
22
|
w = Regexp.last_match(1)
|
22
|
-
|
23
|
+
if downcase
|
24
|
+
abbreviation = abbr[w]
|
25
|
+
else
|
26
|
+
abbreviation = abbr[Unicode.downcase(w)]
|
27
|
+
end
|
28
|
+
unless abbreviation || w =~ /\A[a-z]\z/i ||
|
23
29
|
w =~ /[a-z](?:\.[a-z])+\z/i
|
24
30
|
cleaned_tokens << w
|
25
31
|
cleaned_tokens << '.'
|
@@ -28,7 +34,12 @@ module PragmaticTokenizer
|
|
28
34
|
end
|
29
35
|
cleaned_tokens << tokens[i]
|
30
36
|
end
|
31
|
-
if
|
37
|
+
if downcase
|
38
|
+
abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
|
39
|
+
else
|
40
|
+
abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
|
41
|
+
end
|
42
|
+
if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
|
32
43
|
cleaned_tokens[-1] = Regexp.last_match(1)
|
33
44
|
cleaned_tokens.push '.'
|
34
45
|
end
|
@@ -1,66 +1,115 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
class PostProcessor
|
3
3
|
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
REGEX_SYMBOL = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.freeze
|
5
|
+
REGEXP_COMMAS = /^(,|‚)+/.freeze
|
6
|
+
REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/.freeze
|
7
|
+
REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\/(.*)/.freeze
|
8
|
+
REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/.freeze
|
9
|
+
REGEXP_PLUS_SIGN = /(.+)\+(.+)/.freeze
|
10
|
+
REGEXP_COLON = /^(\:)(\S{2,})/.freeze
|
11
|
+
REGEXP_EMOJI = /(\u{2744}[\u{FE0E}|\u{FE0F}])/.freeze
|
12
|
+
|
13
|
+
REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
|
14
|
+
REGEXP_QUESTION_MARK,
|
15
|
+
REGEXP_PLUS_SIGN,
|
16
|
+
REGEXP_COLON,
|
17
|
+
REGEXP_EMOJI,
|
18
|
+
PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
|
19
|
+
PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX
|
20
|
+
).freeze
|
21
|
+
|
22
|
+
REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
|
23
|
+
REGEXP_COMMAS
|
24
|
+
).freeze
|
25
|
+
|
26
|
+
attr_reader :text, :abbreviations, :downcase
|
27
|
+
|
28
|
+
def initialize(text:, abbreviations:, downcase:)
|
29
|
+
@text = text
|
7
30
|
@abbreviations = abbreviations
|
31
|
+
@downcase = downcase
|
8
32
|
end
|
9
33
|
|
10
34
|
def post_process
|
11
|
-
tokens
|
12
|
-
.flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
|
13
|
-
.flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
|
14
|
-
.map { |t| convert_sym_to_punct(t) }
|
15
|
-
full_stop_separated_tokens = FullStopSeparator.new(tokens: tokens, abbreviations: abbreviations).separate
|
16
|
-
EndingPunctuationSeparator.new(tokens: EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
17
|
-
.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
|
18
|
-
.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
|
19
|
-
.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
|
20
|
-
.flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
|
21
|
-
.flat_map do |t|
|
22
|
-
(
|
23
|
-
if t.include?(".") &&
|
24
|
-
t !~ /(http|https|www)(\.|:)/ &&
|
25
|
-
t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
26
|
-
t !~ /\.[a-z]{2}/ &&
|
27
|
-
t.length > 2 &&
|
28
|
-
t !~ /\A[a-zA-Z]{1}\./ &&
|
29
|
-
t.count(".") == 1 &&
|
30
|
-
t !~ /\d+/ &&
|
31
|
-
!abbreviations.include?(Unicode.downcase(t.split(".")[0].nil? ? '' : t.split(".")[0])) &&
|
32
|
-
t !~ /\S+(@|@)\S+/
|
33
|
-
t.gsub(/\./, '\1 . \2').split(' ').flatten
|
34
|
-
else
|
35
|
-
t
|
36
|
-
end)
|
37
|
-
end
|
38
|
-
.flat_map do |t|
|
39
|
-
(
|
40
|
-
if t.include?(".") &&
|
41
|
-
t !~ /(http|https|www)(\.|:)/ &&
|
42
|
-
t.length > 1 &&
|
43
|
-
t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
|
44
|
-
t !~ /\S+(@|@)\S+/ &&
|
45
|
-
abbreviations.include?(Unicode.downcase(t.split(".")[0].nil? ? '' : t.split(".")[0]))
|
46
|
-
t.gsub(/\./, '\1. \2').split(' ').flatten
|
47
|
-
else
|
48
|
-
t
|
49
|
-
end)
|
50
|
-
end
|
51
|
-
.flat_map { |t| t =~ /\u{2744}\u{FE0F}/ ? t.gsub(/\u{2744}\u{FE0F}/, " \u{2744}\u{FE0F} ").split(' ').flatten : t }
|
52
|
-
.flat_map { |t| t =~ /\u{2744}\u{FE0E}/ ? t.gsub(/\u{2744}\u{FE0E}/, " \u{2744}\u{FE0E} ").split(' ').flatten : t }
|
53
|
-
.flat_map { |t| t =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/ ? t.gsub(/\u{2744}/, " \u{2744} ").split(' ').flatten : t }
|
54
|
-
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
55
|
-
.flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
|
56
|
-
).separate
|
35
|
+
EndingPunctuationSeparator.new(tokens: method_name3).separate
|
57
36
|
end
|
58
37
|
|
59
38
|
private
|
60
39
|
|
40
|
+
def method_name3
|
41
|
+
separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate
|
42
|
+
procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
|
43
|
+
procs.reduce(separated) { |a, e| a.flat_map(&e) }
|
44
|
+
end
|
45
|
+
|
46
|
+
def unified1
|
47
|
+
proc { |token| token.split(REGEX_UNIFIED1) }
|
48
|
+
end
|
49
|
+
|
50
|
+
def full_stop_separated_tokens
|
51
|
+
FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
|
52
|
+
end
|
53
|
+
|
54
|
+
def split_and_convert_commas_and_quotes
|
55
|
+
text
|
56
|
+
.split
|
57
|
+
.flat_map { |token| token.split(REGEX_UNIFIED2) }
|
58
|
+
.flat_map { |token| convert_sym_to_punct(token) }
|
59
|
+
end
|
60
|
+
|
61
|
+
def split_emoji
|
62
|
+
proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
|
63
|
+
end
|
64
|
+
|
65
|
+
def split_unknown_period1
|
66
|
+
proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
|
67
|
+
end
|
68
|
+
|
69
|
+
def split_unknown_period2
|
70
|
+
proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
|
71
|
+
end
|
72
|
+
|
73
|
+
def unknown_period1?(token)
|
74
|
+
token.include?(".") &&
|
75
|
+
token !~ /(http|https|www)(\.|:)/ &&
|
76
|
+
token.length > 1 &&
|
77
|
+
token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
|
78
|
+
token !~ /\S+(@|@)\S+/ &&
|
79
|
+
abbreviations.include?(extract_abbreviation(token))
|
80
|
+
end
|
81
|
+
|
82
|
+
def unknown_period2?(token)
|
83
|
+
token.include?(".") &&
|
84
|
+
token !~ /(http|https|www)(\.|:)/ &&
|
85
|
+
token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
|
86
|
+
token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
|
87
|
+
token.length > 2 &&
|
88
|
+
token !~ /\A[a-zA-Z]{1}\./ &&
|
89
|
+
token.count(".") == 1 &&
|
90
|
+
token !~ /\d+/ &&
|
91
|
+
!abbreviations.include?(extract_abbreviation(token)) &&
|
92
|
+
token !~ /\S+(@|@)\S+/
|
93
|
+
end
|
94
|
+
|
95
|
+
def extract_abbreviation(token)
|
96
|
+
if downcase
|
97
|
+
token.split(/(\.)/)[0]
|
98
|
+
else
|
99
|
+
Unicode.downcase(token.split(/(\.)/)[0])
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
61
103
|
def convert_sym_to_punct(token)
|
62
|
-
symbol_matches =
|
63
|
-
symbol_matches.nil?
|
104
|
+
symbol_matches = REGEX_SYMBOL.match(token)
|
105
|
+
if symbol_matches.nil?
|
106
|
+
token
|
107
|
+
else
|
108
|
+
pattern = symbol_matches[0]
|
109
|
+
replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
|
110
|
+
token.gsub!(pattern, replacement)
|
111
|
+
end
|
64
112
|
end
|
113
|
+
|
65
114
|
end
|
66
115
|
end
|
@@ -127,8 +127,11 @@ module PragmaticTokenizer
|
|
127
127
|
end
|
128
128
|
|
129
129
|
def post_process(text)
|
130
|
-
|
131
|
-
|
130
|
+
if downcase
|
131
|
+
@tokens = PostProcessor.new(text: Unicode.downcase(text), abbreviations: abbreviations, downcase: downcase).post_process
|
132
|
+
else
|
133
|
+
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations, downcase: downcase).post_process
|
134
|
+
end
|
132
135
|
expand_contractions!(contractions) if expand_contractions
|
133
136
|
clean! if clean
|
134
137
|
classic_filter! if classic_filter
|
@@ -146,15 +149,11 @@ module PragmaticTokenizer
|
|
146
149
|
@tokens.reject(&:empty?)
|
147
150
|
end
|
148
151
|
|
149
|
-
def downcase!
|
150
|
-
@tokens.map! { |t| Unicode.downcase(t) }
|
151
|
-
end
|
152
|
-
|
153
152
|
def expand_contractions!(contractions)
|
154
153
|
@tokens = if downcase
|
155
154
|
@tokens.flat_map do |t|
|
156
|
-
if contractions.key?(
|
157
|
-
contractions[
|
155
|
+
if contractions.key?(t.gsub(/[‘’‚‛‹›'´`]/, "'"))
|
156
|
+
contractions[t.gsub(/[‘’‚‛‹›'´`]/, "'")]
|
158
157
|
.split(' ')
|
159
158
|
.flatten
|
160
159
|
else
|
@@ -212,7 +211,11 @@ module PragmaticTokenizer
|
|
212
211
|
when 'semi'
|
213
212
|
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
214
213
|
when 'none'
|
215
|
-
|
214
|
+
if downcase
|
215
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
|
216
|
+
else
|
217
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
|
218
|
+
end
|
216
219
|
when 'only'
|
217
220
|
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
218
221
|
end
|
data/spec/performance_spec.rb
CHANGED
@@ -8,21 +8,18 @@ describe PragmaticTokenizer do
|
|
8
8
|
|
9
9
|
# it 'is fast?' do
|
10
10
|
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
# ).tokenize
|
22
|
-
# end
|
23
|
-
# puts StackProf::Report.new(data).print_text
|
24
|
-
# end
|
11
|
+
# data = StackProf.run(mode: :cpu, interval: 1000) do
|
12
|
+
# PragmaticTokenizer::Tokenizer.new(string * 100,
|
13
|
+
# language: 'en',
|
14
|
+
# clean: true,
|
15
|
+
# minimum_length: 3,
|
16
|
+
# expand_contractions: true,
|
17
|
+
# remove_stop_words: true,
|
18
|
+
# numbers: :none,
|
19
|
+
# punctuation: :none
|
20
|
+
# ).tokenize
|
25
21
|
# end
|
22
|
+
# puts StackProf::Report.new(data).print_text
|
26
23
|
# end
|
27
24
|
|
28
25
|
# 26.8
|
@@ -30,6 +27,7 @@ describe PragmaticTokenizer do
|
|
30
27
|
# 9.6
|
31
28
|
# 23.25
|
32
29
|
# 24.2
|
30
|
+
# 23.2
|
33
31
|
# it 'is fast? (long strings)' do
|
34
32
|
# string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
|
35
33
|
# puts "LENGTH: #{string.length}"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|