pragmatic_tokenizer 1.5.0 → 1.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 41ba3758c8ff32d83451b66e0b28ca8f33248843
4
- data.tar.gz: 1f96d2fbf5a7a66a3031631b58a9b91b77991fc5
3
+ metadata.gz: b2372718412cd437eac22feb65b9bdae0798c26c
4
+ data.tar.gz: a9f6d1d84b00494232fa7b84a82de5c4cd8c7700
5
5
  SHA512:
6
- metadata.gz: 7e62958fbf69b55d62c00e391a9c6ed8da4c55c0336e0160c172363198d3a1a711dbdc6310a94e54a9a65316b9c34913a886b92ee7eb6afe78530556a185663b
7
- data.tar.gz: 9fbb6d481494ef235fd1d74b9c34833bb96887c29d530c28add6ee3e5109379b051f2a7520208f9385d686e0ec2ba8c655c849dfb43a6d7b850c3ec0092b526f
6
+ metadata.gz: 2f7c7dad88d79b99de60a5073d4af18cbae5d6395b141425d76b3edc278e2260a7623abc1a3aa9f861593d6b935a06254d2248fc1408e223d01a0f56fd12c39d
7
+ data.tar.gz: 5973501489bd774e81eb68b9fc0b8e1b061a3f13d360f0cbc888975c44c43169f720efd9f76ce5f20f3e853248cdd4a013502e8076193f30169660bc3595a446
data/.rubocop.yml CHANGED
@@ -49,9 +49,9 @@ Style/MultilineMethodCallIndentation:
49
49
  EnforcedStyle: indented
50
50
  IndentationWidth: 4
51
51
 
52
- # unsure how we'd want it
53
- #Style/MultilineOperationIndentation:
54
- # IndentationWidth: 4
52
+ Style/MultilineOperationIndentation:
53
+ EnforcedStyle: indented
54
+ IndentationWidth: 4
55
55
 
56
56
  # these are not continued (e.g. 'private'), so we keep them at 2 chars
57
57
  Style/AccessModifierIndentation:
data/.rubocop_todo.yml CHANGED
@@ -1,6 +1,6 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2016-01-23 03:18:41 +0100 using RuboCop version 0.36.0.
3
+ # on 2016-01-24 21:09:34 +0100 using RuboCop version 0.36.0.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
@@ -8,30 +8,30 @@
8
8
 
9
9
  # Offense count: 11
10
10
  Metrics/AbcSize:
11
- Max: 137
11
+ Max: 118
12
12
 
13
13
  # Offense count: 2
14
14
  # Configuration parameters: CountComments.
15
15
  Metrics/ClassLength:
16
- Max: 214
16
+ Max: 218
17
17
 
18
- # Offense count: 9
18
+ # Offense count: 7
19
19
  Metrics/CyclomaticComplexity:
20
- Max: 41
20
+ Max: 40
21
21
 
22
- # Offense count: 8
22
+ # Offense count: 7
23
23
  # Configuration parameters: CountComments.
24
24
  Metrics/MethodLength:
25
25
  Max: 57
26
26
 
27
- # Offense count: 1
27
+ # Offense count: 2
28
28
  # Configuration parameters: CountComments.
29
29
  Metrics/ModuleLength:
30
- Max: 134
30
+ Max: 140
31
31
 
32
32
  # Offense count: 6
33
33
  Metrics/PerceivedComplexity:
34
- Max: 43
34
+ Max: 41
35
35
 
36
36
  # Offense count: 4
37
37
  # Cop supports --auto-correct.
@@ -43,10 +43,12 @@ Style/CommentIndentation:
43
43
  Style/Documentation:
44
44
  Enabled: false
45
45
 
46
- # Offense count: 2
47
- Style/MultilineBlockChain:
48
- Exclude:
49
- - 'lib/pragmatic_tokenizer/post_processor.rb'
46
+ # Offense count: 17
47
+ # Cop supports --auto-correct.
48
+ # Configuration parameters: EnforcedStyle, SupportedStyles, IndentationWidth.
49
+ # SupportedStyles: aligned, indented
50
+ Style/MultilineOperationIndentation:
51
+ Enabled: false
50
52
 
51
53
  # Offense count: 1
52
54
  # Configuration parameters: SuspiciousParamNames.
@@ -55,12 +57,11 @@ Style/OptionHash:
55
57
  Exclude:
56
58
  - 'lib/pragmatic_tokenizer/tokenizer.rb'
57
59
 
58
- # Offense count: 4
60
+ # Offense count: 3
59
61
  # Cop supports --auto-correct.
60
62
  # Configuration parameters: EnforcedStyle, SupportedStyles, AllowInnerSlashes.
61
63
  # SupportedStyles: slashes, percent_r, mixed
62
64
  Style/RegexpLiteral:
63
65
  Exclude:
64
66
  - 'lib/pragmatic_tokenizer/post_processor.rb'
65
- - 'lib/pragmatic_tokenizer/pre_processor.rb'
66
67
  - 'lib/pragmatic_tokenizer/tokenizer.rb'
@@ -4,10 +4,11 @@ module PragmaticTokenizer
4
4
  # This class separates true full stops while ignoring
5
5
  # periods that are part of an abbreviation
6
6
  class FullStopSeparator
7
- attr_reader :tokens, :abbreviations
8
- def initialize(tokens:, abbreviations:)
7
+ attr_reader :tokens, :abbreviations, :downcase
8
+ def initialize(tokens:, abbreviations:, downcase:)
9
9
  @tokens = tokens
10
10
  @abbreviations = abbreviations
11
+ @downcase = downcase
11
12
  end
12
13
 
13
14
  def separate
@@ -19,7 +20,12 @@ module PragmaticTokenizer
19
20
  tokens.each_with_index do |_t, i|
20
21
  if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
21
22
  w = Regexp.last_match(1)
22
- unless abbr[Unicode.downcase(w)] || w =~ /\A[a-z]\z/i ||
23
+ if downcase
24
+ abbreviation = abbr[w]
25
+ else
26
+ abbreviation = abbr[Unicode.downcase(w)]
27
+ end
28
+ unless abbreviation || w =~ /\A[a-z]\z/i ||
23
29
  w =~ /[a-z](?:\.[a-z])+\z/i
24
30
  cleaned_tokens << w
25
31
  cleaned_tokens << '.'
@@ -28,7 +34,12 @@ module PragmaticTokenizer
28
34
  end
29
35
  cleaned_tokens << tokens[i]
30
36
  end
31
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp("."))
37
+ if downcase
38
+ abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
+ else
40
+ abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
41
+ end
42
+ if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
32
43
  cleaned_tokens[-1] = Regexp.last_match(1)
33
44
  cleaned_tokens.push '.'
34
45
  end
@@ -1,66 +1,115 @@
1
1
  module PragmaticTokenizer
2
2
  class PostProcessor
3
3
 
4
- attr_reader :text, :abbreviations
5
- def initialize(text:, abbreviations:)
6
- @text = text
4
+ REGEX_SYMBOL = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.freeze
5
+ REGEXP_COMMAS = /^(,|‚)+/.freeze
6
+ REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/.freeze
7
+ REGEXP_SLASH = /^(?!(https?:|www\.))(.*)\/(.*)/.freeze
8
+ REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/.freeze
9
+ REGEXP_PLUS_SIGN = /(.+)\+(.+)/.freeze
10
+ REGEXP_COLON = /^(\:)(\S{2,})/.freeze
11
+ REGEXP_EMOJI = /(\u{2744}[\u{FE0E}|\u{FE0F}])/.freeze
12
+
13
+ REGEX_UNIFIED1 = Regexp.union(REGEXP_SLASH,
14
+ REGEXP_QUESTION_MARK,
15
+ REGEXP_PLUS_SIGN,
16
+ REGEXP_COLON,
17
+ REGEXP_EMOJI,
18
+ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
19
+ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX
20
+ ).freeze
21
+
22
+ REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
23
+ REGEXP_COMMAS
24
+ ).freeze
25
+
26
+ attr_reader :text, :abbreviations, :downcase
27
+
28
+ def initialize(text:, abbreviations:, downcase:)
29
+ @text = text
7
30
  @abbreviations = abbreviations
31
+ @downcase = downcase
8
32
  end
9
33
 
10
34
  def post_process
11
- tokens = text.split
12
- .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
13
- .flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
14
- .map { |t| convert_sym_to_punct(t) }
15
- full_stop_separated_tokens = FullStopSeparator.new(tokens: tokens, abbreviations: abbreviations).separate
16
- EndingPunctuationSeparator.new(tokens: EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
17
- .flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
18
- .flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
19
- .flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
20
- .flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
21
- .flat_map do |t|
22
- (
23
- if t.include?(".") &&
24
- t !~ /(http|https|www)(\.|:)/ &&
25
- t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
26
- t !~ /\.[a-z]{2}/ &&
27
- t.length > 2 &&
28
- t !~ /\A[a-zA-Z]{1}\./ &&
29
- t.count(".") == 1 &&
30
- t !~ /\d+/ &&
31
- !abbreviations.include?(Unicode.downcase(t.split(".")[0].nil? ? '' : t.split(".")[0])) &&
32
- t !~ /\S+(@|@)\S+/
33
- t.gsub(/\./, '\1 . \2').split(' ').flatten
34
- else
35
- t
36
- end)
37
- end
38
- .flat_map do |t|
39
- (
40
- if t.include?(".") &&
41
- t !~ /(http|https|www)(\.|:)/ &&
42
- t.length > 1 &&
43
- t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
44
- t !~ /\S+(@|@)\S+/ &&
45
- abbreviations.include?(Unicode.downcase(t.split(".")[0].nil? ? '' : t.split(".")[0]))
46
- t.gsub(/\./, '\1. \2').split(' ').flatten
47
- else
48
- t
49
- end)
50
- end
51
- .flat_map { |t| t =~ /\u{2744}\u{FE0F}/ ? t.gsub(/\u{2744}\u{FE0F}/, " \u{2744}\u{FE0F} ").split(' ').flatten : t }
52
- .flat_map { |t| t =~ /\u{2744}\u{FE0E}/ ? t.gsub(/\u{2744}\u{FE0E}/, " \u{2744}\u{FE0E} ").split(' ').flatten : t }
53
- .flat_map { |t| t =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/ ? t.gsub(/\u{2744}/, " \u{2744} ").split(' ').flatten : t }
54
- .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
55
- .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
56
- ).separate
35
+ EndingPunctuationSeparator.new(tokens: method_name3).separate
57
36
  end
58
37
 
59
38
  private
60
39
 
40
+ def method_name3
41
+ separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate
42
+ procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
43
+ procs.reduce(separated) { |a, e| a.flat_map(&e) }
44
+ end
45
+
46
+ def unified1
47
+ proc { |token| token.split(REGEX_UNIFIED1) }
48
+ end
49
+
50
+ def full_stop_separated_tokens
51
+ FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
52
+ end
53
+
54
+ def split_and_convert_commas_and_quotes
55
+ text
56
+ .split
57
+ .flat_map { |token| token.split(REGEX_UNIFIED2) }
58
+ .flat_map { |token| convert_sym_to_punct(token) }
59
+ end
60
+
61
+ def split_emoji
62
+ proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
63
+ end
64
+
65
+ def split_unknown_period1
66
+ proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
67
+ end
68
+
69
+ def split_unknown_period2
70
+ proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
71
+ end
72
+
73
+ def unknown_period1?(token)
74
+ token.include?(".") &&
75
+ token !~ /(http|https|www)(\.|:)/ &&
76
+ token.length > 1 &&
77
+ token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
78
+ token !~ /\S+(@|@)\S+/ &&
79
+ abbreviations.include?(extract_abbreviation(token))
80
+ end
81
+
82
+ def unknown_period2?(token)
83
+ token.include?(".") &&
84
+ token !~ /(http|https|www)(\.|:)/ &&
85
+ token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
86
+ token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
87
+ token.length > 2 &&
88
+ token !~ /\A[a-zA-Z]{1}\./ &&
89
+ token.count(".") == 1 &&
90
+ token !~ /\d+/ &&
91
+ !abbreviations.include?(extract_abbreviation(token)) &&
92
+ token !~ /\S+(@|@)\S+/
93
+ end
94
+
95
+ def extract_abbreviation(token)
96
+ if downcase
97
+ token.split(/(\.)/)[0]
98
+ else
99
+ Unicode.downcase(token.split(/(\.)/)[0])
100
+ end
101
+ end
102
+
61
103
  def convert_sym_to_punct(token)
62
- symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.match(token)
63
- symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
104
+ symbol_matches = REGEX_SYMBOL.match(token)
105
+ if symbol_matches.nil?
106
+ token
107
+ else
108
+ pattern = symbol_matches[0]
109
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
110
+ token.gsub!(pattern, replacement)
111
+ end
64
112
  end
113
+
65
114
  end
66
115
  end
@@ -127,8 +127,11 @@ module PragmaticTokenizer
127
127
  end
128
128
 
129
129
  def post_process(text)
130
- @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
131
- downcase! if downcase
130
+ if downcase
131
+ @tokens = PostProcessor.new(text: Unicode.downcase(text), abbreviations: abbreviations, downcase: downcase).post_process
132
+ else
133
+ @tokens = PostProcessor.new(text: text, abbreviations: abbreviations, downcase: downcase).post_process
134
+ end
132
135
  expand_contractions!(contractions) if expand_contractions
133
136
  clean! if clean
134
137
  classic_filter! if classic_filter
@@ -146,15 +149,11 @@ module PragmaticTokenizer
146
149
  @tokens.reject(&:empty?)
147
150
  end
148
151
 
149
- def downcase!
150
- @tokens.map! { |t| Unicode.downcase(t) }
151
- end
152
-
153
152
  def expand_contractions!(contractions)
154
153
  @tokens = if downcase
155
154
  @tokens.flat_map do |t|
156
- if contractions.key?(Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'")))
157
- contractions[Unicode.downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))]
155
+ if contractions.key?(t.gsub(/[‘’‚‛‹›'´`]/, "'"))
156
+ contractions[t.gsub(/[‘’‚‛‹›'´`]/, "'")]
158
157
  .split(' ')
159
158
  .flatten
160
159
  else
@@ -212,7 +211,11 @@ module PragmaticTokenizer
212
211
  when 'semi'
213
212
  @tokens.delete_if { |t| t =~ /\A\d+\z/ }
214
213
  when 'none'
215
- @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
214
+ if downcase
215
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
216
+ else
217
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode.downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode.downcase(t)}.") }
218
+ end
216
219
  when 'only'
217
220
  @tokens.delete_if { |t| t =~ /\A\D+\z/ }
218
221
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "1.5.0".freeze
2
+ VERSION = "1.5.1".freeze
3
3
  end
@@ -8,21 +8,18 @@ describe PragmaticTokenizer do
8
8
 
9
9
  # it 'is fast?' do
10
10
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ."
11
- # benchmark do
12
- # 10.times do
13
- # data = StackProf.run(mode: :cpu, interval: 1000) do
14
- # PragmaticTokenizer::Tokenizer.new(string * 100,
15
- # language: 'en',
16
- # clean: true,
17
- # remove_numbers: true,
18
- # minimum_length: 3,
19
- # expand_contractions: true,
20
- # remove_stop_words: true
21
- # ).tokenize
22
- # end
23
- # puts StackProf::Report.new(data).print_text
24
- # end
11
+ # data = StackProf.run(mode: :cpu, interval: 1000) do
12
+ # PragmaticTokenizer::Tokenizer.new(string * 100,
13
+ # language: 'en',
14
+ # clean: true,
15
+ # minimum_length: 3,
16
+ # expand_contractions: true,
17
+ # remove_stop_words: true,
18
+ # numbers: :none,
19
+ # punctuation: :none
20
+ # ).tokenize
25
21
  # end
22
+ # puts StackProf::Report.new(data).print_text
26
23
  # end
27
24
 
28
25
  # 26.8
@@ -30,6 +27,7 @@ describe PragmaticTokenizer do
30
27
  # 9.6
31
28
  # 23.25
32
29
  # 24.2
30
+ # 23.2
33
31
  # it 'is fast? (long strings)' do
34
32
  # string = "Hello World. My name is Jonas. What is your name? My name is Jonas IV Smith. There it is! I found it. My name is Jonas E. Smith. Please turn to p. 55. Were Jane and co. at the party? They closed the deal with Pitt, Briggs & Co. at noon. Let's ask Jane and co. They should know. They closed the deal with Pitt, Briggs & Co. It closed yesterday. I can't see Mt. Fuji from here. St. Michael's Church is on 5th st. near the light. That is JFK Jr.'s book. I visited the U.S.A. last year. I live in the E.U. How about you? I live in the U.S. How about you? I work for the U.S. Government in Virginia. I have lived in the U.S. for 20 years. She has $100.00 in her bag. She has $100.00. It is in her bag. He teaches science (He previously worked for 5 years as an engineer.) at the local University. Her email is Jane.Doe@example.com. I sent her an email. The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. She turned to him, 'This is great.' she said. She turned to him, \"This is great.\" she said. She turned to him, \"This is great.\" She held the book out to show him. Hello!! Long time no see. Hello?? Who is there? Hello!? Is that you? Hello?! Is that you? 1.) The first item 2.) The second item 1.) The first item. 2.) The second item. 1) The first item 2) The second item 1) The first item. 2) The second item. 1. The first item 2. The second item 1. The first item. 2. The second item. • 9. The first item • 10. The second item ⁃9. The first item ⁃10. The second item a. The first item b. The second item c. The third list item This is a sentence\ncut off in the middle because pdf. It was a cold \nnight in the city. features\ncontact manager\nevents, activities\n You can find it at N°. 1026.253.553. That is where the treasure is. She works at Yahoo! in the accounting department. We make a good team, you and I. Did you see Albert I. Jones yesterday? Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” \"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55). If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. I never meant that.... She left the store. I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . ." * 1000
35
33
  # puts "LENGTH: #{string.length}"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.0
4
+ version: 1.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-24 00:00:00.000000000 Z
11
+ date: 2016-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode