pragmatic_tokenizer 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
4
- data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
3
+ metadata.gz: 2f5112bf38a65d6cfc437fd9b735a5011479807d
4
+ data.tar.gz: 3963cf07246508f250bc2391fccf4245f1d275b0
5
5
  SHA512:
6
- metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
7
- data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
6
+ metadata.gz: fc9fe3c9b9c6aca7ca355ac6a1000f973cdc4f1bd94babd27bad78add4b761cc030bbeddac8ad9deea3ab9e0c4849efa918f2808de198c288fecb7d58af0a4df
7
+ data.tar.gz: 4c28cb848d3ad5b7f29284b2f0a2d63bc9860a4d9c306d5f173af3f2c98f54241e5bf6a190380dd15e431f792860303aceea58460c58032a2579cc07b2df8cba
@@ -4,46 +4,60 @@ module PragmaticTokenizer
4
4
  # This class separates true full stops while ignoring
5
5
  # periods that are part of an abbreviation
6
6
  class FullStopSeparator
7
- attr_reader :tokens, :abbreviations, :downcase
7
+
8
+ REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
9
+ REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
+ REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
11
+ REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
12
+ DOT = '.'.freeze
13
+
8
14
  def initialize(tokens:, abbreviations:, downcase:)
9
- @tokens = tokens
15
+ @tokens = tokens
10
16
  @abbreviations = abbreviations
11
- @downcase = downcase
17
+ @downcase = downcase
12
18
  end
13
19
 
14
20
  def separate
15
- abbr = {}
16
- abbreviations.each do |i|
17
- abbr[i] = true
18
- end
19
- cleaned_tokens = []
20
- tokens.each_with_index do |_t, i|
21
- if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
22
- w = Regexp.last_match(1)
23
- if downcase
24
- abbreviation = abbr[w]
25
- else
26
- abbreviation = abbr[Unicode.downcase(w)]
27
- end
28
- unless abbreviation || w =~ /\A[a-z]\z/i ||
29
- w =~ /[a-z](?:\.[a-z])+\z/i
30
- cleaned_tokens << w
31
- cleaned_tokens << '.'
32
- next
21
+ create_cleaned_tokens
22
+ replace_last_token unless @cleaned_tokens.empty?
23
+ @cleaned_tokens
24
+ end
25
+
26
+ private
27
+
28
+ def create_cleaned_tokens
29
+ @cleaned_tokens = []
30
+ @tokens.each_with_index do |token, position|
31
+ if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
32
+ match = Regexp.last_match(1)
33
+ if unknown_method1(match)
34
+ @cleaned_tokens += [match, DOT]
35
+ next
36
+ end
33
37
  end
38
+ @cleaned_tokens << token
34
39
  end
35
- cleaned_tokens << tokens[i]
36
40
  end
37
- if downcase
38
- abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
- else
40
- abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
41
+
42
+ def unknown_method1(token)
43
+ !abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
44
+ end
45
+
46
+ def abbreviation?(token)
47
+ @abbreviations.include?(inverse_case(token))
41
48
  end
42
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
43
- cleaned_tokens[-1] = Regexp.last_match(1)
44
- cleaned_tokens.push '.'
49
+
50
+ def inverse_case(token)
51
+ @downcase ? token : Unicode.downcase(token)
45
52
  end
46
- cleaned_tokens
47
- end
53
+
54
+ def replace_last_token
55
+ last_token = @cleaned_tokens[-1]
56
+ return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
57
+ @cleaned_tokens[-1] = Regexp.last_match(1)
58
+ @cleaned_tokens << DOT
59
+ end
60
+
48
61
  end
62
+
49
63
  end
@@ -15,14 +15,25 @@ module PragmaticTokenizer
15
15
  EMOTICON_REGEX = /(?::|;|=)(?:-)?(?:\)|D|P)/
16
16
 
17
17
  class SingleQuotes
18
+
19
+ REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
20
+ REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
21
+ REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
22
+ REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
23
+
18
24
  def handle_single_quotes(text)
19
25
  # Convert left quotes to special character except for 'Twas or 'twas
20
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
- text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
22
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
23
- # Separate right single quotes
24
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
26
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
27
+ text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
28
+ text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
29
+ text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
30
+
31
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
32
+ text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
33
+
34
+ text
25
35
  end
36
+
26
37
  end
27
38
  end
28
39
  end
@@ -95,16 +95,29 @@ module PragmaticTokenizer
95
95
  "will-o'-the-wisp" => "will-of-the-wisp",
96
96
  "'twas" => "it was"
97
97
  }.freeze
98
+
98
99
  class SingleQuotes
100
+
101
+ REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
102
+ REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
103
+ REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
104
+ REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
105
+
99
106
  def handle_single_quotes(text)
100
107
  # Convert left quotes to special character except for 'Twas or 'twas
101
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
102
- text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
103
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
104
- # Separate right single quotes
105
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
108
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
109
+ text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
110
+ text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
111
+ text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
112
+
113
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
114
+ text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
115
+
116
+ text
106
117
  end
118
+
107
119
  end
120
+
108
121
  end
109
122
  end
110
123
  end
@@ -7,11 +7,17 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
 
9
9
  class SingleQuotes
10
+
11
+ REGEXP_UNKNOWN1 = /(\w|\D)'(?!')(?=\W|$)/o
12
+ REGEXP_UNKNOWN2 = /(\W|^)'(?=.*\w)/o
13
+
10
14
  def handle_single_quotes(text)
11
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
13
- text.gsub!(/l\'/, '\1 l☮ \2') || text
14
- text.gsub!(/L\'/, '\1 L☮ \2') || text
15
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
16
+ text.gsub!(REGEXP_UNKNOWN1, "\\1 #{replacement} ")
17
+ text.gsub!(REGEXP_UNKNOWN2, ' ' << replacement)
18
+ text.gsub!(/l\'/, '\1 l☮ \2')
19
+ text.gsub!(/L\'/, '\1 L☮ \2')
20
+ text
15
21
  end
16
22
  end
17
23
  end
@@ -20,6 +20,7 @@ module PragmaticTokenizer
20
20
 
21
21
  REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
22
22
  REGEXP_COMMAS)
23
+ REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
23
24
 
24
25
  attr_reader :text, :abbreviations, :downcase
25
26
 
@@ -30,17 +31,21 @@ module PragmaticTokenizer
30
31
  end
31
32
 
32
33
  def post_process
33
- EndingPunctuationSeparator.new(tokens: method_name3).separate
34
+ separate_ending_punctuation(method_name3)
34
35
  end
35
36
 
36
37
  private
37
38
 
38
39
  def method_name3
39
- separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate
40
+ separated = separate_ending_punctuation(full_stop_separated_tokens)
40
41
  procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
41
42
  procs.reduce(separated) { |a, e| a.flat_map(&e) }
42
43
  end
43
44
 
45
+ def separate_ending_punctuation(tokens)
46
+ tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
47
+ end
48
+
44
49
  def unified1
45
50
  proc { |token| token.split(REGEX_UNIFIED1) }
46
51
  end
@@ -91,11 +96,8 @@ module PragmaticTokenizer
91
96
  end
92
97
 
93
98
  def extract_abbreviation(token)
94
- if downcase
95
- token.split(/(\.)/)[0]
96
- else
97
- Unicode.downcase(token.split(/(\.)/)[0])
98
- end
99
+ before_first_dot = token[0, token.index('.'.freeze)]
100
+ downcase ? before_first_dot : Unicode.downcase(before_first_dot)
99
101
  end
100
102
 
101
103
  def convert_sym_to_punct(token)
@@ -59,15 +59,15 @@ module PragmaticTokenizer
59
59
  end
60
60
 
61
61
  def shift_horizontal_ellipsis!
62
- gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
62
+ gsub!(/(…+)/o, ' \1 ')
63
63
  end
64
64
 
65
65
  def shift_ellipse_two_dots!
66
- gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
66
+ gsub!(/(\.\.+)/o, ' \1 ')
67
67
  end
68
68
 
69
69
  def shift_ellipse_three_dots!
70
- gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
70
+ gsub!(/(\.\.\.+)/o, ' \1 ')
71
71
  end
72
72
 
73
73
  def shift_no_space_mention!
@@ -98,11 +98,11 @@ module PragmaticTokenizer
98
98
  end
99
99
 
100
100
  def shift_bracket!
101
- gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
101
+ gsub!(/([\(\[\{\}\]\)])/o, ' \1 ')
102
102
  end
103
103
 
104
104
  def shift_semicolon!
105
- gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
105
+ gsub!(/([;])/o, ' \1 ')
106
106
  end
107
107
 
108
108
  def shift_percent!
@@ -138,7 +138,7 @@ module PragmaticTokenizer
138
138
 
139
139
  def replace_left_quotes!(style, replacement_key)
140
140
  replacement = replacement_for_key(replacement_key)
141
- gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
141
+ gsub!(/#{style}(?=.*\w)/o, ' ' << replacement << ' ')
142
142
  end
143
143
 
144
144
  def replace_remaining_double_quotes!
@@ -149,7 +149,7 @@ module PragmaticTokenizer
149
149
 
150
150
  def replace_remaining_quotes!(style, replacement_key)
151
151
  replacement = replacement_for_key(replacement_key)
152
- gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
152
+ gsub!(/#{style}/, ' ' << replacement << ' ')
153
153
  end
154
154
 
155
155
  def convert_sgl_quotes!(language)
@@ -3,8 +3,8 @@ require 'pragmatic_tokenizer/languages'
3
3
  require 'pragmatic_tokenizer/pre_processor'
4
4
  require 'pragmatic_tokenizer/post_processor'
5
5
  require 'pragmatic_tokenizer/full_stop_separator'
6
- require 'pragmatic_tokenizer/ending_punctuation_separator'
7
6
  require 'unicode'
7
+ require 'set'
8
8
 
9
9
  module PragmaticTokenizer
10
10
  class Tokenizer
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.0".freeze
2
+ VERSION = "3.0.1".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.0
4
+ version: 3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-14 00:00:00.000000000 Z
11
+ date: 2016-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/ending_punctuation_separator.rb
115
114
  - lib/pragmatic_tokenizer/full_stop_separator.rb
116
115
  - lib/pragmatic_tokenizer/languages.rb
117
116
  - lib/pragmatic_tokenizer/languages/arabic.rb
@@ -1,31 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates ending punctuation from a token
5
- class EndingPunctuationSeparator
6
- attr_reader :tokens
7
- def initialize(tokens:)
8
- @tokens = tokens
9
- end
10
-
11
- def separate
12
- cleaned_tokens = []
13
- tokens.each do |a|
14
- split_punctuation = a.scan(/(?<=\S)[。.!!??]+$/)
15
- if split_punctuation[0].nil?
16
- cleaned_tokens << a
17
- else
18
- cleaned_tokens << a.tr(split_punctuation[0], '')
19
- if split_punctuation[0].length.eql?(1)
20
- cleaned_tokens << split_punctuation[0]
21
- else
22
- split_punctuation[0].split("").each do |s|
23
- cleaned_tokens << s
24
- end
25
- end
26
- end
27
- end
28
- cleaned_tokens
29
- end
30
- end
31
- end