pragmatic_tokenizer 3.0.0 → 3.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
4
- data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
3
+ metadata.gz: 2f5112bf38a65d6cfc437fd9b735a5011479807d
4
+ data.tar.gz: 3963cf07246508f250bc2391fccf4245f1d275b0
5
5
  SHA512:
6
- metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
7
- data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
6
+ metadata.gz: fc9fe3c9b9c6aca7ca355ac6a1000f973cdc4f1bd94babd27bad78add4b761cc030bbeddac8ad9deea3ab9e0c4849efa918f2808de198c288fecb7d58af0a4df
7
+ data.tar.gz: 4c28cb848d3ad5b7f29284b2f0a2d63bc9860a4d9c306d5f173af3f2c98f54241e5bf6a190380dd15e431f792860303aceea58460c58032a2579cc07b2df8cba
@@ -4,46 +4,60 @@ module PragmaticTokenizer
4
4
  # This class separates true full stops while ignoring
5
5
  # periods that are part of an abbreviation
6
6
  class FullStopSeparator
7
- attr_reader :tokens, :abbreviations, :downcase
7
+
8
+ REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
9
+ REGEXP_ONLY_LETTERS = /\A[a-z]\z/i
10
+ REGEXP_UNKNOWN1 = /[a-z](?:\.[a-z])+\z/i
11
+ REGEXP_UNKNOWN2 = /\A(.*\w)\.\z/
12
+ DOT = '.'.freeze
13
+
8
14
  def initialize(tokens:, abbreviations:, downcase:)
9
- @tokens = tokens
15
+ @tokens = tokens
10
16
  @abbreviations = abbreviations
11
- @downcase = downcase
17
+ @downcase = downcase
12
18
  end
13
19
 
14
20
  def separate
15
- abbr = {}
16
- abbreviations.each do |i|
17
- abbr[i] = true
18
- end
19
- cleaned_tokens = []
20
- tokens.each_with_index do |_t, i|
21
- if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
22
- w = Regexp.last_match(1)
23
- if downcase
24
- abbreviation = abbr[w]
25
- else
26
- abbreviation = abbr[Unicode.downcase(w)]
27
- end
28
- unless abbreviation || w =~ /\A[a-z]\z/i ||
29
- w =~ /[a-z](?:\.[a-z])+\z/i
30
- cleaned_tokens << w
31
- cleaned_tokens << '.'
32
- next
21
+ create_cleaned_tokens
22
+ replace_last_token unless @cleaned_tokens.empty?
23
+ @cleaned_tokens
24
+ end
25
+
26
+ private
27
+
28
+ def create_cleaned_tokens
29
+ @cleaned_tokens = []
30
+ @tokens.each_with_index do |token, position|
31
+ if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
32
+ match = Regexp.last_match(1)
33
+ if unknown_method1(match)
34
+ @cleaned_tokens += [match, DOT]
35
+ next
36
+ end
33
37
  end
38
+ @cleaned_tokens << token
34
39
  end
35
- cleaned_tokens << tokens[i]
36
40
  end
37
- if downcase
38
- abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
39
- else
40
- abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
41
+
42
+ def unknown_method1(token)
43
+ !abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
44
+ end
45
+
46
+ def abbreviation?(token)
47
+ @abbreviations.include?(inverse_case(token))
41
48
  end
42
- if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
43
- cleaned_tokens[-1] = Regexp.last_match(1)
44
- cleaned_tokens.push '.'
49
+
50
+ def inverse_case(token)
51
+ @downcase ? token : Unicode.downcase(token)
45
52
  end
46
- cleaned_tokens
47
- end
53
+
54
+ def replace_last_token
55
+ last_token = @cleaned_tokens[-1]
56
+ return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
57
+ @cleaned_tokens[-1] = Regexp.last_match(1)
58
+ @cleaned_tokens << DOT
59
+ end
60
+
48
61
  end
62
+
49
63
  end
@@ -15,14 +15,25 @@ module PragmaticTokenizer
15
15
  EMOTICON_REGEX = /(?::|;|=)(?:-)?(?:\)|D|P)/
16
16
 
17
17
  class SingleQuotes
18
+
19
+ REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
20
+ REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
21
+ REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
22
+ REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
23
+
18
24
  def handle_single_quotes(text)
19
25
  # Convert left quotes to special character except for 'Twas or 'twas
20
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
21
- text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
22
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
23
- # Separate right single quotes
24
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
26
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
27
+ text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
28
+ text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
29
+ text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
30
+
31
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
32
+ text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
33
+
34
+ text
25
35
  end
36
+
26
37
  end
27
38
  end
28
39
  end
@@ -95,16 +95,29 @@ module PragmaticTokenizer
95
95
  "will-o'-the-wisp" => "will-of-the-wisp",
96
96
  "'twas" => "it was"
97
97
  }.freeze
98
+
98
99
  class SingleQuotes
100
+
101
+ REGEXP_LEFT_QUOTES1 = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
102
+ REGEXP_LEFT_QUOTES2 = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
103
+ REGEXP_LEFT_QUOTES3 = /(\W|^)'(?=.*\w)/o
104
+ REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
105
+
99
106
  def handle_single_quotes(text)
100
107
  # Convert left quotes to special character except for 'Twas or 'twas
101
- text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
102
- text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
103
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
104
- # Separate right single quotes
105
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
108
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
109
+ text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
110
+ text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
111
+ text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
112
+
113
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
114
+ text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
115
+
116
+ text
106
117
  end
118
+
107
119
  end
120
+
108
121
  end
109
122
  end
110
123
  end
@@ -7,11 +7,17 @@ module PragmaticTokenizer
7
7
  CONTRACTIONS = {}.freeze
8
8
 
9
9
  class SingleQuotes
10
+
11
+ REGEXP_UNKNOWN1 = /(\w|\D)'(?!')(?=\W|$)/o
12
+ REGEXP_UNKNOWN2 = /(\W|^)'(?=.*\w)/o
13
+
10
14
  def handle_single_quotes(text)
11
- text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
12
- text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
13
- text.gsub!(/l\'/, '\1 l☮ \2') || text
14
- text.gsub!(/L\'/, '\1 L☮ \2') || text
15
+ replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
16
+ text.gsub!(REGEXP_UNKNOWN1, "\\1 #{replacement} ")
17
+ text.gsub!(REGEXP_UNKNOWN2, ' ' << replacement)
18
+ text.gsub!(/l\'/, '\1 l☮ \2')
19
+ text.gsub!(/L\'/, '\1 L☮ \2')
20
+ text
15
21
  end
16
22
  end
17
23
  end
@@ -20,6 +20,7 @@ module PragmaticTokenizer
20
20
 
21
21
  REGEX_UNIFIED2 = Regexp.union(REGEXP_SINGLE_QUOTES,
22
22
  REGEXP_COMMAS)
23
+ REGEXP_UNKNOWN1 = /(?<=\S)([。.!!??]+)$/
23
24
 
24
25
  attr_reader :text, :abbreviations, :downcase
25
26
 
@@ -30,17 +31,21 @@ module PragmaticTokenizer
30
31
  end
31
32
 
32
33
  def post_process
33
- EndingPunctuationSeparator.new(tokens: method_name3).separate
34
+ separate_ending_punctuation(method_name3)
34
35
  end
35
36
 
36
37
  private
37
38
 
38
39
  def method_name3
39
- separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate
40
+ separated = separate_ending_punctuation(full_stop_separated_tokens)
40
41
  procs = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
41
42
  procs.reduce(separated) { |a, e| a.flat_map(&e) }
42
43
  end
43
44
 
45
+ def separate_ending_punctuation(tokens)
46
+ tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
47
+ end
48
+
44
49
  def unified1
45
50
  proc { |token| token.split(REGEX_UNIFIED1) }
46
51
  end
@@ -91,11 +96,8 @@ module PragmaticTokenizer
91
96
  end
92
97
 
93
98
  def extract_abbreviation(token)
94
- if downcase
95
- token.split(/(\.)/)[0]
96
- else
97
- Unicode.downcase(token.split(/(\.)/)[0])
98
- end
99
+ before_first_dot = token[0, token.index('.'.freeze)]
100
+ downcase ? before_first_dot : Unicode.downcase(before_first_dot)
99
101
  end
100
102
 
101
103
  def convert_sym_to_punct(token)
@@ -59,15 +59,15 @@ module PragmaticTokenizer
59
59
  end
60
60
 
61
61
  def shift_horizontal_ellipsis!
62
- gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
62
+ gsub!(/(…+)/o, ' \1 ')
63
63
  end
64
64
 
65
65
  def shift_ellipse_two_dots!
66
- gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
66
+ gsub!(/(\.\.+)/o, ' \1 ')
67
67
  end
68
68
 
69
69
  def shift_ellipse_three_dots!
70
- gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
70
+ gsub!(/(\.\.\.+)/o, ' \1 ')
71
71
  end
72
72
 
73
73
  def shift_no_space_mention!
@@ -98,11 +98,11 @@ module PragmaticTokenizer
98
98
  end
99
99
 
100
100
  def shift_bracket!
101
- gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
101
+ gsub!(/([\(\[\{\}\]\)])/o, ' \1 ')
102
102
  end
103
103
 
104
104
  def shift_semicolon!
105
- gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
105
+ gsub!(/([;])/o, ' \1 ')
106
106
  end
107
107
 
108
108
  def shift_percent!
@@ -138,7 +138,7 @@ module PragmaticTokenizer
138
138
 
139
139
  def replace_left_quotes!(style, replacement_key)
140
140
  replacement = replacement_for_key(replacement_key)
141
- gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
141
+ gsub!(/#{style}(?=.*\w)/o, ' ' << replacement << ' ')
142
142
  end
143
143
 
144
144
  def replace_remaining_double_quotes!
@@ -149,7 +149,7 @@ module PragmaticTokenizer
149
149
 
150
150
  def replace_remaining_quotes!(style, replacement_key)
151
151
  replacement = replacement_for_key(replacement_key)
152
- gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
152
+ gsub!(/#{style}/, ' ' << replacement << ' ')
153
153
  end
154
154
 
155
155
  def convert_sgl_quotes!(language)
@@ -3,8 +3,8 @@ require 'pragmatic_tokenizer/languages'
3
3
  require 'pragmatic_tokenizer/pre_processor'
4
4
  require 'pragmatic_tokenizer/post_processor'
5
5
  require 'pragmatic_tokenizer/full_stop_separator'
6
- require 'pragmatic_tokenizer/ending_punctuation_separator'
7
6
  require 'unicode'
7
+ require 'set'
8
8
 
9
9
  module PragmaticTokenizer
10
10
  class Tokenizer
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "3.0.0".freeze
2
+ VERSION = "3.0.1".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 3.0.0
4
+ version: 3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-02-14 00:00:00.000000000 Z
11
+ date: 2016-02-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -111,7 +111,6 @@ files:
111
111
  - README.md
112
112
  - Rakefile
113
113
  - lib/pragmatic_tokenizer.rb
114
- - lib/pragmatic_tokenizer/ending_punctuation_separator.rb
115
114
  - lib/pragmatic_tokenizer/full_stop_separator.rb
116
115
  - lib/pragmatic_tokenizer/languages.rb
117
116
  - lib/pragmatic_tokenizer/languages/arabic.rb
@@ -1,31 +0,0 @@
1
- # -*- encoding : utf-8 -*-
2
-
3
- module PragmaticTokenizer
4
- # This class separates ending punctuation from a token
5
- class EndingPunctuationSeparator
6
- attr_reader :tokens
7
- def initialize(tokens:)
8
- @tokens = tokens
9
- end
10
-
11
- def separate
12
- cleaned_tokens = []
13
- tokens.each do |a|
14
- split_punctuation = a.scan(/(?<=\S)[。.!!??]+$/)
15
- if split_punctuation[0].nil?
16
- cleaned_tokens << a
17
- else
18
- cleaned_tokens << a.tr(split_punctuation[0], '')
19
- if split_punctuation[0].length.eql?(1)
20
- cleaned_tokens << split_punctuation[0]
21
- else
22
- split_punctuation[0].split("").each do |s|
23
- cleaned_tokens << s
24
- end
25
- end
26
- end
27
- end
28
- cleaned_tokens
29
- end
30
- end
31
- end