RubyGems - pragmatic_tokenizer - Versions diffs - 3.0.0 → 3.0.1 - Mend

pragmatic_tokenizer 3.0.0 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/lib/pragmatic_tokenizer/full_stop_separator.rb +45 -31
data/lib/pragmatic_tokenizer/languages/common.rb +16 -5
data/lib/pragmatic_tokenizer/languages/english.rb +18 -5
data/lib/pragmatic_tokenizer/languages/french.rb +10 -4
data/lib/pragmatic_tokenizer/post_processor.rb +9 -7
data/lib/pragmatic_tokenizer/pre_processor.rb +7 -7
data/lib/pragmatic_tokenizer/tokenizer.rb +1 -1
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +2 -3
data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +0 -31

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 241faea11370fc685c55a22eae88d9af30fa955c
-  data.tar.gz: 2daa6aa5bae004836538b4bd632067074782a87b
+  metadata.gz: 2f5112bf38a65d6cfc437fd9b735a5011479807d
+  data.tar.gz: 3963cf07246508f250bc2391fccf4245f1d275b0
 SHA512:
-  metadata.gz: 54f9fb11af6e42f4e35d6a878dae45e5fd0850793671ae7023dd8f8e17f7e307625f9b8497c5912b8c14312235150ad2cd19cf7f15fa693ac7cee427827677ef
-  data.tar.gz: 2ea45b90bfc8df8044ebab404e89e0936ce391e0d7e5206fc60e775ca65cb67af0fabe3ec787855b359ac0712bdece0eaefdb2dc9bcb0635c41fc0ad604a1bdf
+  metadata.gz: fc9fe3c9b9c6aca7ca355ac6a1000f973cdc4f1bd94babd27bad78add4b761cc030bbeddac8ad9deea3ab9e0c4849efa918f2808de198c288fecb7d58af0a4df
+  data.tar.gz: 4c28cb848d3ad5b7f29284b2f0a2d63bc9860a4d9c306d5f173af3f2c98f54241e5bf6a190380dd15e431f792860303aceea58460c58032a2579cc07b2df8cba

data/lib/pragmatic_tokenizer/full_stop_separator.rb CHANGED Viewed

@@ -4,46 +4,60 @@ module PragmaticTokenizer
   # This class separates true full stops while ignoring
   # periods that are part of an abbreviation
   class FullStopSeparator
-    attr_reader :tokens, :abbreviations, :downcase
+    REGEXP_ENDS_WITH_DOT = /\A(.+)\.\z/
+    REGEXP_ONLY_LETTERS  = /\A[a-z]\z/i
+    REGEXP_UNKNOWN1      = /[a-z](?:\.[a-z])+\z/i
+    REGEXP_UNKNOWN2      = /\A(.*\w)\.\z/
+    DOT                  = '.'.freeze
     def initialize(tokens:, abbreviations:, downcase:)
-      @tokens = tokens
+      @tokens        = tokens
       @abbreviations = abbreviations
-      @downcase = downcase
+      @downcase      = downcase
     end
     def separate
-      abbr = {}
-      abbreviations.each do |i|
-        abbr[i] = true
-      end
-      cleaned_tokens = []
-      tokens.each_with_index do |_t, i|
-        if tokens[i + 1] && tokens[i] =~ /\A(.+)\.\z/
-          w = Regexp.last_match(1)
-          if downcase
-            abbreviation = abbr[w]
-          else
-            abbreviation = abbr[Unicode.downcase(w)]
-          end
-          unless abbreviation || w =~ /\A[a-z]\z/i ||
-                 w =~ /[a-z](?:\.[a-z])+\z/i
-            cleaned_tokens << w
-            cleaned_tokens << '.'
-            next
+      create_cleaned_tokens
+      replace_last_token unless @cleaned_tokens.empty?
+      @cleaned_tokens
+    end
+    private
+      def create_cleaned_tokens
+        @cleaned_tokens = []
+        @tokens.each_with_index do |token, position|
+          if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
+            match = Regexp.last_match(1)
+            if unknown_method1(match)
+              @cleaned_tokens += [match, DOT]
+              next
+            end
           end
+          @cleaned_tokens << token
         end
-        cleaned_tokens << tokens[i]
       end
-      if downcase
-        abbreviation = abbreviations.include?(cleaned_tokens[-1].chomp(".")) unless cleaned_tokens[-1].nil?
-      else
-        abbreviation = abbreviations.include?(Unicode.downcase(cleaned_tokens[-1]).chomp(".")) unless cleaned_tokens[-1].nil?
+      def unknown_method1(token)
+        !abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_UNKNOWN1
+      end
+      def abbreviation?(token)
+        @abbreviations.include?(inverse_case(token))
       end
-      if cleaned_tokens[-1] && cleaned_tokens[-1] =~ /\A(.*\w)\.\z/ && !abbreviation
-        cleaned_tokens[-1] = Regexp.last_match(1)
-        cleaned_tokens.push '.'
+      def inverse_case(token)
+        @downcase ? token : Unicode.downcase(token)
       end
-      cleaned_tokens
-    end
+      def replace_last_token
+        last_token = @cleaned_tokens[-1]
+        return if abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_UNKNOWN2
+        @cleaned_tokens[-1] = Regexp.last_match(1)
+        @cleaned_tokens << DOT
+      end
   end
 end

data/lib/pragmatic_tokenizer/languages/common.rb CHANGED Viewed

@@ -15,14 +15,25 @@ module PragmaticTokenizer
       EMOTICON_REGEX      = /(?::|;|=)(?:-)?(?:\)|D|P)/
       class SingleQuotes
+        REGEXP_LEFT_QUOTES1      = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
+        REGEXP_LEFT_QUOTES2      = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
+        REGEXP_LEFT_QUOTES3      = /(\W|^)'(?=.*\w)/o
+        REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
         def handle_single_quotes(text)
           # Convert left quotes to special character except for 'Twas or 'twas
-          text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
-          text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
-          text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
-          # Separate right single quotes
-          text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
+          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
+          text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
+          text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
+          text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
+          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
+          text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
+          text
         end
       end
     end
   end

data/lib/pragmatic_tokenizer/languages/english.rb CHANGED Viewed

@@ -95,16 +95,29 @@ module PragmaticTokenizer
           "will-o'-the-wisp"  => "will-of-the-wisp",
           "'twas"             => "it was"
       }.freeze
       class SingleQuotes
+        REGEXP_LEFT_QUOTES1      = /(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o
+        REGEXP_LEFT_QUOTES2      = /(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o
+        REGEXP_LEFT_QUOTES3      = /(\W|^)'(?=.*\w)/o
+        REGEXP_RIGHT_SIDE_QUOTES = /(\w|\D)'(?!')(?=\W|$)/o
         def handle_single_quotes(text)
           # Convert left quotes to special character except for 'Twas or 'twas
-          text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
-          text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { Regexp.last_match(1) ? Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
-          text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
-          # Separate right single quotes
-          text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
+          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
+          text.gsub!(REGEXP_LEFT_QUOTES1, "\\1 #{replacement} ")
+          text.gsub!(REGEXP_LEFT_QUOTES3, ' ' << replacement)
+          text.gsub!(REGEXP_RIGHT_SIDE_QUOTES, "\\1 #{replacement} ")
+          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘".freeze]
+          text.gsub!(REGEXP_LEFT_QUOTES2, "\\1 #{replacement} ")
+          text
         end
       end
     end
   end
 end

data/lib/pragmatic_tokenizer/languages/french.rb CHANGED Viewed

@@ -7,11 +7,17 @@ module PragmaticTokenizer
       CONTRACTIONS = {}.freeze
       class SingleQuotes
+        REGEXP_UNKNOWN1 = /(\w|\D)'(?!')(?=\W|$)/o
+        REGEXP_UNKNOWN2 = /(\W|^)'(?=.*\w)/o
         def handle_single_quotes(text)
-          text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { Regexp.last_match(1) + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
-          text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
-          text.gsub!(/l\'/, '\1 l☮ \2') || text
-          text.gsub!(/L\'/, '\1 L☮ \2') || text
+          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'".freeze]
+          text.gsub!(REGEXP_UNKNOWN1, "\\1 #{replacement} ")
+          text.gsub!(REGEXP_UNKNOWN2, ' ' << replacement)
+          text.gsub!(/l\'/, '\1 l☮ \2')
+          text.gsub!(/L\'/, '\1 L☮ \2')
+          text
         end
       end
     end

data/lib/pragmatic_tokenizer/post_processor.rb CHANGED Viewed

@@ -20,6 +20,7 @@ module PragmaticTokenizer
     REGEX_UNIFIED2       = Regexp.union(REGEXP_SINGLE_QUOTES,
                                         REGEXP_COMMAS)
+    REGEXP_UNKNOWN1      = /(?<=\S)([。．！!?？]+)$/
     attr_reader :text, :abbreviations, :downcase
@@ -30,17 +31,21 @@ module PragmaticTokenizer
     end
     def post_process
-      EndingPunctuationSeparator.new(tokens: method_name3).separate
+      separate_ending_punctuation(method_name3)
     end
     private
       def method_name3
-        separated = EndingPunctuationSeparator.new(tokens: full_stop_separated_tokens).separate
+        separated = separate_ending_punctuation(full_stop_separated_tokens)
         procs     = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
         procs.reduce(separated) { |a, e| a.flat_map(&e) }
       end
+      def separate_ending_punctuation(tokens)
+        tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
+      end
       def unified1
         proc { |token| token.split(REGEX_UNIFIED1) }
       end
@@ -91,11 +96,8 @@ module PragmaticTokenizer
       end
       def extract_abbreviation(token)
-        if downcase
-          token.split(/(\.)/)[0]
-        else
-          Unicode.downcase(token.split(/(\.)/)[0])
-        end
+        before_first_dot = token[0, token.index('.'.freeze)]
+        downcase ? before_first_dot : Unicode.downcase(before_first_dot)
       end
       def convert_sym_to_punct(token)

data/lib/pragmatic_tokenizer/pre_processor.rb CHANGED Viewed

@@ -59,15 +59,15 @@ module PragmaticTokenizer
       end
       def shift_horizontal_ellipsis!
-        gsub!(/(…+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
+        gsub!(/(…+)/o, ' \1 ')
       end
       def shift_ellipse_two_dots!
-        gsub!(/(\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
+        gsub!(/(\.\.+)/o, ' \1 ')
       end
       def shift_ellipse_three_dots!
-        gsub!(/(\.\.\.+)/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
+        gsub!(/(\.\.\.+)/o, ' \1 ')
       end
       def shift_no_space_mention!
@@ -98,11 +98,11 @@ module PragmaticTokenizer
       end
       def shift_bracket!
-        gsub!(/([\(\[\{\}\]\)])/o) { ' ' + Regexp.last_match(1) + ' '.freeze }
+        gsub!(/([\(\[\{\}\]\)])/o, ' \1 ')
       end
       def shift_semicolon!
-        gsub!(/([;])/o) { ' '.freeze + Regexp.last_match(1) + ' '.freeze }
+        gsub!(/([;])/o, ' \1 ')
       end
       def shift_percent!
@@ -138,7 +138,7 @@ module PragmaticTokenizer
       def replace_left_quotes!(style, replacement_key)
         replacement = replacement_for_key(replacement_key)
-        gsub!(/#{style}(?=.*\w)/o, ' '.freeze + replacement + ' '.freeze)
+        gsub!(/#{style}(?=.*\w)/o, ' ' << replacement << ' ')
       end
       def replace_remaining_double_quotes!
@@ -149,7 +149,7 @@ module PragmaticTokenizer
       def replace_remaining_quotes!(style, replacement_key)
         replacement = replacement_for_key(replacement_key)
-        gsub!(/#{style}/, ' '.freeze + replacement + ' '.freeze)
+        gsub!(/#{style}/, ' ' << replacement << ' ')
       end
       def convert_sgl_quotes!(language)

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -3,8 +3,8 @@ require 'pragmatic_tokenizer/languages'
 require 'pragmatic_tokenizer/pre_processor'
 require 'pragmatic_tokenizer/post_processor'
 require 'pragmatic_tokenizer/full_stop_separator'
-require 'pragmatic_tokenizer/ending_punctuation_separator'
 require 'unicode'
+require 'set'
 module PragmaticTokenizer
   class Tokenizer

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "3.0.0".freeze
+  VERSION = "3.0.1".freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 3.0.0
+  version: 3.0.1
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-02-14 00:00:00.000000000 Z
+date: 2016-02-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode
@@ -111,7 +111,6 @@ files:
 - README.md
 - Rakefile
 - lib/pragmatic_tokenizer.rb
-- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
 - lib/pragmatic_tokenizer/full_stop_separator.rb
 - lib/pragmatic_tokenizer/languages.rb
 - lib/pragmatic_tokenizer/languages/arabic.rb

data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb DELETED Viewed

@@ -1,31 +0,0 @@
-# -*- encoding : utf-8 -*-
-module PragmaticTokenizer
-  # This class separates ending punctuation from a token
-  class EndingPunctuationSeparator
-    attr_reader :tokens
-    def initialize(tokens:)
-      @tokens = tokens
-    end
-    def separate
-      cleaned_tokens = []
-      tokens.each do |a|
-        split_punctuation = a.scan(/(?<=\S)[。．！!?？]+$/)
-        if split_punctuation[0].nil?
-          cleaned_tokens << a
-        else
-          cleaned_tokens << a.tr(split_punctuation[0], '')
-          if split_punctuation[0].length.eql?(1)
-            cleaned_tokens << split_punctuation[0]
-          else
-            split_punctuation[0].split("").each do |s|
-              cleaned_tokens << s
-            end
-          end
-        end
-      end
-      cleaned_tokens
-    end
-  end
-end