RubyGems - pragmatic_tokenizer - Versions diffs - 3.0.5 → 3.0.6 - Mend

pragmatic_tokenizer 3.0.5 → 3.0.6

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/pragmatic_tokenizer/full_stop_separator.rb +10 -14
data/lib/pragmatic_tokenizer/post_processor.rb +53 -47
data/lib/pragmatic_tokenizer/tokenizer.rb +2 -2
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3992076b1304fc76da055925e851e5d61b27dea6
-  data.tar.gz: ab52d479ad9f83018e18fa6c8966cd6213813646
+  metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
+  data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
 SHA512:
-  metadata.gz: 325bba401a3cc218aa984e88828775a1718d11b8f6170d950563cdf90ef5f3a5755feaaaa6760a37a8c29fa63002c36ea48b530fb601c91ea953197e93fc7159
-  data.tar.gz: af2d68f841b70444ce90d5ad00b4d0cb0d33ce1d72d254d4f4cecdc10e11bab5954e8757757663097e53e29c6f143039e5d00764412e78870c933e6e784157d5
+  metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
+  data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f

data/lib/pragmatic_tokenizer/full_stop_separator.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module PragmaticTokenizer
     end
     def separate
-      create_cleaned_tokens
+      @cleaned_tokens = create_cleaned_tokens
       replace_last_token unless @cleaned_tokens.empty?
       @cleaned_tokens
     end
@@ -25,21 +25,15 @@ module PragmaticTokenizer
     private
       def create_cleaned_tokens
-        @cleaned_tokens = []
-        @tokens.each_with_index do |token, position|
-          if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
-            match = Regexp.last_match(1)
-            if abbreviation?(match)
-              @cleaned_tokens += [match, DOT]
-              next
-            end
-          end
-          @cleaned_tokens << token
-        end
+        @tokens[0..-2]
+            .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
+            .push(@tokens.last)
       end
       def abbreviation?(token)
-        !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
+        return false unless token.end_with?(DOT) && token.length > 1
+        shortened = token.chomp(DOT)
+        !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
       end
       def defined_abbreviation?(token)
@@ -52,7 +46,9 @@ module PragmaticTokenizer
       def replace_last_token
         last_token = @cleaned_tokens[-1]
-        return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
+        return unless last_token.end_with?(DOT) && last_token.length > 1
+        shortened = last_token.chomp(DOT)
+        return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
         @cleaned_tokens[-1] = Regexp.last_match(1)
         @cleaned_tokens << DOT
       end

data/lib/pragmatic_tokenizer/post_processor.rb CHANGED Viewed

@@ -1,26 +1,43 @@
 module PragmaticTokenizer
   class PostProcessor
-    REGEX_SYMBOL         = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
-    REGEXP_COMMAS        = /^(,|‚)+/
-    REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/
-    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\/(.*)/
-    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/
+    DOT                       = '.'.freeze
+    RANGE_DINGBATS            = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
+    RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
+    RANGE_FULLWIDTH           = '[\uFF01-\ufF1F]'.freeze # e.g. ！＂＃＇？
+    REGEXP_COMMAS        = /^([,‚])+/
+    REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
+    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\//
+    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
     REGEXP_PLUS_SIGN     = /(.+)\+(.+)/
-    REGEXP_COLON         = /^(\:)(\S{2,})/
-    REGEXP_EMOJI         = /(\u{2744}[\u{FE0E}|\u{FE0F}])/
+    REGEXP_COLON         = /^(:)(\S{2,})/
+    REGEXP_DINGBATS      = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
+    REGEXP_ENDING_PUNCT  = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
+    REGEXP_DOMAIN        = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
+    REGEXP_EMAIL         = /\S+[＠@]\S+/
+    REGEXP_DOMAIN_START  = /^(https?:|www\.|[[:alpha:]]\.)/
+    REGEXP_DOMAIN_END    = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
+    REGEXP_DIGIT         = /[[:digit:]]+/
+    REGEXP_PERIOD1       = /(.*\.)/
+    REGEXP_PERIOD2       = /(\.)/
     REGEX_UNIFIED1       = Regexp.union(REGEXP_SLASH,
                                         REGEXP_QUESTION_MARK,
                                         REGEXP_PLUS_SIGN,
                                         REGEXP_COLON,
-                                        REGEXP_EMOJI,
+                                        REGEXP_DINGBATS,
                                         PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
                                         PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
     REGEX_UNIFIED2       = Regexp.union(REGEXP_SINGLE_QUOTES,
                                         REGEXP_COMMAS)
-    REGEXP_UNKNOWN1      = /(?<=\S)([。．！!?？]+)$/
+    REGEX_DOMAIN_EMAIL   = Regexp.union(REGEXP_DOMAIN,
+                                        REGEXP_EMAIL)
+    REGEX_DOMAIN         = Regexp.union(REGEXP_DOMAIN_START,
+                                        REGEXP_DOMAIN_END)
     attr_reader :text, :abbreviations, :downcase
@@ -31,19 +48,24 @@ module PragmaticTokenizer
     end
     def post_process
-      separate_ending_punctuation(post_process_punctuation)
+      procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
     end
     private
-      def post_process_punctuation
-        separated = separate_ending_punctuation(full_stop_separated_tokens)
-        procs     = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
-        procs.reduce(separated) { |a, e| a.flat_map(&e) }
+      # note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
+      def procs
+        [
+            separate_ending_punctuation,
+            unified1,
+            split_unknown_period1,
+            split_unknown_period2,
+            separate_ending_punctuation
+        ]
       end
-      def separate_ending_punctuation(tokens)
-        tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
+      def separate_ending_punctuation
+        proc { |token| token.split(REGEXP_ENDING_PUNCT) }
       end
       def unified1
@@ -51,64 +73,48 @@ module PragmaticTokenizer
       end
       def full_stop_separated_tokens
-        FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
+        FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
       end
-      def split_and_convert_commas_and_quotes
+      def split_convert_commas_quotes
         text
             .split
             .flat_map { |token| token.split(REGEX_UNIFIED2) }
             .flat_map { |token| convert_sym_to_punct(token) }
       end
-      def split_emoji
-        proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
-      end
       def split_unknown_period1
-        proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
+        proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
       end
       def split_unknown_period2
-        proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
+        proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
       end
       def unknown_period1?(token)
-        token.include?(".") &&
-            token !~ /(http|https|www)(\.|:)/ &&
+        token.include?(DOT) &&
             token.length > 1 &&
-            token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
-            token !~ /\S+(＠|@)\S+/ &&
+            token !~ REGEX_DOMAIN_EMAIL &&
             abbreviations.include?(extract_abbreviation(token))
       end
       def unknown_period2?(token)
-        token.include?(".") &&
-            token !~ /(http|https|www)(\.|:)/ &&
-            token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
-            token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
-            token.length > 2 &&
-            token !~ /\A[a-zA-Z]{1}\./ &&
-            token.count(".") == 1 &&
-            token !~ /\d+/ &&
-            !abbreviations.include?(extract_abbreviation(token)) &&
-            token !~ /\S+(＠|@)\S+/
+        token.include?(DOT) &&
+            token !~ REGEX_DOMAIN &&
+            token !~ REGEXP_DIGIT &&
+            token.count(DOT) == 1 &&
+            !abbreviations.include?(extract_abbreviation(token))
       end
       def extract_abbreviation(token)
-        before_first_dot = token[0, token.index('.'.freeze)]
+        before_first_dot = token[0, token.index(DOT)]
         downcase ? before_first_dot : Unicode.downcase(before_first_dot)
       end
       def convert_sym_to_punct(token)
-        symbol_matches = REGEX_SYMBOL.match(token)
-        if symbol_matches.nil?
-          token
-        else
-          pattern     = symbol_matches[0]
-          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
-          token.gsub!(pattern, replacement)
-        end
+        PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
+            .each { |pattern, replacement| break if token.sub!(replacement, pattern) }
+        token
       end
   end

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -64,7 +64,7 @@ module PragmaticTokenizer
     REGEXP_NO_NUMBERS          = /\A\D+\z/
     REGEXP_NUMBER              = /\D*\d+\d*/
     REGEXP_CONSECUTIVE_DOTS    = /\A\.{2,}\z/
-    REGEXP_CHUNK_STRING        = /.{,10000}(?=\s|\z)/m
+    REGEXP_CHUNK_STRING        = /\S.{1,10000}(?!\S)/m
     # @param [Hash] opts optional arguments
@@ -150,7 +150,7 @@ module PragmaticTokenizer
     def tokenize(text)
       return [] unless text
-      raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
+      raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
       CGI.unescapeHTML(text)
           .scan(REGEXP_CHUNK_STRING)
           .flat_map { |segment| post_process(pre_process(segment)) }

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "3.0.5".freeze
+  VERSION = "3.0.6".freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 3.0.5
+  version: 3.0.6
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-09-19 00:00:00.000000000 Z
+date: 2018-03-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.12
+rubygems_version: 2.6.14
 signing_key:
 specification_version: 4
 summary: A multilingual tokenizer