RubyGems - pragmatic_tokenizer - Versions diffs - 3.0.5 → 3.0.6 - Mend

pragmatic_tokenizer 3.0.5 → 3.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/pragmatic_tokenizer/full_stop_separator.rb +10 -14
data/lib/pragmatic_tokenizer/post_processor.rb +53 -47
data/lib/pragmatic_tokenizer/tokenizer.rb +2 -2
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3992076b1304fc76da055925e851e5d61b27dea6
-  data.tar.gz: ab52d479ad9f83018e18fa6c8966cd6213813646
+  metadata.gz: 746fe8bd11bb0bd75cd7553a7f52d37810a3962f
+  data.tar.gz: 2ec1b073ec014f15a7820297cfdaa46457b94130
 SHA512:
-  metadata.gz: 325bba401a3cc218aa984e88828775a1718d11b8f6170d950563cdf90ef5f3a5755feaaaa6760a37a8c29fa63002c36ea48b530fb601c91ea953197e93fc7159
-  data.tar.gz: af2d68f841b70444ce90d5ad00b4d0cb0d33ce1d72d254d4f4cecdc10e11bab5954e8757757663097e53e29c6f143039e5d00764412e78870c933e6e784157d5
+  metadata.gz: e0a368fe63c7fd4b6f2d0f5636abd922797ff0cc84cd41fdd728803f245d5380e746a3dba02daa585dbe26c7ac84f11f94ac18cdec928dbfe8560a1a45c833d9
+  data.tar.gz: 178b2cc47e431cbc6c11ddd4fecd55394dc5498cd98651c4a632f1c923b2fd2ca73ed71c65353f9d33662141b51b51b5e4d03db51dade1876d5de1c16781359f

data/lib/pragmatic_tokenizer/full_stop_separator.rb CHANGED Viewed

@@ -17,7 +17,7 @@ module PragmaticTokenizer
     end
     def separate
-      create_cleaned_tokens
+      @cleaned_tokens = create_cleaned_tokens
       replace_last_token unless @cleaned_tokens.empty?
       @cleaned_tokens
     end
@@ -25,21 +25,15 @@ module PragmaticTokenizer
     private
       def create_cleaned_tokens
-        @cleaned_tokens = []
-        @tokens.each_with_index do |token, position|
-          if @tokens[position + 1] && token =~ REGEXP_ENDS_WITH_DOT
-            match = Regexp.last_match(1)
-            if abbreviation?(match)
-              @cleaned_tokens += [match, DOT]
-              next
-            end
-          end
-          @cleaned_tokens << token
-        end
+        @tokens[0..-2]
+            .flat_map { |token| abbreviation?(token) ? [token[0..-2], DOT] : token }
+            .push(@tokens.last)
       end
       def abbreviation?(token)
-        !defined_abbreviation?(token) && token !~ REGEXP_ONLY_LETTERS && token !~ REGEXP_ABBREVIATION
+        return false unless token.end_with?(DOT) && token.length > 1
+        shortened = token.chomp(DOT)
+        !defined_abbreviation?(shortened) && shortened !~ REGEXP_ONLY_LETTERS && shortened !~ REGEXP_ABBREVIATION
       end
       def defined_abbreviation?(token)
@@ -52,7 +46,9 @@ module PragmaticTokenizer
       def replace_last_token
         last_token = @cleaned_tokens[-1]
-        return if defined_abbreviation?(last_token.chomp(DOT)) || last_token !~ REGEXP_ENDS_WITH_DOT
+        return unless last_token.end_with?(DOT) && last_token.length > 1
+        shortened = last_token.chomp(DOT)
+        return if defined_abbreviation?(shortened) || last_token !~ REGEXP_ENDS_WITH_DOT
         @cleaned_tokens[-1] = Regexp.last_match(1)
         @cleaned_tokens << DOT
       end

data/lib/pragmatic_tokenizer/post_processor.rb CHANGED Viewed

@@ -1,26 +1,43 @@
 module PragmaticTokenizer
   class PostProcessor
-    REGEX_SYMBOL         = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/
-    REGEXP_COMMAS        = /^(,|‚)+/
-    REGEXP_SINGLE_QUOTES = /(.+)(’|'|‘|`)$/
-    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\/(.*)/
-    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)(.*)/
+    DOT                       = '.'.freeze
+    RANGE_DINGBATS            = '[\u2701-\u27BE]'.freeze # e.g. ✁✎✳❄➾
+    RANGE_VARIATION_SELECTORS = '[\uFE00-\uFE0F]'.freeze # alter the previous character
+    RANGE_FULLWIDTH           = '[\uFF01-\ufF1F]'.freeze # e.g. ！＂＃＇？
+    REGEXP_COMMAS        = /^([,‚])+/
+    REGEXP_SINGLE_QUOTES = /(.+)([’'‘`])$/
+    REGEXP_SLASH         = /^(?!(https?:|www\.))(.*)\//
+    REGEXP_QUESTION_MARK = /^(?!(https?:|www\.))(.*)(\?)/
     REGEXP_PLUS_SIGN     = /(.+)\+(.+)/
-    REGEXP_COLON         = /^(\:)(\S{2,})/
-    REGEXP_EMOJI         = /(\u{2744}[\u{FE0E}|\u{FE0F}])/
+    REGEXP_COLON         = /^(:)(\S{2,})/
+    REGEXP_DINGBATS      = /(#{RANGE_DINGBATS}#{RANGE_VARIATION_SELECTORS}*)/
+    REGEXP_ENDING_PUNCT  = /(?<=\S)([#{RANGE_FULLWIDTH}!?]+)$/
+    REGEXP_DOMAIN        = /^((https?:\/\/|)?[a-z0-9]+([\-\.][a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?)$/ix
+    REGEXP_EMAIL         = /\S+[＠@]\S+/
+    REGEXP_DOMAIN_START  = /^(https?:|www\.|[[:alpha:]]\.)/
+    REGEXP_DOMAIN_END    = /\.(com|net|org|edu|gov|mil|int|[[:alpha:]]{2})$/
+    REGEXP_DIGIT         = /[[:digit:]]+/
+    REGEXP_PERIOD1       = /(.*\.)/
+    REGEXP_PERIOD2       = /(\.)/
     REGEX_UNIFIED1       = Regexp.union(REGEXP_SLASH,
                                         REGEXP_QUESTION_MARK,
                                         REGEXP_PLUS_SIGN,
                                         REGEXP_COLON,
-                                        REGEXP_EMOJI,
+                                        REGEXP_DINGBATS,
                                         PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX,
                                         PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX)
     REGEX_UNIFIED2       = Regexp.union(REGEXP_SINGLE_QUOTES,
                                         REGEXP_COMMAS)
-    REGEXP_UNKNOWN1      = /(?<=\S)([。．！!?？]+)$/
+    REGEX_DOMAIN_EMAIL   = Regexp.union(REGEXP_DOMAIN,
+                                        REGEXP_EMAIL)
+    REGEX_DOMAIN         = Regexp.union(REGEXP_DOMAIN_START,
+                                        REGEXP_DOMAIN_END)
     attr_reader :text, :abbreviations, :downcase
@@ -31,19 +48,24 @@ module PragmaticTokenizer
     end
     def post_process
-      separate_ending_punctuation(post_process_punctuation)
+      procs.reduce(full_stop_separated_tokens) { |a, e| a.flat_map(&e) }
     end
     private
-      def post_process_punctuation
-        separated = separate_ending_punctuation(full_stop_separated_tokens)
-        procs     = [unified1, split_unknown_period1, split_unknown_period2, split_emoji]
-        procs.reduce(separated) { |a, e| a.flat_map(&e) }
+      # note: we need to run #separate_ending_punctuation twice. maybe there's a better solution?
+      def procs
+        [
+            separate_ending_punctuation,
+            unified1,
+            split_unknown_period1,
+            split_unknown_period2,
+            separate_ending_punctuation
+        ]
       end
-      def separate_ending_punctuation(tokens)
-        tokens.flat_map { |token| token.split(REGEXP_UNKNOWN1) }
+      def separate_ending_punctuation
+        proc { |token| token.split(REGEXP_ENDING_PUNCT) }
       end
       def unified1
@@ -51,64 +73,48 @@ module PragmaticTokenizer
       end
       def full_stop_separated_tokens
-        FullStopSeparator.new(tokens: split_and_convert_commas_and_quotes, abbreviations: abbreviations, downcase: downcase).separate
+        FullStopSeparator.new(tokens: split_convert_commas_quotes, abbreviations: abbreviations, downcase: downcase).separate
       end
-      def split_and_convert_commas_and_quotes
+      def split_convert_commas_quotes
         text
             .split
             .flat_map { |token| token.split(REGEX_UNIFIED2) }
             .flat_map { |token| convert_sym_to_punct(token) }
       end
-      def split_emoji
-        proc { |token| (token =~ /(\A|\S)\u{2744}[^\u{FE0E}|\u{FE0F}]/) ? token.split(/(\u{2744})/) : token }
-      end
       def split_unknown_period1
-        proc { |token| unknown_period1?(token) ? token.split(/(.*\.)/) : token }
+        proc { |token| unknown_period1?(token) ? token.split(REGEXP_PERIOD1) : token }
       end
       def split_unknown_period2
-        proc { |token| unknown_period2?(token) ? token.split(/(\.)/) : token }
+        proc { |token| unknown_period2?(token) ? token.split(REGEXP_PERIOD2) : token }
       end
       def unknown_period1?(token)
-        token.include?(".") &&
-            token !~ /(http|https|www)(\.|:)/ &&
+        token.include?(DOT) &&
             token.length > 1 &&
-            token !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
-            token !~ /\S+(＠|@)\S+/ &&
+            token !~ REGEX_DOMAIN_EMAIL &&
             abbreviations.include?(extract_abbreviation(token))
       end
       def unknown_period2?(token)
-        token.include?(".") &&
-            token !~ /(http|https|www)(\.|:)/ &&
-            token !~ /\.(com|net|org|edu|gov|mil|int)/ &&
-            token !~ /\.[a-zA-Z]{2}(\s|\z)/ &&
-            token.length > 2 &&
-            token !~ /\A[a-zA-Z]{1}\./ &&
-            token.count(".") == 1 &&
-            token !~ /\d+/ &&
-            !abbreviations.include?(extract_abbreviation(token)) &&
-            token !~ /\S+(＠|@)\S+/
+        token.include?(DOT) &&
+            token !~ REGEX_DOMAIN &&
+            token !~ REGEXP_DIGIT &&
+            token.count(DOT) == 1 &&
+            !abbreviations.include?(extract_abbreviation(token))
       end
       def extract_abbreviation(token)
-        before_first_dot = token[0, token.index('.'.freeze)]
+        before_first_dot = token[0, token.index(DOT)]
         downcase ? before_first_dot : Unicode.downcase(before_first_dot)
       end
       def convert_sym_to_punct(token)
-        symbol_matches = REGEX_SYMBOL.match(token)
-        if symbol_matches.nil?
-          token
-        else
-          pattern     = symbol_matches[0]
-          replacement = PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(pattern)
-          token.gsub!(pattern, replacement)
-        end
+        PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP
+            .each { |pattern, replacement| break if token.sub!(replacement, pattern) }
+        token
       end
   end

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -64,7 +64,7 @@ module PragmaticTokenizer
     REGEXP_NO_NUMBERS          = /\A\D+\z/
     REGEXP_NUMBER              = /\D*\d+\d*/
     REGEXP_CONSECUTIVE_DOTS    = /\A\.{2,}\z/
-    REGEXP_CHUNK_STRING        = /.{,10000}(?=\s|\z)/m
+    REGEXP_CHUNK_STRING        = /\S.{1,10000}(?!\S)/m
     # @param [Hash] opts optional arguments
@@ -150,7 +150,7 @@ module PragmaticTokenizer
     def tokenize(text)
       return [] unless text
-      raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
+      raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
       CGI.unescapeHTML(text)
           .scan(REGEXP_CHUNK_STRING)
           .flat_map { |segment| post_process(pre_process(segment)) }

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "3.0.5".freeze
+  VERSION = "3.0.6".freeze
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 3.0.5
+  version: 3.0.6
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-09-19 00:00:00.000000000 Z
+date: 2018-03-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode
@@ -169,7 +169,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.6.12
+rubygems_version: 2.6.14
 signing_key:
 specification_version: 4
 summary: A multilingual tokenizer