RubyGems - pragmatic_tokenizer - Versions diffs - 0.5.0 → 1.0.0 - Mend

pragmatic_tokenizer 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

checksums.yaml +4 -4
data/README.md +133 -151
data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +31 -0
data/lib/pragmatic_tokenizer/full_stop_separator.rb +38 -0
data/lib/pragmatic_tokenizer/languages/arabic.rb +3 -3
data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/catalan.rb +3 -3
data/lib/pragmatic_tokenizer/languages/common.rb +14 -8
data/lib/pragmatic_tokenizer/languages/czech.rb +3 -3
data/lib/pragmatic_tokenizer/languages/danish.rb +3 -3
data/lib/pragmatic_tokenizer/languages/deutsch.rb +2 -2
data/lib/pragmatic_tokenizer/languages/dutch.rb +3 -3
data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
data/lib/pragmatic_tokenizer/languages/finnish.rb +3 -3
data/lib/pragmatic_tokenizer/languages/french.rb +3 -3
data/lib/pragmatic_tokenizer/languages/greek.rb +3 -3
data/lib/pragmatic_tokenizer/languages/indonesian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/italian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/latvian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/norwegian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/persian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/polish.rb +3 -3
data/lib/pragmatic_tokenizer/languages/portuguese.rb +3 -3
data/lib/pragmatic_tokenizer/languages/romanian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
data/lib/pragmatic_tokenizer/languages/slovak.rb +3 -3
data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
data/lib/pragmatic_tokenizer/languages/swedish.rb +3 -3
data/lib/pragmatic_tokenizer/languages/turkish.rb +3 -3
data/lib/pragmatic_tokenizer/languages.rb +0 -2
data/lib/pragmatic_tokenizer/post_processor.rb +49 -0
data/lib/pragmatic_tokenizer/{processor.rb → pre_processor.rb} +35 -98
data/lib/pragmatic_tokenizer/tokenizer.rb +186 -159
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +6 -3

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -1,221 +1,248 @@
 # -*- encoding : utf-8 -*-
 require 'pragmatic_tokenizer/languages'
+require 'pragmatic_tokenizer/pre_processor'
+require 'pragmatic_tokenizer/post_processor'
+require 'pragmatic_tokenizer/full_stop_separator'
+require 'pragmatic_tokenizer/ending_punctuation_separator'
 require 'unicode'
 module PragmaticTokenizer
   class Tokenizer
-    attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
-    def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
+    attr_reader :text, :punctuation, :language_module, :expand_contractions, :numbers, :minimum_length, :downcase, :classic_filter, :filter_languages, :abbreviations, :contractions, :clean, :remove_stop_words, :stop_words, :remove_emoji, :remove_emails, :mentions, :hashtags, :remove_urls, :remove_domains, :long_word_split
+    # @param [String] text to be tokenized
+    # @param [Hash] opts optional arguments
+    # @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
+    # @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
+    # @option opts [Boolean] :expand_contractions - (default: false)
+    # @option opts [Boolean] :remove_stop_words - (default: false)
+    # @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
+    # @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
+    # @option opts [Hash]  :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
+    # @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
+      # Punctuation 'all': Does not remove any punctuation from the result
+      # Punctuation 'semi': Removes common punctuation (such as full stops)
+      # and does not remove less common punctuation (such as questions marks)
+      # This is useful for text alignment as less common punctuation can help
+      # identify a sentence (like a fingerprint) while common punctuation
+      # (like stop words) should be removed.
+      # Punctuation 'none': Removes all punctuation from the result
+      # Punctuation 'only': Removes everything except punctuation. The
+      # returned result is an array of only the punctuation.
+    # @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
+      # Numbers 'all': Does not remove any numbers from the result
+      # Numbers 'semi': Removes tokens that include only digits
+      # Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
+      # Numbers 'only': Removes everything except tokens that include a number
+    # @option opts [Integer] :minimum_length - minimum length of the token in characters
+    # @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
+    # @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
+    # @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
+    # @option opts [Boolean] :downcase - (default: true)
+    # @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
+    # @option opts [Boolean] :remove_emoji - (default: false)
+    # @option opts [Boolean] :remove_emails - (default: false)
+    # @option opts [Boolean] :remove_urls - (default: false)
+    # @option opts [Boolean] :remove_domains - (default: false)
+    def initialize(text, opts = {})
+      @text                     = CGI.unescapeHTML(text)
+      @filter_languages         = opts[:filter_languages] || []
+      @language                 = opts[:language] || 'en'
+      @language_module          = Languages.get_language_by_code(@language.to_s)
+      @expand_contractions      = opts[:expand_contractions] || false
+      @remove_stop_words        = opts[:remove_stop_words] || false
+      if @filter_languages.empty?
+        @abbreviations          = opts[:abbreviations] || @language_module::ABBREVIATIONS
+        @contractions           = opts[:contractions] || @language_module::CONTRACTIONS
+        @stop_words             = opts[:stop_words] || @language_module::STOP_WORDS
+      else
+        merged_abbreviations = []
+        @filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
+        merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
+        @abbreviations          =  merged_abbreviations.flatten
+        merged_contractions = {}
+        @filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
+        merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
+        @contractions           =  merged_contractions
+        merged_stop_words = []
+        @filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
+        merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
+        @stop_words             =  merged_stop_words.flatten
+      end
+      @punctuation              = opts[:punctuation] || 'all'
+      @numbers                  = opts[:numbers] || 'all'
+      @minimum_length           = opts[:minimum_length] || 0
+      @long_word_split          = opts[:long_word_split]
+      @mentions                 = opts[:mentions] || 'keep_original'
+      @hashtags                 = opts[:hashtags] || 'keep_original'
+      @downcase                 = opts[:downcase].nil? ? true : opts[:downcase]
+      @clean                    = opts[:clean] || false
+      @classic_filter           = opts[:classic_filter] || false
+      @remove_emoji             = opts[:remove_emoji] || false
+      @remove_emails            = opts[:remove_emails] || false
+      @remove_urls              = opts[:remove_urls] || false
+      @remove_domains           = opts[:remove_domains] || false
       unless punctuation.to_s.eql?('all') ||
         punctuation.to_s.eql?('semi') ||
         punctuation.to_s.eql?('none') ||
         punctuation.to_s.eql?('only')
         raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
-        # Punctuation 'all': Does not remove any punctuation from the result
-        # Punctuation 'semi': Removes common punctuation (such as full stops)
-        # and does not remove less common punctuation (such as questions marks)
-        # This is useful for text alignment as less common punctuation can help
-        # identify a sentence (like a fingerprint) while common punctuation
-        # (like stop words) should be removed.
-        # Punctuation 'none': Removes all punctuation from the result
-        # Punctuation 'only': Removes everything except punctuation. The
-        # returned result is an array of only the punctuation.
+      end
+      unless numbers.to_s.eql?('all') ||
+        numbers.to_s.eql?('semi') ||
+        numbers.to_s.eql?('none') ||
+        numbers.to_s.eql?('only')
+        raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
+      end
+      unless mentions.to_s.eql?('keep_original') ||
+        mentions.to_s.eql?('keep_and_clean') ||
+        mentions.to_s.eql?('remove')
+        raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
       end
       raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
-      @text = CGI.unescapeHTML(text)
-      @language = language.to_s
-      @language_module = Languages.get_language_by_code(language.to_s)
-      @punctuation = punctuation.to_s
-      @remove_stop_words = remove_stop_words
-      @expand_contractions = expand_contractions
-      @clean = clean
-      @remove_numbers = remove_numbers
-      @minimum_length = minimum_length
-      @remove_roman_numerals = remove_roman_numerals
-      @downcase = downcase
-      @remove_en_stop_words = remove_en_stop_words
+      raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless minimum_length.class == Fixnum || minimum_length.nil?
+      raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless long_word_split.class == Fixnum || long_word_split.nil?
     end
     def tokenize
       return [] unless text
       tokens = []
       text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
-        tokens << delete_en_stop_words(
-          delete_stop_words(
-          downcase_tokens(
-          cleaner(
-          remove_short_tokens(
-          delete_numbers(
-          delete_roman_numerals(
-          find_contractions(
-          remove_punctuation(
-          split_at_middle_period_1(
-          split_at_middle_period_2(
-          split_beginning_period(
-          split_at_plus_sign(
-          shift_no_spaces_between_sentences(
-          split_at_forward_slash(
-            processor.new(language: language_module).process(text: segment)
-          ))))))))))))))).reject { |t| t.empty? }
+        tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
       end
       tokens.flatten
     end
-    def domains
-      text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
-    end
-    def urls
-      text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
-    end
-    def emails
-      text.split(' ').delete_if { |t| t !~ /\S+(＠|@)\S+/ }.map { |t| t.chomp('.') }
-    end
-    def hashtags
-      text.split(' ').delete_if { |t| t !~ /(#|＃)/ }.map { |t| t.chomp('.') }
-    end
-    def mentions
-      text.split(' ').delete_if { |t| t !~ /(@|＠)/ }.map { |t| t.chomp('.') }
-    end
-    def emoticons
-      text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
-    end
-    def emoji
-      # https://github.com/franklsf95/ruby-emoji-regex
-      text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
-    end
     private
-    def processor
-      language_module::Processor
-    rescue
-      Processor
-    end
-    def split_at_middle_period_1(tokens)
-      tokens.flat_map { |t| t.include?(".") &&
-        t !~ /(http|https|www)(\.|:)/ &&
-        t.length > 1 &&
-        t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
-        t !~ /\S+(＠|@)\S+/ &&
-        language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
-    end
-    def split_at_middle_period_2(tokens)
-      tokens.flat_map { |t| t.include?(".") &&
-        t !~ /(http|https|www)(\.|:)/ &&
-        t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
-        t !~ /\.[a-z]{2}/ &&
-        t.length > 2 &&
-        t.count(".") == 1 &&
-        t !~ /\d+/ &&
-        !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
-        t !~ /\S+(＠|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
-    end
-    def split_beginning_period(tokens)
-      tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
-    end
-    def shift_no_spaces_between_sentences(tokens)
-      tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
-    end
-    def downcase_tokens(tokens)
-      return tokens unless downcase
-      tokens.map { |t| Unicode::downcase(t) }
-    end
-    def remove_short_tokens(tokens)
-      tokens.delete_if { |t| t.length < minimum_length }
-    end
-    def delete_numbers(tokens)
-      return tokens unless remove_numbers
-      tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
-    end
-    def delete_roman_numerals(tokens)
-      return tokens unless remove_roman_numerals
-      tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") } if remove_roman_numerals
+    def post_process(text)
+      @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
+      downcase! if downcase
+      expand_contractions!(contractions) if expand_contractions
+      clean! if clean
+      classic_filter! if classic_filter
+      process_numbers!
+      remove_short_tokens! if minimum_length > 0
+      process_punctuation!
+      remove_stop_words!(stop_words) if remove_stop_words
+      remove_emoji! if remove_emoji
+      remove_emails! if remove_emails
+      mentions! if mentions
+      hashtags! if hashtags
+      remove_urls! if remove_urls
+      remove_domains! if remove_domains
+      split_long_words! if long_word_split
+      @tokens.reject { |t| t.empty? }
+    end
+    def downcase!
+      @tokens.map! { |t| Unicode::downcase(t) }
+    end
+    def expand_contractions!(contractions)
+      if downcase
+        @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))].split(' ').flatten : t }
+      else
+        @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
+      end
     end
-    def cleaner(tokens)
-      return tokens unless clean
-      tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
-        .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
+    def clean!
+      @tokens = @tokens.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
         .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
         .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
         .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
         .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
+        .map { |t| t.gsub(/[[:cntrl:]]/, '') }
         .delete_if { |t| t =~ /\A-+\z/ ||
         PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
         t =~ /\A\.{2,}\z/ || t.include?("\\") ||
         t.length > 50 ||
-        (t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
+        (t.length > 1 && t =~ /[&*+<=>^|~]/i)
       }
     end
-    def remove_punctuation(tokens)
-      case punctuation
-      when 'all'
-        tokens
+    def classic_filter!
+      @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s") : t.chomp("'s").chomp("’s").chomp("`s") }
+    end
+    def process_numbers!
+      case numbers.to_s
       when 'semi'
-        tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
+        @tokens.delete_if { |t| t =~ /\A\d+\z/ }
       when 'none'
-        tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
+        @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
       when 'only'
-        only_punctuation(tokens)
+        @tokens.delete_if { |t| t =~ /\A\D+\z/ }
       end
     end
-    def only_punctuation(tokens)
-      tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
+    def remove_short_tokens!
+      @tokens.delete_if { |t| t.length < minimum_length }
     end
-    def delete_stop_words(tokens)
-      return tokens unless remove_stop_words && language_module::STOP_WORDS
-      if downcase
-        tokens.map { |t| Unicode::downcase(t) } - language_module::STOP_WORDS
-      else
-        tokens.delete_if { |t| language_module::STOP_WORDS.include?(Unicode::downcase(t)) }
+    def process_punctuation!
+      case punctuation.to_s
+      when 'semi'
+        @tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
+      when 'none'
+        @tokens =  @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
+      when 'only'
+        @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
       end
     end
-    def delete_en_stop_words(tokens)
-      return tokens unless remove_en_stop_words
+    def remove_stop_words!(stop_words)
       if downcase
-        tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
+        @tokens = @tokens - stop_words
       else
-        tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
+        @tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
       end
     end
-    def split_at_forward_slash(tokens)
-      tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
+    def remove_emoji!
+      @tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX }
     end
-    def split_at_plus_sign(tokens)
-      tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
+    def remove_emails!
+      @tokens.delete_if { |t| t =~ /\S+(＠|@)\S+/ }.map { |t| t.chomp('.') }
     end
-    def find_contractions(tokens)
-      return tokens unless expand_contractions && language_module::CONTRACTIONS
-      if downcase
-        tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))].split(' ').flatten : t }
-      else
-        tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›＇´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
+    def mentions!
+      case mentions.to_s
+      when 'remove'
+        @tokens.delete_if { |t| t =~ /\A(@|＠)/ }
+      when 'keep_and_clean'
+        @tokens.map! { |t| t =~ /\A(@|＠)/ ? t.gsub!(/(?<=\A)(@|＠)/, '') : t }
+      end
+    end
+    def hashtags!
+      case hashtags.to_s
+      when 'remove'
+        @tokens.delete_if { |t| t =~ /\A(#|＃)/ }
+      when 'keep_and_clean'
+        @tokens.map! { |t| t =~ /\A(#|＃)/ ? t.gsub!(/(?<=\A)(#|＃)/, '') : t }
       end
     end
+    def remove_urls!
+      @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
+    end
+    def remove_domains!
+      @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
+    end
+    def split_long_words!
+      @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
+        .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
+    end
   end
-end
+end

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "0.5.0"
+  VERSION = "1.0.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.5.0
+  version: 1.0.0
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-01-15 00:00:00.000000000 Z
+date: 2016-01-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode
@@ -97,6 +97,8 @@ files:
 - bin/console
 - bin/setup
 - lib/pragmatic_tokenizer.rb
+- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
+- lib/pragmatic_tokenizer/full_stop_separator.rb
 - lib/pragmatic_tokenizer/languages.rb
 - lib/pragmatic_tokenizer/languages/arabic.rb
 - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -123,7 +125,8 @@ files:
 - lib/pragmatic_tokenizer/languages/spanish.rb
 - lib/pragmatic_tokenizer/languages/swedish.rb
 - lib/pragmatic_tokenizer/languages/turkish.rb
-- lib/pragmatic_tokenizer/processor.rb
+- lib/pragmatic_tokenizer/post_processor.rb
+- lib/pragmatic_tokenizer/pre_processor.rb
 - lib/pragmatic_tokenizer/tokenizer.rb
 - lib/pragmatic_tokenizer/version.rb
 - pragmatic_tokenizer.gemspec