pragmatic_tokenizer 3.0.4 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
 - data/lib/pragmatic_tokenizer/languages.rb +26 -26
 - data/lib/pragmatic_tokenizer/languages/arabic.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
 - data/lib/pragmatic_tokenizer/languages/common.rb +14 -24
 - data/lib/pragmatic_tokenizer/languages/czech.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/danish.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/deutsch.rb +3 -93
 - data/lib/pragmatic_tokenizer/languages/dutch.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/english.rb +11 -14
 - data/lib/pragmatic_tokenizer/languages/finnish.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/french.rb +36 -9
 - data/lib/pragmatic_tokenizer/languages/greek.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/indonesian.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/italian.rb +1 -1
 - data/lib/pragmatic_tokenizer/languages/norwegian.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/persian.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/polish.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/portuguese.rb +1 -1
 - data/lib/pragmatic_tokenizer/languages/romanian.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
 - data/lib/pragmatic_tokenizer/languages/slovak.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/swedish.rb +2 -2
 - data/lib/pragmatic_tokenizer/languages/turkish.rb +2 -2
 - data/lib/pragmatic_tokenizer/post_processor.rb +42 -88
 - data/lib/pragmatic_tokenizer/pre_processor.rb +33 -142
 - data/lib/pragmatic_tokenizer/regex.rb +150 -0
 - data/lib/pragmatic_tokenizer/tokenizer.rb +81 -115
 - data/lib/pragmatic_tokenizer/version.rb +1 -1
 - data/pragmatic_tokenizer.gemspec +5 -6
 - data/spec/languages/english_spec.rb +13 -0
 - data/spec/languages/french_spec.rb +2 -2
 - data/spec/performance_spec.rb +0 -1
 - data/spec/spec_helper.rb +1 -1
 - metadata +12 -12
 - data/lib/pragmatic_tokenizer/full_stop_separator.rb +0 -62
 
| 
         @@ -0,0 +1,150 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module PragmaticTokenizer
         
     | 
| 
      
 2 
     | 
    
         
            +
              class Regex
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
                # Things that can or should be done:
         
     | 
| 
      
 5 
     | 
    
         
            +
                # - check where the use of unicode categories helps (\p{Abbreviation})
         
     | 
| 
      
 6 
     | 
    
         
            +
                # - use URI.parse and other libraries instead of regexp to identify urls, domains, emails
         
     | 
| 
      
 7 
     | 
    
         
            +
                # - check multiple domain regex, we have spec issues when using one or the other
         
     | 
| 
      
 8 
     | 
    
         
            +
                # - check multiple punctuation regex
         
     | 
| 
      
 9 
     | 
    
         
            +
             
     | 
| 
      
 10 
     | 
    
         
            +
                # Text that needs to be tokenized is initially split into chunks of this length:
         
     | 
| 
      
 11 
     | 
    
         
            +
                CHUNK_LONG_INPUT_TEXT         = /\S.{1,10000}(?!\S)/m
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
                # Ranges
         
     | 
| 
      
 14 
     | 
    
         
            +
                RANGE_DINGBATS                = /[\u2701-\u27BE]/ # e.g. ✁✎✳❄➾
         
     | 
| 
      
 15 
     | 
    
         
            +
                RANGE_VARIATION_SELECTORS     = /[\uFE00-\uFE0F]/ # alter the previous character
         
     | 
| 
      
 16 
     | 
    
         
            +
                RANGE_FULLWIDTH               = /[\uFF01-\ufF1F]/ # e.g. !"#'?
         
     | 
| 
      
 17 
     | 
    
         
            +
                RANGE_ALPHANUMERIC_SUPPLEMENT = /[\u{1F100}-\u{1F1FF}]/
         
     | 
| 
      
 18 
     | 
    
         
            +
                RANGE_UNUSUAL_AND_EMOJI       = /[\u203C-\u3299\u{1F000}-\u{1F644}]/
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
                # Regular expressions which do not need to capture anything are enclosed in /(?: … )/ to enhance performance
         
     | 
| 
      
 21 
     | 
    
         
            +
                COLON1                        = /(?:(:)([[:print:]]{2,}))/ # two non-space after colon prevent matching emoticons
         
     | 
| 
      
 22 
     | 
    
         
            +
                COLON2                        = /(?::)/
         
     | 
| 
      
 23 
     | 
    
         
            +
                COMMAS                        = /(?:([,‚])+)/
         
     | 
| 
      
 24 
     | 
    
         
            +
                ENCLOSED_PLUS                 = /(?:([[:print:]]+)\+([[:print:]]+))/
         
     | 
| 
      
 25 
     | 
    
         
            +
                EMAIL                         = /(?:[[:print:]]+[@@][[:print:]]+\.[[:print:]]+)/
         
     | 
| 
      
 26 
     | 
    
         
            +
                DIGIT                         = /(?:[[:digit:]]+)/
         
     | 
| 
      
 27 
     | 
    
         
            +
                ASTERISK                      = /(?:\*+)/
         
     | 
| 
      
 28 
     | 
    
         
            +
                UNDERSCORE                    = /(?:_+)/
         
     | 
| 
      
 29 
     | 
    
         
            +
                HYPHEN_OR_UNDERSCORE          = /(?:[-_])/
         
     | 
| 
      
 30 
     | 
    
         
            +
                LONG_WORD_SPLIT               = /(?:[-_\/—–])/
         
     | 
| 
      
 31 
     | 
    
         
            +
                PERIOD_AND_PRIOR              = /(?:(.+\.))/
         
     | 
| 
      
 32 
     | 
    
         
            +
                PERIOD_ONLY                   = /(?:(\.))/
         
     | 
| 
      
 33 
     | 
    
         
            +
                CONTRACTIONS                  = /(?:[‘’‚‛‹›'´`])/
         
     | 
| 
      
 34 
     | 
    
         
            +
                PUNCTUATION1                  = /(?:([\p{Pd}\p{Pe}\p{Pf}\p{Pi}\p{Ps}])+)/ # all punctuation categories except Pc (Connector) and Po (other)
         
     | 
| 
      
 35 
     | 
    
         
            +
                PUNCTUATION2                  = /(?:(?<=\S)([!?#{RANGE_FULLWIDTH.source}]+))/
         
     | 
| 
      
 36 
     | 
    
         
            +
                PUNCTUATION3                  = /(?:[!%\-–\u00AD]+)/
         
     | 
| 
      
 37 
     | 
    
         
            +
                PUNCTUATION4                  = /(?:[..。]+)/
         
     | 
| 
      
 38 
     | 
    
         
            +
                DINGBATS                      = /(?:(#{RANGE_DINGBATS.source}#{RANGE_VARIATION_SELECTORS.source}*+))/
         
     | 
| 
      
 39 
     | 
    
         
            +
                NO_BREAK_SPACE                = /(?:\u00A0+)/
         
     | 
| 
      
 40 
     | 
    
         
            +
                HTTP                          = /(?:https?:\/\/)/
         
     | 
| 
      
 41 
     | 
    
         
            +
                TIME_WITH_COLON               = /(?:\d:\d)/
         
     | 
| 
      
 42 
     | 
    
         
            +
                DOMAIN_PREFIX                 = /(?:https?:\/\/|www\.|[[:alpha:]]\.)/
         
     | 
| 
      
 43 
     | 
    
         
            +
                DOMAIN_SUFFIX                 = /(?:[[:alpha:]]\.(?:com|net|org|edu|gov|mil|int|[[:alpha:]]{2}))/
         
     | 
| 
      
 44 
     | 
    
         
            +
                DOMAIN1                       = /(?:((https?:\/\/|)[[:print:]]+\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?))/
         
     | 
| 
      
 45 
     | 
    
         
            +
                DOMAIN2                       = /(?:[[:alnum:]]{2,}([\-.][[:alnum:]]+)*\.[[:alpha:]]{2,6}(:[0-9]{1,5})?(\/[[:print:]]*+)?)/
         
     | 
| 
      
 46 
     | 
    
         
            +
                NOT_URL                       = /(?:^(?!#{DOMAIN_PREFIX.source})([[:print:]]*))/
         
     | 
| 
      
 47 
     | 
    
         
            +
                HASHTAG_OR_MENTION            = /(?:[@#@#][[:print:]]+)/
         
     | 
| 
      
 48 
     | 
    
         
            +
                HASHTAG                       = /(?:[##][[:print:]]+)/
         
     | 
| 
      
 49 
     | 
    
         
            +
                MENTION                       = /(?:[@@][[:print:]]+)/
         
     | 
| 
      
 50 
     | 
    
         
            +
                HASHTAG_WITH_HYPHEN           = /(?:^([##][[:digit:]]+)-)/
         
     | 
| 
      
 51 
     | 
    
         
            +
                ONE_AS_EXCLAMATION            = /(?:\D1+)/
         
     | 
| 
      
 52 
     | 
    
         
            +
                ONES_EXCLAMATIONS             = /(?:!+(1*+!*+)*+)/
         
     | 
| 
      
 53 
     | 
    
         
            +
                MANY_PERIODS                  = /(?:^\.{2,}$)/
         
     | 
| 
      
 54 
     | 
    
         
            +
                COPYRIGHT_TRADEMARK           = /(?:[®©™]+)/
         
     | 
| 
      
 55 
     | 
    
         
            +
                CONTROL_CHARACTER             = /(?:[[:cntrl:]]+)/ # matches any character with hexadecimal value 00 through 1F or 7F.
         
     | 
| 
      
 56 
     | 
    
         
            +
                APOSTROPHE_AND_S              = /(?:['’`́]s)/
         
     | 
| 
      
 57 
     | 
    
         
            +
                ALSO_DECIMALS                 = /(?:[[:alpha:]]*+[[:digit:]]+)/
         
     | 
| 
      
 58 
     | 
    
         
            +
                ACUTE_ACCENT_S                = /(?:\s\u0301(?=s))/
         
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
      
 60 
     | 
    
         
            +
                # Regular expressions used to capture items
         
     | 
| 
      
 61 
     | 
    
         
            +
                CAPTURE_UNUSUAL_AND_EMOJI     = /(#{RANGE_UNUSUAL_AND_EMOJI.source})/
         
     | 
| 
      
 62 
     | 
    
         
            +
                QUESTION_MARK_NOT_URL         = /#{NOT_URL.source}(\?)/
         
     | 
| 
      
 63 
     | 
    
         
            +
                # Should we change specs and also capture "/", just like we capture ":" and "?"
         
     | 
| 
      
 64 
     | 
    
         
            +
                SLASH_NOT_URL                 = /#{NOT_URL.source}\//
         
     | 
| 
      
 65 
     | 
    
         
            +
                SHIFT_BOUNDARY_CHARACTERS     = /([;^&|…«»„“¿¡≠]+)/
         
     | 
| 
      
 66 
     | 
    
         
            +
                MULTIPLE_DOTS                 = /(\.{2,})/ # we keep all dashes
         
     | 
| 
      
 67 
     | 
    
         
            +
                MULTIPLE_DASHES               = /(-){2,}/ # we only keep first dash
         
     | 
| 
      
 68 
     | 
    
         
            +
                BRACKET                       = /([{}()\[\]])/
         
     | 
| 
      
 69 
     | 
    
         
            +
                EXCLAMATION_BETWEEN_ALPHA     = /(?<=[[:alpha:]])(!)(?=[[:alpha:]])/
         
     | 
| 
      
 70 
     | 
    
         
            +
                PERCENT_BEFORE_DIGIT          = /(%)\d+/
         
     | 
| 
      
 71 
     | 
    
         
            +
                COMMA_BEFORE_NON_DIGIT        = /(,)(?=\D)/
         
     | 
| 
      
 72 
     | 
    
         
            +
                COMMA_AFTER_NON_DIGIT         = /(?<=\D)(,)/
         
     | 
| 
      
 73 
     | 
    
         
            +
                COLON_IN_URL                  = /(?<=[(https?|ftp)]):(?=\/\/)/
         
     | 
| 
      
 74 
     | 
    
         
            +
                QUOTE_BEFORE_PRINT            = /(('')|["“])(?=[[:print:]])/
         
     | 
| 
      
 75 
     | 
    
         
            +
                QUOTE                         = /('')|["”]/
         
     | 
| 
      
 76 
     | 
    
         
            +
                HYPHEN_AFTER_NON_WORD         = /(?<=\W)(-)/
         
     | 
| 
      
 77 
     | 
    
         
            +
                HYPHEN_BEFORE_NON_WORD        = /(-)(?=\W)/
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     | 
| 
      
 79 
     | 
    
         
            +
                STARTS_WITH_COMMAS            = /^#{COMMAS.source}/
         
     | 
| 
      
 80 
     | 
    
         
            +
                STARTS_WITH_HTTP              = /^#{HTTP.source}/
         
     | 
| 
      
 81 
     | 
    
         
            +
                STARTS_WITH_DOMAIN            = /^#{DOMAIN_PREFIX.source}/
         
     | 
| 
      
 82 
     | 
    
         
            +
                STARTS_WITH_COLON1            = /^#{COLON1.source}/
         
     | 
| 
      
 83 
     | 
    
         
            +
                STARTS_WITH_UNDERSCORE        = /^#{UNDERSCORE.source}/
         
     | 
| 
      
 84 
     | 
    
         
            +
                STARTS_WITH_PUNCTUATION3      = /^#{PUNCTUATION3.source}/
         
     | 
| 
      
 85 
     | 
    
         
            +
             
     | 
| 
      
 86 
     | 
    
         
            +
                ENDS_WITH_DOMAIN              = /#{DOMAIN_SUFFIX.source}$/
         
     | 
| 
      
 87 
     | 
    
         
            +
                ENDS_WITH_PUNCTUATION1        = /#{PUNCTUATION1.source}$/
         
     | 
| 
      
 88 
     | 
    
         
            +
                ENDS_WITH_PUNCTUATION2        = /#{PUNCTUATION2.source}$/
         
     | 
| 
      
 89 
     | 
    
         
            +
                ENDS_WITH_COLON2              = /#{COLON2.source}$/
         
     | 
| 
      
 90 
     | 
    
         
            +
                ENDS_WITH_UNDERSCORE          = /#{UNDERSCORE.source}$/
         
     | 
| 
      
 91 
     | 
    
         
            +
                ENDS_WITH_ONES_EXCLAMATIONS   = /#{ONES_EXCLAMATIONS.source}$/
         
     | 
| 
      
 92 
     | 
    
         
            +
                ENDS_WITH_EXCITED_ONE         = /#{ONE_AS_EXCLAMATION.source}$/
         
     | 
| 
      
 93 
     | 
    
         
            +
                ENDS_WITH_APOSTROPHE_AND_S    = /#{APOSTROPHE_AND_S.source}$/
         
     | 
| 
      
 94 
     | 
    
         
            +
                ENDS_WITH_ALPHA               = /[[:alpha:]]$/
         
     | 
| 
      
 95 
     | 
    
         
            +
                ENDS_WITH_DIGIT               = /[[:digit:]]$/
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                ONLY_DECIMALS                 = /(?:^[[:digit:]]+$)/
         
     | 
| 
      
 98 
     | 
    
         
            +
                NO_DECIMALS                   = /(?:^\D+$)/
         
     | 
| 
      
 99 
     | 
    
         
            +
                ONLY_PUNCTUATION              = /^[[[:punct:]]^|+]+$/
         
     | 
| 
      
 100 
     | 
    
         
            +
                ONLY_ROMAN_NUMERALS           = /^(?=[MDCLXVI])M*(C[MD]|D?C*)(X[CL]|L?X*)(I[XV]|V?I*)$/i
         
     | 
| 
      
 101 
     | 
    
         
            +
                ONLY_EMAIL                    = /^#{EMAIL}$/
         
     | 
| 
      
 102 
     | 
    
         
            +
                ONLY_HASHTAG_MENTION          = /^#{HASHTAG_OR_MENTION}$/
         
     | 
| 
      
 103 
     | 
    
         
            +
                ONLY_HASHTAG                  = /^#{HASHTAG}$/
         
     | 
| 
      
 104 
     | 
    
         
            +
                ONLY_MENTION                  = /^#{MENTION}$/
         
     | 
| 
      
 105 
     | 
    
         
            +
                ONLY_DOMAIN1                  = /^#{DOMAIN1}$/
         
     | 
| 
      
 106 
     | 
    
         
            +
                ONLY_DOMAIN2                  = /^#{DOMAIN2}$/
         
     | 
| 
      
 107 
     | 
    
         
            +
                ONLY_DOMAIN3                  = Regexp.union(STARTS_WITH_DOMAIN, ENDS_WITH_DOMAIN)
         
     | 
| 
      
 108 
     | 
    
         
            +
                DOMAIN_OR_EMAIL               = Regexp.union(ONLY_DOMAIN1, ONLY_EMAIL)
         
     | 
| 
      
 109 
     | 
    
         
            +
                UNDERSCORES_ASTERISK          = Regexp.union(STARTS_WITH_UNDERSCORE, ENDS_WITH_UNDERSCORE, ASTERISK)
         
     | 
| 
      
 110 
     | 
    
         
            +
                NO_DECIMALS_NO_NUMERALS       = Regexp.union(ALSO_DECIMALS, ONLY_ROMAN_NUMERALS)
         
     | 
| 
      
 111 
     | 
    
         
            +
             
     | 
| 
      
 112 
     | 
    
         
            +
                COMMAS_OR_PUNCTUATION = Regexp.union(
         
     | 
| 
      
 113 
     | 
    
         
            +
                    STARTS_WITH_COMMAS,
         
     | 
| 
      
 114 
     | 
    
         
            +
                    ENDS_WITH_PUNCTUATION1,
         
     | 
| 
      
 115 
     | 
    
         
            +
                    ENDS_WITH_PUNCTUATION2
         
     | 
| 
      
 116 
     | 
    
         
            +
                )
         
     | 
| 
      
 117 
     | 
    
         
            +
             
     | 
| 
      
 118 
     | 
    
         
            +
                # Can this constant name be clarified?
         
     | 
| 
      
 119 
     | 
    
         
            +
                VARIOUS = Regexp.union(
         
     | 
| 
      
 120 
     | 
    
         
            +
                    SLASH_NOT_URL,
         
     | 
| 
      
 121 
     | 
    
         
            +
                    QUESTION_MARK_NOT_URL,
         
     | 
| 
      
 122 
     | 
    
         
            +
                    ENCLOSED_PLUS,
         
     | 
| 
      
 123 
     | 
    
         
            +
                    STARTS_WITH_COLON1,
         
     | 
| 
      
 124 
     | 
    
         
            +
                    DINGBATS,
         
     | 
| 
      
 125 
     | 
    
         
            +
                    HASHTAG_WITH_HYPHEN,
         
     | 
| 
      
 126 
     | 
    
         
            +
                    CAPTURE_UNUSUAL_AND_EMOJI
         
     | 
| 
      
 127 
     | 
    
         
            +
                )
         
     | 
| 
      
 128 
     | 
    
         
            +
             
     | 
| 
      
 129 
     | 
    
         
            +
                IRRELEVANT_CHARACTERS = Regexp.union(
         
     | 
| 
      
 130 
     | 
    
         
            +
                    STARTS_WITH_PUNCTUATION3,
         
     | 
| 
      
 131 
     | 
    
         
            +
                    ENDS_WITH_COLON2,
         
     | 
| 
      
 132 
     | 
    
         
            +
                    ENDS_WITH_ONES_EXCLAMATIONS,
         
     | 
| 
      
 133 
     | 
    
         
            +
                    CONTROL_CHARACTER,
         
     | 
| 
      
 134 
     | 
    
         
            +
                    COPYRIGHT_TRADEMARK,
         
     | 
| 
      
 135 
     | 
    
         
            +
                    RANGE_ALPHANUMERIC_SUPPLEMENT
         
     | 
| 
      
 136 
     | 
    
         
            +
                )
         
     | 
| 
      
 137 
     | 
    
         
            +
             
     | 
| 
      
 138 
     | 
    
         
            +
                PRE_PROCESS = Regexp.union(
         
     | 
| 
      
 139 
     | 
    
         
            +
                    SHIFT_BOUNDARY_CHARACTERS,
         
     | 
| 
      
 140 
     | 
    
         
            +
                    MULTIPLE_DOTS,
         
     | 
| 
      
 141 
     | 
    
         
            +
                    BRACKET,
         
     | 
| 
      
 142 
     | 
    
         
            +
                    MULTIPLE_DASHES,
         
     | 
| 
      
 143 
     | 
    
         
            +
                    EXCLAMATION_BETWEEN_ALPHA,
         
     | 
| 
      
 144 
     | 
    
         
            +
                    PERCENT_BEFORE_DIGIT,
         
     | 
| 
      
 145 
     | 
    
         
            +
                    COMMA_BEFORE_NON_DIGIT,
         
     | 
| 
      
 146 
     | 
    
         
            +
                    COMMA_AFTER_NON_DIGIT
         
     | 
| 
      
 147 
     | 
    
         
            +
                )
         
     | 
| 
      
 148 
     | 
    
         
            +
             
     | 
| 
      
 149 
     | 
    
         
            +
              end
         
     | 
| 
      
 150 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -1,70 +1,22 @@ 
     | 
|
| 
       1 
     | 
    
         
            -
            # -*- encoding : utf-8 -*-
         
     | 
| 
       2 
1 
     | 
    
         
             
            require 'set'
         
     | 
| 
       3 
2 
     | 
    
         
             
            require 'cgi'
         
     | 
| 
      
 3 
     | 
    
         
            +
            require 'pragmatic_tokenizer/regex'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'pragmatic_tokenizer/languages'
         
     | 
| 
       5 
5 
     | 
    
         
             
            require 'pragmatic_tokenizer/pre_processor'
         
     | 
| 
       6 
6 
     | 
    
         
             
            require 'pragmatic_tokenizer/post_processor'
         
     | 
| 
       7 
     | 
    
         
            -
            require 'pragmatic_tokenizer/full_stop_separator'
         
     | 
| 
       8 
7 
     | 
    
         
             
            require 'unicode'
         
     | 
| 
       9 
8 
     | 
    
         | 
| 
       10 
9 
     | 
    
         
             
            module PragmaticTokenizer
         
     | 
| 
       11 
10 
     | 
    
         
             
              class Tokenizer
         
     | 
| 
       12 
11 
     | 
    
         | 
| 
       13 
     | 
    
         
            -
                 
     | 
| 
       14 
     | 
    
         
            -
                NUMBERS_OPTIONS           = Set.new([ 
     | 
| 
       15 
     | 
    
         
            -
                MENTIONS_OPTIONS          = Set.new([ 
     | 
| 
      
 12 
     | 
    
         
            +
                PUNCTUATION_OPTIONS       = Set.new(%i[all semi none only]).freeze
         
     | 
| 
      
 13 
     | 
    
         
            +
                NUMBERS_OPTIONS           = Set.new(%i[all semi none only]).freeze
         
     | 
| 
      
 14 
     | 
    
         
            +
                MENTIONS_OPTIONS          = Set.new(%i[keep_original keep_and_clean remove]).freeze
         
     | 
| 
       16 
15 
     | 
    
         
             
                MAX_TOKEN_LENGTH          = 50
         
     | 
| 
       17 
     | 
    
         
            -
                 
     | 
| 
       18 
     | 
    
         
            -
                 
     | 
| 
       19 
     | 
    
         
            -
                 
     | 
| 
       20 
     | 
    
         
            -
                 
     | 
| 
       21 
     | 
    
         
            -
                REGEX_URL                 = /(http|https)(\.|:)/
         
     | 
| 
       22 
     | 
    
         
            -
                REGEX_HYPHEN              = /\-/
         
     | 
| 
       23 
     | 
    
         
            -
                REGEX_LONG_WORD           = /\-|\_/
         
     | 
| 
       24 
     | 
    
         
            -
                REGEXP_SPLIT_CHECK        = /@|@|(http)/
         
     | 
| 
       25 
     | 
    
         
            -
                REGEX_CONTRACTIONS        = /[‘’‚‛‹›'´`]/
         
     | 
| 
       26 
     | 
    
         
            -
                REGEX_APOSTROPHE_S        = /['’`́]s$/
         
     | 
| 
       27 
     | 
    
         
            -
                REGEX_EMAIL               = /\S+(@|@)\S+\.\S+/
         
     | 
| 
       28 
     | 
    
         
            -
                REGEX_HASHTAG_OR_MENTION  = /[@@#|#]/
         
     | 
| 
       29 
     | 
    
         
            -
                REGEX_UNDERSCORE_AT_START = /(?<=\A)\_+/
         
     | 
| 
       30 
     | 
    
         
            -
                REGEX_UNDERSCORE_AT_END   = /\_+(?=\z)/
         
     | 
| 
       31 
     | 
    
         
            -
                REGEX_ASTERISK            = /\*+/
         
     | 
| 
       32 
     | 
    
         
            -
                REGEX_UNIFIED1            = Regexp.union(REGEX_UNDERSCORE_AT_START,
         
     | 
| 
       33 
     | 
    
         
            -
                                                         REGEX_UNDERSCORE_AT_END,
         
     | 
| 
       34 
     | 
    
         
            -
                                                         REGEX_ASTERISK)
         
     | 
| 
       35 
     | 
    
         
            -
                # https://en.wikipedia.org/wiki/Control_character
         
     | 
| 
       36 
     | 
    
         
            -
                # matches any character with hexadecimal value 00 through 1F or 7F.
         
     | 
| 
       37 
     | 
    
         
            -
                # Rubular: http://rubular.com/r/E83fpBoDjI
         
     | 
| 
       38 
     | 
    
         
            -
                REGEXP_CONTROL                  = /[[:cntrl:]]/
         
     | 
| 
       39 
     | 
    
         
            -
                REGEXP_ENDING_COLON             = /\:(?=\z)/
         
     | 
| 
       40 
     | 
    
         
            -
                REGEXP_EXCLAMATION_AT_START     = /(?<=\A)!+(?=.+)/
         
     | 
| 
       41 
     | 
    
         
            -
                REGEXP_EXCLAMATION_AT_END       = /!+(1*!*)*(?=\z)/
         
     | 
| 
       42 
     | 
    
         
            -
                REGEXP_HYPHEN_AT_START          = /\A(-|–|\u{00AD})/
         
     | 
| 
       43 
     | 
    
         
            -
                REGEXP_SPECIAL_SYMBOL           = /[®©]/
         
     | 
| 
       44 
     | 
    
         
            -
                REGEXP_PERCENT_AT_START         = /\A\%/
         
     | 
| 
       45 
     | 
    
         
            -
                # https://codepoints.net/enclosed_alphanumeric_supplement
         
     | 
| 
       46 
     | 
    
         
            -
                REGEXP_ALPHANUMERIC_SUPPLEMENT  = /[\u{1F100}-\u{1F1FF}]/
         
     | 
| 
       47 
     | 
    
         
            -
                REGEX_UNIFIED2                  = Regexp.union(REGEXP_CONTROL,
         
     | 
| 
       48 
     | 
    
         
            -
                                                               REGEXP_ENDING_COLON,
         
     | 
| 
       49 
     | 
    
         
            -
                                                               REGEXP_EXCLAMATION_AT_START,
         
     | 
| 
       50 
     | 
    
         
            -
                                                               REGEXP_EXCLAMATION_AT_END,
         
     | 
| 
       51 
     | 
    
         
            -
                                                               REGEXP_HYPHEN_AT_START,
         
     | 
| 
       52 
     | 
    
         
            -
                                                               REGEXP_SPECIAL_SYMBOL,
         
     | 
| 
       53 
     | 
    
         
            -
                                                               REGEXP_PERCENT_AT_START,
         
     | 
| 
       54 
     | 
    
         
            -
                                                               REGEXP_ALPHANUMERIC_SUPPLEMENT)
         
     | 
| 
       55 
     | 
    
         
            -
                REGEXP_ONE_AS_EXCLAMATION  = /(?<=\D)1+(?=\z)/
         
     | 
| 
       56 
     | 
    
         
            -
                REGEXP_HASHTAG_AT_START    = /(?<=\A)(#|#)/
         
     | 
| 
       57 
     | 
    
         
            -
                REGEXP_AT_SIGN_AT_START    = /(?<=\A)(@|@)/
         
     | 
| 
       58 
     | 
    
         
            -
                REGEXP_HYPHEN_HASTAG       = /\A(#|#)\S+-/
         
     | 
| 
       59 
     | 
    
         
            -
                REGEXP_EMOJI_SNOWFLAKE     = /\u{2744}[\u{FE0F}|\u{FE0E}]?/
         
     | 
| 
       60 
     | 
    
         
            -
                REGEX_EMOJI_UNIFIED        = Regexp.union(REGEXP_EMOJI_SNOWFLAKE,
         
     | 
| 
       61 
     | 
    
         
            -
                                                        PragmaticTokenizer::Languages::Common::EMOJI_REGEX)
         
     | 
| 
       62 
     | 
    
         
            -
                REGEXP_PUNCTUATION_ONLY    = /\A[[:punct:]]+\z/
         
     | 
| 
       63 
     | 
    
         
            -
                REGEXP_NUMBER_ONLY         = /\A\d+\z/
         
     | 
| 
       64 
     | 
    
         
            -
                REGEXP_NO_NUMBERS          = /\A\D+\z/
         
     | 
| 
       65 
     | 
    
         
            -
                REGEXP_NUMBER              = /\D*\d+\d*/
         
     | 
| 
       66 
     | 
    
         
            -
                REGEXP_CONSECUTIVE_DOTS    = /\A\.{2,}\z/
         
     | 
| 
       67 
     | 
    
         
            -
                REGEXP_CHUNK_STRING        = /.{,10000}(?=\s|\z)/m
         
     | 
| 
      
 16 
     | 
    
         
            +
                NOTHING                   = ''.freeze
         
     | 
| 
      
 17 
     | 
    
         
            +
                DOT                       = '.'.freeze
         
     | 
| 
      
 18 
     | 
    
         
            +
                SPACE                     = ' '.freeze
         
     | 
| 
      
 19 
     | 
    
         
            +
                SINGLE_QUOTE              = "'".freeze
         
     | 
| 
       68 
20 
     | 
    
         | 
| 
       69 
21 
     | 
    
         
             
                # @param [Hash] opts optional arguments
         
     | 
| 
       70 
22 
     | 
    
         | 
| 
         @@ -124,7 +76,7 @@ module PragmaticTokenizer 
     | 
|
| 
       124 
76 
     | 
    
         
             
                  @abbreviations       = Set.new(opts[:abbreviations])
         
     | 
| 
       125 
77 
     | 
    
         
             
                  @stop_words          = Set.new(opts[:stop_words])
         
     | 
| 
       126 
78 
     | 
    
         | 
| 
       127 
     | 
    
         
            -
                  #  
     | 
| 
      
 79 
     | 
    
         
            +
                  # Why do we treat stop words differently than abbreviations and contractions? (we don't use @language_module::STOP_WORDS when passing @filter_languages)
         
     | 
| 
       128 
80 
     | 
    
         
             
                  @contractions.merge!(@language_module::CONTRACTIONS) if @contractions.empty?
         
     | 
| 
       129 
81 
     | 
    
         
             
                  @abbreviations       += @language_module::ABBREVIATIONS if @abbreviations.empty?
         
     | 
| 
       130 
82 
     | 
    
         
             
                  @stop_words          += @language_module::STOP_WORDS if @stop_words.empty?
         
     | 
| 
         @@ -136,34 +88,43 @@ module PragmaticTokenizer 
     | 
|
| 
       136 
88 
     | 
    
         
             
                    @stop_words    += language::STOP_WORDS
         
     | 
| 
       137 
89 
     | 
    
         
             
                  end
         
     | 
| 
       138 
90 
     | 
    
         | 
| 
       139 
     | 
    
         
            -
                  raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless  
     | 
| 
      
 91 
     | 
    
         
            +
                  raise "Punctuation argument can be only be nil, :all, :semi, :none, or :only" unless PUNCTUATION_OPTIONS.include?(@punctuation)
         
     | 
| 
       140 
92 
     | 
    
         
             
                  raise "Numbers argument can be only be nil, :all, :semi, :none, or :only" unless NUMBERS_OPTIONS.include?(@numbers)
         
     | 
| 
       141 
93 
     | 
    
         
             
                  raise "Mentions argument can be only be nil, :keep_original, :keep_and_clean, or :remove" unless MENTIONS_OPTIONS.include?(@mentions)
         
     | 
| 
       142 
     | 
    
         
            -
             
     | 
| 
       143 
     | 
    
         
            -
                   
     | 
| 
      
 94 
     | 
    
         
            +
             
     | 
| 
      
 95 
     | 
    
         
            +
                  integer_class = Gem::Version.new(RUBY_VERSION) < Gem::Version.new('2.4.0') ? Fixnum : Integer
         
     | 
| 
      
 96 
     | 
    
         
            +
             
     | 
| 
      
 97 
     | 
    
         
            +
                  raise "In Pragmatic Tokenizer minimum_length must be an Integer"  unless @minimum_length.class  == integer_class || @minimum_length.nil?
         
     | 
| 
      
 98 
     | 
    
         
            +
                  raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless @long_word_split.class == integer_class || @long_word_split.nil?
         
     | 
| 
       144 
99 
     | 
    
         
             
                end
         
     | 
| 
       145 
100 
     | 
    
         | 
| 
       146 
101 
     | 
    
         
             
                # @param [String] text to be tokenized
         
     | 
| 
       147 
102 
     | 
    
         | 
| 
       148 
103 
     | 
    
         
             
                def tokenize(text)
         
     | 
| 
       149 
104 
     | 
    
         
             
                  return [] unless text
         
     | 
| 
       150 
     | 
    
         
            -
                  raise "In  
     | 
| 
      
 105 
     | 
    
         
            +
                  raise "In PragmaticTokenizer text must be a String or subclass of String" unless text.class <= String
         
     | 
| 
       151 
106 
     | 
    
         
             
                  CGI.unescapeHTML(text)
         
     | 
| 
       152 
     | 
    
         
            -
                      .scan( 
     | 
| 
       153 
     | 
    
         
            -
                      .flat_map { |segment|  
     | 
| 
      
 107 
     | 
    
         
            +
                      .scan(Regex::CHUNK_LONG_INPUT_TEXT)
         
     | 
| 
      
 108 
     | 
    
         
            +
                      .flat_map { |segment| process_segment(segment) }
         
     | 
| 
       154 
109 
     | 
    
         
             
                end
         
     | 
| 
       155 
110 
     | 
    
         | 
| 
       156 
111 
     | 
    
         
             
                private
         
     | 
| 
       157 
112 
     | 
    
         | 
| 
       158 
     | 
    
         
            -
                  def  
     | 
| 
       159 
     | 
    
         
            -
                     
     | 
| 
      
 113 
     | 
    
         
            +
                  def process_segment(segment)
         
     | 
| 
      
 114 
     | 
    
         
            +
                    pre_processed = pre_process(segment)
         
     | 
| 
      
 115 
     | 
    
         
            +
                    cased_segment = chosen_case(pre_processed)
         
     | 
| 
      
 116 
     | 
    
         
            +
                    @tokens       = PostProcessor.new(text: cased_segment, abbreviations: @abbreviations, downcase: @downcase).call
         
     | 
| 
      
 117 
     | 
    
         
            +
                    post_process_tokens
         
     | 
| 
      
 118 
     | 
    
         
            +
                  end
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                  def pre_process(segment)
         
     | 
| 
      
 121 
     | 
    
         
            +
                    segment
         
     | 
| 
       160 
122 
     | 
    
         
             
                        .extend(PragmaticTokenizer::PreProcessor)
         
     | 
| 
       161 
123 
     | 
    
         
             
                        .pre_process(language: @language_module)
         
     | 
| 
       162 
124 
     | 
    
         
             
                  end
         
     | 
| 
       163 
125 
     | 
    
         | 
| 
       164 
     | 
    
         
            -
                  def  
     | 
| 
       165 
     | 
    
         
            -
                     
     | 
| 
       166 
     | 
    
         
            -
                    remove_various!
         
     | 
| 
      
 126 
     | 
    
         
            +
                  def post_process_tokens
         
     | 
| 
      
 127 
     | 
    
         
            +
                    remove_by_options!
         
     | 
| 
       167 
128 
     | 
    
         
             
                    process_numbers!
         
     | 
| 
       168 
129 
     | 
    
         
             
                    process_punctuation!
         
     | 
| 
       169 
130 
     | 
    
         
             
                    expand_contractions! if @expand_contractions
         
     | 
| 
         @@ -177,45 +138,45 @@ module PragmaticTokenizer 
     | 
|
| 
       177 
138 
     | 
    
         
             
                    @tokens.reject(&:empty?)
         
     | 
| 
       178 
139 
     | 
    
         
             
                  end
         
     | 
| 
       179 
140 
     | 
    
         | 
| 
       180 
     | 
    
         
            -
                  def run_post_processor(text)
         
     | 
| 
       181 
     | 
    
         
            -
                    PostProcessor.new(
         
     | 
| 
       182 
     | 
    
         
            -
                        text:          chosen_case(text),
         
     | 
| 
       183 
     | 
    
         
            -
                        abbreviations: @abbreviations,
         
     | 
| 
       184 
     | 
    
         
            -
                        downcase:      @downcase
         
     | 
| 
       185 
     | 
    
         
            -
                    ).post_process
         
     | 
| 
       186 
     | 
    
         
            -
                  end
         
     | 
| 
       187 
     | 
    
         
            -
             
     | 
| 
       188 
141 
     | 
    
         
             
                  def expand_contractions!
         
     | 
| 
       189 
     | 
    
         
            -
                    @tokens = @tokens.flat_map { | 
     | 
| 
      
 142 
     | 
    
         
            +
                    @tokens = @tokens.flat_map { |token| expand_token_contraction(token) }
         
     | 
| 
       190 
143 
     | 
    
         
             
                  end
         
     | 
| 
       191 
144 
     | 
    
         | 
| 
       192 
145 
     | 
    
         
             
                  def expand_token_contraction(token)
         
     | 
| 
       193 
     | 
    
         
            -
                    normalized = inverse_case(token.gsub( 
     | 
| 
      
 146 
     | 
    
         
            +
                    normalized = inverse_case(token.gsub(Regex::CONTRACTIONS, SINGLE_QUOTE))
         
     | 
| 
       194 
147 
     | 
    
         
             
                    return token unless @contractions.key?(normalized)
         
     | 
| 
       195 
     | 
    
         
            -
                    result    = @contractions[normalized].split( 
     | 
| 
      
 148 
     | 
    
         
            +
                    result    = @contractions[normalized].split(SPACE)
         
     | 
| 
       196 
149 
     | 
    
         
             
                    result[0] = Unicode.capitalize(result[0]) unless @downcase
         
     | 
| 
       197 
150 
     | 
    
         
             
                    result
         
     | 
| 
       198 
151 
     | 
    
         
             
                  end
         
     | 
| 
       199 
152 
     | 
    
         | 
| 
       200 
153 
     | 
    
         
             
                  def clean!
         
     | 
| 
       201 
154 
     | 
    
         
             
                    @tokens = @tokens
         
     | 
| 
       202 
     | 
    
         
            -
                        .flat_map 
     | 
| 
       203 
     | 
    
         
            -
                        .map! 
     | 
| 
       204 
     | 
    
         
            -
                        . 
     | 
| 
       205 
     | 
    
         
            -
                        .delete_if { |t| unclean_token?(t) }
         
     | 
| 
      
 155 
     | 
    
         
            +
                        .flat_map  { |token| split_underscores_asterisk(token) }
         
     | 
| 
      
 156 
     | 
    
         
            +
                        .map!      { |token| remove_irrelevant_characters(token) }
         
     | 
| 
      
 157 
     | 
    
         
            +
                        .delete_if { |token| many_dots?(token) }
         
     | 
| 
       206 
158 
     | 
    
         
             
                  end
         
     | 
| 
       207 
159 
     | 
    
         | 
| 
       208 
     | 
    
         
            -
                  def  
     | 
| 
       209 
     | 
    
         
            -
                    return  
     | 
| 
       210 
     | 
    
         
            -
                     
     | 
| 
       211 
     | 
    
         
            -
             
     | 
| 
       212 
     | 
    
         
            -
             
     | 
| 
      
 160 
     | 
    
         
            +
                  def split_underscores_asterisk(token)
         
     | 
| 
      
 161 
     | 
    
         
            +
                    return token if token =~ Regex::ONLY_HASHTAG_MENTION
         
     | 
| 
      
 162 
     | 
    
         
            +
                    token.split(Regex::UNDERSCORES_ASTERISK)
         
     | 
| 
      
 163 
     | 
    
         
            +
                  end
         
     | 
| 
      
 164 
     | 
    
         
            +
             
     | 
| 
      
 165 
     | 
    
         
            +
                  def remove_irrelevant_characters(token)
         
     | 
| 
      
 166 
     | 
    
         
            +
                    token.gsub!(Regex::IRRELEVANT_CHARACTERS, NOTHING)
         
     | 
| 
      
 167 
     | 
    
         
            +
                    return token if token =~ Regex::ONLY_HASHTAG_MENTION
         
     | 
| 
      
 168 
     | 
    
         
            +
                    token.gsub!(Regex::ENDS_WITH_EXCITED_ONE, NOTHING)
         
     | 
| 
      
 169 
     | 
    
         
            +
                    token
         
     | 
| 
      
 170 
     | 
    
         
            +
                  end
         
     | 
| 
      
 171 
     | 
    
         
            +
             
     | 
| 
      
 172 
     | 
    
         
            +
                  def many_dots?(token)
         
     | 
| 
      
 173 
     | 
    
         
            +
                    token =~ Regex::MANY_PERIODS
         
     | 
| 
       213 
174 
     | 
    
         
             
                  end
         
     | 
| 
       214 
175 
     | 
    
         | 
| 
       215 
176 
     | 
    
         
             
                  def classic_filter!
         
     | 
| 
       216 
177 
     | 
    
         
             
                    @tokens.map! do |token|
         
     | 
| 
       217 
     | 
    
         
            -
                      token.delete!( 
     | 
| 
       218 
     | 
    
         
            -
                      token.sub!( 
     | 
| 
      
 178 
     | 
    
         
            +
                      token.delete!(DOT) if @abbreviations.include?(token.chomp(DOT))
         
     | 
| 
      
 179 
     | 
    
         
            +
                      token.sub!(Regex::ENDS_WITH_APOSTROPHE_AND_S, NOTHING)
         
     | 
| 
       219 
180 
     | 
    
         
             
                      token
         
     | 
| 
       220 
181 
     | 
    
         
             
                    end
         
     | 
| 
       221 
182 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -223,26 +184,26 @@ module PragmaticTokenizer 
     | 
|
| 
       223 
184 
     | 
    
         
             
                  def process_numbers!
         
     | 
| 
       224 
185 
     | 
    
         
             
                    case @numbers
         
     | 
| 
       225 
186 
     | 
    
         
             
                    when :semi
         
     | 
| 
       226 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 187 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::ONLY_DECIMALS }
         
     | 
| 
       227 
188 
     | 
    
         
             
                    when :none
         
     | 
| 
       228 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 189 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS_NO_NUMERALS }
         
     | 
| 
       229 
190 
     | 
    
         
             
                    when :only
         
     | 
| 
       230 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 191 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::NO_DECIMALS }
         
     | 
| 
       231 
192 
     | 
    
         
             
                    end
         
     | 
| 
       232 
193 
     | 
    
         
             
                  end
         
     | 
| 
       233 
194 
     | 
    
         | 
| 
       234 
195 
     | 
    
         
             
                  def remove_short_tokens!
         
     | 
| 
       235 
     | 
    
         
            -
                    @tokens.delete_if { | 
     | 
| 
      
 196 
     | 
    
         
            +
                    @tokens.delete_if { |token| token.length < @minimum_length }
         
     | 
| 
       236 
197 
     | 
    
         
             
                  end
         
     | 
| 
       237 
198 
     | 
    
         | 
| 
       238 
199 
     | 
    
         
             
                  def process_punctuation!
         
     | 
| 
       239 
200 
     | 
    
         
             
                    case @punctuation
         
     | 
| 
       240 
201 
     | 
    
         
             
                    when :semi
         
     | 
| 
       241 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 202 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::PUNCTUATION4 }
         
     | 
| 
       242 
203 
     | 
    
         
             
                    when :none
         
     | 
| 
       243 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 204 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::ONLY_PUNCTUATION }
         
     | 
| 
       244 
205 
     | 
    
         
             
                    when :only
         
     | 
| 
       245 
     | 
    
         
            -
                      @tokens.keep_if 
     | 
| 
      
 206 
     | 
    
         
            +
                      @tokens.keep_if   { |token| token =~ Regex::ONLY_PUNCTUATION }
         
     | 
| 
       246 
207 
     | 
    
         
             
                    end
         
     | 
| 
       247 
208 
     | 
    
         
             
                  end
         
     | 
| 
       248 
209 
     | 
    
         | 
| 
         @@ -253,45 +214,50 @@ module PragmaticTokenizer 
     | 
|
| 
       253 
214 
     | 
    
         
             
                  def mentions!
         
     | 
| 
       254 
215 
     | 
    
         
             
                    case @mentions
         
     | 
| 
       255 
216 
     | 
    
         
             
                    when :remove
         
     | 
| 
       256 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 217 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::ONLY_MENTION }
         
     | 
| 
       257 
218 
     | 
    
         
             
                    when :keep_and_clean
         
     | 
| 
       258 
     | 
    
         
            -
                      @tokens.map! 
     | 
| 
      
 219 
     | 
    
         
            +
                      @tokens.map!      { |token| token =~ Regex::ONLY_MENTION ? token[1..-1] : token }
         
     | 
| 
       259 
220 
     | 
    
         
             
                    end
         
     | 
| 
       260 
221 
     | 
    
         
             
                  end
         
     | 
| 
       261 
222 
     | 
    
         | 
| 
       262 
223 
     | 
    
         
             
                  def hashtags!
         
     | 
| 
       263 
224 
     | 
    
         
             
                    case @hashtags
         
     | 
| 
       264 
225 
     | 
    
         
             
                    when :remove
         
     | 
| 
       265 
     | 
    
         
            -
                      @tokens.delete_if { | 
     | 
| 
      
 226 
     | 
    
         
            +
                      @tokens.delete_if { |token| token =~ Regex::ONLY_HASHTAG }
         
     | 
| 
       266 
227 
     | 
    
         
             
                    when :keep_and_clean
         
     | 
| 
       267 
     | 
    
         
            -
                      @tokens  
     | 
| 
       268 
     | 
    
         
            -
                                    .flat_map { |t| t =~ REGEXP_HYPHEN_HASTAG ? t.split(REGEX_HYPHEN) : t }
         
     | 
| 
       269 
     | 
    
         
            -
                                    .map { |t| t =~ REGEXP_HASHTAG_AT_START ? t.gsub!(REGEXP_HASHTAG_AT_START, EMPTY_STRING) : t }
         
     | 
| 
      
 228 
     | 
    
         
            +
                      @tokens.map!      { |token| token =~ Regex::ONLY_HASHTAG ? token[1..-1] : token }
         
     | 
| 
       270 
229 
     | 
    
         
             
                    end
         
     | 
| 
       271 
230 
     | 
    
         
             
                  end
         
     | 
| 
       272 
231 
     | 
    
         | 
| 
       273 
     | 
    
         
            -
                  def  
     | 
| 
       274 
     | 
    
         
            -
                    @tokens.delete_if { | 
     | 
| 
      
 232 
     | 
    
         
            +
                  def remove_by_options!
         
     | 
| 
      
 233 
     | 
    
         
            +
                    @tokens.delete_if { |token| token =~ regex_by_options }
         
     | 
| 
       275 
234 
     | 
    
         
             
                  end
         
     | 
| 
       276 
235 
     | 
    
         | 
| 
       277 
     | 
    
         
            -
                  def  
     | 
| 
       278 
     | 
    
         
            -
                    @ 
     | 
| 
      
 236 
     | 
    
         
            +
                  def regex_by_options
         
     | 
| 
      
 237 
     | 
    
         
            +
                    @regex_by_options ||= begin
         
     | 
| 
       279 
238 
     | 
    
         
             
                      regex_array = []
         
     | 
| 
       280 
     | 
    
         
            -
                      regex_array <<  
     | 
| 
       281 
     | 
    
         
            -
                      regex_array <<  
     | 
| 
       282 
     | 
    
         
            -
                      regex_array <<  
     | 
| 
       283 
     | 
    
         
            -
                      regex_array <<  
     | 
| 
      
 239 
     | 
    
         
            +
                      regex_array << Regex::RANGE_UNUSUAL_AND_EMOJI if @remove_emoji
         
     | 
| 
      
 240 
     | 
    
         
            +
                      regex_array << Regex::ONLY_EMAIL              if @remove_emails
         
     | 
| 
      
 241 
     | 
    
         
            +
                      regex_array << Regex::STARTS_WITH_HTTP        if @remove_urls
         
     | 
| 
      
 242 
     | 
    
         
            +
                      regex_array << Regex::ONLY_DOMAIN2            if @remove_domains
         
     | 
| 
       284 
243 
     | 
    
         
             
                      Regexp.union(regex_array)
         
     | 
| 
       285 
244 
     | 
    
         
             
                    end
         
     | 
| 
       286 
245 
     | 
    
         
             
                  end
         
     | 
| 
       287 
246 
     | 
    
         | 
| 
       288 
247 
     | 
    
         
             
                  def split_long_words!
         
     | 
| 
       289 
     | 
    
         
            -
                    @tokens = @tokens
         
     | 
| 
       290 
     | 
    
         
            -
             
     | 
| 
      
 248 
     | 
    
         
            +
                    @tokens = @tokens.flat_map { |token| split_long_word(token) }
         
     | 
| 
      
 249 
     | 
    
         
            +
                  end
         
     | 
| 
      
 250 
     | 
    
         
            +
             
     | 
| 
      
 251 
     | 
    
         
            +
                  def split_long_word(token)
         
     | 
| 
      
 252 
     | 
    
         
            +
                    return token unless @long_word_split
         
     | 
| 
      
 253 
     | 
    
         
            +
                    return token if token.length <= @long_word_split
         
     | 
| 
      
 254 
     | 
    
         
            +
                    return token if token =~ Regex::ONLY_HASHTAG_MENTION
         
     | 
| 
      
 255 
     | 
    
         
            +
                    return token if token =~ Regex::DOMAIN_OR_EMAIL
         
     | 
| 
      
 256 
     | 
    
         
            +
                    token.split(Regex::LONG_WORD_SPLIT)
         
     | 
| 
       291 
257 
     | 
    
         
             
                  end
         
     | 
| 
       292 
258 
     | 
    
         | 
| 
       293 
     | 
    
         
            -
                  def chosen_case( 
     | 
| 
       294 
     | 
    
         
            -
                    @downcase ? Unicode.downcase( 
     | 
| 
      
 259 
     | 
    
         
            +
                  def chosen_case(text)
         
     | 
| 
      
 260 
     | 
    
         
            +
                    @downcase ? Unicode.downcase(text) : text
         
     | 
| 
       295 
261 
     | 
    
         
             
                  end
         
     | 
| 
       296 
262 
     | 
    
         | 
| 
       297 
263 
     | 
    
         
             
                  def inverse_case(token)
         
     |