pragmatic_tokenizer 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +11 -11
- data/lib/pragmatic_tokenizer/languages/common.rb +2 -1
- data/lib/pragmatic_tokenizer/languages/english.rb +1 -0
- data/lib/pragmatic_tokenizer/post_processor.rb +2 -2
- data/lib/pragmatic_tokenizer/pre_processor.rb +6 -0
- data/lib/pragmatic_tokenizer/tokenizer.rb +10 -3
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA1:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 8cba2ce060ad1d9ffc74953a9e3a9504b1c8ed13
         | 
| 4 | 
            +
              data.tar.gz: 3d96486358f974ce30165199381b27c3e01f7625
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 8cc83be7dc5d9db9dd03d8895ea0a0d7bb1856b0f7b82698362dfb97f8e6cd32c3cbaccbb320ce04cb193594c765ae4cbbce5cb24634f3fe0aced37298ce75c5
         | 
| 7 | 
            +
              data.tar.gz: f7edb187f8cc2f60aad58d79eac454b556dec53d77aa86a90d55d054d009783be8b4619da22550a79a01ac91540a7b1e3d5ea509e4c482821e32517f976c6208
         | 
    
        data/README.md
    CHANGED
    
    | @@ -85,7 +85,7 @@ options = { | |
| 85 85 | 
             
            <hr>
         | 
| 86 86 |  | 
| 87 87 | 
             
            ##### `remove_stop_words`
         | 
| 88 | 
            -
              **default** = ` | 
| 88 | 
            +
              **default** = `false`
         | 
| 89 89 | 
             
            - `true`  
         | 
| 90 90 | 
             
              Removes all stop words.
         | 
| 91 91 | 
             
            - `false`   
         | 
| @@ -94,7 +94,7 @@ options = { | |
| 94 94 | 
             
            <hr>
         | 
| 95 95 |  | 
| 96 96 | 
             
            ##### `expand_contractions`
         | 
| 97 | 
            -
              **default** = ` | 
| 97 | 
            +
              **default** = `false`
         | 
| 98 98 | 
             
            - `true`  
         | 
| 99 99 | 
             
              Expands contractions (i.e. i'll -> i will).
         | 
| 100 100 | 
             
            - `false`   
         | 
| @@ -135,7 +135,7 @@ options = { | |
| 135 135 | 
             
            <hr>
         | 
| 136 136 |  | 
| 137 137 | 
             
            ##### `remove_emoji`
         | 
| 138 | 
            -
              **default** = ` | 
| 138 | 
            +
              **default** = `false`
         | 
| 139 139 | 
             
            - `true`  
         | 
| 140 140 | 
             
              Removes any token that contains an emoji.
         | 
| 141 141 | 
             
            - `false`   
         | 
| @@ -144,7 +144,7 @@ options = { | |
| 144 144 | 
             
            <hr>
         | 
| 145 145 |  | 
| 146 146 | 
             
            ##### `remove_urls`
         | 
| 147 | 
            -
              **default** = ` | 
| 147 | 
            +
              **default** = `false`
         | 
| 148 148 | 
             
            - `true`  
         | 
| 149 149 | 
             
              Removes any token that contains a URL.
         | 
| 150 150 | 
             
            - `false`   
         | 
| @@ -153,7 +153,7 @@ options = { | |
| 153 153 | 
             
            <hr>
         | 
| 154 154 |  | 
| 155 155 | 
             
            ##### `remove_domains`
         | 
| 156 | 
            -
              **default** = ` | 
| 156 | 
            +
              **default** = `false`
         | 
| 157 157 | 
             
            - `true`  
         | 
| 158 158 | 
             
              Removes any token that contains a domain.
         | 
| 159 159 | 
             
            - `false`   
         | 
| @@ -162,7 +162,7 @@ options = { | |
| 162 162 | 
             
            <hr>
         | 
| 163 163 |  | 
| 164 164 | 
             
            ##### `remove_domains`
         | 
| 165 | 
            -
              **default** = ` | 
| 165 | 
            +
              **default** = `false`
         | 
| 166 166 | 
             
            - `true`  
         | 
| 167 167 | 
             
              Removes any token that contains a domain.
         | 
| 168 168 | 
             
            - `false`   
         | 
| @@ -171,7 +171,7 @@ options = { | |
| 171 171 | 
             
            <hr>
         | 
| 172 172 |  | 
| 173 173 | 
             
            ##### `clean`
         | 
| 174 | 
            -
              **default** = ` | 
| 174 | 
            +
              **default** = `false`
         | 
| 175 175 | 
             
            - `true`  
         | 
| 176 176 | 
             
              Removes tokens consisting of only hypens, underscores, or periods as well as some special characters (®, ©, ™). Also removes long tokens or tokens with a backslash.
         | 
| 177 177 | 
             
            - `false`   
         | 
| @@ -180,7 +180,7 @@ options = { | |
| 180 180 | 
             
            <hr>
         | 
| 181 181 |  | 
| 182 182 | 
             
            ##### `hashtags`
         | 
| 183 | 
            -
              **default** =  | 
| 183 | 
            +
              **default** = `:keep_original`
         | 
| 184 184 | 
             
            - `:keep_original`  
         | 
| 185 185 | 
             
              Does not alter the token at all.
         | 
| 186 186 | 
             
            - `:keep_and_clean`   
         | 
| @@ -191,7 +191,7 @@ options = { | |
| 191 191 | 
             
            <hr>
         | 
| 192 192 |  | 
| 193 193 | 
             
            ##### `mentions`
         | 
| 194 | 
            -
              **default** =  | 
| 194 | 
            +
              **default** = `:keep_original`
         | 
| 195 195 | 
             
            - `:keep_original`  
         | 
| 196 196 | 
             
              Does not alter the token at all.
         | 
| 197 197 | 
             
            - `:keep_and_clean`   
         | 
| @@ -202,7 +202,7 @@ options = { | |
| 202 202 | 
             
            <hr>
         | 
| 203 203 |  | 
| 204 204 | 
             
            ##### `classic_filter`
         | 
| 205 | 
            -
              **default** = ` | 
| 205 | 
            +
              **default** = `false`
         | 
| 206 206 | 
             
            - `true`  
         | 
| 207 207 | 
             
              Removes dots from acronyms and 's from the end of tokens.
         | 
| 208 208 | 
             
            - `false`   
         | 
| @@ -211,7 +211,7 @@ options = { | |
| 211 211 | 
             
            <hr>
         | 
| 212 212 |  | 
| 213 213 | 
             
            ##### `downcase`
         | 
| 214 | 
            -
              **default** = ` | 
| 214 | 
            +
              **default** = `true`
         | 
| 215 215 |  | 
| 216 216 | 
             
            <hr>
         | 
| 217 217 |  | 
| @@ -2,7 +2,7 @@ module PragmaticTokenizer | |
| 2 2 | 
             
              module Languages
         | 
| 3 3 | 
             
                module Common
         | 
| 4 4 | 
             
                  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”'].freeze
         | 
| 5 | 
            -
                  PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }.freeze
         | 
| 5 | 
            +
                  PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘", "‘" => "⚭" }.freeze
         | 
| 6 6 | 
             
                  SEMI_PUNCTUATION = ['。', '.', '.'].freeze
         | 
| 7 7 | 
             
                  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix'].freeze
         | 
| 8 8 | 
             
                  SPECIAL_CHARACTERS = ['®', '©', '™'].freeze
         | 
| @@ -18,6 +18,7 @@ module PragmaticTokenizer | |
| 18 18 | 
             
                    def handle_single_quotes(text)
         | 
| 19 19 | 
             
                      # Convert left quotes to special character except for 'Twas or 'twas
         | 
| 20 20 | 
             
                      text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
         | 
| 21 | 
            +
                      text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
         | 
| 21 22 | 
             
                      text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
         | 
| 22 23 | 
             
                      # Separate right single quotes
         | 
| 23 24 | 
             
                      text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
         | 
| @@ -99,6 +99,7 @@ module PragmaticTokenizer | |
| 99 99 | 
             
                    def handle_single_quotes(text)
         | 
| 100 100 | 
             
                      # Convert left quotes to special character except for 'Twas or 'twas
         | 
| 101 101 | 
             
                      text.gsub!(/(\W|^)'(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
         | 
| 102 | 
            +
                      text.gsub!(/(\W|^)‘(?=.*\w)(?!twas)(?!Twas)/o) { $1 ? $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' : ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["‘"] + ' ' } || text
         | 
| 102 103 | 
             
                      text.gsub!(/(\W|^)'(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"]) || text
         | 
| 103 104 | 
             
                      # Separate right single quotes
         | 
| 104 105 | 
             
                      text.gsub!(/(\w|\D)'(?!')(?=\W|$)/o) { $1 + ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP["'"] + ' ' } || text
         | 
| @@ -17,6 +17,7 @@ module PragmaticTokenizer | |
| 17 17 | 
             
                    .flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub(/\?/, '\1 \2').split(' ').flatten : t }
         | 
| 18 18 | 
             
                    .flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
         | 
| 19 19 | 
             
                    .flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub(/\./, '\1 ').split(' ').flatten : t }
         | 
| 20 | 
            +
                    .flat_map { |t| t =~ /\A\:\S{2,}/ ? t.gsub(/\:/, ': ').split(' ').flatten : t }
         | 
| 20 21 | 
             
                    .flat_map { |t| t.include?(".") &&
         | 
| 21 22 | 
             
                      t !~ /(http|https|www)(\.|:)/ &&
         | 
| 22 23 | 
             
                      t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
         | 
| @@ -35,14 +36,13 @@ module PragmaticTokenizer | |
| 35 36 | 
             
                      abbreviations.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub(/\./, '\1. \2').split(' ').flatten : t }
         | 
| 36 37 | 
             
                    .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::PREFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
         | 
| 37 38 | 
             
                    .flat_map { |t| t =~ PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX ? t.gsub(PragmaticTokenizer::Languages::Common::POSTFIX_EMOJI_REGEX, '\1 \2').split(' ').flatten : t }
         | 
| 38 | 
            -
                    .flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
         | 
| 39 39 | 
             
                  ).separate
         | 
| 40 40 | 
             
                end
         | 
| 41 41 |  | 
| 42 42 | 
             
                private
         | 
| 43 43 |  | 
| 44 44 | 
             
                def convert_sym_to_punct(token)
         | 
| 45 | 
            -
                  symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
         | 
| 45 | 
            +
                  symbol_matches = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘ ⚭]/.match(token)
         | 
| 46 46 | 
             
                  symbol_matches.nil? ? token : token.gsub!(symbol_matches[0], PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP.key(symbol_matches[0]))
         | 
| 47 47 | 
             
                end
         | 
| 48 48 | 
             
              end
         | 
| @@ -16,6 +16,7 @@ module PragmaticTokenizer | |
| 16 16 | 
             
                  shift_bracket(text)
         | 
| 17 17 | 
             
                  shift_semicolon(text)
         | 
| 18 18 | 
             
                  shift_caret(text)
         | 
| 19 | 
            +
                  shift_hashtag(text)
         | 
| 19 20 | 
             
                  shift_vertical_bar(text)
         | 
| 20 21 | 
             
                  convert_dbl_quotes(text)
         | 
| 21 22 | 
             
                  convert_sgl_quotes(text)
         | 
| @@ -29,6 +30,7 @@ module PragmaticTokenizer | |
| 29 30 | 
             
                def shift_comma(text)
         | 
| 30 31 | 
             
                  # Shift commas off everything but numbers
         | 
| 31 32 | 
             
                  text.gsub!(/,(?!\d)/o, ' , ') || text
         | 
| 33 | 
            +
                  text.gsub!(/(?<=\D),(?=\S+)/, ' , ') || text
         | 
| 32 34 | 
             
                end
         | 
| 33 35 |  | 
| 34 36 | 
             
                def shift_multiple_dash(text)
         | 
| @@ -78,6 +80,10 @@ module PragmaticTokenizer | |
| 78 80 | 
             
                  text.gsub!(/\^/, ' ^ ') || text
         | 
| 79 81 | 
             
                end
         | 
| 80 82 |  | 
| 83 | 
            +
                def shift_hashtag(text)
         | 
| 84 | 
            +
                  text.gsub!(/(?<=\S)(#|#)(?=\S)/, ' \1\2') || text
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
             | 
| 81 87 | 
             
                def shift_vertical_bar(text)
         | 
| 82 88 | 
             
                  text.gsub!(/\|/, ' | ') || text
         | 
| 83 89 | 
             
                end
         | 
| @@ -160,11 +160,17 @@ module PragmaticTokenizer | |
| 160 160 | 
             
                    .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
         | 
| 161 161 | 
             
                    .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
         | 
| 162 162 | 
             
                    .map { |t| t.gsub(/[[:cntrl:]]/, '') }
         | 
| 163 | 
            +
                    .map { |t| t.gsub(/(?<=\A)\:(?=.+)/, '') }
         | 
| 164 | 
            +
                    .map { |t| t.gsub(/(?<=\A)!+(?=.+)/, '') }
         | 
| 165 | 
            +
                    .map { |t| t.gsub(/1+(?=\z)/, '') }
         | 
| 166 | 
            +
                    .map { |t| t.gsub(/!+(?=\z)/, '') }
         | 
| 167 | 
            +
                    .map { |t| t.gsub(/!+(1*!*)*(?=\z)/, '') }
         | 
| 163 168 | 
             
                    .delete_if { |t| t =~ /\A-+\z/ ||
         | 
| 164 169 | 
             
                    PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
         | 
| 165 170 | 
             
                    t =~ /\A\.{2,}\z/ || t.include?("\\") ||
         | 
| 166 171 | 
             
                    t.length > 50 ||
         | 
| 167 | 
            -
                    (t.length > 1 && t =~ /[&*+<=>^|~]/i)
         | 
| 172 | 
            +
                    (t.length > 1 && t =~ /[&*+<=>^|~]/i) ||
         | 
| 173 | 
            +
                    (t.length == 1 && t =~ /\:/)
         | 
| 168 174 | 
             
                  }
         | 
| 169 175 | 
             
                end
         | 
| 170 176 |  | 
| @@ -211,7 +217,7 @@ module PragmaticTokenizer | |
| 211 217 | 
             
                end
         | 
| 212 218 |  | 
| 213 219 | 
             
                def remove_emails!
         | 
| 214 | 
            -
                  @tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
         | 
| 220 | 
            +
                  @tokens.delete_if { |t| t =~ /\S+(@|@)\S+\.\S+/ }.map { |t| t.chomp('.') }
         | 
| 215 221 | 
             
                end
         | 
| 216 222 |  | 
| 217 223 | 
             
                def mentions!
         | 
| @@ -228,6 +234,7 @@ module PragmaticTokenizer | |
| 228 234 | 
             
                  when 'remove'
         | 
| 229 235 | 
             
                    @tokens.delete_if { |t| t =~ /\A(#|#)/ }
         | 
| 230 236 | 
             
                  when 'keep_and_clean'
         | 
| 237 | 
            +
                    @tokens = @tokens.flat_map { |t| t =~ /\A(#|#)\S+-/ ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
         | 
| 231 238 | 
             
                    @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
         | 
| 232 239 | 
             
                  end
         | 
| 233 240 | 
             
                end
         | 
| @@ -237,7 +244,7 @@ module PragmaticTokenizer | |
| 237 244 | 
             
                end
         | 
| 238 245 |  | 
| 239 246 | 
             
                def remove_domains!
         | 
| 240 | 
            -
                  @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9] | 
| 247 | 
            +
                  @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]{2,}([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
         | 
| 241 248 | 
             
                end
         | 
| 242 249 |  | 
| 243 250 | 
             
                def split_long_words!
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: pragmatic_tokenizer
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1.0. | 
| 4 | 
            +
              version: 1.0.1
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Kevin S. Dias
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2016-01- | 
| 11 | 
            +
            date: 2016-01-20 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: unicode
         |