RubyGems - pragmatic_tokenizer - Versions diffs - 0.2.4 → 0.3.0 - Mend

pragmatic_tokenizer 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/lib/pragmatic_tokenizer/languages/common.rb +2 -2
data/lib/pragmatic_tokenizer/processor.rb +18 -21
data/lib/pragmatic_tokenizer/tokenizer.rb +63 -12
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: db86f5c14e05efabcaedc5d86d730445db7aaed2
-  data.tar.gz: 87e8cb36fffffef3424f540e5cb8c044b0acf2f9
+  metadata.gz: 414905d0126493ffc7224055dd2f79010061662c
+  data.tar.gz: 2b0995cc2b16cef7f7a521a65f90011118fd70f7
 SHA512:
-  metadata.gz: 2d135b1814bd5385c699d2e6732dc51013bd5fe7e9a1d55e83ac34b37f91f0cfbc97855288c1ce29af3866cbf8a7f561c13c33376a73f929abebbe5f2392cc65
-  data.tar.gz: 9ef4499aaf6d48df889c069f00d3d56fb3984bbf03f92e7cf83e64f62cce814ec0cecfda36cdffdf5e461d4a28c20176520454b3e998a216cbf6b5617727e0da
+  metadata.gz: 62772587ff880bd192c504f9f319e1dd9e11c89bac1e37e67ef95d3f42f451fb3a770f5c57eb7152919c301e49e4d5c06a71001b27781e8ca5228bf4ab29c082
+  data.tar.gz: 977cc2c5fd69d0bca8e619860ee78e1161cb509da734ad0290b7dd54401822f399315d7e10606f8f3750a53314167568af6beccd688870bdacc04832c150d606

data/lib/pragmatic_tokenizer/languages/common.rb CHANGED Viewed

@@ -1,8 +1,8 @@
 module PragmaticTokenizer
   module Languages
     module Common
-      PUNCTUATION = ['。', '．', '.', '！', '!', '?', '？', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
-      PUNCTUATION_MAP = { "。" => "♳", "．" => "♴", "." => "♵", "！" => "♶", "!" => "♷", "?" => "♸", "？" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
+      PUNCTUATION = ['。', '．', '.', '！', '!', '?', '？', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”']
+      PUNCTUATION_MAP = { "。" => "♳", "．" => "♴", "." => "♵", "！" => "♶", "!" => "♷", "?" => "♸", "？" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }
       SEMI_PUNCTUATION = ['。', '．', '.']
       ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
       SPECIAL_CHARACTERS = ['®', '©', '™']

data/lib/pragmatic_tokenizer/processor.rb CHANGED Viewed

@@ -15,9 +15,8 @@ module PragmaticTokenizer
       shift_colon(text)
       shift_bracket(text)
       shift_semicolon(text)
-      shift_underscore(text)
-      shift_asterisk(text)
-      shift_at_symbol(text)
+      shift_caret(text)
+      shift_vertical_bar(text)
       convert_dbl_quotes(text)
       convert_sgl_quotes(text)
       shift_beginning_hyphen(text)
@@ -35,8 +34,10 @@ module PragmaticTokenizer
     def convert_dbl_quotes(text)
       # Convert left double quotes to special character
       text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+      text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
       # Convert remaining quotes to special character
       text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
+      text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
     end
     def convert_sgl_quotes(text)
@@ -51,6 +52,10 @@ module PragmaticTokenizer
       text.gsub!(/--+/o, ' - ') || text
     end
+    def shift_vertical_bar(text)
+      text.gsub!(/\|/, ' | ') || text
+    end
     def shift_comma(text)
       # Shift commas off everything but numbers
       text.gsub!(/,(?!\d)/o, ' , ') || text
@@ -83,34 +88,26 @@ module PragmaticTokenizer
       text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
     end
-    def shift_underscore(text)
-      text.gsub!(/(?<=\s)\_+/, ' \1') || text
-      text.gsub!(/\_+(?=\s)/, ' \1') || text
-      text.gsub!(/(?<=\A)\_+/, '\1 ') || text
-      text.gsub!(/\_+(?=\z)/, ' \1') || text
-    end
-    def shift_asterisk(text)
-      text.gsub!(/\*+/, ' \1 ') || text
-    end
-    def shift_at_symbol(text)
-      text.gsub!(/(\A|\s)\@/, '\1 ') || text
-    end
     def shift_colon(text)
+      puts "Text: #{text}"
       return text unless text.include?(':') &&
-        !(/\A\d+/ == text.partition(':').last[0]) &&
-        !(/\A\d+/ == text.partition(':').first[-1])
+        text.partition(':').last[0] !~ /\A\d+/ &&
+        text.partition(':').first[-1] !~ /\A\d+/
+      puts "YOYOYO"
       # Ignore web addresses
       text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
       text.gsub!(/:/o, ' :') || text
+      text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
     end
     def shift_semicolon(text)
       text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
     end
+    def shift_caret(text)
+      text.gsub!(/\^/, ' ^ ') || text
+    end
     def shift_ellipse(text)
       text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
       text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
@@ -167,7 +164,7 @@ module PragmaticTokenizer
     end
     def convert_sym_to_punct(token)
-      symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
+      symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚ ⚘]/.match(token)
       if symbol.nil?
         return token
       else

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -7,10 +7,10 @@ module PragmaticTokenizer
     attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
     def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
-      unless punctuation.eql?('all') ||
-        punctuation.eql?('semi') ||
-        punctuation.eql?('none') ||
-        punctuation.eql?('only')
+      unless punctuation.to_s.eql?('all') ||
+        punctuation.to_s.eql?('semi') ||
+        punctuation.to_s.eql?('none') ||
+        punctuation.to_s.eql?('only')
         raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
         # Punctuation 'all': Does not remove any punctuation from the result
@@ -25,10 +25,10 @@ module PragmaticTokenizer
         # Punctuation 'only': Removes everything except punctuation. The
         # returned result is an array of only the punctuation.
       end
-      @text = CGI.unescapeHTML(text)
-      @language = language
-      @language_module = Languages.get_language_by_code(language)
-      @punctuation = punctuation
+      @text = CGI.unescapeHTML(text.to_s)
+      @language = language.to_s
+      @language_module = Languages.get_language_by_code(language.to_s)
+      @punctuation = punctuation.to_s
       @remove_stop_words = remove_stop_words
       @expand_contractions = expand_contractions
       @clean = clean
@@ -40,7 +40,21 @@ module PragmaticTokenizer
     def tokenize
       return [] unless text
-      downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
+      downcase_tokens(
+        cleaner(
+        remove_short_tokens(
+        delete_numbers(
+        delete_roman_numerals(
+        find_contractions(
+        delete_stop_words(
+        remove_punctuation(
+        split_at_middle_period_1(
+        split_at_middle_period_2(
+        split_beginning_period(
+        shift_no_spaces_between_sentences(
+        split_at_forward_slash(
+          processor.new(language: language_module).process(text: text)
+        ))))))))))))).reject { |t| t.empty? }
     end
     def domains
@@ -80,6 +94,35 @@ module PragmaticTokenizer
       Processor
     end
+    def split_at_middle_period_1(tokens)
+      tokens.flat_map { |t| t.include?(".") &&
+        t !~ /(http|https|www)(\.|:)/ &&
+        t.length > 1 &&
+        t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
+        t !~ /\S+(＠|@)\S+/ &&
+        language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
+    end
+    def split_at_middle_period_2(tokens)
+      tokens.flat_map { |t| t.include?(".") &&
+        t !~ /(http|https|www)(\.|:)/ &&
+        t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
+        t !~ /\.[a-z]{2}/ &&
+        t.length > 2 &&
+        t.count(".") == 1 &&
+        t !~ /\d+/ &&
+        !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
+        t !~ /\S+(＠|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
+    end
+    def split_beginning_period(tokens)
+      tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
+    end
+    def shift_no_spaces_between_sentences(tokens)
+      tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
+    end
     def downcase_tokens(tokens)
       return tokens unless downcase
       tokens.map { |t| Unicode::downcase(t) }
@@ -101,7 +144,13 @@ module PragmaticTokenizer
     def cleaner(tokens)
       return tokens unless clean
-      tokens.delete_if { |t| t =~ /\A-+\z/ ||
+      tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
+        .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
+        .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
+        .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
+        .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
+        .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
+        .delete_if { |t| t =~ /\A-+\z/ ||
         PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
         t =~ /\A\.{2,}\z/ || t.include?("\\") ||
         t.length > 50 ||
@@ -135,14 +184,16 @@ module PragmaticTokenizer
       end
     end
+    def split_at_forward_slash(tokens)
+      tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
+    end
     def find_contractions(tokens)
       return tokens unless expand_contractions && language_module::CONTRACTIONS
       if downcase
         tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').flatten : t }
-          .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
       else
         tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
-          .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
       end
     end
   end

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "0.2.4"
+  VERSION = "0.3.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.2.4
+  version: 0.3.0
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2016-01-12 00:00:00.000000000 Z
+date: 2016-01-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode