RubyGems - pragmatic_tokenizer - Versions diffs - 0.4.2 → 0.5.0 - Mend

pragmatic_tokenizer 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +9 -0
data/lib/pragmatic_tokenizer/languages/english.rb +6 -0
data/lib/pragmatic_tokenizer/processor.rb +1 -1
data/lib/pragmatic_tokenizer/tokenizer.rb +15 -4
data/lib/pragmatic_tokenizer/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: e7448bf47a86bd3d1d0ce9c9de1d67b66cd990cf
-  data.tar.gz: a573e899144787db7f7695b53155c703bff88ddf
+  metadata.gz: e86a121879d806b58f855e311c14be249ba6ce95
+  data.tar.gz: 9be42b0a437ddaa0e03630d0fd6eee64f242bc9a
 SHA512:
-  metadata.gz: 1fec32cc52fbefbae153f88ab6ed74a0dc6efefa5f5163f4577cf4fe582fc5be7a63e8ca6074ceca24ee6945cd1f55cd24b6885c38d7b7e928746ab6f55826b5
-  data.tar.gz: 60635210d0d25af99fd5651de520200706675b669328c9cd5443a8a938cfd49ac3aec56a6ad61aa2b9eaceb97f7b51026e495d3245ee7ad0d54a8a6ebfb71395
+  metadata.gz: 9f8fbf0b2de1674c557144568dc771fadcb882b892318eae04c7aae3f1ec53743f29dd208c971c45707ded50d88304aaa824ca2f5364fc65b96bd0b72d93e0d6
+  data.tar.gz: 39ee1f3e32cd243ef28c4f6b0823aa4cc523ca8a2adbb90cf6feca33fb966ccd452b70166f9ab7d2b64d859165b799c6ba3d45a410a2d5ff7116210170e77d02

data/README.md CHANGED Viewed

@@ -50,6 +50,15 @@ Or install it yourself as:
 <hr>
+##### `remove_en_stop_words`
+  **default** = `'false'`
+- `true`
+  Removes all English stop words (sometimes foreign language strings have English mixed in).
+- `false`
+  Does not remove English stop words.
+<hr>
 ##### `expand_contractions`
   **default** = `'false'`
 - `true`

data/lib/pragmatic_tokenizer/languages/english.rb CHANGED Viewed

@@ -26,6 +26,7 @@ module PragmaticTokenizer
         "'tis"              => "it is",
         "it'll"             => "it will",
         "it'd"              => "it would",
+        "let's"             => "let us",
         "we're"             => "we are",
         "we'll"             => "we will",
         "we'd"              => "we would",
@@ -34,6 +35,11 @@ module PragmaticTokenizer
         "they'll"           => "they will",
         "they'd"            => "they would",
         "they've"           => "they have",
+        "there'd"           => "there would",
+        "there'll"          => "there will",
+        "there're"          => "there are",
+        "there's"           => "there has",
+        "there've"          => "there have",
         "that's"            => "that is",
         "that'll"           => "that will",
         "that'd"            => "that would",

data/lib/pragmatic_tokenizer/processor.rb CHANGED Viewed

@@ -24,7 +24,7 @@ module PragmaticTokenizer
       tokens = separate_full_stop(text.squeeze(' ')
         .split
         .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
-        .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
+        .flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
         .map { |t| convert_sym_to_punct(t) })
       separate_other_ending_punc(tokens)
     end

data/lib/pragmatic_tokenizer/tokenizer.rb CHANGED Viewed

@@ -5,8 +5,8 @@ require 'unicode'
 module PragmaticTokenizer
   class Tokenizer
-    attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
-    def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
+    attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
+    def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
       unless punctuation.to_s.eql?('all') ||
         punctuation.to_s.eql?('semi') ||
         punctuation.to_s.eql?('none') ||
@@ -37,13 +37,15 @@ module PragmaticTokenizer
       @minimum_length = minimum_length
       @remove_roman_numerals = remove_roman_numerals
       @downcase = downcase
+      @remove_en_stop_words = remove_en_stop_words
     end
     def tokenize
       return [] unless text
       tokens = []
       text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
-        tokens << delete_stop_words(
+        tokens << delete_en_stop_words(
+          delete_stop_words(
           downcase_tokens(
           cleaner(
           remove_short_tokens(
@@ -58,7 +60,7 @@ module PragmaticTokenizer
           shift_no_spaces_between_sentences(
           split_at_forward_slash(
             processor.new(language: language_module).process(text: segment)
-          )))))))))))))).reject { |t| t.empty? }
+          ))))))))))))))).reject { |t| t.empty? }
       end
       tokens.flatten
     end
@@ -190,6 +192,15 @@ module PragmaticTokenizer
       end
     end
+    def delete_en_stop_words(tokens)
+      return tokens unless remove_en_stop_words
+      if downcase
+        tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
+      else
+        tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
+      end
+    end
     def split_at_forward_slash(tokens)
       tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
     end

data/lib/pragmatic_tokenizer/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticTokenizer
-  VERSION = "0.4.2"
+  VERSION = "0.5.0"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_tokenizer
 version: !ruby/object:Gem::Version
-  version: 0.4.2
+  version: 0.5.0
 platform: ruby
 authors:
 - Kevin S. Dias