pragmatic_tokenizer 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7448bf47a86bd3d1d0ce9c9de1d67b66cd990cf
4
- data.tar.gz: a573e899144787db7f7695b53155c703bff88ddf
3
+ metadata.gz: e86a121879d806b58f855e311c14be249ba6ce95
4
+ data.tar.gz: 9be42b0a437ddaa0e03630d0fd6eee64f242bc9a
5
5
  SHA512:
6
- metadata.gz: 1fec32cc52fbefbae153f88ab6ed74a0dc6efefa5f5163f4577cf4fe582fc5be7a63e8ca6074ceca24ee6945cd1f55cd24b6885c38d7b7e928746ab6f55826b5
7
- data.tar.gz: 60635210d0d25af99fd5651de520200706675b669328c9cd5443a8a938cfd49ac3aec56a6ad61aa2b9eaceb97f7b51026e495d3245ee7ad0d54a8a6ebfb71395
6
+ metadata.gz: 9f8fbf0b2de1674c557144568dc771fadcb882b892318eae04c7aae3f1ec53743f29dd208c971c45707ded50d88304aaa824ca2f5364fc65b96bd0b72d93e0d6
7
+ data.tar.gz: 39ee1f3e32cd243ef28c4f6b0823aa4cc523ca8a2adbb90cf6feca33fb966ccd452b70166f9ab7d2b64d859165b799c6ba3d45a410a2d5ff7116210170e77d02
data/README.md CHANGED
@@ -50,6 +50,15 @@ Or install it yourself as:
50
50
 
51
51
  <hr>
52
52
 
53
+ ##### `remove_en_stop_words`
54
+ **default** = `'false'`
55
+ - `true`
56
+ Removes all English stop words (sometimes foreign language strings have English mixed in).
57
+ - `false`
58
+ Does not remove English stop words.
59
+
60
+ <hr>
61
+
53
62
  ##### `expand_contractions`
54
63
  **default** = `'false'`
55
64
  - `true`
@@ -26,6 +26,7 @@ module PragmaticTokenizer
26
26
  "'tis" => "it is",
27
27
  "it'll" => "it will",
28
28
  "it'd" => "it would",
29
+ "let's" => "let us",
29
30
  "we're" => "we are",
30
31
  "we'll" => "we will",
31
32
  "we'd" => "we would",
@@ -34,6 +35,11 @@ module PragmaticTokenizer
34
35
  "they'll" => "they will",
35
36
  "they'd" => "they would",
36
37
  "they've" => "they have",
38
+ "there'd" => "there would",
39
+ "there'll" => "there will",
40
+ "there're" => "there are",
41
+ "there's" => "there has",
42
+ "there've" => "there have",
37
43
  "that's" => "that is",
38
44
  "that'll" => "that will",
39
45
  "that'd" => "that would",
@@ -24,7 +24,7 @@ module PragmaticTokenizer
24
24
  tokens = separate_full_stop(text.squeeze(' ')
25
25
  .split
26
26
  .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
27
- .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
27
+ .flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
28
28
  .map { |t| convert_sym_to_punct(t) })
29
29
  separate_other_ending_punc(tokens)
30
30
  end
@@ -5,8 +5,8 @@ require 'unicode'
5
5
  module PragmaticTokenizer
6
6
  class Tokenizer
7
7
 
8
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
9
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
8
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
9
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
10
10
  unless punctuation.to_s.eql?('all') ||
11
11
  punctuation.to_s.eql?('semi') ||
12
12
  punctuation.to_s.eql?('none') ||
@@ -37,13 +37,15 @@ module PragmaticTokenizer
37
37
  @minimum_length = minimum_length
38
38
  @remove_roman_numerals = remove_roman_numerals
39
39
  @downcase = downcase
40
+ @remove_en_stop_words = remove_en_stop_words
40
41
  end
41
42
 
42
43
  def tokenize
43
44
  return [] unless text
44
45
  tokens = []
45
46
  text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
46
- tokens << delete_stop_words(
47
+ tokens << delete_en_stop_words(
48
+ delete_stop_words(
47
49
  downcase_tokens(
48
50
  cleaner(
49
51
  remove_short_tokens(
@@ -58,7 +60,7 @@ module PragmaticTokenizer
58
60
  shift_no_spaces_between_sentences(
59
61
  split_at_forward_slash(
60
62
  processor.new(language: language_module).process(text: segment)
61
- )))))))))))))).reject { |t| t.empty? }
63
+ ))))))))))))))).reject { |t| t.empty? }
62
64
  end
63
65
  tokens.flatten
64
66
  end
@@ -190,6 +192,15 @@ module PragmaticTokenizer
190
192
  end
191
193
  end
192
194
 
195
+ def delete_en_stop_words(tokens)
196
+ return tokens unless remove_en_stop_words
197
+ if downcase
198
+ tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
199
+ else
200
+ tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
201
+ end
202
+ end
203
+
193
204
  def split_at_forward_slash(tokens)
194
205
  tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
195
206
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias