pragmatic_tokenizer 0.4.2 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e7448bf47a86bd3d1d0ce9c9de1d67b66cd990cf
4
- data.tar.gz: a573e899144787db7f7695b53155c703bff88ddf
3
+ metadata.gz: e86a121879d806b58f855e311c14be249ba6ce95
4
+ data.tar.gz: 9be42b0a437ddaa0e03630d0fd6eee64f242bc9a
5
5
  SHA512:
6
- metadata.gz: 1fec32cc52fbefbae153f88ab6ed74a0dc6efefa5f5163f4577cf4fe582fc5be7a63e8ca6074ceca24ee6945cd1f55cd24b6885c38d7b7e928746ab6f55826b5
7
- data.tar.gz: 60635210d0d25af99fd5651de520200706675b669328c9cd5443a8a938cfd49ac3aec56a6ad61aa2b9eaceb97f7b51026e495d3245ee7ad0d54a8a6ebfb71395
6
+ metadata.gz: 9f8fbf0b2de1674c557144568dc771fadcb882b892318eae04c7aae3f1ec53743f29dd208c971c45707ded50d88304aaa824ca2f5364fc65b96bd0b72d93e0d6
7
+ data.tar.gz: 39ee1f3e32cd243ef28c4f6b0823aa4cc523ca8a2adbb90cf6feca33fb966ccd452b70166f9ab7d2b64d859165b799c6ba3d45a410a2d5ff7116210170e77d02
data/README.md CHANGED
@@ -50,6 +50,15 @@ Or install it yourself as:
50
50
 
51
51
  <hr>
52
52
 
53
+ ##### `remove_en_stop_words`
54
+ **default** = `'false'`
55
+ - `true`
56
+ Removes all English stop words (sometimes foreign language strings have English mixed in).
57
+ - `false`
58
+ Does not remove English stop words.
59
+
60
+ <hr>
61
+
53
62
  ##### `expand_contractions`
54
63
  **default** = `'false'`
55
64
  - `true`
@@ -26,6 +26,7 @@ module PragmaticTokenizer
26
26
  "'tis" => "it is",
27
27
  "it'll" => "it will",
28
28
  "it'd" => "it would",
29
+ "let's" => "let us",
29
30
  "we're" => "we are",
30
31
  "we'll" => "we will",
31
32
  "we'd" => "we would",
@@ -34,6 +35,11 @@ module PragmaticTokenizer
34
35
  "they'll" => "they will",
35
36
  "they'd" => "they would",
36
37
  "they've" => "they have",
38
+ "there'd" => "there would",
39
+ "there'll" => "there will",
40
+ "there're" => "there are",
41
+ "there's" => "there has",
42
+ "there've" => "there have",
37
43
  "that's" => "that is",
38
44
  "that'll" => "that will",
39
45
  "that'd" => "that would",
@@ -24,7 +24,7 @@ module PragmaticTokenizer
24
24
  tokens = separate_full_stop(text.squeeze(' ')
25
25
  .split
26
26
  .flat_map { |t| (t[0] == '‚' || t[0] == ',') && t.length > 1 ? t.split(/(,|‚)/).flatten : t }
27
- .flat_map { |t| (t[-1] == '’' || t[-1] == "'") && t.length > 1 ? t.split(/(’|')/).flatten : t }
27
+ .flat_map { |t| (t[-1] == '’' || t[-1] == "'" || t[-1] == '‘' || t[-1] == '`') && t.length > 1 ? t.split(/(’|'|‘|`)/).flatten : t }
28
28
  .map { |t| convert_sym_to_punct(t) })
29
29
  separate_other_ending_punc(tokens)
30
30
  end
@@ -5,8 +5,8 @@ require 'unicode'
5
5
  module PragmaticTokenizer
6
6
  class Tokenizer
7
7
 
8
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
9
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
8
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
9
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
10
10
  unless punctuation.to_s.eql?('all') ||
11
11
  punctuation.to_s.eql?('semi') ||
12
12
  punctuation.to_s.eql?('none') ||
@@ -37,13 +37,15 @@ module PragmaticTokenizer
37
37
  @minimum_length = minimum_length
38
38
  @remove_roman_numerals = remove_roman_numerals
39
39
  @downcase = downcase
40
+ @remove_en_stop_words = remove_en_stop_words
40
41
  end
41
42
 
42
43
  def tokenize
43
44
  return [] unless text
44
45
  tokens = []
45
46
  text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
46
- tokens << delete_stop_words(
47
+ tokens << delete_en_stop_words(
48
+ delete_stop_words(
47
49
  downcase_tokens(
48
50
  cleaner(
49
51
  remove_short_tokens(
@@ -58,7 +60,7 @@ module PragmaticTokenizer
58
60
  shift_no_spaces_between_sentences(
59
61
  split_at_forward_slash(
60
62
  processor.new(language: language_module).process(text: segment)
61
- )))))))))))))).reject { |t| t.empty? }
63
+ ))))))))))))))).reject { |t| t.empty? }
62
64
  end
63
65
  tokens.flatten
64
66
  end
@@ -190,6 +192,15 @@ module PragmaticTokenizer
190
192
  end
191
193
  end
192
194
 
195
+ def delete_en_stop_words(tokens)
196
+ return tokens unless remove_en_stop_words
197
+ if downcase
198
+ tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
199
+ else
200
+ tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
201
+ end
202
+ end
203
+
193
204
  def split_at_forward_slash(tokens)
194
205
  tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
195
206
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.4.2"
2
+ VERSION = "0.5.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.2
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias