pragmatic_tokenizer 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 802aa9ef0922e27bb0838c65d661e0918b3f48c7
4
- data.tar.gz: e4ecd8525874a49fdfbe37494d48aec07fb285ab
3
+ metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
4
+ data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
5
5
  SHA512:
6
- metadata.gz: 7999dd167559f7d49c2707dcf6d963ee63269bbb8d118236c59d9d67b1fb009ef54b1b8db872013f1c4ba73b9ddfae2482bff1b49d2227d197b73036695049bc
7
- data.tar.gz: 7872a5e27d7aeb1332f4a00bffc5e3ea66fb09ece0b34419cc8f10f47c4b504928ded18eedcde9fa022bab9597e2ca8b3ba4ca28c89574425174a8f316d694fc
6
+ metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
7
+ data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
data/README.md CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
61
61
  ##### `clean`
62
62
  **default** = `'false'`
63
63
  - `true`
64
- Removes tokens consisting of only hypens or underscores.
64
+ Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
65
65
  - `false`
66
66
  Leaves tokens as is.
67
67
 
@@ -70,10 +70,16 @@ Or install it yourself as:
70
70
  ##### `remove_numbers`
71
71
  **default** = `'false'`
72
72
  - `true`
73
- Removes any token that contains a number.
73
+ Removes any token that contains a number or Roman numeral.
74
74
  - `false`
75
75
  Leaves tokens as is.
76
76
 
77
+ <hr>
78
+
79
+ ##### `minimum_length`
80
+ **default** = `0`
81
+ The minimum number of characters a token should be.
82
+
77
83
  **Example Usage**
78
84
  ```ruby
79
85
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
@@ -110,6 +116,10 @@ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
110
116
  text = "Hello ______ ."
111
117
  PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
112
118
  # => ["hello", "."]
119
+
120
+ text = "Let's test the minimum length."
121
+ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
122
+ # => ["minimum", "length"]
113
123
  ```
114
124
 
115
125
  ## Development
@@ -4,6 +4,11 @@ module PragmaticTokenizer
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
5
5
  PUNCTUATION_MAP = ['♳', '♴', '♵', '♶', '♷', '♸', '♹', '♺', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅', '☇', '☈', '☉', '☊', '☋', '☌', '☍', '☠', '☢', '☣', '☤', '☥', '☦', '☧', '☀', '☁', '☂', '☃', '☄', "☮", '♔', '♕', '♖', '♗', '♘', '♙', '♚']
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
+ ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
8
+ SPECIAL_CHARACTERS = ['®', '©', '™']
9
+ ABBREVIATIONS = []
10
+ STOP_WORDS = []
11
+ CONTRACTIONS = {}
7
12
  end
8
13
  end
9
14
  end
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
4
4
  module PragmaticTokenizer
5
5
  class Tokenizer
6
6
 
7
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
8
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
7
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
8
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
9
9
  unless punctuation.eql?('all') ||
10
10
  punctuation.eql?('semi') ||
11
11
  punctuation.eql?('none') ||
@@ -32,11 +32,12 @@ module PragmaticTokenizer
32
32
  @expand_contractions = expand_contractions
33
33
  @clean = clean
34
34
  @remove_numbers = remove_numbers
35
+ @minimum_length = minimum_length
35
36
  end
36
37
 
37
38
  def tokenize
38
39
  return [] unless text
39
- delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
40
+ remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
40
41
  end
41
42
 
42
43
  private
@@ -47,14 +48,18 @@ module PragmaticTokenizer
47
48
  Processor
48
49
  end
49
50
 
51
+ def remove_short_tokens(tokens)
52
+ tokens.delete_if { |t| t.length < minimum_length }
53
+ end
54
+
50
55
  def delete_numbers(tokens)
51
56
  return tokens unless remove_numbers
52
- tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
57
+ tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
53
58
  end
54
59
 
55
60
  def cleaner(tokens)
56
61
  return tokens unless clean
57
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
62
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
58
63
  end
59
64
 
60
65
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias