pragmatic_tokenizer 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 802aa9ef0922e27bb0838c65d661e0918b3f48c7
4
- data.tar.gz: e4ecd8525874a49fdfbe37494d48aec07fb285ab
3
+ metadata.gz: 77bf20ef9b491ecf72bd541712fda8338e4f318e
4
+ data.tar.gz: 6a8c8464d224532a05f80832142ef9a6b0b1c26b
5
5
  SHA512:
6
- metadata.gz: 7999dd167559f7d49c2707dcf6d963ee63269bbb8d118236c59d9d67b1fb009ef54b1b8db872013f1c4ba73b9ddfae2482bff1b49d2227d197b73036695049bc
7
- data.tar.gz: 7872a5e27d7aeb1332f4a00bffc5e3ea66fb09ece0b34419cc8f10f47c4b504928ded18eedcde9fa022bab9597e2ca8b3ba4ca28c89574425174a8f316d694fc
6
+ metadata.gz: 93ad61db52cb42f5ec17cfec971bb1505e7b8d2d9e2c40713be01f46fc0eebc2c8de302ad36c90bc9864696ba4b895050f9041d2bd0d2622b8b4f5d9d94118f5
7
+ data.tar.gz: f6ef9861babfc3e28ca90624e396f821cbeaa09da189f3ecec60a0724b15538d39afe920b1808cbb6a1f527044fe761745947af551d321cab79bdfe11b418f80
data/README.md CHANGED
@@ -61,7 +61,7 @@ Or install it yourself as:
61
61
  ##### `clean`
62
62
  **default** = `'false'`
63
63
  - `true`
64
- Removes tokens consisting of only hypens or underscores.
64
+ Removes tokens consisting of only hypens or underscores as well as some special characters (®, ©, ™).
65
65
  - `false`
66
66
  Leaves tokens as is.
67
67
 
@@ -70,10 +70,16 @@ Or install it yourself as:
70
70
  ##### `remove_numbers`
71
71
  **default** = `'false'`
72
72
  - `true`
73
- Removes any token that contains a number.
73
+ Removes any token that contains a number or Roman numeral.
74
74
  - `false`
75
75
  Leaves tokens as is.
76
76
 
77
+ <hr>
78
+
79
+ ##### `minimum_length`
80
+ **default** = `0`
81
+ The minimum number of characters a token should be.
82
+
77
83
  **Example Usage**
78
84
  ```ruby
79
85
  text = "\"I said, 'what're you? Crazy?'\" said Sandowsky. \"I can't afford to do that.\""
@@ -110,6 +116,10 @@ PragmaticTokenizer::Tokenizer.new(text, remove_numbers: true).tokenize
110
116
  text = "Hello ______ ."
111
117
  PragmaticTokenizer::Tokenizer.new(text, clean: true).tokenize
112
118
  # => ["hello", "."]
119
+
120
+ text = "Let's test the minimum length."
121
+ PragmaticTokenizer::Tokenizer.new(text, minimum_length: 6).tokenize
122
+ # => ["minimum", "length"]
113
123
  ```
114
124
 
115
125
  ## Development
@@ -4,6 +4,11 @@ module PragmaticTokenizer
4
4
  PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
5
5
  PUNCTUATION_MAP = ['♳', '♴', '♵', '♶', '♷', '♸', '♹', '♺', '⚀', '⚁', '⚂', '⚃', '⚄', '⚅', '☇', '☈', '☉', '☊', '☋', '☌', '☍', '☠', '☢', '☣', '☤', '☥', '☦', '☧', '☀', '☁', '☂', '☃', '☄', "☮", '♔', '♕', '♖', '♗', '♘', '♙', '♚']
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
+ ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
8
+ SPECIAL_CHARACTERS = ['®', '©', '™']
9
+ ABBREVIATIONS = []
10
+ STOP_WORDS = []
11
+ CONTRACTIONS = {}
7
12
  end
8
13
  end
9
14
  end
@@ -4,8 +4,8 @@ require 'pragmatic_tokenizer/languages'
4
4
  module PragmaticTokenizer
5
5
  class Tokenizer
6
6
 
7
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers
8
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false)
7
+ attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length
8
+ def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0)
9
9
  unless punctuation.eql?('all') ||
10
10
  punctuation.eql?('semi') ||
11
11
  punctuation.eql?('none') ||
@@ -32,11 +32,12 @@ module PragmaticTokenizer
32
32
  @expand_contractions = expand_contractions
33
33
  @clean = clean
34
34
  @remove_numbers = remove_numbers
35
+ @minimum_length = minimum_length
35
36
  end
36
37
 
37
38
  def tokenize
38
39
  return [] unless text
39
- delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text))))))
40
+ remove_short_tokens(delete_numbers(cleaner(delete_stop_words(find_contractions(remove_punctuation(processor.new(language: language_module).process(text: text)))))))
40
41
  end
41
42
 
42
43
  private
@@ -47,14 +48,18 @@ module PragmaticTokenizer
47
48
  Processor
48
49
  end
49
50
 
51
+ def remove_short_tokens(tokens)
52
+ tokens.delete_if { |t| t.length < minimum_length }
53
+ end
54
+
50
55
  def delete_numbers(tokens)
51
56
  return tokens unless remove_numbers
52
- tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
57
+ tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(t) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{t}.") }
53
58
  end
54
59
 
55
60
  def cleaner(tokens)
56
61
  return tokens unless clean
57
- tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ }
62
+ tokens.delete_if { |t| t =~ /\A_+\z/ || t =~ /\A-+\z/ || PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) }
58
63
  end
59
64
 
60
65
  def remove_punctuation(tokens)
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias