pragmatic_tokenizer 0.5.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +133 -151
  3. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +31 -0
  4. data/lib/pragmatic_tokenizer/full_stop_separator.rb +38 -0
  5. data/lib/pragmatic_tokenizer/languages/arabic.rb +3 -3
  6. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  7. data/lib/pragmatic_tokenizer/languages/catalan.rb +3 -3
  8. data/lib/pragmatic_tokenizer/languages/common.rb +14 -8
  9. data/lib/pragmatic_tokenizer/languages/czech.rb +3 -3
  10. data/lib/pragmatic_tokenizer/languages/danish.rb +3 -3
  11. data/lib/pragmatic_tokenizer/languages/deutsch.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/dutch.rb +3 -3
  13. data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/finnish.rb +3 -3
  15. data/lib/pragmatic_tokenizer/languages/french.rb +3 -3
  16. data/lib/pragmatic_tokenizer/languages/greek.rb +3 -3
  17. data/lib/pragmatic_tokenizer/languages/indonesian.rb +3 -3
  18. data/lib/pragmatic_tokenizer/languages/italian.rb +3 -3
  19. data/lib/pragmatic_tokenizer/languages/latvian.rb +3 -3
  20. data/lib/pragmatic_tokenizer/languages/norwegian.rb +3 -3
  21. data/lib/pragmatic_tokenizer/languages/persian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/polish.rb +3 -3
  23. data/lib/pragmatic_tokenizer/languages/portuguese.rb +3 -3
  24. data/lib/pragmatic_tokenizer/languages/romanian.rb +3 -3
  25. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  26. data/lib/pragmatic_tokenizer/languages/slovak.rb +3 -3
  27. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  28. data/lib/pragmatic_tokenizer/languages/swedish.rb +3 -3
  29. data/lib/pragmatic_tokenizer/languages/turkish.rb +3 -3
  30. data/lib/pragmatic_tokenizer/languages.rb +0 -2
  31. data/lib/pragmatic_tokenizer/post_processor.rb +49 -0
  32. data/lib/pragmatic_tokenizer/{processor.rb → pre_processor.rb} +35 -98
  33. data/lib/pragmatic_tokenizer/tokenizer.rb +186 -159
  34. data/lib/pragmatic_tokenizer/version.rb +1 -1
  35. metadata +6 -3
@@ -1,221 +1,248 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  require 'pragmatic_tokenizer/languages'
3
+ require 'pragmatic_tokenizer/pre_processor'
4
+ require 'pragmatic_tokenizer/post_processor'
5
+ require 'pragmatic_tokenizer/full_stop_separator'
6
+ require 'pragmatic_tokenizer/ending_punctuation_separator'
3
7
  require 'unicode'
4
8
 
5
9
  module PragmaticTokenizer
6
10
  class Tokenizer
7
11
 
8
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
9
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
12
+ attr_reader :text, :punctuation, :language_module, :expand_contractions, :numbers, :minimum_length, :downcase, :classic_filter, :filter_languages, :abbreviations, :contractions, :clean, :remove_stop_words, :stop_words, :remove_emoji, :remove_emails, :mentions, :hashtags, :remove_urls, :remove_domains, :long_word_split
13
+
14
+ # @param [String] text to be tokenized
15
+ # @param [Hash] opts optional arguments
16
+
17
+ # @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
18
+ # @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
19
+ # @option opts [Boolean] :expand_contractions - (default: false)
20
+ # @option opts [Boolean] :remove_stop_words - (default: false)
21
+ # @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
22
+ # @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
23
+ # @option opts [Hash] :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
24
+ # @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
25
+ # Punctuation 'all': Does not remove any punctuation from the result
26
+ # Punctuation 'semi': Removes common punctuation (such as full stops)
27
+ # and does not remove less common punctuation (such as questions marks)
28
+ # This is useful for text alignment as less common punctuation can help
29
+ # identify a sentence (like a fingerprint) while common punctuation
30
+ # (like stop words) should be removed.
31
+ # Punctuation 'none': Removes all punctuation from the result
32
+ # Punctuation 'only': Removes everything except punctuation. The
33
+ # returned result is an array of only the punctuation.
34
+ # @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
35
+ # Numbers 'all': Does not remove any numbers from the result
36
+ # Numbers 'semi': Removes tokens that include only digits
37
+ # Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
38
+ # Numbers 'only': Removes everything except tokens that include a number
39
+ # @option opts [Integer] :minimum_length - minimum length of the token in characters
40
+ # @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
41
+ # @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
42
+ # @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
43
+ # @option opts [Boolean] :downcase - (default: true)
44
+ # @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
45
+ # @option opts [Boolean] :remove_emoji - (default: false)
46
+ # @option opts [Boolean] :remove_emails - (default: false)
47
+ # @option opts [Boolean] :remove_urls - (default: false)
48
+ # @option opts [Boolean] :remove_domains - (default: false)
49
+
50
+ def initialize(text, opts = {})
51
+ @text = CGI.unescapeHTML(text)
52
+ @filter_languages = opts[:filter_languages] || []
53
+ @language = opts[:language] || 'en'
54
+ @language_module = Languages.get_language_by_code(@language.to_s)
55
+ @expand_contractions = opts[:expand_contractions] || false
56
+ @remove_stop_words = opts[:remove_stop_words] || false
57
+ if @filter_languages.empty?
58
+ @abbreviations = opts[:abbreviations] || @language_module::ABBREVIATIONS
59
+ @contractions = opts[:contractions] || @language_module::CONTRACTIONS
60
+ @stop_words = opts[:stop_words] || @language_module::STOP_WORDS
61
+ else
62
+ merged_abbreviations = []
63
+ @filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
64
+ merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
65
+ @abbreviations = merged_abbreviations.flatten
66
+
67
+ merged_contractions = {}
68
+ @filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
69
+ merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
70
+ @contractions = merged_contractions
71
+
72
+ merged_stop_words = []
73
+ @filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
74
+ merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
75
+ @stop_words = merged_stop_words.flatten
76
+ end
77
+ @punctuation = opts[:punctuation] || 'all'
78
+ @numbers = opts[:numbers] || 'all'
79
+ @minimum_length = opts[:minimum_length] || 0
80
+ @long_word_split = opts[:long_word_split]
81
+ @mentions = opts[:mentions] || 'keep_original'
82
+ @hashtags = opts[:hashtags] || 'keep_original'
83
+ @downcase = opts[:downcase].nil? ? true : opts[:downcase]
84
+ @clean = opts[:clean] || false
85
+ @classic_filter = opts[:classic_filter] || false
86
+ @remove_emoji = opts[:remove_emoji] || false
87
+ @remove_emails = opts[:remove_emails] || false
88
+ @remove_urls = opts[:remove_urls] || false
89
+ @remove_domains = opts[:remove_domains] || false
90
+
10
91
  unless punctuation.to_s.eql?('all') ||
11
92
  punctuation.to_s.eql?('semi') ||
12
93
  punctuation.to_s.eql?('none') ||
13
94
  punctuation.to_s.eql?('only')
14
95
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
15
- # Punctuation 'all': Does not remove any punctuation from the result
16
-
17
- # Punctuation 'semi': Removes common punctuation (such as full stops)
18
- # and does not remove less common punctuation (such as questions marks)
19
- # This is useful for text alignment as less common punctuation can help
20
- # identify a sentence (like a fingerprint) while common punctuation
21
- # (like stop words) should be removed.
22
-
23
- # Punctuation 'none': Removes all punctuation from the result
24
-
25
- # Punctuation 'only': Removes everything except punctuation. The
26
- # returned result is an array of only the punctuation.
96
+ end
97
+ unless numbers.to_s.eql?('all') ||
98
+ numbers.to_s.eql?('semi') ||
99
+ numbers.to_s.eql?('none') ||
100
+ numbers.to_s.eql?('only')
101
+ raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
102
+ end
103
+ unless mentions.to_s.eql?('keep_original') ||
104
+ mentions.to_s.eql?('keep_and_clean') ||
105
+ mentions.to_s.eql?('remove')
106
+ raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
27
107
  end
28
108
  raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
29
- @text = CGI.unescapeHTML(text)
30
- @language = language.to_s
31
- @language_module = Languages.get_language_by_code(language.to_s)
32
- @punctuation = punctuation.to_s
33
- @remove_stop_words = remove_stop_words
34
- @expand_contractions = expand_contractions
35
- @clean = clean
36
- @remove_numbers = remove_numbers
37
- @minimum_length = minimum_length
38
- @remove_roman_numerals = remove_roman_numerals
39
- @downcase = downcase
40
- @remove_en_stop_words = remove_en_stop_words
109
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless minimum_length.class == Fixnum || minimum_length.nil?
110
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless long_word_split.class == Fixnum || long_word_split.nil?
41
111
  end
42
112
 
43
113
  def tokenize
44
114
  return [] unless text
45
115
  tokens = []
46
116
  text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
47
- tokens << delete_en_stop_words(
48
- delete_stop_words(
49
- downcase_tokens(
50
- cleaner(
51
- remove_short_tokens(
52
- delete_numbers(
53
- delete_roman_numerals(
54
- find_contractions(
55
- remove_punctuation(
56
- split_at_middle_period_1(
57
- split_at_middle_period_2(
58
- split_beginning_period(
59
- split_at_plus_sign(
60
- shift_no_spaces_between_sentences(
61
- split_at_forward_slash(
62
- processor.new(language: language_module).process(text: segment)
63
- ))))))))))))))).reject { |t| t.empty? }
117
+ tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
64
118
  end
65
119
  tokens.flatten
66
120
  end
67
121
 
68
- def domains
69
- text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
70
- end
71
-
72
- def urls
73
- text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
74
- end
75
-
76
- def emails
77
- text.split(' ').delete_if { |t| t !~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
78
- end
79
-
80
- def hashtags
81
- text.split(' ').delete_if { |t| t !~ /(#|#)/ }.map { |t| t.chomp('.') }
82
- end
83
-
84
- def mentions
85
- text.split(' ').delete_if { |t| t !~ /(@|@)/ }.map { |t| t.chomp('.') }
86
- end
87
-
88
- def emoticons
89
- text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
90
- end
91
-
92
- def emoji
93
- # https://github.com/franklsf95/ruby-emoji-regex
94
- text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
95
- end
96
-
97
122
  private
98
123
 
99
- def processor
100
- language_module::Processor
101
- rescue
102
- Processor
103
- end
104
-
105
- def split_at_middle_period_1(tokens)
106
- tokens.flat_map { |t| t.include?(".") &&
107
- t !~ /(http|https|www)(\.|:)/ &&
108
- t.length > 1 &&
109
- t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
110
- t !~ /\S+(@|@)\S+/ &&
111
- language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
112
- end
113
-
114
- def split_at_middle_period_2(tokens)
115
- tokens.flat_map { |t| t.include?(".") &&
116
- t !~ /(http|https|www)(\.|:)/ &&
117
- t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
118
- t !~ /\.[a-z]{2}/ &&
119
- t.length > 2 &&
120
- t.count(".") == 1 &&
121
- t !~ /\d+/ &&
122
- !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
123
- t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
124
- end
125
-
126
- def split_beginning_period(tokens)
127
- tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
128
- end
129
-
130
- def shift_no_spaces_between_sentences(tokens)
131
- tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
132
- end
133
-
134
- def downcase_tokens(tokens)
135
- return tokens unless downcase
136
- tokens.map { |t| Unicode::downcase(t) }
137
- end
138
-
139
- def remove_short_tokens(tokens)
140
- tokens.delete_if { |t| t.length < minimum_length }
141
- end
142
-
143
- def delete_numbers(tokens)
144
- return tokens unless remove_numbers
145
- tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
146
- end
147
-
148
- def delete_roman_numerals(tokens)
149
- return tokens unless remove_roman_numerals
150
- tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") } if remove_roman_numerals
124
+ def post_process(text)
125
+ @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
126
+ downcase! if downcase
127
+ expand_contractions!(contractions) if expand_contractions
128
+ clean! if clean
129
+ classic_filter! if classic_filter
130
+ process_numbers!
131
+ remove_short_tokens! if minimum_length > 0
132
+ process_punctuation!
133
+ remove_stop_words!(stop_words) if remove_stop_words
134
+ remove_emoji! if remove_emoji
135
+ remove_emails! if remove_emails
136
+ mentions! if mentions
137
+ hashtags! if hashtags
138
+ remove_urls! if remove_urls
139
+ remove_domains! if remove_domains
140
+ split_long_words! if long_word_split
141
+ @tokens.reject { |t| t.empty? }
142
+ end
143
+
144
+ def downcase!
145
+ @tokens.map! { |t| Unicode::downcase(t) }
146
+ end
147
+
148
+ def expand_contractions!(contractions)
149
+ if downcase
150
+ @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
151
+ else
152
+ @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
153
+ end
151
154
  end
152
155
 
153
- def cleaner(tokens)
154
- return tokens unless clean
155
- tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
156
- .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
156
+ def clean!
157
+ @tokens = @tokens.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
157
158
  .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
158
159
  .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
159
160
  .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
160
161
  .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
+ .map { |t| t.gsub(/[[:cntrl:]]/, '') }
161
163
  .delete_if { |t| t =~ /\A-+\z/ ||
162
164
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
163
165
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
164
166
  t.length > 50 ||
165
- (t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
167
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i)
166
168
  }
167
169
  end
168
170
 
169
- def remove_punctuation(tokens)
170
- case punctuation
171
- when 'all'
172
- tokens
171
+ def classic_filter!
172
+ @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s") : t.chomp("'s").chomp("’s").chomp("`s") }
173
+ end
174
+
175
+ def process_numbers!
176
+ case numbers.to_s
173
177
  when 'semi'
174
- tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
178
+ @tokens.delete_if { |t| t =~ /\A\d+\z/ }
175
179
  when 'none'
176
- tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
180
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
177
181
  when 'only'
178
- only_punctuation(tokens)
182
+ @tokens.delete_if { |t| t =~ /\A\D+\z/ }
179
183
  end
180
184
  end
181
185
 
182
- def only_punctuation(tokens)
183
- tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
186
+ def remove_short_tokens!
187
+ @tokens.delete_if { |t| t.length < minimum_length }
184
188
  end
185
189
 
186
- def delete_stop_words(tokens)
187
- return tokens unless remove_stop_words && language_module::STOP_WORDS
188
- if downcase
189
- tokens.map { |t| Unicode::downcase(t) } - language_module::STOP_WORDS
190
- else
191
- tokens.delete_if { |t| language_module::STOP_WORDS.include?(Unicode::downcase(t)) }
190
+ def process_punctuation!
191
+ case punctuation.to_s
192
+ when 'semi'
193
+ @tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
194
+ when 'none'
195
+ @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
196
+ when 'only'
197
+ @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
192
198
  end
193
199
  end
194
200
 
195
- def delete_en_stop_words(tokens)
196
- return tokens unless remove_en_stop_words
201
+ def remove_stop_words!(stop_words)
197
202
  if downcase
198
- tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
203
+ @tokens = @tokens - stop_words
199
204
  else
200
- tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
205
+ @tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
201
206
  end
202
207
  end
203
208
 
204
- def split_at_forward_slash(tokens)
205
- tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
209
+ def remove_emoji!
210
+ @tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX }
206
211
  end
207
212
 
208
- def split_at_plus_sign(tokens)
209
- tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
213
+ def remove_emails!
214
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
210
215
  end
211
216
 
212
- def find_contractions(tokens)
213
- return tokens unless expand_contractions && language_module::CONTRACTIONS
214
- if downcase
215
- tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
216
- else
217
- tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
217
+ def mentions!
218
+ case mentions.to_s
219
+ when 'remove'
220
+ @tokens.delete_if { |t| t =~ /\A(@|@)/ }
221
+ when 'keep_and_clean'
222
+ @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
223
+ end
224
+ end
225
+
226
+ def hashtags!
227
+ case hashtags.to_s
228
+ when 'remove'
229
+ @tokens.delete_if { |t| t =~ /\A(#|#)/ }
230
+ when 'keep_and_clean'
231
+ @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
218
232
  end
219
233
  end
234
+
235
+ def remove_urls!
236
+ @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
237
+ end
238
+
239
+ def remove_domains!
240
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
241
+ end
242
+
243
+ def split_long_words!
244
+ @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
245
+ .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
246
+ end
220
247
  end
221
- end
248
+ end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.5.0"
2
+ VERSION = "1.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-15 00:00:00.000000000 Z
11
+ date: 2016-01-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -97,6 +97,8 @@ files:
97
97
  - bin/console
98
98
  - bin/setup
99
99
  - lib/pragmatic_tokenizer.rb
100
+ - lib/pragmatic_tokenizer/ending_punctuation_separator.rb
101
+ - lib/pragmatic_tokenizer/full_stop_separator.rb
100
102
  - lib/pragmatic_tokenizer/languages.rb
101
103
  - lib/pragmatic_tokenizer/languages/arabic.rb
102
104
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -123,7 +125,8 @@ files:
123
125
  - lib/pragmatic_tokenizer/languages/spanish.rb
124
126
  - lib/pragmatic_tokenizer/languages/swedish.rb
125
127
  - lib/pragmatic_tokenizer/languages/turkish.rb
126
- - lib/pragmatic_tokenizer/processor.rb
128
+ - lib/pragmatic_tokenizer/post_processor.rb
129
+ - lib/pragmatic_tokenizer/pre_processor.rb
127
130
  - lib/pragmatic_tokenizer/tokenizer.rb
128
131
  - lib/pragmatic_tokenizer/version.rb
129
132
  - pragmatic_tokenizer.gemspec