pragmatic_tokenizer 0.5.0 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +133 -151
  3. data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +31 -0
  4. data/lib/pragmatic_tokenizer/full_stop_separator.rb +38 -0
  5. data/lib/pragmatic_tokenizer/languages/arabic.rb +3 -3
  6. data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
  7. data/lib/pragmatic_tokenizer/languages/catalan.rb +3 -3
  8. data/lib/pragmatic_tokenizer/languages/common.rb +14 -8
  9. data/lib/pragmatic_tokenizer/languages/czech.rb +3 -3
  10. data/lib/pragmatic_tokenizer/languages/danish.rb +3 -3
  11. data/lib/pragmatic_tokenizer/languages/deutsch.rb +2 -2
  12. data/lib/pragmatic_tokenizer/languages/dutch.rb +3 -3
  13. data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
  14. data/lib/pragmatic_tokenizer/languages/finnish.rb +3 -3
  15. data/lib/pragmatic_tokenizer/languages/french.rb +3 -3
  16. data/lib/pragmatic_tokenizer/languages/greek.rb +3 -3
  17. data/lib/pragmatic_tokenizer/languages/indonesian.rb +3 -3
  18. data/lib/pragmatic_tokenizer/languages/italian.rb +3 -3
  19. data/lib/pragmatic_tokenizer/languages/latvian.rb +3 -3
  20. data/lib/pragmatic_tokenizer/languages/norwegian.rb +3 -3
  21. data/lib/pragmatic_tokenizer/languages/persian.rb +3 -3
  22. data/lib/pragmatic_tokenizer/languages/polish.rb +3 -3
  23. data/lib/pragmatic_tokenizer/languages/portuguese.rb +3 -3
  24. data/lib/pragmatic_tokenizer/languages/romanian.rb +3 -3
  25. data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
  26. data/lib/pragmatic_tokenizer/languages/slovak.rb +3 -3
  27. data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
  28. data/lib/pragmatic_tokenizer/languages/swedish.rb +3 -3
  29. data/lib/pragmatic_tokenizer/languages/turkish.rb +3 -3
  30. data/lib/pragmatic_tokenizer/languages.rb +0 -2
  31. data/lib/pragmatic_tokenizer/post_processor.rb +49 -0
  32. data/lib/pragmatic_tokenizer/{processor.rb → pre_processor.rb} +35 -98
  33. data/lib/pragmatic_tokenizer/tokenizer.rb +186 -159
  34. data/lib/pragmatic_tokenizer/version.rb +1 -1
  35. metadata +6 -3
@@ -1,221 +1,248 @@
1
1
  # -*- encoding : utf-8 -*-
2
2
  require 'pragmatic_tokenizer/languages'
3
+ require 'pragmatic_tokenizer/pre_processor'
4
+ require 'pragmatic_tokenizer/post_processor'
5
+ require 'pragmatic_tokenizer/full_stop_separator'
6
+ require 'pragmatic_tokenizer/ending_punctuation_separator'
3
7
  require 'unicode'
4
8
 
5
9
  module PragmaticTokenizer
6
10
  class Tokenizer
7
11
 
8
- attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase, :remove_en_stop_words
9
- def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true, remove_en_stop_words: false)
12
+ attr_reader :text, :punctuation, :language_module, :expand_contractions, :numbers, :minimum_length, :downcase, :classic_filter, :filter_languages, :abbreviations, :contractions, :clean, :remove_stop_words, :stop_words, :remove_emoji, :remove_emails, :mentions, :hashtags, :remove_urls, :remove_domains, :long_word_split
13
+
14
+ # @param [String] text to be tokenized
15
+ # @param [Hash] opts optional arguments
16
+
17
+ # @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
18
+ # @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
19
+ # @option opts [Boolean] :expand_contractions - (default: false)
20
+ # @option opts [Boolean] :remove_stop_words - (default: false)
21
+ # @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
22
+ # @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
23
+ # @option opts [Hash] :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
24
+ # @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
25
+ # Punctuation 'all': Does not remove any punctuation from the result
26
+ # Punctuation 'semi': Removes common punctuation (such as full stops)
27
+ # and does not remove less common punctuation (such as questions marks)
28
+ # This is useful for text alignment as less common punctuation can help
29
+ # identify a sentence (like a fingerprint) while common punctuation
30
+ # (like stop words) should be removed.
31
+ # Punctuation 'none': Removes all punctuation from the result
32
+ # Punctuation 'only': Removes everything except punctuation. The
33
+ # returned result is an array of only the punctuation.
34
+ # @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
35
+ # Numbers 'all': Does not remove any numbers from the result
36
+ # Numbers 'semi': Removes tokens that include only digits
37
+ # Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
38
+ # Numbers 'only': Removes everything except tokens that include a number
39
+ # @option opts [Integer] :minimum_length - minimum length of the token in characters
40
+ # @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
41
+ # @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
42
+ # @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
43
+ # @option opts [Boolean] :downcase - (default: true)
44
+ # @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
45
+ # @option opts [Boolean] :remove_emoji - (default: false)
46
+ # @option opts [Boolean] :remove_emails - (default: false)
47
+ # @option opts [Boolean] :remove_urls - (default: false)
48
+ # @option opts [Boolean] :remove_domains - (default: false)
49
+
50
+ def initialize(text, opts = {})
51
+ @text = CGI.unescapeHTML(text)
52
+ @filter_languages = opts[:filter_languages] || []
53
+ @language = opts[:language] || 'en'
54
+ @language_module = Languages.get_language_by_code(@language.to_s)
55
+ @expand_contractions = opts[:expand_contractions] || false
56
+ @remove_stop_words = opts[:remove_stop_words] || false
57
+ if @filter_languages.empty?
58
+ @abbreviations = opts[:abbreviations] || @language_module::ABBREVIATIONS
59
+ @contractions = opts[:contractions] || @language_module::CONTRACTIONS
60
+ @stop_words = opts[:stop_words] || @language_module::STOP_WORDS
61
+ else
62
+ merged_abbreviations = []
63
+ @filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
64
+ merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
65
+ @abbreviations = merged_abbreviations.flatten
66
+
67
+ merged_contractions = {}
68
+ @filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
69
+ merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
70
+ @contractions = merged_contractions
71
+
72
+ merged_stop_words = []
73
+ @filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
74
+ merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
75
+ @stop_words = merged_stop_words.flatten
76
+ end
77
+ @punctuation = opts[:punctuation] || 'all'
78
+ @numbers = opts[:numbers] || 'all'
79
+ @minimum_length = opts[:minimum_length] || 0
80
+ @long_word_split = opts[:long_word_split]
81
+ @mentions = opts[:mentions] || 'keep_original'
82
+ @hashtags = opts[:hashtags] || 'keep_original'
83
+ @downcase = opts[:downcase].nil? ? true : opts[:downcase]
84
+ @clean = opts[:clean] || false
85
+ @classic_filter = opts[:classic_filter] || false
86
+ @remove_emoji = opts[:remove_emoji] || false
87
+ @remove_emails = opts[:remove_emails] || false
88
+ @remove_urls = opts[:remove_urls] || false
89
+ @remove_domains = opts[:remove_domains] || false
90
+
10
91
  unless punctuation.to_s.eql?('all') ||
11
92
  punctuation.to_s.eql?('semi') ||
12
93
  punctuation.to_s.eql?('none') ||
13
94
  punctuation.to_s.eql?('only')
14
95
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
15
- # Punctuation 'all': Does not remove any punctuation from the result
16
-
17
- # Punctuation 'semi': Removes common punctuation (such as full stops)
18
- # and does not remove less common punctuation (such as questions marks)
19
- # This is useful for text alignment as less common punctuation can help
20
- # identify a sentence (like a fingerprint) while common punctuation
21
- # (like stop words) should be removed.
22
-
23
- # Punctuation 'none': Removes all punctuation from the result
24
-
25
- # Punctuation 'only': Removes everything except punctuation. The
26
- # returned result is an array of only the punctuation.
96
+ end
97
+ unless numbers.to_s.eql?('all') ||
98
+ numbers.to_s.eql?('semi') ||
99
+ numbers.to_s.eql?('none') ||
100
+ numbers.to_s.eql?('only')
101
+ raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
102
+ end
103
+ unless mentions.to_s.eql?('keep_original') ||
104
+ mentions.to_s.eql?('keep_and_clean') ||
105
+ mentions.to_s.eql?('remove')
106
+ raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
27
107
  end
28
108
  raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
29
- @text = CGI.unescapeHTML(text)
30
- @language = language.to_s
31
- @language_module = Languages.get_language_by_code(language.to_s)
32
- @punctuation = punctuation.to_s
33
- @remove_stop_words = remove_stop_words
34
- @expand_contractions = expand_contractions
35
- @clean = clean
36
- @remove_numbers = remove_numbers
37
- @minimum_length = minimum_length
38
- @remove_roman_numerals = remove_roman_numerals
39
- @downcase = downcase
40
- @remove_en_stop_words = remove_en_stop_words
109
+ raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless minimum_length.class == Fixnum || minimum_length.nil?
110
+ raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless long_word_split.class == Fixnum || long_word_split.nil?
41
111
  end
42
112
 
43
113
  def tokenize
44
114
  return [] unless text
45
115
  tokens = []
46
116
  text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
47
- tokens << delete_en_stop_words(
48
- delete_stop_words(
49
- downcase_tokens(
50
- cleaner(
51
- remove_short_tokens(
52
- delete_numbers(
53
- delete_roman_numerals(
54
- find_contractions(
55
- remove_punctuation(
56
- split_at_middle_period_1(
57
- split_at_middle_period_2(
58
- split_beginning_period(
59
- split_at_plus_sign(
60
- shift_no_spaces_between_sentences(
61
- split_at_forward_slash(
62
- processor.new(language: language_module).process(text: segment)
63
- ))))))))))))))).reject { |t| t.empty? }
117
+ tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
64
118
  end
65
119
  tokens.flatten
66
120
  end
67
121
 
68
- def domains
69
- text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
70
- end
71
-
72
- def urls
73
- text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
74
- end
75
-
76
- def emails
77
- text.split(' ').delete_if { |t| t !~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
78
- end
79
-
80
- def hashtags
81
- text.split(' ').delete_if { |t| t !~ /(#|#)/ }.map { |t| t.chomp('.') }
82
- end
83
-
84
- def mentions
85
- text.split(' ').delete_if { |t| t !~ /(@|@)/ }.map { |t| t.chomp('.') }
86
- end
87
-
88
- def emoticons
89
- text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
90
- end
91
-
92
- def emoji
93
- # https://github.com/franklsf95/ruby-emoji-regex
94
- text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
95
- end
96
-
97
122
  private
98
123
 
99
- def processor
100
- language_module::Processor
101
- rescue
102
- Processor
103
- end
104
-
105
- def split_at_middle_period_1(tokens)
106
- tokens.flat_map { |t| t.include?(".") &&
107
- t !~ /(http|https|www)(\.|:)/ &&
108
- t.length > 1 &&
109
- t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
110
- t !~ /\S+(@|@)\S+/ &&
111
- language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
112
- end
113
-
114
- def split_at_middle_period_2(tokens)
115
- tokens.flat_map { |t| t.include?(".") &&
116
- t !~ /(http|https|www)(\.|:)/ &&
117
- t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
118
- t !~ /\.[a-z]{2}/ &&
119
- t.length > 2 &&
120
- t.count(".") == 1 &&
121
- t !~ /\d+/ &&
122
- !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
123
- t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
124
- end
125
-
126
- def split_beginning_period(tokens)
127
- tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
128
- end
129
-
130
- def shift_no_spaces_between_sentences(tokens)
131
- tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
132
- end
133
-
134
- def downcase_tokens(tokens)
135
- return tokens unless downcase
136
- tokens.map { |t| Unicode::downcase(t) }
137
- end
138
-
139
- def remove_short_tokens(tokens)
140
- tokens.delete_if { |t| t.length < minimum_length }
141
- end
142
-
143
- def delete_numbers(tokens)
144
- return tokens unless remove_numbers
145
- tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
146
- end
147
-
148
- def delete_roman_numerals(tokens)
149
- return tokens unless remove_roman_numerals
150
- tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") } if remove_roman_numerals
124
+ def post_process(text)
125
+ @tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
126
+ downcase! if downcase
127
+ expand_contractions!(contractions) if expand_contractions
128
+ clean! if clean
129
+ classic_filter! if classic_filter
130
+ process_numbers!
131
+ remove_short_tokens! if minimum_length > 0
132
+ process_punctuation!
133
+ remove_stop_words!(stop_words) if remove_stop_words
134
+ remove_emoji! if remove_emoji
135
+ remove_emails! if remove_emails
136
+ mentions! if mentions
137
+ hashtags! if hashtags
138
+ remove_urls! if remove_urls
139
+ remove_domains! if remove_domains
140
+ split_long_words! if long_word_split
141
+ @tokens.reject { |t| t.empty? }
142
+ end
143
+
144
+ def downcase!
145
+ @tokens.map! { |t| Unicode::downcase(t) }
146
+ end
147
+
148
+ def expand_contractions!(contractions)
149
+ if downcase
150
+ @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
151
+ else
152
+ @tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
153
+ end
151
154
  end
152
155
 
153
- def cleaner(tokens)
154
- return tokens unless clean
155
- tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
156
- .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
156
+ def clean!
157
+ @tokens = @tokens.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
157
158
  .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
158
159
  .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
159
160
  .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
160
161
  .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
162
+ .map { |t| t.gsub(/[[:cntrl:]]/, '') }
161
163
  .delete_if { |t| t =~ /\A-+\z/ ||
162
164
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
163
165
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
164
166
  t.length > 50 ||
165
- (t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
167
+ (t.length > 1 && t =~ /[&*+<=>^|~]/i)
166
168
  }
167
169
  end
168
170
 
169
- def remove_punctuation(tokens)
170
- case punctuation
171
- when 'all'
172
- tokens
171
+ def classic_filter!
172
+ @tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s") : t.chomp("'s").chomp("’s").chomp("`s") }
173
+ end
174
+
175
+ def process_numbers!
176
+ case numbers.to_s
173
177
  when 'semi'
174
- tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
178
+ @tokens.delete_if { |t| t =~ /\A\d+\z/ }
175
179
  when 'none'
176
- tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
180
+ @tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
177
181
  when 'only'
178
- only_punctuation(tokens)
182
+ @tokens.delete_if { |t| t =~ /\A\D+\z/ }
179
183
  end
180
184
  end
181
185
 
182
- def only_punctuation(tokens)
183
- tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
186
+ def remove_short_tokens!
187
+ @tokens.delete_if { |t| t.length < minimum_length }
184
188
  end
185
189
 
186
- def delete_stop_words(tokens)
187
- return tokens unless remove_stop_words && language_module::STOP_WORDS
188
- if downcase
189
- tokens.map { |t| Unicode::downcase(t) } - language_module::STOP_WORDS
190
- else
191
- tokens.delete_if { |t| language_module::STOP_WORDS.include?(Unicode::downcase(t)) }
190
+ def process_punctuation!
191
+ case punctuation.to_s
192
+ when 'semi'
193
+ @tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
194
+ when 'none'
195
+ @tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
196
+ when 'only'
197
+ @tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
192
198
  end
193
199
  end
194
200
 
195
- def delete_en_stop_words(tokens)
196
- return tokens unless remove_en_stop_words
201
+ def remove_stop_words!(stop_words)
197
202
  if downcase
198
- tokens.map { |t| Unicode::downcase(t) } - PragmaticTokenizer::Languages::English::STOP_WORDS
203
+ @tokens = @tokens - stop_words
199
204
  else
200
- tokens.delete_if { |t| PragmaticTokenizer::Languages::English::STOP_WORDS.include?(Unicode::downcase(t)) }
205
+ @tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
201
206
  end
202
207
  end
203
208
 
204
- def split_at_forward_slash(tokens)
205
- tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
209
+ def remove_emoji!
210
+ @tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX }
206
211
  end
207
212
 
208
- def split_at_plus_sign(tokens)
209
- tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
213
+ def remove_emails!
214
+ @tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
210
215
  end
211
216
 
212
- def find_contractions(tokens)
213
- return tokens unless expand_contractions && language_module::CONTRACTIONS
214
- if downcase
215
- tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
216
- else
217
- tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? language_module::CONTRACTIONS[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
217
+ def mentions!
218
+ case mentions.to_s
219
+ when 'remove'
220
+ @tokens.delete_if { |t| t =~ /\A(@|@)/ }
221
+ when 'keep_and_clean'
222
+ @tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
223
+ end
224
+ end
225
+
226
+ def hashtags!
227
+ case hashtags.to_s
228
+ when 'remove'
229
+ @tokens.delete_if { |t| t =~ /\A(#|#)/ }
230
+ when 'keep_and_clean'
231
+ @tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
218
232
  end
219
233
  end
234
+
235
+ def remove_urls!
236
+ @tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
237
+ end
238
+
239
+ def remove_domains!
240
+ @tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
241
+ end
242
+
243
+ def split_long_words!
244
+ @tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
245
+ .map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
246
+ end
220
247
  end
221
- end
248
+ end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.5.0"
2
+ VERSION = "1.0.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-15 00:00:00.000000000 Z
11
+ date: 2016-01-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode
@@ -97,6 +97,8 @@ files:
97
97
  - bin/console
98
98
  - bin/setup
99
99
  - lib/pragmatic_tokenizer.rb
100
+ - lib/pragmatic_tokenizer/ending_punctuation_separator.rb
101
+ - lib/pragmatic_tokenizer/full_stop_separator.rb
100
102
  - lib/pragmatic_tokenizer/languages.rb
101
103
  - lib/pragmatic_tokenizer/languages/arabic.rb
102
104
  - lib/pragmatic_tokenizer/languages/bulgarian.rb
@@ -123,7 +125,8 @@ files:
123
125
  - lib/pragmatic_tokenizer/languages/spanish.rb
124
126
  - lib/pragmatic_tokenizer/languages/swedish.rb
125
127
  - lib/pragmatic_tokenizer/languages/turkish.rb
126
- - lib/pragmatic_tokenizer/processor.rb
128
+ - lib/pragmatic_tokenizer/post_processor.rb
129
+ - lib/pragmatic_tokenizer/pre_processor.rb
127
130
  - lib/pragmatic_tokenizer/tokenizer.rb
128
131
  - lib/pragmatic_tokenizer/version.rb
129
132
  - pragmatic_tokenizer.gemspec