pragmatic_tokenizer 0.5.0 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +133 -151
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +31 -0
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +38 -0
- data/lib/pragmatic_tokenizer/languages/arabic.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/catalan.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -8
- data/lib/pragmatic_tokenizer/languages/czech.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/danish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/dutch.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/finnish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/french.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/greek.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/italian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/latvian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/persian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/polish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/romanian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/turkish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages.rb +0 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +49 -0
- data/lib/pragmatic_tokenizer/{processor.rb → pre_processor.rb} +35 -98
- data/lib/pragmatic_tokenizer/tokenizer.rb +186 -159
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +6 -3
@@ -1,221 +1,248 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
require 'pragmatic_tokenizer/languages'
|
3
|
+
require 'pragmatic_tokenizer/pre_processor'
|
4
|
+
require 'pragmatic_tokenizer/post_processor'
|
5
|
+
require 'pragmatic_tokenizer/full_stop_separator'
|
6
|
+
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
3
7
|
require 'unicode'
|
4
8
|
|
5
9
|
module PragmaticTokenizer
|
6
10
|
class Tokenizer
|
7
11
|
|
8
|
-
attr_reader :text, :
|
9
|
-
|
12
|
+
attr_reader :text, :punctuation, :language_module, :expand_contractions, :numbers, :minimum_length, :downcase, :classic_filter, :filter_languages, :abbreviations, :contractions, :clean, :remove_stop_words, :stop_words, :remove_emoji, :remove_emails, :mentions, :hashtags, :remove_urls, :remove_domains, :long_word_split
|
13
|
+
|
14
|
+
# @param [String] text to be tokenized
|
15
|
+
# @param [Hash] opts optional arguments
|
16
|
+
|
17
|
+
# @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
|
18
|
+
# @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
|
19
|
+
# @option opts [Boolean] :expand_contractions - (default: false)
|
20
|
+
# @option opts [Boolean] :remove_stop_words - (default: false)
|
21
|
+
# @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
|
22
|
+
# @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
|
23
|
+
# @option opts [Hash] :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
|
24
|
+
# @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
|
25
|
+
# Punctuation 'all': Does not remove any punctuation from the result
|
26
|
+
# Punctuation 'semi': Removes common punctuation (such as full stops)
|
27
|
+
# and does not remove less common punctuation (such as questions marks)
|
28
|
+
# This is useful for text alignment as less common punctuation can help
|
29
|
+
# identify a sentence (like a fingerprint) while common punctuation
|
30
|
+
# (like stop words) should be removed.
|
31
|
+
# Punctuation 'none': Removes all punctuation from the result
|
32
|
+
# Punctuation 'only': Removes everything except punctuation. The
|
33
|
+
# returned result is an array of only the punctuation.
|
34
|
+
# @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
|
35
|
+
# Numbers 'all': Does not remove any numbers from the result
|
36
|
+
# Numbers 'semi': Removes tokens that include only digits
|
37
|
+
# Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
|
38
|
+
# Numbers 'only': Removes everything except tokens that include a number
|
39
|
+
# @option opts [Integer] :minimum_length - minimum length of the token in characters
|
40
|
+
# @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
|
41
|
+
# @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
|
42
|
+
# @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
|
43
|
+
# @option opts [Boolean] :downcase - (default: true)
|
44
|
+
# @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
|
45
|
+
# @option opts [Boolean] :remove_emoji - (default: false)
|
46
|
+
# @option opts [Boolean] :remove_emails - (default: false)
|
47
|
+
# @option opts [Boolean] :remove_urls - (default: false)
|
48
|
+
# @option opts [Boolean] :remove_domains - (default: false)
|
49
|
+
|
50
|
+
def initialize(text, opts = {})
|
51
|
+
@text = CGI.unescapeHTML(text)
|
52
|
+
@filter_languages = opts[:filter_languages] || []
|
53
|
+
@language = opts[:language] || 'en'
|
54
|
+
@language_module = Languages.get_language_by_code(@language.to_s)
|
55
|
+
@expand_contractions = opts[:expand_contractions] || false
|
56
|
+
@remove_stop_words = opts[:remove_stop_words] || false
|
57
|
+
if @filter_languages.empty?
|
58
|
+
@abbreviations = opts[:abbreviations] || @language_module::ABBREVIATIONS
|
59
|
+
@contractions = opts[:contractions] || @language_module::CONTRACTIONS
|
60
|
+
@stop_words = opts[:stop_words] || @language_module::STOP_WORDS
|
61
|
+
else
|
62
|
+
merged_abbreviations = []
|
63
|
+
@filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
|
64
|
+
merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
|
65
|
+
@abbreviations = merged_abbreviations.flatten
|
66
|
+
|
67
|
+
merged_contractions = {}
|
68
|
+
@filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
|
69
|
+
merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
|
70
|
+
@contractions = merged_contractions
|
71
|
+
|
72
|
+
merged_stop_words = []
|
73
|
+
@filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
|
74
|
+
merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
|
75
|
+
@stop_words = merged_stop_words.flatten
|
76
|
+
end
|
77
|
+
@punctuation = opts[:punctuation] || 'all'
|
78
|
+
@numbers = opts[:numbers] || 'all'
|
79
|
+
@minimum_length = opts[:minimum_length] || 0
|
80
|
+
@long_word_split = opts[:long_word_split]
|
81
|
+
@mentions = opts[:mentions] || 'keep_original'
|
82
|
+
@hashtags = opts[:hashtags] || 'keep_original'
|
83
|
+
@downcase = opts[:downcase].nil? ? true : opts[:downcase]
|
84
|
+
@clean = opts[:clean] || false
|
85
|
+
@classic_filter = opts[:classic_filter] || false
|
86
|
+
@remove_emoji = opts[:remove_emoji] || false
|
87
|
+
@remove_emails = opts[:remove_emails] || false
|
88
|
+
@remove_urls = opts[:remove_urls] || false
|
89
|
+
@remove_domains = opts[:remove_domains] || false
|
90
|
+
|
10
91
|
unless punctuation.to_s.eql?('all') ||
|
11
92
|
punctuation.to_s.eql?('semi') ||
|
12
93
|
punctuation.to_s.eql?('none') ||
|
13
94
|
punctuation.to_s.eql?('only')
|
14
95
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
# returned result is an array of only the punctuation.
|
96
|
+
end
|
97
|
+
unless numbers.to_s.eql?('all') ||
|
98
|
+
numbers.to_s.eql?('semi') ||
|
99
|
+
numbers.to_s.eql?('none') ||
|
100
|
+
numbers.to_s.eql?('only')
|
101
|
+
raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
102
|
+
end
|
103
|
+
unless mentions.to_s.eql?('keep_original') ||
|
104
|
+
mentions.to_s.eql?('keep_and_clean') ||
|
105
|
+
mentions.to_s.eql?('remove')
|
106
|
+
raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
|
27
107
|
end
|
28
108
|
raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
|
29
|
-
|
30
|
-
|
31
|
-
@language_module = Languages.get_language_by_code(language.to_s)
|
32
|
-
@punctuation = punctuation.to_s
|
33
|
-
@remove_stop_words = remove_stop_words
|
34
|
-
@expand_contractions = expand_contractions
|
35
|
-
@clean = clean
|
36
|
-
@remove_numbers = remove_numbers
|
37
|
-
@minimum_length = minimum_length
|
38
|
-
@remove_roman_numerals = remove_roman_numerals
|
39
|
-
@downcase = downcase
|
40
|
-
@remove_en_stop_words = remove_en_stop_words
|
109
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless minimum_length.class == Fixnum || minimum_length.nil?
|
110
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless long_word_split.class == Fixnum || long_word_split.nil?
|
41
111
|
end
|
42
112
|
|
43
113
|
def tokenize
|
44
114
|
return [] unless text
|
45
115
|
tokens = []
|
46
116
|
text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
|
47
|
-
tokens <<
|
48
|
-
delete_stop_words(
|
49
|
-
downcase_tokens(
|
50
|
-
cleaner(
|
51
|
-
remove_short_tokens(
|
52
|
-
delete_numbers(
|
53
|
-
delete_roman_numerals(
|
54
|
-
find_contractions(
|
55
|
-
remove_punctuation(
|
56
|
-
split_at_middle_period_1(
|
57
|
-
split_at_middle_period_2(
|
58
|
-
split_beginning_period(
|
59
|
-
split_at_plus_sign(
|
60
|
-
shift_no_spaces_between_sentences(
|
61
|
-
split_at_forward_slash(
|
62
|
-
processor.new(language: language_module).process(text: segment)
|
63
|
-
))))))))))))))).reject { |t| t.empty? }
|
117
|
+
tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
|
64
118
|
end
|
65
119
|
tokens.flatten
|
66
120
|
end
|
67
121
|
|
68
|
-
def domains
|
69
|
-
text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
|
70
|
-
end
|
71
|
-
|
72
|
-
def urls
|
73
|
-
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
74
|
-
end
|
75
|
-
|
76
|
-
def emails
|
77
|
-
text.split(' ').delete_if { |t| t !~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
78
|
-
end
|
79
|
-
|
80
|
-
def hashtags
|
81
|
-
text.split(' ').delete_if { |t| t !~ /(#|#)/ }.map { |t| t.chomp('.') }
|
82
|
-
end
|
83
|
-
|
84
|
-
def mentions
|
85
|
-
text.split(' ').delete_if { |t| t !~ /(@|@)/ }.map { |t| t.chomp('.') }
|
86
|
-
end
|
87
|
-
|
88
|
-
def emoticons
|
89
|
-
text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
|
90
|
-
end
|
91
|
-
|
92
|
-
def emoji
|
93
|
-
# https://github.com/franklsf95/ruby-emoji-regex
|
94
|
-
text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
|
95
|
-
end
|
96
|
-
|
97
122
|
private
|
98
123
|
|
99
|
-
def
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
def shift_no_spaces_between_sentences(tokens)
|
131
|
-
tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
|
132
|
-
end
|
133
|
-
|
134
|
-
def downcase_tokens(tokens)
|
135
|
-
return tokens unless downcase
|
136
|
-
tokens.map { |t| Unicode::downcase(t) }
|
137
|
-
end
|
138
|
-
|
139
|
-
def remove_short_tokens(tokens)
|
140
|
-
tokens.delete_if { |t| t.length < minimum_length }
|
141
|
-
end
|
142
|
-
|
143
|
-
def delete_numbers(tokens)
|
144
|
-
return tokens unless remove_numbers
|
145
|
-
tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
|
146
|
-
end
|
147
|
-
|
148
|
-
def delete_roman_numerals(tokens)
|
149
|
-
return tokens unless remove_roman_numerals
|
150
|
-
tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") } if remove_roman_numerals
|
124
|
+
def post_process(text)
|
125
|
+
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
|
126
|
+
downcase! if downcase
|
127
|
+
expand_contractions!(contractions) if expand_contractions
|
128
|
+
clean! if clean
|
129
|
+
classic_filter! if classic_filter
|
130
|
+
process_numbers!
|
131
|
+
remove_short_tokens! if minimum_length > 0
|
132
|
+
process_punctuation!
|
133
|
+
remove_stop_words!(stop_words) if remove_stop_words
|
134
|
+
remove_emoji! if remove_emoji
|
135
|
+
remove_emails! if remove_emails
|
136
|
+
mentions! if mentions
|
137
|
+
hashtags! if hashtags
|
138
|
+
remove_urls! if remove_urls
|
139
|
+
remove_domains! if remove_domains
|
140
|
+
split_long_words! if long_word_split
|
141
|
+
@tokens.reject { |t| t.empty? }
|
142
|
+
end
|
143
|
+
|
144
|
+
def downcase!
|
145
|
+
@tokens.map! { |t| Unicode::downcase(t) }
|
146
|
+
end
|
147
|
+
|
148
|
+
def expand_contractions!(contractions)
|
149
|
+
if downcase
|
150
|
+
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
|
151
|
+
else
|
152
|
+
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
153
|
+
end
|
151
154
|
end
|
152
155
|
|
153
|
-
def
|
154
|
-
|
155
|
-
tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
|
156
|
-
.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
156
|
+
def clean!
|
157
|
+
@tokens = @tokens.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
157
158
|
.flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
158
159
|
.flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
159
160
|
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
160
161
|
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
162
|
+
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
161
163
|
.delete_if { |t| t =~ /\A-+\z/ ||
|
162
164
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
163
165
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
164
166
|
t.length > 50 ||
|
165
|
-
(t.length > 1 && t =~ /[
|
167
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i)
|
166
168
|
}
|
167
169
|
end
|
168
170
|
|
169
|
-
def
|
170
|
-
|
171
|
-
|
172
|
-
|
171
|
+
def classic_filter!
|
172
|
+
@tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s") : t.chomp("'s").chomp("’s").chomp("`s") }
|
173
|
+
end
|
174
|
+
|
175
|
+
def process_numbers!
|
176
|
+
case numbers.to_s
|
173
177
|
when 'semi'
|
174
|
-
tokens
|
178
|
+
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
175
179
|
when 'none'
|
176
|
-
tokens.delete_if { |t| t =~ /\
|
180
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
|
177
181
|
when 'only'
|
178
|
-
|
182
|
+
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
179
183
|
end
|
180
184
|
end
|
181
185
|
|
182
|
-
def
|
183
|
-
tokens.delete_if { |t|
|
186
|
+
def remove_short_tokens!
|
187
|
+
@tokens.delete_if { |t| t.length < minimum_length }
|
184
188
|
end
|
185
189
|
|
186
|
-
def
|
187
|
-
|
188
|
-
|
189
|
-
tokens
|
190
|
-
|
191
|
-
tokens.delete_if { |t|
|
190
|
+
def process_punctuation!
|
191
|
+
case punctuation.to_s
|
192
|
+
when 'semi'
|
193
|
+
@tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
194
|
+
when 'none'
|
195
|
+
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
196
|
+
when 'only'
|
197
|
+
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
192
198
|
end
|
193
199
|
end
|
194
200
|
|
195
|
-
def
|
196
|
-
return tokens unless remove_en_stop_words
|
201
|
+
def remove_stop_words!(stop_words)
|
197
202
|
if downcase
|
198
|
-
tokens
|
203
|
+
@tokens = @tokens - stop_words
|
199
204
|
else
|
200
|
-
tokens.delete_if { |t|
|
205
|
+
@tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
|
201
206
|
end
|
202
207
|
end
|
203
208
|
|
204
|
-
def
|
205
|
-
tokens.
|
209
|
+
def remove_emoji!
|
210
|
+
@tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX }
|
206
211
|
end
|
207
212
|
|
208
|
-
def
|
209
|
-
tokens.
|
213
|
+
def remove_emails!
|
214
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
210
215
|
end
|
211
216
|
|
212
|
-
def
|
213
|
-
|
214
|
-
|
215
|
-
tokens.
|
216
|
-
|
217
|
-
tokens.
|
217
|
+
def mentions!
|
218
|
+
case mentions.to_s
|
219
|
+
when 'remove'
|
220
|
+
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
221
|
+
when 'keep_and_clean'
|
222
|
+
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def hashtags!
|
227
|
+
case hashtags.to_s
|
228
|
+
when 'remove'
|
229
|
+
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
230
|
+
when 'keep_and_clean'
|
231
|
+
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
218
232
|
end
|
219
233
|
end
|
234
|
+
|
235
|
+
def remove_urls!
|
236
|
+
@tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
|
237
|
+
end
|
238
|
+
|
239
|
+
def remove_domains!
|
240
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
241
|
+
end
|
242
|
+
|
243
|
+
def split_long_words!
|
244
|
+
@tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
245
|
+
.map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
|
246
|
+
end
|
220
247
|
end
|
221
|
-
end
|
248
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -97,6 +97,8 @@ files:
|
|
97
97
|
- bin/console
|
98
98
|
- bin/setup
|
99
99
|
- lib/pragmatic_tokenizer.rb
|
100
|
+
- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
|
101
|
+
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
100
102
|
- lib/pragmatic_tokenizer/languages.rb
|
101
103
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
102
104
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -123,7 +125,8 @@ files:
|
|
123
125
|
- lib/pragmatic_tokenizer/languages/spanish.rb
|
124
126
|
- lib/pragmatic_tokenizer/languages/swedish.rb
|
125
127
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
126
|
-
- lib/pragmatic_tokenizer/
|
128
|
+
- lib/pragmatic_tokenizer/post_processor.rb
|
129
|
+
- lib/pragmatic_tokenizer/pre_processor.rb
|
127
130
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
128
131
|
- lib/pragmatic_tokenizer/version.rb
|
129
132
|
- pragmatic_tokenizer.gemspec
|