pragmatic_tokenizer 0.5.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +133 -151
- data/lib/pragmatic_tokenizer/ending_punctuation_separator.rb +31 -0
- data/lib/pragmatic_tokenizer/full_stop_separator.rb +38 -0
- data/lib/pragmatic_tokenizer/languages/arabic.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/bulgarian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/catalan.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/common.rb +14 -8
- data/lib/pragmatic_tokenizer/languages/czech.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/danish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/deutsch.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/dutch.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/english.rb +2 -2
- data/lib/pragmatic_tokenizer/languages/finnish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/french.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/greek.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/indonesian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/italian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/latvian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/norwegian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/persian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/polish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/portuguese.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/romanian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/russian.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/slovak.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/spanish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/swedish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages/turkish.rb +3 -3
- data/lib/pragmatic_tokenizer/languages.rb +0 -2
- data/lib/pragmatic_tokenizer/post_processor.rb +49 -0
- data/lib/pragmatic_tokenizer/{processor.rb → pre_processor.rb} +35 -98
- data/lib/pragmatic_tokenizer/tokenizer.rb +186 -159
- data/lib/pragmatic_tokenizer/version.rb +1 -1
- metadata +6 -3
@@ -1,221 +1,248 @@
|
|
1
1
|
# -*- encoding : utf-8 -*-
|
2
2
|
require 'pragmatic_tokenizer/languages'
|
3
|
+
require 'pragmatic_tokenizer/pre_processor'
|
4
|
+
require 'pragmatic_tokenizer/post_processor'
|
5
|
+
require 'pragmatic_tokenizer/full_stop_separator'
|
6
|
+
require 'pragmatic_tokenizer/ending_punctuation_separator'
|
3
7
|
require 'unicode'
|
4
8
|
|
5
9
|
module PragmaticTokenizer
|
6
10
|
class Tokenizer
|
7
11
|
|
8
|
-
attr_reader :text, :
|
9
|
-
|
12
|
+
attr_reader :text, :punctuation, :language_module, :expand_contractions, :numbers, :minimum_length, :downcase, :classic_filter, :filter_languages, :abbreviations, :contractions, :clean, :remove_stop_words, :stop_words, :remove_emoji, :remove_emails, :mentions, :hashtags, :remove_urls, :remove_domains, :long_word_split
|
13
|
+
|
14
|
+
# @param [String] text to be tokenized
|
15
|
+
# @param [Hash] opts optional arguments
|
16
|
+
|
17
|
+
# @option opts [Array] :filter_languages - user-supplied array of languages from which that language's stop words, abbreviations and contractions should be used when calculating the resulting tokens - array elements should be of the String class or can be symbols
|
18
|
+
# @option opts [String] :language - two character ISO 639-1 code - can be a String or symbol (i.e. :en or 'en')
|
19
|
+
# @option opts [Boolean] :expand_contractions - (default: false)
|
20
|
+
# @option opts [Boolean] :remove_stop_words - (default: false)
|
21
|
+
# @option opts [Array] :abbreviations - user-supplied array of abbreviations (each element should be downcased with final period removed) - array elements should be of the String class
|
22
|
+
# @option opts [Array] :stop_words - user-supplied array of stop words - array elements should be of the String class
|
23
|
+
# @option opts [Hash] :contractions - user-supplied hash of contractions (key is the contracted form; value is the expanded form - both the key and value should be downcased)
|
24
|
+
# @option opts [String] :punctuation - see description below - can be a String or symbol (i.e. :none or 'none')
|
25
|
+
# Punctuation 'all': Does not remove any punctuation from the result
|
26
|
+
# Punctuation 'semi': Removes common punctuation (such as full stops)
|
27
|
+
# and does not remove less common punctuation (such as questions marks)
|
28
|
+
# This is useful for text alignment as less common punctuation can help
|
29
|
+
# identify a sentence (like a fingerprint) while common punctuation
|
30
|
+
# (like stop words) should be removed.
|
31
|
+
# Punctuation 'none': Removes all punctuation from the result
|
32
|
+
# Punctuation 'only': Removes everything except punctuation. The
|
33
|
+
# returned result is an array of only the punctuation.
|
34
|
+
# @option opts [String] :numbers - see description below - can be a String or symbol (i.e. :none or 'none')
|
35
|
+
# Numbers 'all': Does not remove any numbers from the result
|
36
|
+
# Numbers 'semi': Removes tokens that include only digits
|
37
|
+
# Numbers 'none': Removes all tokens that include a number from the result (including Roman numerals)
|
38
|
+
# Numbers 'only': Removes everything except tokens that include a number
|
39
|
+
# @option opts [Integer] :minimum_length - minimum length of the token in characters
|
40
|
+
# @option opts [Integer] :long_word_split - the specified length to split long words at any hyphen or underscore.
|
41
|
+
# @option opts [String] :mentions - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
|
42
|
+
# @option opts [String] :hashtags - :remove (will completely remove it), :keep_and_clean (will prefix) and :keep_original (don't alter the token at all). - can be a String or symbol (i.e. :keep_and_clean or 'keep_and_clean')
|
43
|
+
# @option opts [Boolean] :downcase - (default: true)
|
44
|
+
# @option opts [Boolean] :classic_filter - removes dots from acronyms and 's from the end of tokens - (default: false)
|
45
|
+
# @option opts [Boolean] :remove_emoji - (default: false)
|
46
|
+
# @option opts [Boolean] :remove_emails - (default: false)
|
47
|
+
# @option opts [Boolean] :remove_urls - (default: false)
|
48
|
+
# @option opts [Boolean] :remove_domains - (default: false)
|
49
|
+
|
50
|
+
def initialize(text, opts = {})
|
51
|
+
@text = CGI.unescapeHTML(text)
|
52
|
+
@filter_languages = opts[:filter_languages] || []
|
53
|
+
@language = opts[:language] || 'en'
|
54
|
+
@language_module = Languages.get_language_by_code(@language.to_s)
|
55
|
+
@expand_contractions = opts[:expand_contractions] || false
|
56
|
+
@remove_stop_words = opts[:remove_stop_words] || false
|
57
|
+
if @filter_languages.empty?
|
58
|
+
@abbreviations = opts[:abbreviations] || @language_module::ABBREVIATIONS
|
59
|
+
@contractions = opts[:contractions] || @language_module::CONTRACTIONS
|
60
|
+
@stop_words = opts[:stop_words] || @language_module::STOP_WORDS
|
61
|
+
else
|
62
|
+
merged_abbreviations = []
|
63
|
+
@filter_languages.map { |l| merged_abbreviations << Languages.get_language_by_code(l.to_s)::ABBREVIATIONS.flatten }
|
64
|
+
merged_abbreviations << opts[:abbreviations].flatten unless opts[:abbreviations].nil?
|
65
|
+
@abbreviations = merged_abbreviations.flatten
|
66
|
+
|
67
|
+
merged_contractions = {}
|
68
|
+
@filter_languages.map { |l| merged_contractions = merged_contractions.merge(Languages.get_language_by_code(l.to_s)::CONTRACTIONS) }
|
69
|
+
merged_contractions = merged_contractions.merge(opts[:contractions]) unless opts[:contractions].nil?
|
70
|
+
@contractions = merged_contractions
|
71
|
+
|
72
|
+
merged_stop_words = []
|
73
|
+
@filter_languages.map { |l| merged_stop_words << Languages.get_language_by_code(l.to_s)::STOP_WORDS.flatten }
|
74
|
+
merged_stop_words << opts[:stop_words].flatten unless opts[:stop_words].nil?
|
75
|
+
@stop_words = merged_stop_words.flatten
|
76
|
+
end
|
77
|
+
@punctuation = opts[:punctuation] || 'all'
|
78
|
+
@numbers = opts[:numbers] || 'all'
|
79
|
+
@minimum_length = opts[:minimum_length] || 0
|
80
|
+
@long_word_split = opts[:long_word_split]
|
81
|
+
@mentions = opts[:mentions] || 'keep_original'
|
82
|
+
@hashtags = opts[:hashtags] || 'keep_original'
|
83
|
+
@downcase = opts[:downcase].nil? ? true : opts[:downcase]
|
84
|
+
@clean = opts[:clean] || false
|
85
|
+
@classic_filter = opts[:classic_filter] || false
|
86
|
+
@remove_emoji = opts[:remove_emoji] || false
|
87
|
+
@remove_emails = opts[:remove_emails] || false
|
88
|
+
@remove_urls = opts[:remove_urls] || false
|
89
|
+
@remove_domains = opts[:remove_domains] || false
|
90
|
+
|
10
91
|
unless punctuation.to_s.eql?('all') ||
|
11
92
|
punctuation.to_s.eql?('semi') ||
|
12
93
|
punctuation.to_s.eql?('none') ||
|
13
94
|
punctuation.to_s.eql?('only')
|
14
95
|
raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
# returned result is an array of only the punctuation.
|
96
|
+
end
|
97
|
+
unless numbers.to_s.eql?('all') ||
|
98
|
+
numbers.to_s.eql?('semi') ||
|
99
|
+
numbers.to_s.eql?('none') ||
|
100
|
+
numbers.to_s.eql?('only')
|
101
|
+
raise "Numbers argument can be only be nil, 'all', 'semi', 'none', or 'only'"
|
102
|
+
end
|
103
|
+
unless mentions.to_s.eql?('keep_original') ||
|
104
|
+
mentions.to_s.eql?('keep_and_clean') ||
|
105
|
+
mentions.to_s.eql?('remove')
|
106
|
+
raise "Mentions argument can be only be nil, 'keep_original', 'keep_and_clean', or 'remove'"
|
27
107
|
end
|
28
108
|
raise "In Pragmatic Tokenizer text must be a String" unless text.class == String
|
29
|
-
|
30
|
-
|
31
|
-
@language_module = Languages.get_language_by_code(language.to_s)
|
32
|
-
@punctuation = punctuation.to_s
|
33
|
-
@remove_stop_words = remove_stop_words
|
34
|
-
@expand_contractions = expand_contractions
|
35
|
-
@clean = clean
|
36
|
-
@remove_numbers = remove_numbers
|
37
|
-
@minimum_length = minimum_length
|
38
|
-
@remove_roman_numerals = remove_roman_numerals
|
39
|
-
@downcase = downcase
|
40
|
-
@remove_en_stop_words = remove_en_stop_words
|
109
|
+
raise "In Pragmatic Tokenizer minimum_length must be an Integer" unless minimum_length.class == Fixnum || minimum_length.nil?
|
110
|
+
raise "In Pragmatic Tokenizer long_word_split must be an Integer" unless long_word_split.class == Fixnum || long_word_split.nil?
|
41
111
|
end
|
42
112
|
|
43
113
|
def tokenize
|
44
114
|
return [] unless text
|
45
115
|
tokens = []
|
46
116
|
text.scan(/.{,10000}(?=\s|\z)/m).each do |segment|
|
47
|
-
tokens <<
|
48
|
-
delete_stop_words(
|
49
|
-
downcase_tokens(
|
50
|
-
cleaner(
|
51
|
-
remove_short_tokens(
|
52
|
-
delete_numbers(
|
53
|
-
delete_roman_numerals(
|
54
|
-
find_contractions(
|
55
|
-
remove_punctuation(
|
56
|
-
split_at_middle_period_1(
|
57
|
-
split_at_middle_period_2(
|
58
|
-
split_beginning_period(
|
59
|
-
split_at_plus_sign(
|
60
|
-
shift_no_spaces_between_sentences(
|
61
|
-
split_at_forward_slash(
|
62
|
-
processor.new(language: language_module).process(text: segment)
|
63
|
-
))))))))))))))).reject { |t| t.empty? }
|
117
|
+
tokens << post_process(PreProcessor.new(language: language_module).pre_process(text: segment))
|
64
118
|
end
|
65
119
|
tokens.flatten
|
66
120
|
end
|
67
121
|
|
68
|
-
def domains
|
69
|
-
text.split(' ').delete_if { |t| t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }.map { |t| t.chomp('.').chomp(',').chomp(';').chomp(':') }
|
70
|
-
end
|
71
|
-
|
72
|
-
def urls
|
73
|
-
text.split(' ').delete_if { |t| t !~ /(http|https|www)(\.|:)/ }.map { |t| t.chomp('.') }
|
74
|
-
end
|
75
|
-
|
76
|
-
def emails
|
77
|
-
text.split(' ').delete_if { |t| t !~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
78
|
-
end
|
79
|
-
|
80
|
-
def hashtags
|
81
|
-
text.split(' ').delete_if { |t| t !~ /(#|#)/ }.map { |t| t.chomp('.') }
|
82
|
-
end
|
83
|
-
|
84
|
-
def mentions
|
85
|
-
text.split(' ').delete_if { |t| t !~ /(@|@)/ }.map { |t| t.chomp('.') }
|
86
|
-
end
|
87
|
-
|
88
|
-
def emoticons
|
89
|
-
text.scan(/(?::|;|=)(?:-)?(?:\)|D|P)/)
|
90
|
-
end
|
91
|
-
|
92
|
-
def emoji
|
93
|
-
# https://github.com/franklsf95/ruby-emoji-regex
|
94
|
-
text.scan(/[\u{203C}\u{2049}\u{20E3}\u{2122}\u{2139}\u{2194}-\u{2199}\u{21A9}-\u{21AA}\u{231A}-\u{231B}\u{23E9}-\u{23EC}\u{23F0}\u{23F3}\u{24C2}\u{25AA}-\u{25AB}\u{25B6}\u{25C0}\u{25FB}-\u{25FE}\u{2600}-\u{2601}\u{260E}\u{2611}\u{2614}-\u{2615}\u{261D}\u{263A}\u{2648}-\u{2653}\u{2660}\u{2663}\u{2665}-\u{2666}\u{2668}\u{267B}\u{267F}\u{2693}\u{26A0}-\u{26A1}\u{26AA}-\u{26AB}\u{26BD}-\u{26BE}\u{26C4}-\u{26C5}\u{26CE}\u{26D4}\u{26EA}\u{26F2}-\u{26F3}\u{26F5}\u{26FA}\u{26FD}\u{2702}\u{2705}\u{2708}-\u{270C}\u{270F}\u{2712}\u{2714}\u{2716}\u{2728}\u{2733}-\u{2734}\u{2744}\u{2747}\u{274C}\u{274E}\u{2753}-\u{2755}\u{2757}\u{2764}\u{2795}-\u{2797}\u{27A1}\u{27B0}\u{2934}-\u{2935}\u{2B05}-\u{2B07}\u{2B1B}-\u{2B1C}\u{2B50}\u{2B55}\u{3030}\u{303D}\u{3297}\u{3299}\u{1F004}\u{1F0CF}\u{1F170}-\u{1F171}\u{1F17E}-\u{1F17F}\u{1F18E}\u{1F191}-\u{1F19A}\u{1F1E7}-\u{1F1EC}\u{1F1EE}-\u{1F1F0}\u{1F1F3}\u{1F1F5}\u{1F1F7}-\u{1F1FA}\u{1F201}-\u{1F202}\u{1F21A}\u{1F22F}\u{1F232}-\u{1F23A}\u{1F250}-\u{1F251}\u{1F300}-\u{1F320}\u{1F330}-\u{1F335}\u{1F337}-\u{1F37C}\u{1F380}-\u{1F393}\u{1F3A0}-\u{1F3C4}\u{1F3C6}-\u{1F3CA}\u{1F3E0}-\u{1F3F0}\u{1F400}-\u{1F43E}\u{1F440}\u{1F442}-\u{1F4F7}\u{1F4F9}-\u{1F4FC}\u{1F500}-\u{1F507}\u{1F509}-\u{1F53D}\u{1F550}-\u{1F567}\u{1F5FB}-\u{1F640}\u{1F645}-\u{1F64F}\u{1F680}-\u{1F68A}]/)
|
95
|
-
end
|
96
|
-
|
97
122
|
private
|
98
123
|
|
99
|
-
def
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
def shift_no_spaces_between_sentences(tokens)
|
131
|
-
tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
|
132
|
-
end
|
133
|
-
|
134
|
-
def downcase_tokens(tokens)
|
135
|
-
return tokens unless downcase
|
136
|
-
tokens.map { |t| Unicode::downcase(t) }
|
137
|
-
end
|
138
|
-
|
139
|
-
def remove_short_tokens(tokens)
|
140
|
-
tokens.delete_if { |t| t.length < minimum_length }
|
141
|
-
end
|
142
|
-
|
143
|
-
def delete_numbers(tokens)
|
144
|
-
return tokens unless remove_numbers
|
145
|
-
tokens.delete_if { |t| t =~ /\D*\d+\d*/ }
|
146
|
-
end
|
147
|
-
|
148
|
-
def delete_roman_numerals(tokens)
|
149
|
-
return tokens unless remove_roman_numerals
|
150
|
-
tokens.delete_if { |t| PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") } if remove_roman_numerals
|
124
|
+
def post_process(text)
|
125
|
+
@tokens = PostProcessor.new(text: text, abbreviations: abbreviations).post_process
|
126
|
+
downcase! if downcase
|
127
|
+
expand_contractions!(contractions) if expand_contractions
|
128
|
+
clean! if clean
|
129
|
+
classic_filter! if classic_filter
|
130
|
+
process_numbers!
|
131
|
+
remove_short_tokens! if minimum_length > 0
|
132
|
+
process_punctuation!
|
133
|
+
remove_stop_words!(stop_words) if remove_stop_words
|
134
|
+
remove_emoji! if remove_emoji
|
135
|
+
remove_emails! if remove_emails
|
136
|
+
mentions! if mentions
|
137
|
+
hashtags! if hashtags
|
138
|
+
remove_urls! if remove_urls
|
139
|
+
remove_domains! if remove_domains
|
140
|
+
split_long_words! if long_word_split
|
141
|
+
@tokens.reject { |t| t.empty? }
|
142
|
+
end
|
143
|
+
|
144
|
+
def downcase!
|
145
|
+
@tokens.map! { |t| Unicode::downcase(t) }
|
146
|
+
end
|
147
|
+
|
148
|
+
def expand_contractions!(contractions)
|
149
|
+
if downcase
|
150
|
+
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').flatten : t }
|
151
|
+
else
|
152
|
+
@tokens = @tokens.flat_map { |t| contractions.has_key?(Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))) ? contractions[Unicode::downcase(t.gsub(/[‘’‚‛‹›'´`]/, "'"))].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
|
153
|
+
end
|
151
154
|
end
|
152
155
|
|
153
|
-
def
|
154
|
-
|
155
|
-
tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
|
156
|
-
.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
156
|
+
def clean!
|
157
|
+
@tokens = @tokens.flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
|
157
158
|
.flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
|
158
159
|
.flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
|
159
160
|
.flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
|
160
161
|
.flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
|
162
|
+
.map { |t| t.gsub(/[[:cntrl:]]/, '') }
|
161
163
|
.delete_if { |t| t =~ /\A-+\z/ ||
|
162
164
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
163
165
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
164
166
|
t.length > 50 ||
|
165
|
-
(t.length > 1 && t =~ /[
|
167
|
+
(t.length > 1 && t =~ /[&*+<=>^|~]/i)
|
166
168
|
}
|
167
169
|
end
|
168
170
|
|
169
|
-
def
|
170
|
-
|
171
|
-
|
172
|
-
|
171
|
+
def classic_filter!
|
172
|
+
@tokens.map! { |t| abbreviations.include?(t.chomp(".")) ? t.gsub('.', '').chomp("'s").chomp("’s").chomp("`s") : t.chomp("'s").chomp("’s").chomp("`s") }
|
173
|
+
end
|
174
|
+
|
175
|
+
def process_numbers!
|
176
|
+
case numbers.to_s
|
173
177
|
when 'semi'
|
174
|
-
tokens
|
178
|
+
@tokens.delete_if { |t| t =~ /\A\d+\z/ }
|
175
179
|
when 'none'
|
176
|
-
tokens.delete_if { |t| t =~ /\
|
180
|
+
@tokens.delete_if { |t| t =~ /\D*\d+\d*/ || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?(Unicode::downcase(t)) || PragmaticTokenizer::Languages::Common::ROMAN_NUMERALS.include?("#{Unicode::downcase(t)}.") }
|
177
181
|
when 'only'
|
178
|
-
|
182
|
+
@tokens.delete_if { |t| t =~ /\A\D+\z/ }
|
179
183
|
end
|
180
184
|
end
|
181
185
|
|
182
|
-
def
|
183
|
-
tokens.delete_if { |t|
|
186
|
+
def remove_short_tokens!
|
187
|
+
@tokens.delete_if { |t| t.length < minimum_length }
|
184
188
|
end
|
185
189
|
|
186
|
-
def
|
187
|
-
|
188
|
-
|
189
|
-
tokens
|
190
|
-
|
191
|
-
tokens.delete_if { |t|
|
190
|
+
def process_punctuation!
|
191
|
+
case punctuation.to_s
|
192
|
+
when 'semi'
|
193
|
+
@tokens = @tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
194
|
+
when 'none'
|
195
|
+
@tokens = @tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
196
|
+
when 'only'
|
197
|
+
@tokens.delete_if { |t| !PragmaticTokenizer::Languages::Common::PUNCTUATION.include?(t) }
|
192
198
|
end
|
193
199
|
end
|
194
200
|
|
195
|
-
def
|
196
|
-
return tokens unless remove_en_stop_words
|
201
|
+
def remove_stop_words!(stop_words)
|
197
202
|
if downcase
|
198
|
-
tokens
|
203
|
+
@tokens = @tokens - stop_words
|
199
204
|
else
|
200
|
-
tokens.delete_if { |t|
|
205
|
+
@tokens.delete_if { |t| stop_words.include?(Unicode::downcase(t)) }
|
201
206
|
end
|
202
207
|
end
|
203
208
|
|
204
|
-
def
|
205
|
-
tokens.
|
209
|
+
def remove_emoji!
|
210
|
+
@tokens.delete_if { |t| t =~ PragmaticTokenizer::Languages::Common::EMOJI_REGEX }
|
206
211
|
end
|
207
212
|
|
208
|
-
def
|
209
|
-
tokens.
|
213
|
+
def remove_emails!
|
214
|
+
@tokens.delete_if { |t| t =~ /\S+(@|@)\S+/ }.map { |t| t.chomp('.') }
|
210
215
|
end
|
211
216
|
|
212
|
-
def
|
213
|
-
|
214
|
-
|
215
|
-
tokens.
|
216
|
-
|
217
|
-
tokens.
|
217
|
+
def mentions!
|
218
|
+
case mentions.to_s
|
219
|
+
when 'remove'
|
220
|
+
@tokens.delete_if { |t| t =~ /\A(@|@)/ }
|
221
|
+
when 'keep_and_clean'
|
222
|
+
@tokens.map! { |t| t =~ /\A(@|@)/ ? t.gsub!(/(?<=\A)(@|@)/, '') : t }
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def hashtags!
|
227
|
+
case hashtags.to_s
|
228
|
+
when 'remove'
|
229
|
+
@tokens.delete_if { |t| t =~ /\A(#|#)/ }
|
230
|
+
when 'keep_and_clean'
|
231
|
+
@tokens.map! { |t| t =~ /\A(#|#)/ ? t.gsub!(/(?<=\A)(#|#)/, '') : t }
|
218
232
|
end
|
219
233
|
end
|
234
|
+
|
235
|
+
def remove_urls!
|
236
|
+
@tokens.delete_if { |t| t =~ /(http|https)(\.|:)/ }
|
237
|
+
end
|
238
|
+
|
239
|
+
def remove_domains!
|
240
|
+
@tokens.delete_if { |t| t =~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix }
|
241
|
+
end
|
242
|
+
|
243
|
+
def split_long_words!
|
244
|
+
@tokens.map! { |t| t.length > long_word_split ? t.gsub(/\-/, '\1 \2').split(' ').flatten : t }
|
245
|
+
.map! { |t| t.length > long_word_split ? t.gsub(/\_/, '\1 \2').split(' ').flatten : t }
|
246
|
+
end
|
220
247
|
end
|
221
|
-
end
|
248
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: unicode
|
@@ -97,6 +97,8 @@ files:
|
|
97
97
|
- bin/console
|
98
98
|
- bin/setup
|
99
99
|
- lib/pragmatic_tokenizer.rb
|
100
|
+
- lib/pragmatic_tokenizer/ending_punctuation_separator.rb
|
101
|
+
- lib/pragmatic_tokenizer/full_stop_separator.rb
|
100
102
|
- lib/pragmatic_tokenizer/languages.rb
|
101
103
|
- lib/pragmatic_tokenizer/languages/arabic.rb
|
102
104
|
- lib/pragmatic_tokenizer/languages/bulgarian.rb
|
@@ -123,7 +125,8 @@ files:
|
|
123
125
|
- lib/pragmatic_tokenizer/languages/spanish.rb
|
124
126
|
- lib/pragmatic_tokenizer/languages/swedish.rb
|
125
127
|
- lib/pragmatic_tokenizer/languages/turkish.rb
|
126
|
-
- lib/pragmatic_tokenizer/
|
128
|
+
- lib/pragmatic_tokenizer/post_processor.rb
|
129
|
+
- lib/pragmatic_tokenizer/pre_processor.rb
|
127
130
|
- lib/pragmatic_tokenizer/tokenizer.rb
|
128
131
|
- lib/pragmatic_tokenizer/version.rb
|
129
132
|
- pragmatic_tokenizer.gemspec
|