pragmatic_tokenizer 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: db86f5c14e05efabcaedc5d86d730445db7aaed2
4
- data.tar.gz: 87e8cb36fffffef3424f540e5cb8c044b0acf2f9
3
+ metadata.gz: 414905d0126493ffc7224055dd2f79010061662c
4
+ data.tar.gz: 2b0995cc2b16cef7f7a521a65f90011118fd70f7
5
5
  SHA512:
6
- metadata.gz: 2d135b1814bd5385c699d2e6732dc51013bd5fe7e9a1d55e83ac34b37f91f0cfbc97855288c1ce29af3866cbf8a7f561c13c33376a73f929abebbe5f2392cc65
7
- data.tar.gz: 9ef4499aaf6d48df889c069f00d3d56fb3984bbf03f92e7cf83e64f62cce814ec0cecfda36cdffdf5e461d4a28c20176520454b3e998a216cbf6b5617727e0da
6
+ metadata.gz: 62772587ff880bd192c504f9f319e1dd9e11c89bac1e37e67ef95d3f42f451fb3a770f5c57eb7152919c301e49e4d5c06a71001b27781e8ca5228bf4ab29c082
7
+ data.tar.gz: 977cc2c5fd69d0bca8e619860ee78e1161cb509da734ad0290b7dd54401822f399315d7e10606f8f3750a53314167568af6beccd688870bdacc04832c150d606
@@ -1,8 +1,8 @@
1
1
  module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
- PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
5
- PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
4
+ PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”']
5
+ PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™']
@@ -15,9 +15,8 @@ module PragmaticTokenizer
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
- shift_underscore(text)
19
- shift_asterisk(text)
20
- shift_at_symbol(text)
18
+ shift_caret(text)
19
+ shift_vertical_bar(text)
21
20
  convert_dbl_quotes(text)
22
21
  convert_sgl_quotes(text)
23
22
  shift_beginning_hyphen(text)
@@ -35,8 +34,10 @@ module PragmaticTokenizer
35
34
  def convert_dbl_quotes(text)
36
35
  # Convert left double quotes to special character
37
36
  text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
37
+ text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
38
38
  # Convert remaining quotes to special character
39
39
  text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
40
+ text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
40
41
  end
41
42
 
42
43
  def convert_sgl_quotes(text)
@@ -51,6 +52,10 @@ module PragmaticTokenizer
51
52
  text.gsub!(/--+/o, ' - ') || text
52
53
  end
53
54
 
55
+ def shift_vertical_bar(text)
56
+ text.gsub!(/\|/, ' | ') || text
57
+ end
58
+
54
59
  def shift_comma(text)
55
60
  # Shift commas off everything but numbers
56
61
  text.gsub!(/,(?!\d)/o, ' , ') || text
@@ -83,34 +88,26 @@ module PragmaticTokenizer
83
88
  text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
84
89
  end
85
90
 
86
- def shift_underscore(text)
87
- text.gsub!(/(?<=\s)\_+/, ' \1') || text
88
- text.gsub!(/\_+(?=\s)/, ' \1') || text
89
- text.gsub!(/(?<=\A)\_+/, '\1 ') || text
90
- text.gsub!(/\_+(?=\z)/, ' \1') || text
91
- end
92
-
93
- def shift_asterisk(text)
94
- text.gsub!(/\*+/, ' \1 ') || text
95
- end
96
-
97
- def shift_at_symbol(text)
98
- text.gsub!(/(\A|\s)\@/, '\1 ') || text
99
- end
100
-
101
91
  def shift_colon(text)
92
+ puts "Text: #{text}"
102
93
  return text unless text.include?(':') &&
103
- !(/\A\d+/ == text.partition(':').last[0]) &&
104
- !(/\A\d+/ == text.partition(':').first[-1])
94
+ text.partition(':').last[0] !~ /\A\d+/ &&
95
+ text.partition(':').first[-1] !~ /\A\d+/
96
+ puts "YOYOYO"
105
97
  # Ignore web addresses
106
98
  text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
107
99
  text.gsub!(/:/o, ' :') || text
100
+ text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
108
101
  end
109
102
 
110
103
  def shift_semicolon(text)
111
104
  text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
112
105
  end
113
106
 
107
+ def shift_caret(text)
108
+ text.gsub!(/\^/, ' ^ ') || text
109
+ end
110
+
114
111
  def shift_ellipse(text)
115
112
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
116
113
  text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
@@ -167,7 +164,7 @@ module PragmaticTokenizer
167
164
  end
168
165
 
169
166
  def convert_sym_to_punct(token)
170
- symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
167
+ symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
171
168
  if symbol.nil?
172
169
  return token
173
170
  else
@@ -7,10 +7,10 @@ module PragmaticTokenizer
7
7
 
8
8
  attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
9
9
  def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
10
- unless punctuation.eql?('all') ||
11
- punctuation.eql?('semi') ||
12
- punctuation.eql?('none') ||
13
- punctuation.eql?('only')
10
+ unless punctuation.to_s.eql?('all') ||
11
+ punctuation.to_s.eql?('semi') ||
12
+ punctuation.to_s.eql?('none') ||
13
+ punctuation.to_s.eql?('only')
14
14
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
15
15
  # Punctuation 'all': Does not remove any punctuation from the result
16
16
 
@@ -25,10 +25,10 @@ module PragmaticTokenizer
25
25
  # Punctuation 'only': Removes everything except punctuation. The
26
26
  # returned result is an array of only the punctuation.
27
27
  end
28
- @text = CGI.unescapeHTML(text)
29
- @language = language
30
- @language_module = Languages.get_language_by_code(language)
31
- @punctuation = punctuation
28
+ @text = CGI.unescapeHTML(text.to_s)
29
+ @language = language.to_s
30
+ @language_module = Languages.get_language_by_code(language.to_s)
31
+ @punctuation = punctuation.to_s
32
32
  @remove_stop_words = remove_stop_words
33
33
  @expand_contractions = expand_contractions
34
34
  @clean = clean
@@ -40,7 +40,21 @@ module PragmaticTokenizer
40
40
 
41
41
  def tokenize
42
42
  return [] unless text
43
- downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
43
+ downcase_tokens(
44
+ cleaner(
45
+ remove_short_tokens(
46
+ delete_numbers(
47
+ delete_roman_numerals(
48
+ find_contractions(
49
+ delete_stop_words(
50
+ remove_punctuation(
51
+ split_at_middle_period_1(
52
+ split_at_middle_period_2(
53
+ split_beginning_period(
54
+ shift_no_spaces_between_sentences(
55
+ split_at_forward_slash(
56
+ processor.new(language: language_module).process(text: text)
57
+ ))))))))))))).reject { |t| t.empty? }
44
58
  end
45
59
 
46
60
  def domains
@@ -80,6 +94,35 @@ module PragmaticTokenizer
80
94
  Processor
81
95
  end
82
96
 
97
+ def split_at_middle_period_1(tokens)
98
+ tokens.flat_map { |t| t.include?(".") &&
99
+ t !~ /(http|https|www)(\.|:)/ &&
100
+ t.length > 1 &&
101
+ t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
102
+ t !~ /\S+(@|@)\S+/ &&
103
+ language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
104
+ end
105
+
106
+ def split_at_middle_period_2(tokens)
107
+ tokens.flat_map { |t| t.include?(".") &&
108
+ t !~ /(http|https|www)(\.|:)/ &&
109
+ t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
110
+ t !~ /\.[a-z]{2}/ &&
111
+ t.length > 2 &&
112
+ t.count(".") == 1 &&
113
+ t !~ /\d+/ &&
114
+ !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
115
+ t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
116
+ end
117
+
118
+ def split_beginning_period(tokens)
119
+ tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
120
+ end
121
+
122
+ def shift_no_spaces_between_sentences(tokens)
123
+ tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
124
+ end
125
+
83
126
  def downcase_tokens(tokens)
84
127
  return tokens unless downcase
85
128
  tokens.map { |t| Unicode::downcase(t) }
@@ -101,7 +144,13 @@ module PragmaticTokenizer
101
144
 
102
145
  def cleaner(tokens)
103
146
  return tokens unless clean
104
- tokens.delete_if { |t| t =~ /\A-+\z/ ||
147
+ tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
148
+ .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
149
+ .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
150
+ .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
151
+ .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
152
+ .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
153
+ .delete_if { |t| t =~ /\A-+\z/ ||
105
154
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
106
155
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
107
156
  t.length > 50 ||
@@ -135,14 +184,16 @@ module PragmaticTokenizer
135
184
  end
136
185
  end
137
186
 
187
+ def split_at_forward_slash(tokens)
188
+ tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
189
+ end
190
+
138
191
  def find_contractions(tokens)
139
192
  return tokens unless expand_contractions && language_module::CONTRACTIONS
140
193
  if downcase
141
194
  tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').flatten : t }
142
- .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
143
195
  else
144
196
  tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
145
- .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
146
197
  end
147
198
  end
148
199
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-12 00:00:00.000000000 Z
11
+ date: 2016-01-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode