pragmatic_tokenizer 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: db86f5c14e05efabcaedc5d86d730445db7aaed2
4
- data.tar.gz: 87e8cb36fffffef3424f540e5cb8c044b0acf2f9
3
+ metadata.gz: 414905d0126493ffc7224055dd2f79010061662c
4
+ data.tar.gz: 2b0995cc2b16cef7f7a521a65f90011118fd70f7
5
5
  SHA512:
6
- metadata.gz: 2d135b1814bd5385c699d2e6732dc51013bd5fe7e9a1d55e83ac34b37f91f0cfbc97855288c1ce29af3866cbf8a7f561c13c33376a73f929abebbe5f2392cc65
7
- data.tar.gz: 9ef4499aaf6d48df889c069f00d3d56fb3984bbf03f92e7cf83e64f62cce814ec0cecfda36cdffdf5e461d4a28c20176520454b3e998a216cbf6b5617727e0da
6
+ metadata.gz: 62772587ff880bd192c504f9f319e1dd9e11c89bac1e37e67ef95d3f42f451fb3a770f5c57eb7152919c301e49e4d5c06a71001b27781e8ca5228bf4ab29c082
7
+ data.tar.gz: 977cc2c5fd69d0bca8e619860ee78e1161cb509da734ad0290b7dd54401822f399315d7e10606f8f3750a53314167568af6beccd688870bdacc04832c150d606
@@ -1,8 +1,8 @@
1
1
  module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
- PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
5
- PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
4
+ PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^', '”']
5
+ PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚", "”" => "⚘" }
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
8
8
  SPECIAL_CHARACTERS = ['®', '©', '™']
@@ -15,9 +15,8 @@ module PragmaticTokenizer
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
- shift_underscore(text)
19
- shift_asterisk(text)
20
- shift_at_symbol(text)
18
+ shift_caret(text)
19
+ shift_vertical_bar(text)
21
20
  convert_dbl_quotes(text)
22
21
  convert_sgl_quotes(text)
23
22
  shift_beginning_hyphen(text)
@@ -35,8 +34,10 @@ module PragmaticTokenizer
35
34
  def convert_dbl_quotes(text)
36
35
  # Convert left double quotes to special character
37
36
  text.gsub!(/"(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
37
+ text.gsub!(/“(?=.*\w)/o, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['“'] + ' ') || text
38
38
  # Convert remaining quotes to special character
39
39
  text.gsub!(/"/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['"'] + ' ') || text
40
+ text.gsub!(/”/, ' ' + PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP['”'] + ' ') || text
40
41
  end
41
42
 
42
43
  def convert_sgl_quotes(text)
@@ -51,6 +52,10 @@ module PragmaticTokenizer
51
52
  text.gsub!(/--+/o, ' - ') || text
52
53
  end
53
54
 
55
+ def shift_vertical_bar(text)
56
+ text.gsub!(/\|/, ' | ') || text
57
+ end
58
+
54
59
  def shift_comma(text)
55
60
  # Shift commas off everything but numbers
56
61
  text.gsub!(/,(?!\d)/o, ' , ') || text
@@ -83,34 +88,26 @@ module PragmaticTokenizer
83
88
  text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
84
89
  end
85
90
 
86
- def shift_underscore(text)
87
- text.gsub!(/(?<=\s)\_+/, ' \1') || text
88
- text.gsub!(/\_+(?=\s)/, ' \1') || text
89
- text.gsub!(/(?<=\A)\_+/, '\1 ') || text
90
- text.gsub!(/\_+(?=\z)/, ' \1') || text
91
- end
92
-
93
- def shift_asterisk(text)
94
- text.gsub!(/\*+/, ' \1 ') || text
95
- end
96
-
97
- def shift_at_symbol(text)
98
- text.gsub!(/(\A|\s)\@/, '\1 ') || text
99
- end
100
-
101
91
  def shift_colon(text)
92
+ puts "Text: #{text}"
102
93
  return text unless text.include?(':') &&
103
- !(/\A\d+/ == text.partition(':').last[0]) &&
104
- !(/\A\d+/ == text.partition(':').first[-1])
94
+ text.partition(':').last[0] !~ /\A\d+/ &&
95
+ text.partition(':').first[-1] !~ /\A\d+/
96
+ puts "YOYOYO"
105
97
  # Ignore web addresses
106
98
  text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
107
99
  text.gsub!(/:/o, ' :') || text
100
+ text.gsub!(/(?<=\s):(?=\#)/, ': ') || text
108
101
  end
109
102
 
110
103
  def shift_semicolon(text)
111
104
  text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
112
105
  end
113
106
 
107
+ def shift_caret(text)
108
+ text.gsub!(/\^/, ' ^ ') || text
109
+ end
110
+
114
111
  def shift_ellipse(text)
115
112
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
116
113
  text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
@@ -167,7 +164,7 @@ module PragmaticTokenizer
167
164
  end
168
165
 
169
166
  def convert_sym_to_punct(token)
170
- symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
167
+ symbol = /[♳ ♴ ♵ ♶ ♷ ♸ ♹ ♺ ⚀ ⚁ ⚂ ⚃ ⚄ ⚅ ☇ ☈ ☉ ☊ ☋ ☌ ☍ ☠ ☢ ☣ ☤ ☥ ☦ ☧ ☀ ☁ ☂ ☃ ☄ ☮ ♔ ♕ ♖ ♗ ♘ ♙ ♚]/.match(token)
171
168
  if symbol.nil?
172
169
  return token
173
170
  else
@@ -7,10 +7,10 @@ module PragmaticTokenizer
7
7
 
8
8
  attr_reader :text, :language, :punctuation, :remove_stop_words, :expand_contractions, :language_module, :clean, :remove_numbers, :minimum_length, :remove_roman_numerals, :downcase
9
9
  def initialize(text, language: 'en', punctuation: 'all', remove_stop_words: false, expand_contractions: false, clean: false, remove_numbers: false, minimum_length: 0, remove_roman_numerals: false, downcase: true)
10
- unless punctuation.eql?('all') ||
11
- punctuation.eql?('semi') ||
12
- punctuation.eql?('none') ||
13
- punctuation.eql?('only')
10
+ unless punctuation.to_s.eql?('all') ||
11
+ punctuation.to_s.eql?('semi') ||
12
+ punctuation.to_s.eql?('none') ||
13
+ punctuation.to_s.eql?('only')
14
14
  raise "Punctuation argument can be only be nil, 'all', 'semi', 'none', or 'only'"
15
15
  # Punctuation 'all': Does not remove any punctuation from the result
16
16
 
@@ -25,10 +25,10 @@ module PragmaticTokenizer
25
25
  # Punctuation 'only': Removes everything except punctuation. The
26
26
  # returned result is an array of only the punctuation.
27
27
  end
28
- @text = CGI.unescapeHTML(text)
29
- @language = language
30
- @language_module = Languages.get_language_by_code(language)
31
- @punctuation = punctuation
28
+ @text = CGI.unescapeHTML(text.to_s)
29
+ @language = language.to_s
30
+ @language_module = Languages.get_language_by_code(language.to_s)
31
+ @punctuation = punctuation.to_s
32
32
  @remove_stop_words = remove_stop_words
33
33
  @expand_contractions = expand_contractions
34
34
  @clean = clean
@@ -40,7 +40,21 @@ module PragmaticTokenizer
40
40
 
41
41
  def tokenize
42
42
  return [] unless text
43
- downcase_tokens(cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))).reject { |t| t.empty? }
43
+ downcase_tokens(
44
+ cleaner(
45
+ remove_short_tokens(
46
+ delete_numbers(
47
+ delete_roman_numerals(
48
+ find_contractions(
49
+ delete_stop_words(
50
+ remove_punctuation(
51
+ split_at_middle_period_1(
52
+ split_at_middle_period_2(
53
+ split_beginning_period(
54
+ shift_no_spaces_between_sentences(
55
+ split_at_forward_slash(
56
+ processor.new(language: language_module).process(text: text)
57
+ ))))))))))))).reject { |t| t.empty? }
44
58
  end
45
59
 
46
60
  def domains
@@ -80,6 +94,35 @@ module PragmaticTokenizer
80
94
  Processor
81
95
  end
82
96
 
97
+ def split_at_middle_period_1(tokens)
98
+ tokens.flat_map { |t| t.include?(".") &&
99
+ t !~ /(http|https|www)(\.|:)/ &&
100
+ t.length > 1 &&
101
+ t !~ /(\s+|\A)[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}(:[0-9]{1,5})?(\/.*)?/ix &&
102
+ t !~ /\S+(@|@)\S+/ &&
103
+ language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) ? t.gsub!(/\./, '\1. \2').split(' ').flatten : t }
104
+ end
105
+
106
+ def split_at_middle_period_2(tokens)
107
+ tokens.flat_map { |t| t.include?(".") &&
108
+ t !~ /(http|https|www)(\.|:)/ &&
109
+ t !~ /\.(com|net|org|edu|gov|mil|int)/ &&
110
+ t !~ /\.[a-z]{2}/ &&
111
+ t.length > 2 &&
112
+ t.count(".") == 1 &&
113
+ t !~ /\d+/ &&
114
+ !language_module::ABBREVIATIONS.include?(Unicode::downcase(t.split(".")[0] == nil ? '' : t.split(".")[0])) &&
115
+ t !~ /\S+(@|@)\S+/ ? t.gsub!(/\./, '\1 . \2').split(' ').flatten : t }
116
+ end
117
+
118
+ def split_beginning_period(tokens)
119
+ tokens.flat_map { |t| t =~ /\A\.[^\.]/ && t.length > 1 ? t.gsub!(/\./, '\1 ').split(' ').flatten : t }
120
+ end
121
+
122
+ def shift_no_spaces_between_sentences(tokens)
123
+ tokens.flat_map { |t| t.include?("?") && t !~ /(http|https|www)(\.|:)/ && t.length > 1 ? t.gsub!(/\?/, '\1 \2').split(' ').flatten : t }
124
+ end
125
+
83
126
  def downcase_tokens(tokens)
84
127
  return tokens unless downcase
85
128
  tokens.map { |t| Unicode::downcase(t) }
@@ -101,7 +144,13 @@ module PragmaticTokenizer
101
144
 
102
145
  def cleaner(tokens)
103
146
  return tokens unless clean
104
- tokens.delete_if { |t| t =~ /\A-+\z/ ||
147
+ tokens.flat_map { |t| t =~ /(\A|\s)\@/ ? t.gsub!(/\@/, '\1 ').split(' ').flatten : t }
148
+ .flat_map { |t| t =~ /(?<=\s)\_+/ ? t.gsub!(/(?<=\s)\_+/, ' \1').split(' ').flatten : t }
149
+ .flat_map { |t| t =~ /\_+(?=\s)/ ? t.gsub!(/\_+(?=\s)/, ' \1').split(' ').flatten : t }
150
+ .flat_map { |t| t =~ /(?<=\A)\_+/ ? t.gsub!(/(?<=\A)\_+/, '\1 ').split(' ').flatten : t }
151
+ .flat_map { |t| t =~ /\_+(?=\z)/ ? t.gsub!(/\_+(?=\z)/, ' \1').split(' ').flatten : t }
152
+ .flat_map { |t| t =~ /\*+/ ? t.gsub!(/\*+/, '\1 ').split(' ').flatten : t }
153
+ .delete_if { |t| t =~ /\A-+\z/ ||
105
154
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
106
155
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
107
156
  t.length > 50 ||
@@ -135,14 +184,16 @@ module PragmaticTokenizer
135
184
  end
136
185
  end
137
186
 
187
+ def split_at_forward_slash(tokens)
188
+ tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
189
+ end
190
+
138
191
  def find_contractions(tokens)
139
192
  return tokens unless expand_contractions && language_module::CONTRACTIONS
140
193
  if downcase
141
194
  tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').flatten : t }
142
- .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
143
195
  else
144
196
  tokens.flat_map { |t| language_module::CONTRACTIONS.has_key?(Unicode::downcase(t)) ? language_module::CONTRACTIONS[Unicode::downcase(t)].split(' ').each_with_index.map { |t, i| i.eql?(0) ? Unicode::capitalize(t) : t }.flatten : t }
145
- .flat_map { |t| t.include?("/") ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
146
197
  end
147
198
  end
148
199
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.2.4"
2
+ VERSION = "0.3.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-12 00:00:00.000000000 Z
11
+ date: 2016-01-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: unicode