pragmatic_tokenizer 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bd6598d349d883708c73d3f51431cf4a516bafdb
4
- data.tar.gz: ba8ab51aa218c4d5405ec1db2a539e0fffa478ec
3
+ metadata.gz: c4b348d207073bd0812a58b29d1277951ffc63dc
4
+ data.tar.gz: b9c21c948e4164678fe87e666901d327a395c550
5
5
  SHA512:
6
- metadata.gz: 5e3aa17314b8130edf651229048c502e2834517f3f2d793705232248554511937d0157a9ff2dee57191aadeb66ad7dc5303634e9857f6c9fcf51dd9da43072d7
7
- data.tar.gz: 73af6e7bd5807dc7637d662abebb3a27dbf0e2194a51e30e1876d7c931dbaaeab6bc6ee0ddb3cf915a75385fc6613b2a6480d8686f0e2949b7461334453dff87
6
+ metadata.gz: f67084e2d3cfe8a34b431ba5d250b314d86138400cea242fb40bfe85d7d265df2f5e5ca4d54f5968f3cd56b6a9b21edb7d7368e25f5aef8faee5446bf014facf
7
+ data.tar.gz: a947af809bce16395da59a10e7f51393794885886528faad65ecc16a3b702874938f6fd02f006bc9045b432c633d67462b7ea41736fb7548e76e4e3de46762bb
@@ -1,7 +1,7 @@
1
1
  module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
- PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
4
+ PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
5
5
  PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
@@ -15,6 +15,9 @@ module PragmaticTokenizer
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
+ shift_underscore(text)
19
+ shift_asterisk(text)
20
+ shift_at_symbol(text)
18
21
  convert_dbl_quotes(text)
19
22
  convert_sgl_quotes(text)
20
23
  tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
@@ -66,6 +69,21 @@ module PragmaticTokenizer
66
69
  text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
67
70
  end
68
71
 
72
+ def shift_underscore(text)
73
+ text.gsub!(/(?<=\s)\_+/, ' \1') || text
74
+ text.gsub!(/\_+(?=\s)/, ' \1') || text
75
+ text.gsub!(/(?<=\A)\_+/, '\1 ') || text
76
+ text.gsub!(/\_+(?=\z)/, ' \1') || text
77
+ end
78
+
79
+ def shift_asterisk(text)
80
+ text.gsub!(/\*+/, ' \1 ') || text
81
+ end
82
+
83
+ def shift_at_symbol(text)
84
+ text.gsub!(/(\A|\s)\@/, '\1 ') || text
85
+ end
86
+
69
87
  def shift_colon(text)
70
88
  return text unless text.include?(':') &&
71
89
  !(/\A\d+/ == text.partition(':').last[0]) &&
@@ -81,6 +99,8 @@ module PragmaticTokenizer
81
99
 
82
100
  def shift_ellipse(text)
83
101
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
102
+ text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
103
+ text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
84
104
  end
85
105
 
86
106
  def separate_full_stop(tokens)
@@ -38,7 +38,7 @@ module PragmaticTokenizer
38
38
 
39
39
  def tokenize
40
40
  return [] unless text
41
- cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))
41
+ cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text)))))))).reject { |t| t.empty? }
42
42
  end
43
43
 
44
44
  private
@@ -65,8 +65,7 @@ module PragmaticTokenizer
65
65
 
66
66
  def cleaner(tokens)
67
67
  return tokens unless clean
68
- tokens.delete_if { |t| t =~ /\A_+\z/ ||
69
- t =~ /\A-+\z/ ||
68
+ tokens.delete_if { |t| t =~ /\A-+\z/ ||
70
69
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
71
70
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
72
71
  t.length > 50 ||
@@ -81,7 +80,7 @@ module PragmaticTokenizer
81
80
  when 'semi'
82
81
  tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
83
82
  when 'none'
84
- tokens - PragmaticTokenizer::Languages::Common::PUNCTUATION
83
+ tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
85
84
  when 'only'
86
85
  only_punctuation(tokens)
87
86
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.11"
2
+ VERSION = "0.1.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-08 00:00:00.000000000 Z
11
+ date: 2016-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler