pragmatic_tokenizer 0.1.11 → 0.1.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: bd6598d349d883708c73d3f51431cf4a516bafdb
4
- data.tar.gz: ba8ab51aa218c4d5405ec1db2a539e0fffa478ec
3
+ metadata.gz: c4b348d207073bd0812a58b29d1277951ffc63dc
4
+ data.tar.gz: b9c21c948e4164678fe87e666901d327a395c550
5
5
  SHA512:
6
- metadata.gz: 5e3aa17314b8130edf651229048c502e2834517f3f2d793705232248554511937d0157a9ff2dee57191aadeb66ad7dc5303634e9857f6c9fcf51dd9da43072d7
7
- data.tar.gz: 73af6e7bd5807dc7637d662abebb3a27dbf0e2194a51e30e1876d7c931dbaaeab6bc6ee0ddb3cf915a75385fc6613b2a6480d8686f0e2949b7461334453dff87
6
+ metadata.gz: f67084e2d3cfe8a34b431ba5d250b314d86138400cea242fb40bfe85d7d265df2f5e5ca4d54f5968f3cd56b6a9b21edb7d7368e25f5aef8faee5446bf014facf
7
+ data.tar.gz: a947af809bce16395da59a10e7f51393794885886528faad65ecc16a3b702874938f6fd02f006bc9045b432c633d67462b7ea41736fb7548e76e4e3de46762bb
@@ -1,7 +1,7 @@
1
1
  module PragmaticTokenizer
2
2
  module Languages
3
3
  module Common
4
- PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
4
+ PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
5
5
  PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
6
6
  SEMI_PUNCTUATION = ['。', '.', '.']
7
7
  ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
@@ -15,6 +15,9 @@ module PragmaticTokenizer
15
15
  shift_colon(text)
16
16
  shift_bracket(text)
17
17
  shift_semicolon(text)
18
+ shift_underscore(text)
19
+ shift_asterisk(text)
20
+ shift_at_symbol(text)
18
21
  convert_dbl_quotes(text)
19
22
  convert_sgl_quotes(text)
20
23
  tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
@@ -66,6 +69,21 @@ module PragmaticTokenizer
66
69
  text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
67
70
  end
68
71
 
72
+ def shift_underscore(text)
73
+ text.gsub!(/(?<=\s)\_+/, ' \1') || text
74
+ text.gsub!(/\_+(?=\s)/, ' \1') || text
75
+ text.gsub!(/(?<=\A)\_+/, '\1 ') || text
76
+ text.gsub!(/\_+(?=\z)/, ' \1') || text
77
+ end
78
+
79
+ def shift_asterisk(text)
80
+ text.gsub!(/\*+/, ' \1 ') || text
81
+ end
82
+
83
+ def shift_at_symbol(text)
84
+ text.gsub!(/(\A|\s)\@/, '\1 ') || text
85
+ end
86
+
69
87
  def shift_colon(text)
70
88
  return text unless text.include?(':') &&
71
89
  !(/\A\d+/ == text.partition(':').last[0]) &&
@@ -81,6 +99,8 @@ module PragmaticTokenizer
81
99
 
82
100
  def shift_ellipse(text)
83
101
  text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
102
+ text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
103
+ text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
84
104
  end
85
105
 
86
106
  def separate_full_stop(tokens)
@@ -38,7 +38,7 @@ module PragmaticTokenizer
38
38
 
39
39
  def tokenize
40
40
  return [] unless text
41
- cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))
41
+ cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text)))))))).reject { |t| t.empty? }
42
42
  end
43
43
 
44
44
  private
@@ -65,8 +65,7 @@ module PragmaticTokenizer
65
65
 
66
66
  def cleaner(tokens)
67
67
  return tokens unless clean
68
- tokens.delete_if { |t| t =~ /\A_+\z/ ||
69
- t =~ /\A-+\z/ ||
68
+ tokens.delete_if { |t| t =~ /\A-+\z/ ||
70
69
  PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
71
70
  t =~ /\A\.{2,}\z/ || t.include?("\\") ||
72
71
  t.length > 50 ||
@@ -81,7 +80,7 @@ module PragmaticTokenizer
81
80
  when 'semi'
82
81
  tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
83
82
  when 'none'
84
- tokens - PragmaticTokenizer::Languages::Common::PUNCTUATION
83
+ tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
85
84
  when 'only'
86
85
  only_punctuation(tokens)
87
86
  end
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.1.11"
2
+ VERSION = "0.1.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.11
4
+ version: 0.1.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-01-08 00:00:00.000000000 Z
11
+ date: 2016-01-09 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler