pragmatic_tokenizer 0.1.11 → 0.1.12
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c4b348d207073bd0812a58b29d1277951ffc63dc
|
4
|
+
data.tar.gz: b9c21c948e4164678fe87e666901d327a395c550
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f67084e2d3cfe8a34b431ba5d250b314d86138400cea242fb40bfe85d7d265df2f5e5ca4d54f5968f3cd56b6a9b21edb7d7368e25f5aef8faee5446bf014facf
|
7
|
+
data.tar.gz: a947af809bce16395da59a10e7f51393794885886528faad65ecc16a3b702874938f6fd02f006bc9045b432c633d67462b7ea41736fb7548e76e4e3de46762bb
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module PragmaticTokenizer
|
2
2
|
module Languages
|
3
3
|
module Common
|
4
|
-
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»']
|
4
|
+
PUNCTUATION = ['。', '.', '.', '!', '!', '?', '?', '、', '¡', '¿', '„', '“', '[', ']', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', ':', ';', '<', '=', '>', '@', '^', '_', '`', "'", '{', '|', '}', '~', '-', '«', '»', '/', '›', '‹', '^']
|
5
5
|
PUNCTUATION_MAP = { "。" => "♳", "." => "♴", "." => "♵", "!" => "♶", "!" => "♷", "?" => "♸", "?" => "♹", "、" => "♺", "¡" => "⚀", "¿" => "⚁", "„" => "⚂", "“" => "⚃", "[" => "⚄", "]" => "⚅", "\"" => "☇", "#" => "☈", "$" => "☉", "%" => "☊", "&" => "☋", "(" => "☌", ")" => "☍", "*" => "☠", "+" => "☢", "," => "☣", ":" => "☤", ";" => "☥", "<" => "☦", "=" => "☧", ">" => "☀", "@" => "☁", "^" => "☂", "_" => "☃", "`" => "☄", "'" => "☮", "{" => "♔", "|" => "♕", "}" => "♖", "~" => "♗", "-" => "♘", "«" => "♙", "»" => "♚" }
|
6
6
|
SEMI_PUNCTUATION = ['。', '.', '.']
|
7
7
|
ROMAN_NUMERALS = ['i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'xiii', 'xiv', 'xv', 'xvi', 'xvii', 'xviii', 'xix', 'xx', 'xxi', 'xxii', 'xxiii', 'xxiv', 'xxv', 'xxvi', 'xxvii', 'xxviii', 'xxix', 'xxx', 'xxxi', 'xxxii', 'xxxiii', 'xxxiv', 'xxxv', 'xxxvi', 'xxxvii', 'xxxviii', 'xxxix', 'xl', 'xli', 'xlii', 'xliii', 'xliv', 'xlv', 'xlvi', 'xlvii', 'xlviii', 'xlix', 'l', 'li', 'lii', 'liii', 'liv', 'lv', 'lvi', 'lvii', 'lviii', 'lix', 'lx', 'lxi', 'lxii', 'lxiii', 'lxiv', 'lxv', 'lxvi', 'lxvii', 'lxviii', 'lxix', 'lxx', 'lxxi', 'lxxii', 'lxxiii', 'lxxiv', 'lxxv', 'lxxvi', 'lxxvii', 'lxxviii', 'lxxix', 'lxxx', 'lxxxi', 'lxxxii', 'lxxxiii', 'lxxxiv', 'lxxxv', 'lxxxvi', 'lxxxvii', 'lxxxviii', 'lxxxix', 'xc', 'xci', 'xcii', 'xciii', 'xciv', 'xcv', 'xcvi', 'xcvii', 'xcviii', 'xcix']
|
@@ -15,6 +15,9 @@ module PragmaticTokenizer
|
|
15
15
|
shift_colon(text)
|
16
16
|
shift_bracket(text)
|
17
17
|
shift_semicolon(text)
|
18
|
+
shift_underscore(text)
|
19
|
+
shift_asterisk(text)
|
20
|
+
shift_at_symbol(text)
|
18
21
|
convert_dbl_quotes(text)
|
19
22
|
convert_sgl_quotes(text)
|
20
23
|
tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
|
@@ -66,6 +69,21 @@ module PragmaticTokenizer
|
|
66
69
|
text.gsub!(/([\(\[\{\}\]\)])/o) { ' ' + $1 + ' ' } || text
|
67
70
|
end
|
68
71
|
|
72
|
+
def shift_underscore(text)
|
73
|
+
text.gsub!(/(?<=\s)\_+/, ' \1') || text
|
74
|
+
text.gsub!(/\_+(?=\s)/, ' \1') || text
|
75
|
+
text.gsub!(/(?<=\A)\_+/, '\1 ') || text
|
76
|
+
text.gsub!(/\_+(?=\z)/, ' \1') || text
|
77
|
+
end
|
78
|
+
|
79
|
+
def shift_asterisk(text)
|
80
|
+
text.gsub!(/\*+/, ' \1 ') || text
|
81
|
+
end
|
82
|
+
|
83
|
+
def shift_at_symbol(text)
|
84
|
+
text.gsub!(/(\A|\s)\@/, '\1 ') || text
|
85
|
+
end
|
86
|
+
|
69
87
|
def shift_colon(text)
|
70
88
|
return text unless text.include?(':') &&
|
71
89
|
!(/\A\d+/ == text.partition(':').last[0]) &&
|
@@ -81,6 +99,8 @@ module PragmaticTokenizer
|
|
81
99
|
|
82
100
|
def shift_ellipse(text)
|
83
101
|
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
|
102
|
+
text.gsub!(/(\.\.+)/o) { ' ' + $1 + ' ' } || text
|
103
|
+
text.gsub!(/(…+)/o) { ' ' + $1 + ' ' } || text
|
84
104
|
end
|
85
105
|
|
86
106
|
def separate_full_stop(tokens)
|
@@ -38,7 +38,7 @@ module PragmaticTokenizer
|
|
38
38
|
|
39
39
|
def tokenize
|
40
40
|
return [] unless text
|
41
|
-
cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text))))))))
|
41
|
+
cleaner(remove_short_tokens(delete_numbers(delete_roman_numerals(find_contractions(delete_stop_words(remove_punctuation(processor.new(language: language_module).process(text: text)))))))).reject { |t| t.empty? }
|
42
42
|
end
|
43
43
|
|
44
44
|
private
|
@@ -65,8 +65,7 @@ module PragmaticTokenizer
|
|
65
65
|
|
66
66
|
def cleaner(tokens)
|
67
67
|
return tokens unless clean
|
68
|
-
tokens.delete_if { |t| t =~ /\
|
69
|
-
t =~ /\A-+\z/ ||
|
68
|
+
tokens.delete_if { |t| t =~ /\A-+\z/ ||
|
70
69
|
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
71
70
|
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
72
71
|
t.length > 50 ||
|
@@ -81,7 +80,7 @@ module PragmaticTokenizer
|
|
81
80
|
when 'semi'
|
82
81
|
tokens - PragmaticTokenizer::Languages::Common::SEMI_PUNCTUATION
|
83
82
|
when 'none'
|
84
|
-
tokens - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
83
|
+
tokens.delete_if { |t| t =~ /\A[[:punct:]]+\z/ || t =~ /\A(‹+|\^+|›+|\++)\z/ } - PragmaticTokenizer::Languages::Common::PUNCTUATION
|
85
84
|
when 'only'
|
86
85
|
only_punctuation(tokens)
|
87
86
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pragmatic_tokenizer
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kevin S. Dias
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-01-
|
11
|
+
date: 2016-01-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|