pragmatic_tokenizer 0.1.9 → 0.1.10
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4b2dd34906b6a89945200b65353a772089c2fdb2
|
4
|
+
data.tar.gz: dceed8cdbb9cefcd822ddf68cc99073b76aa9506
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 21d892cc075635b1d1ba42c122421bb237bbc01b35ab2068648fb8c4eb9725a2174f22b89a41976d9df06546bdf92e1d4f4f8520a409edf1d6c2c62df1faf8cc
|
7
|
+
data.tar.gz: 2750c610896eeb49f8218e666be12f6deeb5675fd6bef7678f826d07031fa0fba94fd5113e0473f263899f20d5e740c4117748d48621e2a51fe1c3f835275a1a
|
@@ -14,6 +14,7 @@ module PragmaticTokenizer
|
|
14
14
|
shift_special_quotes(text)
|
15
15
|
shift_colon(text)
|
16
16
|
shift_bracket(text)
|
17
|
+
shift_semicolon(text)
|
17
18
|
convert_dbl_quotes(text)
|
18
19
|
convert_sgl_quotes(text)
|
19
20
|
tokens = separate_full_stop(text.squeeze(' ').split.map { |t| convert_sym_to_punct(t.downcase) })
|
@@ -74,6 +75,10 @@ module PragmaticTokenizer
|
|
74
75
|
text.gsub!(/:/o, ' :') || text
|
75
76
|
end
|
76
77
|
|
78
|
+
def shift_semicolon(text)
|
79
|
+
text.gsub!(/([;])/o) { ' ' + $1 + ' ' } || text
|
80
|
+
end
|
81
|
+
|
77
82
|
def shift_ellipse(text)
|
78
83
|
text.gsub!(/(\.\.\.+)/o) { ' ' + $1 + ' ' } || text
|
79
84
|
end
|
@@ -65,7 +65,13 @@ module PragmaticTokenizer
|
|
65
65
|
|
66
66
|
def cleaner(tokens)
|
67
67
|
return tokens unless clean
|
68
|
-
tokens.delete_if { |t| t =~ /\A_+\z/ ||
|
68
|
+
tokens.delete_if { |t| t =~ /\A_+\z/ ||
|
69
|
+
t =~ /\A-+\z/ ||
|
70
|
+
PragmaticTokenizer::Languages::Common::SPECIAL_CHARACTERS.include?(t) ||
|
71
|
+
t =~ /\A\.{2,}\z/ || t.include?("\\") ||
|
72
|
+
t.length > 50 ||
|
73
|
+
(t.length > 1 && t =~ /[#&*+<=>@^|~]/i)
|
74
|
+
}
|
69
75
|
end
|
70
76
|
|
71
77
|
def remove_punctuation(tokens)
|