pragmatic_tokenizer 0.3.3 → 0.3.4
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bfc15f5e19157e3d4f5aba7f781dc4ce11ab73a
|
4
|
+
data.tar.gz: 8416accd1749a6f51868bfc59060a7bc3f30c708
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d3a59c05e2e526d619f70bfdcb12bc9bf5299897c4fff026883d5a23a849274fa9966d4cf47cff7aeb07639cf70a5ced4015e5974326b00f631c5815753088e
|
7
|
+
data.tar.gz: a82349e9152127ed195a800803e1ee828459e286a0910147d53e5331c8b16f66516521f4d53b6a6bc3b80afb8d6995f4ec1ff290d46d59d50728eb1f46391d19
|
@@ -90,8 +90,8 @@ module PragmaticTokenizer
|
|
90
90
|
|
91
91
|
def shift_colon(text)
|
92
92
|
return text unless text.include?(':') &&
|
93
|
-
text.partition(':').last[0] !~ /\A\d+/
|
94
|
-
text.partition(':').first[-1] !~ /\A\d+/
|
93
|
+
(text.partition(':').last[0] !~ /\A\d+/ ||
|
94
|
+
text.partition(':').first[-1] !~ /\A\d+/)
|
95
95
|
# Ignore web addresses
|
96
96
|
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
97
97
|
text.gsub!(/:/o, ' :') || text
|
@@ -53,10 +53,11 @@ module PragmaticTokenizer
|
|
53
53
|
split_at_middle_period_1(
|
54
54
|
split_at_middle_period_2(
|
55
55
|
split_beginning_period(
|
56
|
+
split_at_plus_sign(
|
56
57
|
shift_no_spaces_between_sentences(
|
57
58
|
split_at_forward_slash(
|
58
59
|
processor.new(language: language_module).process(text: segment)
|
59
|
-
))))))))))))).reject { |t| t.empty? }
|
60
|
+
)))))))))))))).reject { |t| t.empty? }
|
60
61
|
end
|
61
62
|
tokens.flatten
|
62
63
|
end
|
@@ -192,6 +193,10 @@ module PragmaticTokenizer
|
|
192
193
|
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
193
194
|
end
|
194
195
|
|
196
|
+
def split_at_plus_sign(tokens)
|
197
|
+
tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
|
198
|
+
end
|
199
|
+
|
195
200
|
def find_contractions(tokens)
|
196
201
|
return tokens unless expand_contractions && language_module::CONTRACTIONS
|
197
202
|
if downcase
|