pragmatic_tokenizer 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7bfc15f5e19157e3d4f5aba7f781dc4ce11ab73a
|
4
|
+
data.tar.gz: 8416accd1749a6f51868bfc59060a7bc3f30c708
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d3a59c05e2e526d619f70bfdcb12bc9bf5299897c4fff026883d5a23a849274fa9966d4cf47cff7aeb07639cf70a5ced4015e5974326b00f631c5815753088e
|
7
|
+
data.tar.gz: a82349e9152127ed195a800803e1ee828459e286a0910147d53e5331c8b16f66516521f4d53b6a6bc3b80afb8d6995f4ec1ff290d46d59d50728eb1f46391d19
|
@@ -90,8 +90,8 @@ module PragmaticTokenizer
|
|
90
90
|
|
91
91
|
def shift_colon(text)
|
92
92
|
return text unless text.include?(':') &&
|
93
|
-
text.partition(':').last[0] !~ /\A\d+/
|
94
|
-
text.partition(':').first[-1] !~ /\A\d+/
|
93
|
+
(text.partition(':').last[0] !~ /\A\d+/ ||
|
94
|
+
text.partition(':').first[-1] !~ /\A\d+/)
|
95
95
|
# Ignore web addresses
|
96
96
|
text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
|
97
97
|
text.gsub!(/:/o, ' :') || text
|
@@ -53,10 +53,11 @@ module PragmaticTokenizer
|
|
53
53
|
split_at_middle_period_1(
|
54
54
|
split_at_middle_period_2(
|
55
55
|
split_beginning_period(
|
56
|
+
split_at_plus_sign(
|
56
57
|
shift_no_spaces_between_sentences(
|
57
58
|
split_at_forward_slash(
|
58
59
|
processor.new(language: language_module).process(text: segment)
|
59
|
-
))))))))))))).reject { |t| t.empty? }
|
60
|
+
)))))))))))))).reject { |t| t.empty? }
|
60
61
|
end
|
61
62
|
tokens.flatten
|
62
63
|
end
|
@@ -192,6 +193,10 @@ module PragmaticTokenizer
|
|
192
193
|
tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
|
193
194
|
end
|
194
195
|
|
196
|
+
def split_at_plus_sign(tokens)
|
197
|
+
tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
|
198
|
+
end
|
199
|
+
|
195
200
|
def find_contractions(tokens)
|
196
201
|
return tokens unless expand_contractions && language_module::CONTRACTIONS
|
197
202
|
if downcase
|