pragmatic_tokenizer 0.3.3 → 0.3.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93070f0374fe808125850e652cb0f05cdd262c3e
4
- data.tar.gz: 0e69cfc518e9b734ab35e47d7eb0d656aaa22a76
3
+ metadata.gz: 7bfc15f5e19157e3d4f5aba7f781dc4ce11ab73a
4
+ data.tar.gz: 8416accd1749a6f51868bfc59060a7bc3f30c708
5
5
  SHA512:
6
- metadata.gz: e96d74097ee19c69c03952341be97f425200ea056dabd0b36dfae4387df78ca1920e521aaad1c28aef0e3d07b35d68abd21b54d7e21fcfc110c873b1ee319449
7
- data.tar.gz: 908a93161d227b7cdca4208157a339c12c075b416ceed0a59757a9227ebde77c1c87e9deae694ba2445cbd1a2dd00f8cb81a91051826de3b8596654272a4e9b4
6
+ metadata.gz: 4d3a59c05e2e526d619f70bfdcb12bc9bf5299897c4fff026883d5a23a849274fa9966d4cf47cff7aeb07639cf70a5ced4015e5974326b00f631c5815753088e
7
+ data.tar.gz: a82349e9152127ed195a800803e1ee828459e286a0910147d53e5331c8b16f66516521f4d53b6a6bc3b80afb8d6995f4ec1ff290d46d59d50728eb1f46391d19
@@ -90,8 +90,8 @@ module PragmaticTokenizer
90
90
 
91
91
  def shift_colon(text)
92
92
  return text unless text.include?(':') &&
93
- text.partition(':').last[0] !~ /\A\d+/ &&
94
- text.partition(':').first[-1] !~ /\A\d+/
93
+ (text.partition(':').last[0] !~ /\A\d+/ ||
94
+ text.partition(':').first[-1] !~ /\A\d+/)
95
95
  # Ignore web addresses
96
96
  text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
97
97
  text.gsub!(/:/o, ' :') || text
@@ -53,10 +53,11 @@ module PragmaticTokenizer
53
53
  split_at_middle_period_1(
54
54
  split_at_middle_period_2(
55
55
  split_beginning_period(
56
+ split_at_plus_sign(
56
57
  shift_no_spaces_between_sentences(
57
58
  split_at_forward_slash(
58
59
  processor.new(language: language_module).process(text: segment)
59
- ))))))))))))).reject { |t| t.empty? }
60
+ )))))))))))))).reject { |t| t.empty? }
60
61
  end
61
62
  tokens.flatten
62
63
  end
@@ -192,6 +193,10 @@ module PragmaticTokenizer
192
193
  tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
193
194
  end
194
195
 
196
+ def split_at_plus_sign(tokens)
197
+ tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
198
+ end
199
+
195
200
  def find_contractions(tokens)
196
201
  return tokens unless expand_contractions && language_module::CONTRACTIONS
197
202
  if downcase
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias