pragmatic_tokenizer 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 93070f0374fe808125850e652cb0f05cdd262c3e
4
- data.tar.gz: 0e69cfc518e9b734ab35e47d7eb0d656aaa22a76
3
+ metadata.gz: 7bfc15f5e19157e3d4f5aba7f781dc4ce11ab73a
4
+ data.tar.gz: 8416accd1749a6f51868bfc59060a7bc3f30c708
5
5
  SHA512:
6
- metadata.gz: e96d74097ee19c69c03952341be97f425200ea056dabd0b36dfae4387df78ca1920e521aaad1c28aef0e3d07b35d68abd21b54d7e21fcfc110c873b1ee319449
7
- data.tar.gz: 908a93161d227b7cdca4208157a339c12c075b416ceed0a59757a9227ebde77c1c87e9deae694ba2445cbd1a2dd00f8cb81a91051826de3b8596654272a4e9b4
6
+ metadata.gz: 4d3a59c05e2e526d619f70bfdcb12bc9bf5299897c4fff026883d5a23a849274fa9966d4cf47cff7aeb07639cf70a5ced4015e5974326b00f631c5815753088e
7
+ data.tar.gz: a82349e9152127ed195a800803e1ee828459e286a0910147d53e5331c8b16f66516521f4d53b6a6bc3b80afb8d6995f4ec1ff290d46d59d50728eb1f46391d19
@@ -90,8 +90,8 @@ module PragmaticTokenizer
90
90
 
91
91
  def shift_colon(text)
92
92
  return text unless text.include?(':') &&
93
- text.partition(':').last[0] !~ /\A\d+/ &&
94
- text.partition(':').first[-1] !~ /\A\d+/
93
+ (text.partition(':').last[0] !~ /\A\d+/ ||
94
+ text.partition(':').first[-1] !~ /\A\d+/)
95
95
  # Ignore web addresses
96
96
  text.gsub!(/(?<=[http|https]):(?=\/\/)/, PragmaticTokenizer::Languages::Common::PUNCTUATION_MAP[":"]) || text
97
97
  text.gsub!(/:/o, ' :') || text
@@ -53,10 +53,11 @@ module PragmaticTokenizer
53
53
  split_at_middle_period_1(
54
54
  split_at_middle_period_2(
55
55
  split_beginning_period(
56
+ split_at_plus_sign(
56
57
  shift_no_spaces_between_sentences(
57
58
  split_at_forward_slash(
58
59
  processor.new(language: language_module).process(text: segment)
59
- ))))))))))))).reject { |t| t.empty? }
60
+ )))))))))))))).reject { |t| t.empty? }
60
61
  end
61
62
  tokens.flatten
62
63
  end
@@ -192,6 +193,10 @@ module PragmaticTokenizer
192
193
  tokens.flat_map { |t| t.include?("/") && t !~ /(http|https|www)(\.|:)/ ? t.gsub!(/\//, '\1 \2').split(' ').flatten : t }
193
194
  end
194
195
 
196
+ def split_at_plus_sign(tokens)
197
+ tokens.flat_map { |t| t.include?("+") ? t.gsub!(/\+/, '\1 \2').split(' ').flatten : t }
198
+ end
199
+
195
200
  def find_contractions(tokens)
196
201
  return tokens unless expand_contractions && language_module::CONTRACTIONS
197
202
  if downcase
@@ -1,3 +1,3 @@
1
1
  module PragmaticTokenizer
2
- VERSION = "0.3.3"
2
+ VERSION = "0.3.4"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pragmatic_tokenizer
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kevin S. Dias