RubyGems - pragmatic_segmenter - Versions diffs - 0.3.4 → 0.3.5 - Mend

pragmatic_segmenter 0.3.4 → 0.3.5

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/pragmatic_segmenter/abbreviation_replacer.rb +24 -21
data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -1
data/lib/pragmatic_segmenter/languages/persian.rb +2 -1
data/lib/pragmatic_segmenter/languages/russian.rb +4 -3
data/lib/pragmatic_segmenter/version.rb +1 -1
data/spec/pragmatic_segmenter_spec.rb +0 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
-  data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
+  metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
+  data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
 SHA512:
-  metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
-  data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
+  metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
+  data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869

data/lib/pragmatic_segmenter/abbreviation_replacer.rb CHANGED Viewed

@@ -67,7 +67,7 @@ module PragmaticSegmenter
       # Some might say that the set of words that follow an
       # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
       # the set of words that could start a sentence and
-      # never follow U.S. However, we  are being conservative
+      # never follow U.S. However, we are being conservative
       # and not splitting by default, so we need to look for places
       # where we definitely can split. Obviously SENTENCE_STARTERS
       # will never cover all cases, but as the gem is named
@@ -76,17 +76,17 @@ module PragmaticSegmenter
       # sentence but could never follow one of the abbreviations below.
       SENTENCE_STARTERS.each do |word|
-        txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
-              .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
-              .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
-              .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
-              .gsub(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
-              .gsub(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
       end
       txt
     end
@@ -95,29 +95,32 @@ module PragmaticSegmenter
       mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
       return txt if mpa.empty?
       mpa.each do |r|
-        txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
+        txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
       end
       txt
     end
     def replace_pre_number_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
+      txt
     end
     def replace_prepositive_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
+      txt
     end
     def replace_period_of_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
+      txt
     end
     def replace_possessive_abbreviations(txt)
-      txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
+      txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
+      txt
     end
   end
 end

data/lib/pragmatic_segmenter/languages/arabic.rb CHANGED Viewed

@@ -22,7 +22,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\./, '∯')
+          txt.gsub!(/(?<=#{am})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/languages/deutsch.rb CHANGED Viewed

@@ -75,7 +75,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
+          txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯')
+          txt
         end
       end

data/lib/pragmatic_segmenter/languages/persian.rb CHANGED Viewed

@@ -13,7 +13,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\./, '∯')
+          txt.gsub!(/(?<=#{am})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/languages/russian.rb CHANGED Viewed

@@ -13,9 +13,10 @@ module PragmaticSegmenter
         private
         def replace_period_of_abbr(txt, abbr)
-          txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
-            .gsub(/(?<=\A#{abbr.strip})\./, '∯')
-            .gsub(/(?<=^#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticSegmenter
-  VERSION = "0.3.4"
+  VERSION = "0.3.5"
 end

data/spec/pragmatic_segmenter_spec.rb CHANGED Viewed

@@ -58,5 +58,4 @@ RSpec.describe PragmaticSegmenter::Segmenter do
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_segmenter
 version: !ruby/object:Gem::Version
-  version: 0.3.4
+  version: 0.3.5
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-12-22 00:00:00.000000000 Z
+date: 2016-01-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.4.1
 signing_key:
 specification_version: 4
 summary: A rule-based sentence boundary detection gem that works out-of-the-box across