RubyGems - pragmatic_segmenter - Versions diffs - 0.3.4 → 0.3.5 - Mend

pragmatic_segmenter 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/pragmatic_segmenter/abbreviation_replacer.rb +24 -21
data/lib/pragmatic_segmenter/languages/arabic.rb +2 -1
data/lib/pragmatic_segmenter/languages/deutsch.rb +2 -1
data/lib/pragmatic_segmenter/languages/persian.rb +2 -1
data/lib/pragmatic_segmenter/languages/russian.rb +4 -3
data/lib/pragmatic_segmenter/version.rb +1 -1
data/spec/pragmatic_segmenter_spec.rb +0 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a9cb6133aca84f8c6ff233ec6fb34b276cf47964
-  data.tar.gz: 00c1f664707e86e5c2ae5740c53acde5c814ece8
+  metadata.gz: 805bb57215b38dc30f107be60c18c141d8dc1297
+  data.tar.gz: 389bf5f6700f44cb255279a4391c849d0ac67f69
 SHA512:
-  metadata.gz: d5726605fa78ec4067c79ed592a7983f2638b26a81fb88cf23bdffeb26d842c0eaed39a531181ecef6456218208f6d297e4316b36fc7f4a15f4deb2ebb7cb800
-  data.tar.gz: a1c99c7f3c73c1624a2b1d4792c8937dd27d3dfd4667dcb05844376d5ecad13ebe065f18fe3f40a61a4aa23baa3e8ab9b4dc3bc547821c6d9fc5700cd5a16f20
+  metadata.gz: 749c13c831913bcd606f0d08194d09ca982c7e033c59c8520bf1b20a4146934725ab8252bc54362815cd34f5ea45f8c5ed1cde2da80b8beb506093bd86138522
+  data.tar.gz: d5f05492994604c023fc357b7b030878eb2d8554f2d898efe196530a5777d394359acd4e8c600451cec511df4e2dfafa30cc34d2589a8d983ae3b11299aa9869

data/lib/pragmatic_segmenter/abbreviation_replacer.rb CHANGED Viewed

@@ -67,7 +67,7 @@ module PragmaticSegmenter
       # Some might say that the set of words that follow an
       # abbreviation such as U.S. (i.e. U.S. Government) is smaller than
       # the set of words that could start a sentence and
-      # never follow U.S. However, we  are being conservative
+      # never follow U.S. However, we are being conservative
       # and not splitting by default, so we need to look for places
       # where we definitely can split. Obviously SENTENCE_STARTERS
       # will never cover all cases, but as the gem is named
@@ -76,17 +76,17 @@ module PragmaticSegmenter
       # sentence but could never follow one of the abbreviations below.
       SENTENCE_STARTERS.each do |word|
-        txt = txt.gsub(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
-              .gsub(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
-              .gsub(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
-              .gsub(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
-              .gsub(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
-              .gsub(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
-              .gsub(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯S∯\s#{Regexp.escape(word)}\s/, "U∯S\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.S∯\s#{Regexp.escape(word)}\s/, "U\.S\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯K∯\s#{Regexp.escape(word)}\s/, "U∯K\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.K∯\s#{Regexp.escape(word)}\s/, "U\.K\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/E∯U∯\s#{Regexp.escape(word)}\s/, "E∯U\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/E\.U∯\s#{Regexp.escape(word)}\s/, "E\.U\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U∯S∯A∯\s#{Regexp.escape(word)}\s/, "U∯S∯A\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/U\.S\.A∯\s#{Regexp.escape(word)}\s/, "U\.S\.A\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/I∯\s#{Regexp.escape(word)}\s/, "I\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/i.v∯\s#{Regexp.escape(word)}\s/, "i\.v\.\s#{Regexp.escape(word)}\s")
+        txt.gsub!(/I.V∯\s#{Regexp.escape(word)}\s/, "I\.V\.\s#{Regexp.escape(word)}\s")
       end
       txt
     end
@@ -95,29 +95,32 @@ module PragmaticSegmenter
       mpa = txt.scan(@language::MULTI_PERIOD_ABBREVIATION_REGEX)
       return txt if mpa.empty?
       mpa.each do |r|
-        txt = txt.gsub(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
+        txt.gsub!(/#{Regexp.escape(r)}/, "#{r.gsub!('.', '∯')}")
       end
       txt
     end
     def replace_pre_number_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s\d)|(?<=^#{abbr.strip})\.(?=\s\d)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s+\()|(?<=^#{abbr.strip})\.(?=\s+\()/, '∯')
+      txt
     end
     def replace_prepositive_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=\s)|(?<=^#{abbr.strip})\.(?=\s)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=:\d+)|(?<=^#{abbr.strip})\.(?=:\d+)/, '∯')
+      txt
     end
     def replace_period_of_abbr(txt, abbr)
-      txt.gsub(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
-         .gsub(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))|(?<=^#{abbr.strip})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))/, '∯')
+      txt.gsub!(/(?<=\s#{abbr.strip})\.(?=,)|(?<=^#{abbr.strip})\.(?=,)/, '∯')
+      txt
     end
     def replace_possessive_abbreviations(txt)
-      txt.gsub(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
+      txt.gsub!(@language::POSSESSIVE_ABBREVIATION_REGEX, '∯')
+      txt
     end
   end
 end

data/lib/pragmatic_segmenter/languages/arabic.rb CHANGED Viewed

@@ -22,7 +22,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\./, '∯')
+          txt.gsub!(/(?<=#{am})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/languages/deutsch.rb CHANGED Viewed

@@ -75,7 +75,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\.(?=\s)/, '∯')
+          txt.gsub!(/(?<=#{am})\.(?=\s)/, '∯')
+          txt
         end
       end

data/lib/pragmatic_segmenter/languages/persian.rb CHANGED Viewed

@@ -13,7 +13,8 @@ module PragmaticSegmenter
         private
         def scan_for_replacements(txt, am, index, character_array)
-          txt.gsub(/(?<=#{am})\./, '∯')
+          txt.gsub!(/(?<=#{am})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/languages/russian.rb CHANGED Viewed

@@ -13,9 +13,10 @@ module PragmaticSegmenter
         private
         def replace_period_of_abbr(txt, abbr)
-          txt.gsub(/(?<=\s#{abbr.strip})\./, '∯')
-            .gsub(/(?<=\A#{abbr.strip})\./, '∯')
-            .gsub(/(?<=^#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=\s#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=\A#{abbr.strip})\./, '∯')
+          txt.gsub!(/(?<=^#{abbr.strip})\./, '∯')
+          txt
         end
       end
     end

data/lib/pragmatic_segmenter/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticSegmenter
-  VERSION = "0.3.4"
+  VERSION = "0.3.5"
 end

data/spec/pragmatic_segmenter_spec.rb CHANGED Viewed

@@ -58,5 +58,4 @@ RSpec.describe PragmaticSegmenter::Segmenter do
       end
     end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_segmenter
 version: !ruby/object:Gem::Version
-  version: 0.3.4
+  version: 0.3.5
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-12-22 00:00:00.000000000 Z
+date: 2016-01-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -160,7 +160,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.4.1
 signing_key:
 specification_version: 4
 summary: A rule-based sentence boundary detection gem that works out-of-the-box across