RubyGems - pragmatic_segmenter - Versions diffs - 0.3.20 → 0.3.21 - Mend

pragmatic_segmenter 0.3.20 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/NEWS +5 -0
data/README.md +4 -0
data/lib/pragmatic_segmenter/languages/common.rb +2 -0
data/lib/pragmatic_segmenter/languages/common/numbers.rb +2 -0
data/lib/pragmatic_segmenter/processor.rb +6 -0
data/lib/pragmatic_segmenter/version.rb +1 -1
data/spec/pragmatic_segmenter/languages/english_spec.rb +36 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 49b8f9ed555c4c3adba18700c7a910e3b67083aa
-  data.tar.gz: 0907aa732977255028708a162eece4e6225634da
+  metadata.gz: a13bc873b763cc68e4bfebcd0c291d3eab12a429
+  data.tar.gz: 85666c4d354ba1a0e5118c0e3e23b9c4d1978ae4
 SHA512:
-  metadata.gz: cfe1b18483d7fbe83d4bfe52971ab7ca95321e89c375e06d1c0e5686c7f2818930cf842a2963fb69bb08b213e6c5480a1fe1171610a8d08629089b028e5899f2
-  data.tar.gz: 4b0d8fdd97ce03c6e205e3c47312486e157ffa926e8ddb50f1e825ae75327379b3aee0dbbc61322bb15f1594535aecbf839a5842893f82d9a74caf2389a7f065
+  metadata.gz: f3c211daa3aaf71d4ff1d363ee15711c8979db6fa3cf9fb87d84108466f0d8fe9ee7b9694ad32a6439a79c471ab013e70f536601d951a7e9400addb954f35c76
+  data.tar.gz: de40f070a216d90cff094bb6cfdb83b1f1c108774ff67da18e58846fc69aab78decc234f0b0c7fda7931ffcc7e8ca1e427d28fd495922a28aa91df02981917e6

data/NEWS CHANGED Viewed

@@ -1,3 +1,8 @@
+0.3.21 (2018-08-30):
+* Improvement: Add support for file formats
+* Improvement: Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
 0.3.20 (2018-08-28):
 * Improvement: Handle slanted single quotation as a single quote

data/README.md CHANGED Viewed

@@ -874,6 +874,10 @@ To test the relative performance of different segmentation tools and libraries I
 * Add support for Chinese caret brackets
 * Add viz as abbreviation
+**Version 0.3.21**
+* Add support for file formats
+* Add support for numeric references at the end of a sentence (i.e. Wikipedia references)
 ## Contributing
 If you find a text that is incorrectly segmented using this gem, please submit an issue.

data/lib/pragmatic_segmenter/languages/common.rb CHANGED Viewed

@@ -24,6 +24,8 @@ module PragmaticSegmenter
       # Rubular: http://rubular.com/r/G2opjedIm9
       GeoLocationRule = Rule.new(/(?<=[a-zA-z]°)\.(?=\s*\d+)/, '∯')
+      FileFormatRule = Rule.new(/(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)/, '∯')
       SingleNewLineRule = Rule.new(/\n/, 'ȹ')
       module DoublePunctuationRules

data/lib/pragmatic_segmenter/languages/common/numbers.rb CHANGED Viewed

@@ -47,6 +47,8 @@ module PragmaticSegmenter
       # Rubular: http://rubular.com/r/mQ8Es9bxtk
       CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
+      NUMBERED_REFERENCE_REGEX = /(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)*\d{1,3}))(\s)(?=[A-Z])/
       # Rubular: http://rubular.com/r/yqa4Rit8EY
       PossessiveAbbreviationRule = Rule.new(/\.(?='s\s)|\.(?='s$)|\.(?='s\z)/, '∯')

data/lib/pragmatic_segmenter/processor.rb CHANGED Viewed

@@ -23,8 +23,10 @@ module PragmaticSegmenter
       replace_abbreviations
       replace_numbers
       replace_continuous_punctuation
+      replace_periods_before_numeric_references
       @text.apply(@language::Abbreviations::WithMultiplePeriodsAndEmailRule)
       @text.apply(@language::GeoLocationRule)
+      @text.apply(@language::FileFormatRule)
       split_into_segments
     end
@@ -68,6 +70,10 @@ module PragmaticSegmenter
       end
     end
+    def replace_periods_before_numeric_references
+      @text.gsub!(@language::NUMBERED_REFERENCE_REGEX, "∯\\2\r\\7")
+    end
     def consecutive_underscore?(txt)
       # Rubular: http://rubular.com/r/fTF2Ff3WBL
       txt.gsub(/_{3,}/, '').length.eql?(0)

data/lib/pragmatic_segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module PragmaticSegmenter
-  VERSION = "0.3.20"
+  VERSION = "0.3.21"
 end

data/spec/pragmatic_segmenter/languages/english_spec.rb CHANGED Viewed

@@ -1426,5 +1426,41 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
       ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
       expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
     end
+    it 'correctly segments text #122' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "These include images of various modes of transport and members of the team, all available in .jpeg format. Images can be downloaded from our website. We also offer archives as .zip files.")
+      expect(ps.segment).to eq(["These include images of various modes of transport and members of the team, all available in .jpeg format.", "Images can be downloaded from our website.", "We also offer archives as .zip files."])
+    end
+    it 'correctly segments text #123' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Saint Maximus (died 250) is a Christian saint and martyr.[1] The emperor Decius published a decree ordering the veneration of busts of the deified emperors.")
+      expect(ps.segment).to eq(["Saint Maximus (died 250) is a Christian saint and martyr.[1]", "The emperor Decius published a decree ordering the veneration of busts of the deified emperors."])
+    end
+    it 'correctly segments text #124' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Differing agendas can potentially create an understanding gap in a consultation.11 12 Take the example of one of the most common presentations in ill health: the common cold.")
+      expect(ps.segment).to eq(["Differing agendas can potentially create an understanding gap in a consultation.11 12", "Take the example of one of the most common presentations in ill health: the common cold."])
+    end
+    it 'correctly segments text #125' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10 This model relates to doctors achieving a balance between efficiency and effectiveness.")
+      expect(ps.segment).to eq(["Daniel Kahneman popularised the concept of fast and slow thinking: the distinction between instinctive (type 1 thinking) and reflective, analytical cognition (type 2).10", "This model relates to doctors achieving a balance between efficiency and effectiveness."])
+    end
+    it 'correctly segments text #126' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Its traditional use[1] is well documented in the ethnobotanical literature [2–11]. Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases.")
+      expect(ps.segment).to eq(["Its traditional use[1] is well documented in the ethnobotanical literature [2–11].", "Leaves, buds, tar and essential oils are used to treat a wide spectrum of diseases."])
+    end
+    it 'correctly segments text #127' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8] This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18] The radicals sought a second opinion on the meeting's legality.")
+      expect(ps.segment).to eq(["Thus increasing the desire for political reform both in Lancashire and in the country at large.[7][8]", "This was a serious misdemeanour,[16] encouraging them to declare the assembly illegal as soon as it was announced on 31 July.[17][18]", "The radicals sought a second opinion on the meeting's legality."])
+    end
+    it 'correctly segments text #128' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "The table in (4) is a sample from the Wall Street Journal (1987).1 According to the distribution all the pairs given in (4) count as candidates for abbreviations.")
+      expect(ps.segment).to eq([ "The table in (4) is a sample from the Wall Street Journal (1987).1", "According to the distribution all the pairs given in (4) count as candidates for abbreviations."])
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_segmenter
 version: !ruby/object:Gem::Version
-  version: 0.3.20
+  version: 0.3.21
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-08-27 00:00:00.000000000 Z
+date: 2018-08-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode