RubyGems - pragmatic_segmenter - Versions diffs - 0.3.19 → 0.3.20 - Mend

pragmatic_segmenter 0.3.19 → 0.3.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

checksums.yaml +4 -4
data/NEWS +7 -0
data/README.md +6 -0
data/lib/pragmatic_segmenter/between_punctuation.rb +10 -0
data/lib/pragmatic_segmenter/languages/chinese.rb +26 -0
data/lib/pragmatic_segmenter/languages/common.rb +1 -1
data/lib/pragmatic_segmenter/languages/common/numbers.rb +2 -2
data/lib/pragmatic_segmenter/version.rb +1 -1
data/spec/pragmatic_segmenter/languages/chinese_spec.rb +5 -0
data/spec/pragmatic_segmenter/languages/english_spec.rb +15 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 3f03709232bf5b433b73de74a04b254228bc3ce6
-  data.tar.gz: 087860da234619d431849de1f85614b876e6d31a
+  metadata.gz: 49b8f9ed555c4c3adba18700c7a910e3b67083aa
+  data.tar.gz: 0907aa732977255028708a162eece4e6225634da
 SHA512:
-  metadata.gz: 6d2759f5a302f6acf40b34990df8fd20d352b0aef5dfdd1799e5f7303d065466964566c068c505460837e838a8fa296754031084ef42715aab709f5d7a2f882c
-  data.tar.gz: 0ad1c988cff72e818fe9fdd4de77831e792e20834f27c6a40b34f594e8ef5a9927fdf1da259cdcd7a7d990b592c53f09bba11c59432b74d3b6a6f46615186378
+  metadata.gz: cfe1b18483d7fbe83d4bfe52971ab7ca95321e89c375e06d1c0e5686c7f2818930cf842a2963fb69bb08b213e6c5480a1fe1171610a8d08629089b028e5899f2
+  data.tar.gz: 4b0d8fdd97ce03c6e205e3c47312486e157ffa926e8ddb50f1e825ae75327379b3aee0dbbc61322bb15f1594535aecbf839a5842893f82d9a74caf2389a7f065

data/NEWS CHANGED Viewed

@@ -1,3 +1,10 @@
+0.3.20 (2018-08-28):
+* Improvement: Handle slanted single quotation as a single quote
+* Bug Fix: The text contains a single character abbreviation as part of a list
+* Bug Fix: Chinese book quotes
+* Improvement: Add viz as abbreviation
 0.3.19 (2018-07-19):
 * Bug Fix: A parenthetical following an abbreviation is now included as part of the same segment. Example: "The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)." is now treated as one segment.

data/README.md CHANGED Viewed

@@ -868,6 +868,12 @@ To test the relative performance of different segmentation tools and libraries I
 **Version 0.3.19**
 * Treat a parenthetical following an abbreviation as part of the same segment
+**Version 0.3.20**
+* Handle slanted single quotation as a single quote
+* Handle a single character abbreviation as part of a list
+* Add support for Chinese caret brackets
+* Add viz as abbreviation
 ## Contributing
 If you find a text that is incorrectly segmented using this gem, please submit an issue.

data/lib/pragmatic_segmenter/between_punctuation.rb CHANGED Viewed

@@ -8,6 +8,8 @@ module PragmaticSegmenter
     # Rubular: http://rubular.com/r/2YFrKWQUYi
     BETWEEN_SINGLE_QUOTES_REGEX = /(?<=\s)'(?:[^']|'[a-zA-Z])*'/
+    BETWEEN_SINGLE_QUOTE_SLANTED_REGEX = /(?<=\s)‘(?:[^’]|’[a-zA-Z])*’/
     # Rubular: http://rubular.com/r/3Pw1QlXOjd
     BETWEEN_DOUBLE_QUOTES_REGEX = /"(?>[^"\\]+|\\{2}|\\.)*"/
@@ -42,6 +44,7 @@ module PragmaticSegmenter
     def sub_punctuation_between_quotes_and_parens(txt)
       sub_punctuation_between_single_quotes(txt)
+      sub_punctuation_between_single_quote_slanted(txt)
       sub_punctuation_between_double_quotes(txt)
       sub_punctuation_between_square_brackets(txt)
       sub_punctuation_between_parens(txt)
@@ -74,6 +77,13 @@ module PragmaticSegmenter
       end
     end
+    def sub_punctuation_between_single_quote_slanted(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_SINGLE_QUOTE_SLANTED_REGEX),
+        text: txt
+      ).replace
+    end
     def sub_punctuation_between_double_quotes(txt)
       PragmaticSegmenter::PunctuationReplacer.new(
         matches_array: btwn_dbl_quote(txt),

data/lib/pragmatic_segmenter/languages/chinese.rb CHANGED Viewed

@@ -8,6 +8,32 @@ module PragmaticSegmenter
       class AbbreviationReplacer < AbbreviationReplacer
         SENTENCE_STARTERS = [].freeze
       end
+      class BetweenPunctuation < PragmaticSegmenter::BetweenPunctuation
+        BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX = /《(?>[^》\\]+|\\{2}|\\.)*》/
+        BETWEEN_L_BRACKET_REGEX = /「(?>[^」\\]+|\\{2}|\\.)*」/
+        private
+        def sub_punctuation_between_quotes_and_parens(txt)
+          super
+          sub_punctuation_between_double_angled_quotation_marks(txt)
+          sub_punctuation_between_l_bracket(txt)
+        end
+        def sub_punctuation_between_double_angled_quotation_marks(txt)
+          PunctuationReplacer.new(
+            matches_array: txt.scan(BETWEEN_DOUBLE_ANGLE_QUOTATION_MARK_REGEX),
+            text: txt
+          ).replace
+        end
+        def sub_punctuation_between_l_bracket(txt)
+          PunctuationReplacer.new(
+            matches_array: txt.scan(BETWEEN_L_BRACKET_REGEX),
+            text: txt
+          ).replace
+        end
+      end
     end
   end
 end

data/lib/pragmatic_segmenter/languages/common.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module PragmaticSegmenter
       # Defines the abbreviations for each language (if available)
       module Abbreviation
-        ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
+        ABBREVIATIONS = Set.new(['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']).freeze
         PREPOSITIVE_ABBREVIATIONS = Set.new(['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']).freeze
         NUMBER_ABBREVIATIONS = Set.new(['art', 'ext', 'no', 'nos', 'p', 'pp']).freeze
       end

data/lib/pragmatic_segmenter/languages/common/numbers.rb CHANGED Viewed

@@ -76,10 +76,10 @@ module PragmaticSegmenter
       # replaces the periods.
       module SingleLetterAbbreviationRules
         # Rubular: http://rubular.com/r/e3H6kwnr6H
-        SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=\s)/, '∯')
+        SingleUpperCaseLetterAtStartOfLineRule = Rule.new(/(?<=^[A-Z])\.(?=,?\s)/, '∯')
         # Rubular: http://rubular.com/r/gitvf0YWH4
-        SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=\s)/, '∯')
+        SingleUpperCaseLetterRule = Rule.new(/(?<=\s[A-Z])\.(?=,?\s)/, '∯')
         All = [
           SingleUpperCaseLetterAtStartOfLineRule,

data/lib/pragmatic_segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module PragmaticSegmenter
-  VERSION = "0.3.19"
+  VERSION = "0.3.20"
 end

data/spec/pragmatic_segmenter/languages/chinese_spec.rb CHANGED Viewed

@@ -7,5 +7,10 @@ RSpec.describe PragmaticSegmenter::Languages::Chinese, '(zh)' do
       ps = PragmaticSegmenter::Segmenter.new(text: "安永已聯繫周怡安親屬，協助辦理簽證相關事宜，周怡安家屬1月1日晚間搭乘東方航空班機抵達上海，他們步入入境大廳時神情落寞、不發一語。周怡安來自台中，去年剛從元智大學畢業，同年9月加入安永。", language: 'zh')
       expect(ps.segment).to eq(["安永已聯繫周怡安親屬，協助辦理簽證相關事宜，周怡安家屬1月1日晚間搭乘東方航空班機抵達上海，他們步入入境大廳時神情落寞、不發一語。", "周怡安來自台中，去年剛從元智大學畢業，同年9月加入安永。"])
     end
+    it 'correctly segments text #002' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "我们明天一起去看《摔跤吧！爸爸》好吗？好！", language: 'zh')
+      expect(ps.segment).to eq(["我们明天一起去看《摔跤吧！爸爸》好吗？", "好！"])
+    end
   end
 end

data/spec/pragmatic_segmenter/languages/english_spec.rb CHANGED Viewed

@@ -1411,5 +1411,20 @@ RSpec.describe PragmaticSegmenter::Languages::English, "(en)" do
       ps = PragmaticSegmenter::Segmenter.new(text: text, clean: false)
       expect(ps.segment).to eq(["The parties to this Agreement are PragmaticSegmenterExampleCompanyA Inc. (“Company A”), and PragmaticSegmenterExampleCompanyB Inc. (“Company B”)."])
     end
+    it 'correctly segments text #119' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before.")
+      expect(ps.segment).to eq(["Unlike the abbreviations i.e. and e.g., viz. is used to indicate a detailed description of something stated before."])
+    end
+    it 'correctly segments text #120' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154). The Herbal also includes lore about other plants, such as the mandrake.")
+      expect(ps.segment).to eq(["For example, ‘dragonswort… is said that it should be grown in dragon’s blood. It grows at the tops of mountains where there are groves of trees, chiefly in holy places and in the country that is called Apulia’ (translated by Anne Van Arsdall, in Medieval Herbal Remedies: The Old English Herbarium and Anglo-Saxon Medicine p. 154).", "The Herbal also includes lore about other plants, such as the mandrake."])
+    end
+    it 'correctly segments text #121' do
+      ps = PragmaticSegmenter::Segmenter.new(text: "Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia. Modeling Open Source Software Communities, ProQuest Dissertations and Theses.")
+      expect(ps.segment).to eq(["Here’s the - ahem - official citation: Baker, C., Anderson, Kenneth, Martin, James, & Palen, Leysia.", "Modeling Open Source Software Communities, ProQuest Dissertations and Theses."])
+    end
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_segmenter
 version: !ruby/object:Gem::Version
-  version: 0.3.19
+  version: 0.3.20
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-07-19 00:00:00.000000000 Z
+date: 2018-08-27 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: unicode