RubyGems - pragmatic_segmenter - Versions diffs - 0.2.0 → 0.3.0 - Mend

pragmatic_segmenter 0.2.0 → 0.3.0

Files changed (12) hide show

checksums.yaml +4 -4
data/README.md +23 -2
data/lib/pragmatic_segmenter/abbreviation.rb +1 -1
data/lib/pragmatic_segmenter/between_punctuation.rb +11 -0
data/lib/pragmatic_segmenter/languages/deutsch.rb +1 -1
data/lib/pragmatic_segmenter/languages/japanese.rb +1 -1
data/lib/pragmatic_segmenter/list.rb +12 -1
data/lib/pragmatic_segmenter/process.rb +15 -3
data/lib/pragmatic_segmenter/rules.rb +3 -2
data/lib/pragmatic_segmenter/version.rb +1 -1
data/spec/pragmatic_segmenter_spec.rb +30 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1ea9b87e9654b097486aa24301ec3da33c8b3b79
-  data.tar.gz: 0db4660da0b5ebd3a1536d2ad29ddc023e09a81d
+  metadata.gz: 1e6be4ac5dd27c491dbbed17b572b6503adb2707
+  data.tar.gz: 63d4c42adb13a83e5aa99702a7d7be0a162de0ad
 SHA512:
-  metadata.gz: 5737824cefe9a0c378e540857b0fc7122c7e89e738f1cb05722e7103346c8683be532b2c42ab21e199624d4f21f9a681511cb383cced542bfad30287d1c8893e
-  data.tar.gz: 0bfa44c11c70dc1af5b87b55499b6fb2ae306e541e5253335db45fbbc060316d2fa0c78477c36cf574e21ded594f2ccab9e0a6693154816217079df5d495ad16
+  metadata.gz: c21a055c652ffee7f819b79dbf49da0aeb2a068c16b14edbf995ba30f545824c359f61066490b8b5d15ef356df21edc890ea8cfc831fc246eeb293b038868250
+  data.tar.gz: 985d1121bc725a65d5b3ace00e1f03f1efa1e9e8edac9ae8657a9b4cb6c736720ed000fa825a3ab9b30fce928cd25f689c6068e3292df1c65c7f3b808fd103a2

data/README.md CHANGED Viewed

@@ -637,6 +637,20 @@ Hola Srta. Ledesma. Buenos días, soy el Lic. Naser Pastoriza, y él es mi padre
 => ["کیا حال ہے؟", "ميرا نام ___ ەے۔", "میں حالا تاوان دےدوں؟"]
 ```
+####Golden Rules (Dutch)
+1.) **Sentence starting with a number**
+```
+Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen. 81 procent van de schoten was raak.
+=> ["Hij schoot op de JP8-brandstof toen de Surface-to-Air (sam)-missiles op hem af kwamen.", "81 procent van de schoten was raak."]
+```
+2.) **Sentence starting with an ellipsis**
+```
+81 procent van de schoten was raak. ...en toen barste de hel los.
+=> ["81 procent van de schoten was raak.", "...en toen barste de hel los."]
+```
 ## Comparison of Segmentation Tools, Libraries and Algorithms
 Name                                                                 | Programming Language | License                                             | GRS (English) | GRS (Other Languages)† | Speed‡
@@ -657,11 +671,12 @@ Other tools not yet tested:
 * [FreeLing](http://nlp.lsi.upc.edu/freeling/)
 * [Alpino](http://www.let.rug.nl/vannoord/alp/Alpino/)
 * [trtok](https://github.com/jirkamarsik/trainable-tokenizer)
-* [segtok](https://pypi.python.org/pypi/segtok/1.1.0)
+* [segtok](https://github.com/fnl/segtok)
 * [LingPipe](http://alias-i.com/lingpipe/demos/tutorial/sentences/read-me.html)
 * [Elephant](http://gmb.let.rug.nl/elephant/experiments.php)
 * [Ucto: Unicode Tokenizer](http://ilk.uvt.nl/ucto/)
 * [tokenizer](http://moin.delph-in.net/WeSearch/DocumentParsing)
+* [spaCy](http://honnibal.github.io/spaCy/)
 ## Speed Performance Benchmarks
@@ -779,11 +794,17 @@ To test the relative performance of different segmentation tools and libraries I
 * Fix bug in splitting new sentence after single quotes
 **Version 0.2.0**
-* Add Dutch Golden rules and abbreviations
+* Add Dutch Golden Rules and abbreviations
 * Update README with additional tools
 * Update segmentation test scores in README with results of new Golden Rule tests
 * Add Polish abbreviations
+**Version 0.3.0**
+* Add support for square brackets
+* Add support for continuous exclamation points or questions marks or combinations of both
+* Fix Roman numeral support
+* Add English abbreviations
 ## Contributing
 If you find a text that is incorrectly segmented using this gem, please submit an issue.

data/lib/pragmatic_segmenter/abbreviation.rb CHANGED Viewed

@@ -3,7 +3,7 @@
 module PragmaticSegmenter
   # Defines the abbreviations for each language (if available)
   class Abbreviation
-    ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
+    ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk']
     PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs']
     NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']

data/lib/pragmatic_segmenter/between_punctuation.rb CHANGED Viewed

@@ -17,6 +17,9 @@ module PragmaticSegmenter
     # Rubular: http://rubular.com/r/JbAIpKdlSq
     BETWEEN_QUOTE_SLANTED_REGEX = /“(?>[^”\\]+|\\{2}|\\.)*”/
+    # Rubular: http://rubular.com/r/WX4AvnZvlX
+    BETWEEN_SQUARE_BRACKETS_REGEX = /\[(?>[^\]\\]+|\\{2}|\\.)*\]/
     # Rubular: http://rubular.com/r/6tTityPflI
     BETWEEN_PARENS_REGEX = /\((?>[^\(\)\\]+|\\{2}|\\.)*\)/
@@ -34,6 +37,7 @@ module PragmaticSegmenter
     def sub_punctuation_between_quotes_and_parens(txt)
       sub_punctuation_between_single_quotes(txt)
       sub_punctuation_between_double_quotes(txt)
+      sub_punctuation_between_square_brackets(txt)
       sub_punctuation_between_parens(txt)
       sub_punctuation_between_quotes_arrow(txt)
       sub_punctuation_between_quotes_slanted(txt)
@@ -46,6 +50,13 @@ module PragmaticSegmenter
       ).replace
     end
+    def sub_punctuation_between_square_brackets(txt)
+      PragmaticSegmenter::PunctuationReplacer.new(
+        matches_array: txt.scan(BETWEEN_SQUARE_BRACKETS_REGEX),
+        text: txt
+      ).replace
+    end
     def sub_punctuation_between_single_quotes(txt)
       PragmaticSegmenter::PunctuationReplacer.new(
         matches_array: txt.scan(BETWEEN_SINGLE_QUOTES_REGEX),

data/lib/pragmatic_segmenter/languages/deutsch.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module PragmaticSegmenter
       class Process < PragmaticSegmenter::Process
         private
-        def between_punctutation(txt)
+        def between_punctuation(txt)
           PragmaticSegmenter::Languages::Deutsch::BetweenPunctuation.new(text: txt).replace
         end

data/lib/pragmatic_segmenter/languages/japanese.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module PragmaticSegmenter
       class Process < PragmaticSegmenter::Process
         private
-        def between_punctutation(txt)
+        def between_punctuation(txt)
           PragmaticSegmenter::Languages::Japanese::BetweenPunctuation.new(text: txt).replace
         end
       end

data/lib/pragmatic_segmenter/list.rb CHANGED Viewed

@@ -4,6 +4,7 @@ module PragmaticSegmenter
   # This class searches for a list within a string and adds
   # newlines before each list item.
   class List
+    ROMAN_NUMERALS = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
     # Rubular: http://rubular.com/r/XcpaJKH0sz
     ALPHABETICAL_LIST_WITH_PERIODS =
       /(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)/
@@ -50,6 +51,16 @@ module PragmaticSegmenter
       format_numbered_list_with_parens(formatted_text)
     end
+    def replace_parens
+      ROMAN_NUMERALS.each do |rm|
+        next unless text =~ /\(#{Regexp.escape(rm)}\)\s[A-Z]/
+        text.gsub!(/\(#{Regexp.escape(rm)}\)(?=\s[A-Z])/) do |match|
+          match.gsub!(/\(/, '&✂&').gsub!(/\)/, '&⌬&')
+        end
+      end
+      text
+    end
     private
     def format_numbered_list_with_parens(txt)
@@ -171,7 +182,7 @@ module PragmaticSegmenter
     def iterate_alphabet_array(regex, parens, txt, roman_numeral)
       list_array = txt.scan(regex).map(&:downcase)
       if roman_numeral
-        alphabet = %w(i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx)
+        alphabet = ROMAN_NUMERALS
       else
         alphabet = ('a'..'z').to_a
       end

data/lib/pragmatic_segmenter/process.rb CHANGED Viewed

@@ -25,6 +25,9 @@ module PragmaticSegmenter
     # Rubular: http://rubular.com/r/JMjlZHAT4g
     SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = /(?<=[!?\.-][\"\'\u{201d}\u{201c}])\s{1}(?=[A-Z])/
+    # Rubular: http://rubular.com/r/mQ8Es9bxtk
+    CONTINUOUS_PUNCTUATION_REGEX = /(?<=\S)(!|\?){3,}(?=(\s|\z|$))/
     attr_reader :text, :doc_type
     def initialize(text:, doc_type:)
       @text = text
@@ -35,6 +38,7 @@ module PragmaticSegmenter
       reformatted_text = PragmaticSegmenter::List.new(text: text).add_line_break
       reformatted_text = replace_abbreviations(reformatted_text)
       reformatted_text = replace_numbers(reformatted_text)
+      reformatted_text = replace_continuous_punctuation(reformatted_text)
       reformatted_text.apply(AbbreviationsWithMultiplePeriodsAndEmailRule)
       reformatted_text.apply(GeoLocationRule)
       split_into_segments(reformatted_text)
@@ -69,6 +73,13 @@ module PragmaticSegmenter
       end
     end
+    def replace_continuous_punctuation(txt)
+      return txt unless txt =~ CONTINUOUS_PUNCTUATION_REGEX
+      txt.gsub!(CONTINUOUS_PUNCTUATION_REGEX) do |match|
+        match.gsub!(/!/, '&ᓴ&').gsub!(/\?/, '&ᓷ&')
+      end
+    end
     def consecutive_underscore?(txt)
       # Rubular: http://rubular.com/r/fTF2Ff3WBL
       txt.gsub(/_{3,}/, '').length.eql?(0)
@@ -85,12 +96,13 @@ module PragmaticSegmenter
     def process_text(txt)
       txt << 'ȸ' unless punctuation_array.any? { |p| txt[-1].include?(p) }
       PragmaticSegmenter::ExclamationWords.apply_rules(txt)
-      between_punctutation(txt)
+      between_punctuation(txt)
       txt = txt.apply(
-        DoublePuctationRules::All,
+        DoublePunctuationRules::All,
         QuestionMarkInQuotationRule,
         ExclamationPointRules::All
       )
+      txt = PragmaticSegmenter::List.new(text: txt).replace_parens
       sentence_boundary_punctuation(txt)
     end
@@ -106,7 +118,7 @@ module PragmaticSegmenter
       @punct_arr ||= PragmaticSegmenter::Punctuation.new.punct
     end
-    def between_punctutation(txt)
+    def between_punctuation(txt)
       PragmaticSegmenter::BetweenPunctuation.new(text: txt).replace
     end

data/lib/pragmatic_segmenter/rules.rb CHANGED Viewed

@@ -28,7 +28,7 @@ module PragmaticSegmenter
       All = [ InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule ]
     end
-    module DoublePuctationRules
+    module DoublePunctuationRules
       FirstRule = Rule.new(/\?!/, '☉')
       SecondRule = Rule.new(/!\?/, '☈')
       ThirdRule = Rule.new(/\?\?/, '☇')
@@ -64,6 +64,7 @@ module PragmaticSegmenter
       MixedDoubleEQ = Rule.new(/☈/, '!?')
       MixedDoubleEE = Rule.new(/☄/, '!!')
       LeftParens = Rule.new(/&✂&/, '(')
+      RightParens = Rule.new(/&⌬&/, ')')
       TemporaryEndingPunctutation = Rule.new('ȸ', '')
       Newline = Rule.new(/ȹ/, "\n")
@@ -74,7 +75,7 @@ module PragmaticSegmenter
               FullWidthQuestionMark, MixedDoubleQE,
               MixedDoubleQQ, MixedDoubleEQ,
               MixedDoubleEE, LeftParens,
-              TemporaryEndingPunctutation,
+              RightParens, TemporaryEndingPunctutation,
               Newline ]
     end

data/lib/pragmatic_segmenter/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module PragmaticSegmenter
-  VERSION = "0.2.0"
+  VERSION = "0.3.0"
 end

data/spec/pragmatic_segmenter_spec.rb CHANGED Viewed

@@ -1515,6 +1515,36 @@ RSpec.describe PragmaticSegmenter::Segmenter do
         ps = PragmaticSegmenter::Segmenter.new(text: "Hello. 'This is a test of single quotes.' A new sentence.")
         expect(ps.segment).to eq(["Hello.", "'This is a test of single quotes.'", "A new sentence."])
       end
+      it "correctly segments text #099" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "[A sentence in square brackets.]")
+        expect(ps.segment).to eq(["[A sentence in square brackets.]"])
+      end
+      it "correctly segments text #100" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "(iii) List item number 3.")
+        expect(ps.segment).to eq(["(iii) List item number 3."])
+      end
+      it "correctly segments text #101" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "Unbelievable??!?!")
+        expect(ps.segment).to eq(["Unbelievable??!?!"])
+      end
+      it "correctly segments text #102" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "This abbreviation f.e. means for example.")
+        expect(ps.segment).to eq(["This abbreviation f.e. means for example."])
+      end
+      it "correctly segments text #103" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "The med. staff here is very kind.")
+        expect(ps.segment).to eq(["The med. staff here is very kind."])
+      end
+      it "correctly segments text #104" do
+        ps = PragmaticSegmenter::Segmenter.new(text: "What did you order btw., she wondered.")
+        expect(ps.segment).to eq(["What did you order btw., she wondered."])
+      end
     end
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: pragmatic_segmenter
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Kevin S. Dias
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-01-26 00:00:00.000000000 Z
+date: 2015-02-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler