RubyGems - llt-segmenter - Versions diffs - 0.0.3 → 0.0.4 - Mend

llt-segmenter 0.0.3 → 0.0.4

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/llt/segmenter/version.rb +1 -1
data/lib/llt/segmenter.rb +64 -2
data/spec/lib/llt/segmenter_spec.rb +31 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
-  data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
+  metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
+  data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
 SHA512:
-  metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
-  data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
+  metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
+  data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099

data/lib/llt/segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module LLT
   class Segmenter
-    VERSION = "0.0.3"
+    VERSION = "0.0.4"
   end
 end

data/lib/llt/segmenter.rb CHANGED Viewed

@@ -24,17 +24,21 @@ module LLT
     # so we have to change things as long as this is not fixed.
     #
     # (?<=\s|^) can be just \b in MRI 2.0 and upwards
-    AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
+    #
+    # Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
+    # of word boundary.
+    AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
     # the xml escaped characters cannot be refactored to something along
     # &(?:amp|quot); - it's an invalid pattern in the look-behind
     SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
     DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
-    TRAILERS = /\)|<\/.*?>/
+    TRAILERS = /\)|\s*<\/.*?>/
     def segment(string, add_to: nil, **options)
       setup(options)
       # dump whitespace at the beginning and end!
       string.strip!
+      string = normalize_whitespace(string)
       sentences = scan_through_string(StringScanner.new(string))
       add_to << sentences if add_to.respond_to?(:<<)
       sentences
@@ -51,6 +55,64 @@ module LLT
       @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
     end
+    # Used to normalized wonky whitespace in front of or behind direct speech
+    # delimiters like " (currently the only one supported).
+    def normalize_whitespace(string)
+      # in most cases there is nothing to do, then leave immediately
+      return string unless string.match(/\s"\s/)
+      scanner = StringScanner.new(string)
+      reset_direct_speech_status
+      string_with_normalized_whitespace(scanner)
+    end
+    def string_with_normalized_whitespace(scanner)
+      new_string = ''
+      until scanner.eos?
+        if match = scanner.scan_until(/"/)
+          new_string << normalized_match(scanner, match)
+          toggle_direct_speech_status
+        else
+          new_string << scanner.rest
+          break
+        end
+      end
+      new_string
+    end
+    def surrounded_by_whitespace?(scanner)
+      pos_before = scanner.pre_match[-1]
+      pos_behind = scanner.post_match[0]
+      pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
+    end
+    def normalized_match(scanner, match)
+      if surrounded_by_whitespace?(scanner)
+        if direct_speech_open?
+          # eliminate the whitespace in front of "
+          match[0..-3] << '"'
+        else
+          # hop over the whitespace behind "
+          scanner.pos = scanner.pos + 1
+          match
+        end
+      else
+        match
+      end
+    end
+    def direct_speech_open?
+      @direct_speech
+    end
+    def reset_direct_speech_status
+      @direct_speech = false
+    end
+    def toggle_direct_speech_status
+      @direct_speech = (@direct_speech ? false : true)
+    end
     def scan_through_string(scanner, sentences = [])
       while scanner.rest?
         sentence = scan_until_next_sentence(scanner, sentences)

data/spec/lib/llt/segmenter_spec.rb CHANGED Viewed

@@ -121,6 +121,25 @@ describe LLT::Segmenter do
         sentences.should have(2).items
         sentences[1].to_s.should == 'text 2.'
       end
+      it "doesn't break when a random newline leads the last tag" do
+        txt = "<grc> text.\n</grc>"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
+      it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
+        txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+      end
+      it "treats an xml tag like a word boundary" do
+        # M. would not be recognized as abbreviation otherwise
+        txt = "<p>M. Cicero est.</p>"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
     end
     context "with xml escaped characters" do
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
       end
     end
+    context "with badly whitespaced direct speech delimiters" do
+      it "normalizes whitespace and knows to which sentence a \" belongs" do
+        txt = '"Marcus est. " Cicero est. " Iulius est. "'
+        sentences = segmenter.segment(txt)
+        #sentences.should have(3).items
+        sentences.map!(&:to_s)
+        sentences[0].should == '"Marcus est."'
+        sentences[1].should == 'Cicero est.'
+        sentences[2].should == '"Iulius est."'
+      end
+    end
     describe "takes an optional keyword argument add_to" do
       class ParagraphDummy
         attr_reader :sentences

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llt-segmenter
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-05 00:00:00.000000000 Z
+date: 2014-02-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler