RubyGems - llt-segmenter - Versions diffs - 0.0.3 → 0.0.4 - Mend

llt-segmenter 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/lib/llt/segmenter/version.rb +1 -1
data/lib/llt/segmenter.rb +64 -2
data/spec/lib/llt/segmenter_spec.rb +31 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
-  data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
+  metadata.gz: 201cbc98ca192041429641ef0e3c0fcf128a3654
+  data.tar.gz: 1f61cb110d27d4b60427f16b1c7eb58576c67eb0
 SHA512:
-  metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
-  data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff
+  metadata.gz: 76017f1fc143d0d6d190b341218c3272f71ad230a1c81496ab9813a94a285262f3c15e73ef73570fc8a5b5d07ceb726b7e2ad9e046f9c54a8d61576017c0aac8
+  data.tar.gz: 0915e76fac1f68d23b9454c010fc5820b021457ff4b53cf8bb0bf98a05b73cffc21b888a0169d14d331a571812c5027e26e7dd56a07dc6936be7538160dff099

data/lib/llt/segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module LLT
   class Segmenter
-    VERSION = "0.0.3"
+    VERSION = "0.0.4"
   end
 end

data/lib/llt/segmenter.rb CHANGED Viewed

@@ -24,17 +24,21 @@ module LLT
     # so we have to change things as long as this is not fixed.
     #
     # (?<=\s|^) can be just \b in MRI 2.0 and upwards
-    AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
+    #
+    # Added > to the regex on Feb 11 2014 to treat a closing chevron as a kind
+    # of word boundary.
+    AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^|>)#{abbr}" }.join('|')
     # the xml escaped characters cannot be refactored to something along
     # &(?:amp|quot); - it's an invalid pattern in the look-behind
     SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
     DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
-    TRAILERS = /\)|<\/.*?>/
+    TRAILERS = /\)|\s*<\/.*?>/
     def segment(string, add_to: nil, **options)
       setup(options)
       # dump whitespace at the beginning and end!
       string.strip!
+      string = normalize_whitespace(string)
       sentences = scan_through_string(StringScanner.new(string))
       add_to << sentences if add_to.respond_to?(:<<)
       sentences
@@ -51,6 +55,64 @@ module LLT
       @sentence_closer = Regexp.union(SENTENCE_CLOSER, /\n{#{nl_boundary}}/)
     end
+    # Used to normalized wonky whitespace in front of or behind direct speech
+    # delimiters like " (currently the only one supported).
+    def normalize_whitespace(string)
+      # in most cases there is nothing to do, then leave immediately
+      return string unless string.match(/\s"\s/)
+      scanner = StringScanner.new(string)
+      reset_direct_speech_status
+      string_with_normalized_whitespace(scanner)
+    end
+    def string_with_normalized_whitespace(scanner)
+      new_string = ''
+      until scanner.eos?
+        if match = scanner.scan_until(/"/)
+          new_string << normalized_match(scanner, match)
+          toggle_direct_speech_status
+        else
+          new_string << scanner.rest
+          break
+        end
+      end
+      new_string
+    end
+    def surrounded_by_whitespace?(scanner)
+      pos_before = scanner.pre_match[-1]
+      pos_behind = scanner.post_match[0]
+      pos_before == ' ' && (pos_behind == ' ' || pos_behind == nil) # end of string
+    end
+    def normalized_match(scanner, match)
+      if surrounded_by_whitespace?(scanner)
+        if direct_speech_open?
+          # eliminate the whitespace in front of "
+          match[0..-3] << '"'
+        else
+          # hop over the whitespace behind "
+          scanner.pos = scanner.pos + 1
+          match
+        end
+      else
+        match
+      end
+    end
+    def direct_speech_open?
+      @direct_speech
+    end
+    def reset_direct_speech_status
+      @direct_speech = false
+    end
+    def toggle_direct_speech_status
+      @direct_speech = (@direct_speech ? false : true)
+    end
     def scan_through_string(scanner, sentences = [])
       while scanner.rest?
         sentence = scan_until_next_sentence(scanner, sentences)

data/spec/lib/llt/segmenter_spec.rb CHANGED Viewed

@@ -121,6 +121,25 @@ describe LLT::Segmenter do
         sentences.should have(2).items
         sentences[1].to_s.should == 'text 2.'
       end
+      it "doesn't break when a random newline leads the last tag" do
+        txt = "<grc> text.\n</grc>"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
+      it "handles abbreviation of Marcus (M.) at the beginning of a new paragraph" do
+        txt = "<p>qui facere poterat.</p>\n<p>\n<milestone/>\nM. Cicero inter Catilinas detestatur!"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+      end
+      it "treats an xml tag like a word boundary" do
+        # M. would not be recognized as abbreviation otherwise
+        txt = "<p>M. Cicero est.</p>"
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
     end
     context "with xml escaped characters" do
@@ -288,6 +307,18 @@ describe LLT::Segmenter do
       end
     end
+    context "with badly whitespaced direct speech delimiters" do
+      it "normalizes whitespace and knows to which sentence a \" belongs" do
+        txt = '"Marcus est. " Cicero est. " Iulius est. "'
+        sentences = segmenter.segment(txt)
+        #sentences.should have(3).items
+        sentences.map!(&:to_s)
+        sentences[0].should == '"Marcus est."'
+        sentences[1].should == 'Cicero est.'
+        sentences[2].should == '"Iulius est."'
+      end
+    end
     describe "takes an optional keyword argument add_to" do
       class ParagraphDummy
         attr_reader :sentences

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llt-segmenter
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.0.4
 platform: ruby
 authors:
 - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-02-05 00:00:00.000000000 Z
+date: 2014-02-11 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler