RubyGems - llt-segmenter - Versions diffs - 0.0.2 → 0.0.3 - Mend

llt-segmenter 0.0.2 → 0.0.3

Files changed (7) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -1
data/README.md +0 -10
data/lib/llt/segmenter/version.rb +1 -1
data/lib/llt/segmenter.rb +34 -5
data/spec/lib/llt/segmenter_spec.rb +53 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 281e390815b02c98e23d91569c7fa99af1c7c0c4
-  data.tar.gz: fae604f9cd60978890c7bd2d7c87133e98d4b666
+  metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
+  data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
 SHA512:
-  metadata.gz: a4273667e0f61109795552b9ab71c61fb5b9f5ec92497ce44c6ac471ed1d15babef116213e92851f9f270a60775329ae0308113deaba93c6b1022c03c188e0a9
-  data.tar.gz: d3d2c7720f9cb445f45f2b679b8bde1e487bf348fe80a3172f1239078e3caa9317ddcfd6ca7f1aa4e61c1a2dbf9cd0c3defe0a1295d203337c6ebcc3e1ea82c2
+  metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
+  data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff

data/.travis.yml CHANGED Viewed

@@ -3,4 +3,4 @@ before_script:
   - "export JRUBY_OPTS=--2.0"
 rvm:
   - 2.0.0
-  - jruby-20mode
+  - jruby-1.7.8

data/README.md CHANGED Viewed

@@ -26,16 +26,6 @@ Or install it yourself as:
 TODO: Write usage instructions here
-## API
-This currently is a list of requirements and will transform into an API documentation.
-Input:
-- Text or (URI)
-- Black-/Whitelist for separators.
-Output:
-- XML (TEI) or JSON
 ## Contributing
 1. Fork it

data/lib/llt/segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module LLT
   class Segmenter
-    VERSION = "0.0.2"
+    VERSION = "0.0.3"
   end
 end

data/lib/llt/segmenter.rb CHANGED Viewed

@@ -13,7 +13,8 @@ module LLT
     def self.default_options
       {
         indexing: true,
-        newline_boundary: 2
+        newline_boundary: 2,
+        xml: false
       }
     end
@@ -24,8 +25,10 @@ module LLT
     #
     # (?<=\s|^) can be just \b in MRI 2.0 and upwards
     AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
-    SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
-    DIRECT_SPEECH_DELIMITER = /['"”]/
+    # the xml escaped characters cannot be refactored to something along
+    # &(?:amp|quot); - it's an invalid pattern in the look-behind
+    SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
+    DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
     TRAILERS = /\)|<\/.*?>/
     def segment(string, add_to: nil, **options)
@@ -40,6 +43,7 @@ module LLT
     private
     def setup(options)
+      @xml = parse_option(:xml, options)
       @indexing = parse_option(:indexing, options)
       @id = 0 if @indexing
@@ -49,8 +53,9 @@ module LLT
     def scan_through_string(scanner, sentences = [])
       while scanner.rest?
-        sentence = scanner.scan_until(@sentence_closer) ||
-          rescue_no_delimiters(sentences, scanner)
+        sentence = scan_until_next_sentence(scanner, sentences)
+        rebuild_xml_tags(scanner, sentence, sentences) if @xml
         sentence << trailing_delimiters(scanner)
         sentence.strip!
@@ -63,12 +68,36 @@ module LLT
       sentences
     end
+    def scan_until_next_sentence(scanner, sentences)
+      scanner.scan_until(@sentence_closer) ||
+        rescue_no_delimiters(sentences, scanner)
+    end
     def id
       if @indexing
         @id += 1
       end
     end
+    # this is only needed when there is punctuation inside of xml tags
+    def rebuild_xml_tags(scanner, sentence, sentences)
+      if has_open_chevron?(sentence)
+        sentence << scanner.scan_until(/>/)
+        if inside_a_running_sentence?(sentence)
+          sentence << scan_until_next_sentence(scanner, sentences)
+        end
+        rebuild_xml_tags(scanner, sentence, sentences)
+      end
+    end
+    def has_open_chevron?(sentence)
+      sentence.count('<') > sentence.count('>')
+    end
+    def inside_a_running_sentence?(sentence)
+      ! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
+    end
     def rescue_no_delimiters(sentences, scanner)
       if sentences.any?
         # broken off texts

data/spec/lib/llt/segmenter_spec.rb CHANGED Viewed

@@ -89,9 +89,61 @@ describe LLT::Segmenter do
     context "with embedded xml" do
       it "doesn't break up before xml closing tags" do
         txt = '<grc> text.</grc>'
-        sentences = segmenter.segment(txt)
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
+      it "doesn't break with punctuation in element names I" do
+        txt = '<grc.test>text.</grc.test>'
+        sentences = segmenter.segment(txt, xml: true)
         sentences.should have(1).item
       end
+      it "doesn't break with punctuation in element names II" do
+        txt = '<grc.test>text.</grc.test> text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[0].to_s.should == '<grc.test>text.</grc.test>'
+        sentences[1].to_s.should == 'text 2.'
+      end
+      it "doesn't break with punctuation in element names III" do
+        txt = '<grc.test>text</grc.test> resumed. text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
+        sentences[1].to_s.should == 'text 2.'
+      end
+      it "doesn't break with attribute values containing punctuation" do
+        txt = '<grc no="1.1"> text.</grc> text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[1].to_s.should == 'text 2.'
+      end
+    end
+    context "with xml escaped characters" do
+      it "doesn't split when it shouldn't" do
+        txt = '&quot;text&quot; resumed. success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
+      it "acknowledges &quot; as potentially trailing delimiter" do
+        txt = '&quot;text.&quot; success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
+      it "acknowledges &apos; as potentially trailing delimiter" do
+        txt = '&apos;text.&apos; success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
     end
     context "newline (\\n) handling" do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llt-segmenter
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-09 00:00:00.000000000 Z
+date: 2014-02-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.1.5
+rubygems_version: 2.2.0
 signing_key:
 specification_version: 4
 summary: Segments text into sentences