RubyGems - llt-segmenter - Versions diffs - 0.0.2 → 0.0.3 - Mend

llt-segmenter 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/.travis.yml +1 -1
data/README.md +0 -10
data/lib/llt/segmenter/version.rb +1 -1
data/lib/llt/segmenter.rb +34 -5
data/spec/lib/llt/segmenter_spec.rb +53 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 281e390815b02c98e23d91569c7fa99af1c7c0c4
-  data.tar.gz: fae604f9cd60978890c7bd2d7c87133e98d4b666
+  metadata.gz: 7fbc7cac188b6e8e63674495586e2bd89511f18a
+  data.tar.gz: 3766f1b4346d1fdaae057f6e75efaf78e4d1d318
 SHA512:
-  metadata.gz: a4273667e0f61109795552b9ab71c61fb5b9f5ec92497ce44c6ac471ed1d15babef116213e92851f9f270a60775329ae0308113deaba93c6b1022c03c188e0a9
-  data.tar.gz: d3d2c7720f9cb445f45f2b679b8bde1e487bf348fe80a3172f1239078e3caa9317ddcfd6ca7f1aa4e61c1a2dbf9cd0c3defe0a1295d203337c6ebcc3e1ea82c2
+  metadata.gz: dd674f244cb2b773fa3431fa88b6a9861e16d01e2e6b58d428f42f8d41f4c2117bca582481e9f28cce5a9166dd2715dd6d8279c048a322ffdfec92201fe9a097
+  data.tar.gz: 6adf68ff06205ce2c21b035b4a5a07cdad2f718296383a2de3ad3b194f22d8caee9b628d0ff3b8527ab49653833c78c0ce39170cbac3296c2b7ffaac5799feff

data/.travis.yml CHANGED Viewed

@@ -3,4 +3,4 @@ before_script:
   - "export JRUBY_OPTS=--2.0"
 rvm:
   - 2.0.0
-  - jruby-20mode
+  - jruby-1.7.8

data/README.md CHANGED Viewed

@@ -26,16 +26,6 @@ Or install it yourself as:
 TODO: Write usage instructions here
-## API
-This currently is a list of requirements and will transform into an API documentation.
-Input:
-- Text or (URI)
-- Black-/Whitelist for separators.
-Output:
-- XML (TEI) or JSON
 ## Contributing
 1. Fork it

data/lib/llt/segmenter/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 module LLT
   class Segmenter
-    VERSION = "0.0.2"
+    VERSION = "0.0.3"
   end
 end

data/lib/llt/segmenter.rb CHANGED Viewed

@@ -13,7 +13,8 @@ module LLT
     def self.default_options
       {
         indexing: true,
-        newline_boundary: 2
+        newline_boundary: 2,
+        xml: false
       }
     end
@@ -24,8 +25,10 @@ module LLT
     #
     # (?<=\s|^) can be just \b in MRI 2.0 and upwards
     AWB = ALL_ABBRS_PIPED.split('|').map { |abbr| "(?<=\\s|^)#{abbr}" }.join('|')
-    SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[;\?!:]/
-    DIRECT_SPEECH_DELIMITER = /['"”]/
+    # the xml escaped characters cannot be refactored to something along
+    # &(?:amp|quot); - it's an invalid pattern in the look-behind
+    SENTENCE_CLOSER = /(?<!#{AWB})\.(?!\.)|[\?!:]|((?<!&amp|&quot|&apos|&lt|&gt);)/
+    DIRECT_SPEECH_DELIMITER = /['"”]|&(?:apos|quot);/
     TRAILERS = /\)|<\/.*?>/
     def segment(string, add_to: nil, **options)
@@ -40,6 +43,7 @@ module LLT
     private
     def setup(options)
+      @xml = parse_option(:xml, options)
       @indexing = parse_option(:indexing, options)
       @id = 0 if @indexing
@@ -49,8 +53,9 @@ module LLT
     def scan_through_string(scanner, sentences = [])
       while scanner.rest?
-        sentence = scanner.scan_until(@sentence_closer) ||
-          rescue_no_delimiters(sentences, scanner)
+        sentence = scan_until_next_sentence(scanner, sentences)
+        rebuild_xml_tags(scanner, sentence, sentences) if @xml
         sentence << trailing_delimiters(scanner)
         sentence.strip!
@@ -63,12 +68,36 @@ module LLT
       sentences
     end
+    def scan_until_next_sentence(scanner, sentences)
+      scanner.scan_until(@sentence_closer) ||
+        rescue_no_delimiters(sentences, scanner)
+    end
     def id
       if @indexing
         @id += 1
       end
     end
+    # this is only needed when there is punctuation inside of xml tags
+    def rebuild_xml_tags(scanner, sentence, sentences)
+      if has_open_chevron?(sentence)
+        sentence << scanner.scan_until(/>/)
+        if inside_a_running_sentence?(sentence)
+          sentence << scan_until_next_sentence(scanner, sentences)
+        end
+        rebuild_xml_tags(scanner, sentence, sentences)
+      end
+    end
+    def has_open_chevron?(sentence)
+      sentence.count('<') > sentence.count('>')
+    end
+    def inside_a_running_sentence?(sentence)
+      ! sentence.match(/#{@sentence_closer}\s*<.*?>$/)
+    end
     def rescue_no_delimiters(sentences, scanner)
       if sentences.any?
         # broken off texts

data/spec/lib/llt/segmenter_spec.rb CHANGED Viewed

@@ -89,9 +89,61 @@ describe LLT::Segmenter do
     context "with embedded xml" do
       it "doesn't break up before xml closing tags" do
         txt = '<grc> text.</grc>'
-        sentences = segmenter.segment(txt)
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(1).item
+      end
+      it "doesn't break with punctuation in element names I" do
+        txt = '<grc.test>text.</grc.test>'
+        sentences = segmenter.segment(txt, xml: true)
         sentences.should have(1).item
       end
+      it "doesn't break with punctuation in element names II" do
+        txt = '<grc.test>text.</grc.test> text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[0].to_s.should == '<grc.test>text.</grc.test>'
+        sentences[1].to_s.should == 'text 2.'
+      end
+      it "doesn't break with punctuation in element names III" do
+        txt = '<grc.test>text</grc.test> resumed. text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[0].to_s.should == '<grc.test>text</grc.test> resumed.'
+        sentences[1].to_s.should == 'text 2.'
+      end
+      it "doesn't break with attribute values containing punctuation" do
+        txt = '<grc no="1.1"> text.</grc> text 2.'
+        sentences = segmenter.segment(txt, xml: true)
+        sentences.should have(2).items
+        sentences[1].to_s.should == 'text 2.'
+      end
+    end
+    context "with xml escaped characters" do
+      it "doesn't split when it shouldn't" do
+        txt = '&quot;text&quot; resumed. success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
+      it "acknowledges &quot; as potentially trailing delimiter" do
+        txt = '&quot;text.&quot; success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
+      it "acknowledges &apos; as potentially trailing delimiter" do
+        txt = '&apos;text.&apos; success.'
+        sentences = segmenter.segment(txt)
+        sentences.should have(2).item
+        sentences[1].to_s.should == 'success.'
+      end
     end
     context "newline (\\n) handling" do

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: llt-segmenter
 version: !ruby/object:Gem::Version
-  version: 0.0.2
+  version: 0.0.3
 platform: ruby
 authors:
 - Gernot Höflechner, Robert Lichstensteiner, Christof Sirk
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-12-09 00:00:00.000000000 Z
+date: 2014-02-05 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -152,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.1.5
+rubygems_version: 2.2.0
 signing_key:
 specification_version: 4
 summary: Segments text into sentences