RubyGems - text_sentencer - Versions diffs - 0.2.1 → 1.0.0 - Mend

text_sentencer 0.2.1 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/text_sentencer +13 -16
data/lib/text_sentencer/string_scan_offset.rb +9 -0
data/lib/text_sentencer/text_sentencer.rb +43 -48
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
-  data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
+  metadata.gz: 0be05e5130a03bd0c189b112d1d26c610844a4c9
+  data.tar.gz: 16ea016fb63066dd617c7b4ac39887bbacba40af
 SHA512:
-  metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
-  data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227
+  metadata.gz: c07b216029b059b7f9bc2dcff01278801933fb95d7a201f19d95951e52329f66c311ebf26bdbd910a2c57a2347adef6f366361f5e71643a735e8bc1775c4b61c
+  data.tar.gz: eeb161448cdd65860686248281c6438e0b9e096529956ae175faab0a41bfa40cd3a101017b9aa5367099d4ad39fd7bc972fde48beae88e3f3dcb2c500715ae22

data/bin/text_sentencer CHANGED Viewed

@@ -3,6 +3,7 @@ require 'json'
 require 'text_sentencer'
 config_filename = nil
+output_mode = :sentences
 ## command line option processing
 require 'optparse'
@@ -13,6 +14,10 @@ optparse = OptionParser.new do |opts|
     config_filename = c
   end
+  opts.on('-j', '--json_output', 'outputs the result in JSON.') do
+    output_mode = :json
+  end
   opts.on('-h', '--help', 'displays this screen.') do
     puts opts
     exit
@@ -28,21 +33,13 @@ end
 sentencer = TextSentencer.new(config)
 text = ARGF.read
-## Preprocessing
-# It should be removed later
-text.gsub!(/ +/, ' ')
-text.gsub!(/\n+/, "\n")
-text.gsub!(/\t+/, "\t")
-text.gsub!(/\n /, "\n")
-text.gsub!(/ \n/, "\n")
-text.gsub!(/\t /, "\t")
-text.gsub!(/ \t/, "\t")
-text.gsub!(/\n\t/, "\n")
-text.gsub!(/\t\n/, "\n")
 annotations = sentencer.annotate(text)
-annotations[:denotations].each do |d|
-	span = d[:span]
-	puts text[span[:begin]...span[:end]]
+if output_mode == :json
+  puts JSON.pretty_generate(annotations)
+else
+  annotations[:denotations].each do |d|
+    span = d[:span]
+    puts text[span[:begin]...span[:end]]
+  end
 end

data/lib/text_sentencer/string_scan_offset.rb ADDED Viewed

@@ -0,0 +1,9 @@
+class String
+  def scan_offset(regex)
+    Enumerator.new do |y|
+      self.scan(regex) do
+        y << Regexp.last_match
+      end
+    end
+  end
+end

data/lib/text_sentencer/text_sentencer.rb CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'text_sentencer/string_scan_offset'
 require 'pp'
 class TextSentencer
@@ -6,14 +7,10 @@ class TextSentencer
   DEFAULT_RULES = {
     # All the positions of new line characters always take sentence break.
-    break_characters: [
-      "\n"
-    ],
+    break_pattern: "([ \t]*\n+)+[ \t]*",
     # All the positions of space and tab characters are candiates of sentence break.
-    break_candidates: [
-      " ", "\t"
-    ],
+    candidate_pattern: "[ \t]+",
     # First, positive rules are applied to the break candidates to make initial segmantations.
     positive_rules: [
@@ -49,75 +46,73 @@ class TextSentencer
   def initialize(rules = nil)
     rules ||= DEFAULT_RULES
     @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
-    @rules[:break_characters] ||= []
-    @rules[:break_candidates] ||= []
+    @rules[:break_pattern] ||= ""
+    @rules[:candidate_pattern] ||= ""
     @rules[:positive_rules] ||= []
     @rules[:negative_rules] ||= []
   end
   def annotate(text)
-    return nil if text.nil? || text.empty?
+    return nil if text.nil?
     sentences = segment(text)
     denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
-    denotations.empty? ? {text:text} : {text:text, denotations:denotations}
+    {text:text, denotations:denotations}
   end
   private
   def segment(text)
-    original_text = text
-    text = original_text.strip
-    start = original_text.index(text)
-    # sentence breaks
-    breaks = []
+    breaks = if @rules[:break_pattern].empty?
+      []
+    else
+      text.scan_offset(/#{@rules[:break_pattern]}/).map{|m| m.offset(0)}
+    end
-    # breaks by positive rules
-    pbreaks = []
+    candidates = if @rules[:candidate_pattern].empty?
+      []
+    else
+      text.scan_offset(/#{@rules[:candidate_pattern]}/).map{|m| m.offset(0)}
+    end
-    # canceled breaks by negative rules
-    nbreaks = []
+    # breaks take precedent
+    candidates -= breaks
-    for l in 0..text.length
+    candidates.each do |c|
+      last_end, next_begin = c
-      ## apply the positive rules to the places of break candidates
-      if @rules[:break_candidates].include?(text[l])
-        @rules[:positive_rules].each do |r|
-          if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
-            pbreaks << l
-            break
-          end
-        end
-      elsif @rules[:break_characters].include?(text[l])
-        breaks << l
+      if (last_end == 0) || (next_begin == text.length)
+        breaks << c
+        next
       end
-    end
-    ## apply the negative rules to the places of break candidates
-    pbreaks.each do |l|
-      @rules[:negative_rules].each do |r|
-        if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
-          nbreaks << l
+      last_text = text[0...last_end]
+      next_text = text[next_begin..-1]
+      @rules[:positive_rules].each do |p|
+        if (last_text =~ /#{p[0]}\Z/) && (next_text =~ /\A#{p[1]}/)
+          break_p = true
+          @rules[:negative_rules].each do |n|
+            if (last_text =~ /#{n[0]}\Z/) && (next_text =~ /\A#{n[1]}/)
+              break_p = false
+              break
+            end
+          end
+          breaks << c if break_p
           break
         end
       end
     end
-    breaks += pbreaks - nbreaks
     breaks.sort!
     sentences = []
-    lastbreak = -1
+    lastbreak = 0
     breaks.each do |b|
-      sentences.push([lastbreak+1, b])
-      lastbreak = b
+      sentences << [lastbreak, b[0]] if b[0] > lastbreak
+      lastbreak = b[1]
     end
-    sentences.push([lastbreak+1, text.length])
-    ## filter out empty segments
-    sentences.delete_if {|b, e| text[b...e] !~ /[a-zA-Z0-9]/}
-    ## adjust offsets for the in text
-    sentences.collect!{|b, e| [b + start, e + start]}
+    sentences << [lastbreak, text.length] if lastbreak < text.length
     sentences
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_sentencer
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 1.0.0
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-07-21 00:00:00.000000000 Z
+date: 2017-07-22 00:00:00.000000000 Z
 dependencies: []
 description: TextSentencer is a simple rule-based system for segmenting a text block
   into sentences.
@@ -20,6 +20,7 @@ extra_rdoc_files: []
 files:
 - bin/text_sentencer
 - lib/text_sentencer.rb
+- lib/text_sentencer/string_scan_offset.rb
 - lib/text_sentencer/text_sentencer.rb
 homepage: https://github.com/jdkim/text_sentencer
 licenses: