RubyGems - text_sentencer - Versions diffs - 0.1.0 → 0.2.0 - Mend

text_sentencer 0.1.0 → 0.2.0

Files changed (5) hide show

checksums.yaml +4 -4
data/bin/text_sentencer +47 -0
data/lib/text_sentencer/text_sentencer.rb +101 -20
metadata +9 -8
data/lib/text_sentencer/rules.rb +0 -33

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 1ec3b6b9d2b3596397c952d6e905c75f4667a73d
-  data.tar.gz: f094203047168fae9a682d1344904c301c9b62ae
+  metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
+  data.tar.gz: 0255396f5d925c06023111bca178157b56765586
 SHA512:
-  metadata.gz: 1142ded7a5be0e72cb840f8f0f6b294196b4718f1e793935304068c6dd4763eedcd55db26ed70e24cb7477d5b3f3fcb24f4c0c57b9a6751a325e4a26c429d553
-  data.tar.gz: 44ef4ff3c18c18623b340a32331153da3f24e3e992f7b481d682d05ec1778e05f75f85ce1992182a8c6cc383d28a3e14e0d553d3f2df20853928400da73a13a0
+  metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
+  data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8

data/bin/text_sentencer ADDED Viewed

@@ -0,0 +1,47 @@
+#!/usr/bin/env ruby
+require 'text_sentencer'
+config_filename = nil
+## command line option processing
+require 'optparse'
+optparse = OptionParser.new do |opts|
+  opts.banner = "Usage: text_sentencer [options]"
+  opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
+    config_filename = f
+  end
+  opts.on('-h', '--help', 'displays this screen.') do
+    puts opts
+    exit
+  end
+end
+optparse.parse!
+config = if config_filename
+	JSON.parse File.read(config_filename) if File.file?(config_filename)
+end
+sentencer = TextSentencer.new(config)
+text = ARGF.read
+## Preprocessing
+# It should be removed later
+text.gsub!(/ +/, ' ')
+text.gsub!(/\n+/, "\n")
+text.gsub!(/\t+/, "\t")
+text.gsub!(/\n /, "\n")
+text.gsub!(/ \n/, "\n")
+text.gsub!(/\t /, "\t")
+text.gsub!(/ \t/, "\t")
+text.gsub!(/\n\t/, "\n")
+text.gsub!(/\t\n/, "\n")
+annotations = sentencer.annotate(text)
+annotations[:denotations].each do |d|
+	span = d[:span]
+	puts text[span[:begin]...span[:end]]
+end

data/lib/text_sentencer/text_sentencer.rb CHANGED Viewed

@@ -1,44 +1,105 @@
 #!/usr/bin/env ruby
-require 'text_sentencer/rules'
-module TextSentencer; end unless defined? TextSentencer
+class TextSentencer
+  ## default rules
-module TextSentencer
-  def TextSentencer.segment(text)
+  # All the positions of space and tab characters are candiates of sentence break.
+  BREAK_CANDIDATES = [
+    " ", "\t"
+  ]
+  # All the positions of new line characters always take sentence break.
+  BREAK_CHARACTERS = [
+    "\n"
+  ]
+  # First, positive rules are applied to the break candidates to make initial segmantations.
+  POSITIVE_RULES = [
+    ['[\.!?]', '[0-9A-Z]'],
+    ['[:]', '[0-9]'],
+    ['[:]', '[A-Z][a-z]']
+  ]
+  # Then, negative rules are applied to cancel some initial segmentations.
+  NEGATIVE_RULES = [
+    # Titles before names
+    ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
+    # Titles usually before names, but ..
+    ['(Sr|Jr)\.', '[A-Z][a-z]'],
+    # Single letter abbriveations, e.g. middle name
+    # ['\b[A-Z]\.', '[A-Z][a-z]'],
+    # Abbriveations, e.g. middle name
+    ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
+    # Frequent abbreviations that will never appear in the end of a sentence
+    ['(cf|vs)\.', ''],
+    ['e\.g\.', ''],
+    ['i\.e\.', ''],
+    # Others
+    ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
+  ]
+  def initialize(rules = {})
+    rules ||= {}
+    @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
+    @break_characters = rules[:break_characters] || BREAK_CHARACTERS
+    @positive_rules = rules[:positive_rules] || POSITIVE_RULES
+    @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
+  end
+  def annotate(text)
+    return nil if text.nil? || text.empty?
+    sentences = segment(text)
+    denotations = sentences.inject([]){|c, s| c << {:span => {:begin => s[0], :end => s[1]}, :obj => 'Sentence'}}
+    denotations.empty? ? {text:text} : {text:text, denotations:denotations}
+  end
+  private
+  def segment(text)
     original_text = text
     text = original_text.strip
     start = original_text.index(text)
-    ## apply the positive rules to the places of space and newline characters
-    pbreaks = []                # breaks by positive rules
+    # sentence breaks
+    breaks = []
+    # breaks by positive rules
+    pbreaks = []
+    # canceled breaks by negative rules
+    nbreaks = []
     for l in 0..text.length
-      case text[l]
-      when ' '                   # space
+      ## apply the positive rules to the places of break candidates
+      if @break_candidates.include?(text[l])
         POSITIVE_RULES.each do |r|
           if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
             pbreaks << l
             break
           end
         end
-      when "\n"                   # newline
-        pbreaks << l
+      elsif @break_characters.include?(text[l])
+        breaks << l
       end
     end
-    ## apply the negative rules to the places of space characters
-    nbreaks = []                # breaks by negative rules
+    ## apply the negative rules to the places of break candidates
     pbreaks.each do |l|
-      if text[l] == ' '
-        NEGATIVE_RULES.each do |r|
-          if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
-            nbreaks << l
-            break
-          end
+      NEGATIVE_RULES.each do |r|
+        if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
+          nbreaks << l
+          break
         end
       end
     end
-    breaks = pbreaks - nbreaks
+    breaks += pbreaks - nbreaks
+    breaks.sort!
     sentences = []
     lastbreak = -1
@@ -59,11 +120,31 @@ module TextSentencer
 end
 if __FILE__ == $0
+  rules = {
+    break_candidates: [
+      " ", "\t"
+    ],
+    break_characters: [
+      "\n"
+    ],
+    positive_rules: [
+      ['[\.!?]', '[0-9A-Z]'],
+      ['[:]', '[0-9]'],
+      ['[:]', '[A-Z][a-z]']
+    ],
+    negative_rules: []
+  }
+  sentencer = TextSentencer.new
   text = ''
   ARGF.each do |line|
     text += line
   end
-  sen_so = TextSentencer.segment(text)
+  sen_so = sentencer.annotate(text)
   p(sen_so)
 end

metadata CHANGED Viewed

@@ -1,24 +1,25 @@
 --- !ruby/object:Gem::Specification
 name: text_sentencer
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.2.0
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-04-11 00:00:00.000000000 Z
+date: 2017-07-20 00:00:00.000000000 Z
 dependencies: []
-description: Text sentencer finds sentence boundaries of a given text. It is a simple,
-  rule-based system.
+description: TextSentencer is a simple rule-based system for segmenting a text block
+  into sentences.
 email: jindong.kim@gmail.com
-executables: []
+executables:
+- text_sentencer
 extensions: []
 extra_rdoc_files: []
 files:
+- bin/text_sentencer
 - lib/text_sentencer.rb
-- lib/text_sentencer/rules.rb
 - lib/text_sentencer/text_sentencer.rb
 homepage: http://rubygems.org/gems/text_sentencer
 licenses:
@@ -40,8 +41,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.4.8
 signing_key:
 specification_version: 4
-summary: To find sentences in text.
+summary: A simple, rule-based script to find sentence boundaries in text.
 test_files: []

data/lib/text_sentencer/rules.rb DELETED Viewed

@@ -1,33 +0,0 @@
-module TextSentencer; end unless defined? TextSentencer
-# All the positions of whitespace characters are candiate of sentence boundary.
-# First, positive rules are applied to find make initial segmantations.
-TextSentencer::POSITIVE_RULES = [
-  ['[\.!?]', '[0-9A-Z]'],
-  ['[:]', '[0-9]'],
-  ['[:]', '[A-Z][a-z]']
-]
-# Then, negative rules are applied to cancel some initial segmentations.
-TextSentencer::NEGATIVE_RULES = [
-  # Titles before names
-  ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
-  # Titles usually before names, but ..
-  ['(Sr|Jr)\.', '[A-Z][a-z]'],
-  # Single letter abbriveations, e.g. middle name
-  # ['\b[A-Z]\.', '[A-Z][a-z]'],
-  # Abbriveations, e.g. middle name
-  ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
-  # Frequent abbreviations that will never appear in the end of a sentence
-  ['(cf|vs)\.', ''],
-  ['e\.g\.', ''],
-  ['i\.e\.', ''],
-  # Others
-  ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
-]