RubyGems - text_sentencer - Versions diffs - 0.2.0 → 0.2.1 - Mend

text_sentencer 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml +4 -4
data/bin/text_sentencer +5 -4
data/lib/text_sentencer/text_sentencer.rb +54 -80
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 85aab334688ecac5dc4b3307c58a2bf0058e0aad
-  data.tar.gz: 0255396f5d925c06023111bca178157b56765586
+  metadata.gz: 7b479744fc24ea87a1629ab886c39a16900d7ba8
+  data.tar.gz: 82a7d16ab9aff0a44fabfbc1b57a6367d9c7a979
 SHA512:
-  metadata.gz: a014eb537f0902018a4a71991110cea6350e26dadc108bc076f7673cd7ace1ea46a8ae8e07b76c4a22d7e94bbd0d7d94e96be5123e2ddf98b3536daba2a69b4c
-  data.tar.gz: 54885c68f05f96de55bb94ad6375cee45fb4fd6d9c41000cb7fa7091f0e3c0792ac63494c6b1d788088bc41a80559d3c711ccd8a82511a47c24c9fb7126a58c8
+  metadata.gz: 1573ea7f89f0a96d37de04724792c18f0f34f646f0e6fc005605c521903fb49cf6890dd9826fcf6e1a229ace0a8369fb2c36a61b0fe5444874d06fc1323a36b2
+  data.tar.gz: 7154af7ce5f9ce392b926b448f82e0b035345559c206cc0ca13d65204235df22e20c197668f71f61bf005b9a26d24f1697799842a577666dba87a6e41bd8c227

data/bin/text_sentencer CHANGED Viewed

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+require 'json'
 require 'text_sentencer'
 config_filename = nil
@@ -8,8 +9,8 @@ require 'optparse'
 optparse = OptionParser.new do |opts|
   opts.banner = "Usage: text_sentencer [options]"
-  opts.on('-c', '--config', 'specifies the configuration JSON file.') do |f|
-    config_filename = f
+  opts.on('-c', '--config=config_filename', 'specifies the configuration JSON file.') do |c|
+    config_filename = c
   end
   opts.on('-h', '--help', 'displays this screen.') do
@@ -20,8 +21,8 @@ end
 optparse.parse!
-config = if config_filename
-	JSON.parse File.read(config_filename) if File.file?(config_filename)
+config = if config_filename && File.file?(config_filename)
+	JSON.parse File.read(config_filename)
 end
 sentencer = TextSentencer.new(config)

data/lib/text_sentencer/text_sentencer.rb CHANGED Viewed

@@ -1,54 +1,58 @@
 #!/usr/bin/env ruby
+require 'pp'
 class TextSentencer
   ## default rules
-  # All the positions of space and tab characters are candiates of sentence break.
-  BREAK_CANDIDATES = [
-    " ", "\t"
-  ]
-  # All the positions of new line characters always take sentence break.
-  BREAK_CHARACTERS = [
-    "\n"
-  ]
-  # First, positive rules are applied to the break candidates to make initial segmantations.
-  POSITIVE_RULES = [
-    ['[\.!?]', '[0-9A-Z]'],
-    ['[:]', '[0-9]'],
-    ['[:]', '[A-Z][a-z]']
-  ]
-  # Then, negative rules are applied to cancel some initial segmentations.
-  NEGATIVE_RULES = [
-    # Titles before names
-    ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
-    # Titles usually before names, but ..
-    ['(Sr|Jr)\.', '[A-Z][a-z]'],
-    # Single letter abbriveations, e.g. middle name
-    # ['\b[A-Z]\.', '[A-Z][a-z]'],
-    # Abbriveations, e.g. middle name
-    ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
-    # Frequent abbreviations that will never appear in the end of a sentence
-    ['(cf|vs)\.', ''],
-    ['e\.g\.', ''],
-    ['i\.e\.', ''],
-    # Others
-    ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
-  ]
-  def initialize(rules = {})
-    rules ||= {}
-    @break_candidates = rules[:break_candidates] || BREAK_CANDIDATES
-    @break_characters = rules[:break_characters] || BREAK_CHARACTERS
-    @positive_rules = rules[:positive_rules] || POSITIVE_RULES
-    @negative_rules = rules[:negative_rules] || NEGATIVE_RULES
+  DEFAULT_RULES = {
+    # All the positions of new line characters always take sentence break.
+    break_characters: [
+      "\n"
+    ],
+    # All the positions of space and tab characters are candiates of sentence break.
+    break_candidates: [
+      " ", "\t"
+    ],
+    # First, positive rules are applied to the break candidates to make initial segmantations.
+    positive_rules: [
+      ['[.!?]', '[0-9A-Z]'],
+      ['[:]', '[0-9]'],
+      ['[:]', '[A-Z][a-z]']
+    ],
+    # Then, negative rules are applied to cancel some initial segmentations.
+    negative_rules: [
+      # Titles before names
+      ['(Mrs|Mmes|Mr|Messrs|Ms|Prof|Dr|Drs|Rev|Hon|Sen|St)\.', '[A-Z][a-z]'],
+      # Titles usually before names, but ..
+      ['(Sr|Jr)\.', '[A-Z][a-z]'],
+      # Single letter abbriveations, e.g. middle name
+      # ['\b[A-Z]\.', '[A-Z][a-z]'],
+      # Abbriveations, e.g. middle name
+      ['\b[A-Z][a-z]*\.', '[0-9A-Z]'],
+      # Frequent abbreviations that will never appear in the end of a sentence
+      ['(cf|vs)\.', ''],
+      ['e\.g\.', ''],
+      ['i\.e\.', ''],
+      # Others
+      ['(Sec|Chap|Fig|Eq)\.', '[0-9A-Z]']
+    ]
+  }
+  def initialize(rules = nil)
+    rules ||= DEFAULT_RULES
+    @rules = Hash[rules.map{|(k,v)| [k.to_sym,v]}]
+    @rules[:break_characters] ||= []
+    @rules[:break_candidates] ||= []
+    @rules[:positive_rules] ||= []
+    @rules[:negative_rules] ||= []
   end
   def annotate(text)
@@ -77,21 +81,21 @@ class TextSentencer
     for l in 0..text.length
       ## apply the positive rules to the places of break candidates
-      if @break_candidates.include?(text[l])
-        POSITIVE_RULES.each do |r|
+      if @rules[:break_candidates].include?(text[l])
+        @rules[:positive_rules].each do |r|
           if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
             pbreaks << l
             break
           end
         end
-      elsif @break_characters.include?(text[l])
+      elsif @rules[:break_characters].include?(text[l])
         breaks << l
       end
     end
     ## apply the negative rules to the places of break candidates
     pbreaks.each do |l|
-      NEGATIVE_RULES.each do |r|
+      @rules[:negative_rules].each do |r|
         if (text[0...l] =~ /#{r[0]}\Z/) && (text[l+1..-1] =~ /\A#{r[1]}/)
           nbreaks << l
           break
@@ -118,33 +122,3 @@ class TextSentencer
     sentences
   end
 end
-if __FILE__ == $0
-  rules = {
-    break_candidates: [
-      " ", "\t"
-    ],
-    break_characters: [
-      "\n"
-    ],
-    positive_rules: [
-      ['[\.!?]', '[0-9A-Z]'],
-      ['[:]', '[0-9]'],
-      ['[:]', '[A-Z][a-z]']
-    ],
-    negative_rules: []
-  }
-  sentencer = TextSentencer.new
-  text = ''
-  ARGF.each do |line|
-    text += line
-  end
-  sen_so = sentencer.annotate(text)
-  p(sen_so)
-end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: text_sentencer
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.2.1
 platform: ruby
 authors:
 - Jin-Dong Kim
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2017-07-20 00:00:00.000000000 Z
+date: 2017-07-21 00:00:00.000000000 Z
 dependencies: []
 description: TextSentencer is a simple rule-based system for segmenting a text block
   into sentences.
@@ -21,7 +21,7 @@ files:
 - bin/text_sentencer
 - lib/text_sentencer.rb
 - lib/text_sentencer/text_sentencer.rb
-homepage: http://rubygems.org/gems/text_sentencer
+homepage: https://github.com/jdkim/text_sentencer
 licenses:
 - MIT
 metadata: {}