RubyGems - srx-english - Versions diffs - 0.1.0 - Mend

srx-english 0.1.0

Files changed (11) hide show

data/.gitignore +3 -0
data/README.rdoc +75 -0
data/changelog.txt +2 -0
data/features/sentence_splitter.feature +34 -0
data/features/steps/sentence_splitter.rb +17 -0
data/features/steps/word_splitter.rb +26 -0
data/features/word_splitter.feature +17 -0
data/lib/srx/english/sentence_splitter.rb +96 -0
data/lib/srx/english/word_splitter.rb +57 -0
data/srx-english.gemspec +23 -0
metadata +77 -0

data/.gitignore ADDED Viewed

@@ -0,0 +1,3 @@
+.*.sw?
+work
+pkg

data/README.rdoc ADDED Viewed

@@ -0,0 +1,75 @@
+== srx-english
+* https://github.com/apohllo/srx-english
+= DESCRIPTION
+'srx-english' is a Ruby library containing English sentence and word segmentation rules.
+The sentence segementation rules are based on rules defined by Marcin Miłkowski:
+http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
+= FEATURES/PROBLEMS
+* this library is generated by 'srx2ruby' which has some limitations and might
+  be not 100% SRX standard compliant.
+= INSTALL
+Standard rubygems installation:
+  $ gem install srx-english
+= BASIC USAGE
+The library defines the SRX::English::Sentence class allowing to iterate
+over the matched sentences:
+  require 'srx/english/sentence_splitter'
+  text =<<-END
+    This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
+  END
+  splitter = SRX::English::SentenceSplitter.new(text)
+  splitter.each do |sentence|
+    puts sentence.gsub(/\n|\r/,"")
+  end
+  # This is e.g. Mr. Smith, who talks slowly...
+  # And this is another sentence.
+  require 'srx/english/word_splitter'
+  sentence = 'My home is my castle.'
+  splitter = SRX::English::WordSplitter.new(sentence)
+  splitter.each do |word,type|
+    puts "'#{word}' #{type}"
+  end
+  # 'My' word
+  # ' ' other
+  # 'home' word
+  # ' ' other
+  # 'is' word
+  # ' ' other
+  # 'my' word
+  # ' ' other
+  # 'castle' word
+  # '.' punct
+== LICENSE
+Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+== FEEDBACK
+* mailto:apohllo@o2.pl

data/changelog.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0.1.0
2	+ - First release of sentence segmentation rules for English

data/features/sentence_splitter.feature ADDED Viewed

@@ -0,0 +1,34 @@
+Feature: sentence splitter
+  Scenario: splitting text
+    Given a text
+      """
+      It [really!] works.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence            |
+      #-------------------- #
+      | It [really!] works. |
+    Given a text
+      """
+      This is e.g. Mr. Smith, who talks slowly... And this is another sentence.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence                                    |
+      #---------------------------------------------#
+      | This is e.g. Mr. Smith, who talks slowly... |
+      | And this is another sentence.               |
+    Given a text
+      """
+      Leave me alone!, he yelled. I am in the U.S. Army. Charles (Ind.) said he.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence                    |
+      #-----------------------------#
+      | Leave me alone!, he yelled. |
+      | I am in the U.S. Army.      |
+      | Charles (Ind.) said he.     |

data/features/steps/sentence_splitter.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# encoding: utf-8
+$:.unshift "lib"
+require 'srx/english/sentence_splitter'
+Given /^a text$/ do |text|
+  @text = text
+end
+When /^the text is split$/ do
+  @splitter = SRX::English::SentenceSplitter.new(@text)
+end
+Then /^the following sentences should be detected$/ do |table|
+  table.hashes.zip(@splitter.to_a).each do |expected,returned|
+    returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
+  end
+end

data/features/steps/word_splitter.rb ADDED Viewed

@@ -0,0 +1,26 @@
+# encoding: utf-8
+$:.unshift "lib"
+#require 'srx/english/word_splitter'
+Given /^a sentence '([^']+)'$/ do |sentence|
+  @sentence = sentence.force_encoding('utf-8')
+end
+When /^the sentence is split$/ do
+  @splitter = SRX::English::WordSplitter.new(@sentence)
+end
+Then /^the following segments should be detected$/ do |table|
+  table.hashes.zip(@splitter.to_a).each do |expected,returned|
+    returned[0].should == expected[:segment].gsub(/'/,"")
+    returned[1].should == expected[:type].to_sym
+  end
+end
+Then /^the following non-blank segments should be detected$/ do |table|
+  segments = @splitter.select{|s| s[1] != :other}
+  table.hashes.zip(segments).each do |expected,returned|
+    returned[0].should == expected[:segment].gsub(/'/,"")
+    returned[1].should == expected[:type].to_sym
+  end
+end

data/features/word_splitter.feature ADDED Viewed

@@ -0,0 +1,17 @@
+Feature: word splitter
+  Scenario: splitting a sentence
+    Given a sentence 'My home is my castle.'
+    When the sentence is split
+    Then the following segments should be detected
+      | segment | type  |
+      #-----------------#
+      | My      | word  |
+      | ' '     | other |
+      | home    | word  |
+      | ' '     | other |
+      | is      | word  |
+      | ' '     | other |
+      | my      | word  |
+      | ' '     | other |
+      | castle  | word  |
+      | .       | punct |

data/lib/srx/english/sentence_splitter.rb ADDED Viewed

@@ -0,0 +1,96 @@
+#encoding: utf-8
+require 'stringio'
+require 'term/ansicolor'
+module SRX
+  module English
+    RULES =
+[["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[^\\.]\\s[A-Z]\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s)|(?:\\b(?:St|Gen|Hon|Prof|Dr|Mr|Ms|Mrs|[JS]r|Col|Maj|Brig|Sgt|Capt|Cmnd|Sen|Rev|Rep|Revd)\\.\\s[A-Z]\\.\\s)|(?:\\bApr\\.\\s)|(?:\\bAug\\.\\s)|(?:\\bBros\\.\\s)|(?:\\bCo\\.\\s)|(?:\\bCorp\\.\\s)|(?:\\bDec\\.\\s)|(?:\\bDist\\.\\s)|(?:\\bFeb\\.\\s)|(?:\\bInc\\.\\s)|(?:\\bJan\\.\\s)|(?:\\bJul\\.\\s)|(?:\\bJun\\.\\s)|(?:\\bMar\\.\\s)|(?:\\bNov\\.\\s)|(?:\\bOct\\.\\s)|(?:\\bPh\\.?D\\.\\s)|(?:\\bSept?\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bcf\\.\\s)|(?:\\be\\.g\\.\\s)|(?:\\besp\\.\\s)|(?:\\bet\\b\\s\\bal\\.\\s)|(?:\\bvs\\.\\s)|(?:\\p{Ps}[!?]+\\p{Pe} )",
+  nil,
+  false],
+ ["(?:[\\.\\s]\\p{L}{1,2}\\.\\s)", "[\\p{N}\\p{Ll}]", false],
+ ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )", "[^\\p{Lu}]", false],
+ ["(?:\\b(?:pp|[Vv]iz|i\\.?\\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl|Pres|[Dd]ept|min|max|[Gg]ovt|lb|ft|c\\.?\\s*f|vs)\\.\\s)",
+  "[^\\p{Lu}]|I",
+  false],
+ ["(?:\\b[Ee]tc\\.\\s)", "[^p{Lu}]", false],
+ ["(?:[\\.!?…]+\\p{Pe} )|(?:[\\[\\(]*…[\\]\\)]* )", "\\p{Ll}", false],
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
+ ["(?:\\b[Ff]igs?\\.\\s)|(?:\\b[nN]o\\.\\s)", "\\p{N}", false],
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
+ ["(?:[\\.!?…][\\u00BB\\u2019\\u201D\\u203A\"'\\p{Pe}\\u0002]*\\s)|(?:\\r?\\n)",
+  nil,
+  true],
+ ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\p{Pe}\\u0002]*)",
+  "\\p{Lu}[^\\p{Lu}]",
+  true],
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
+    BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
+    REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
+    FIRST_CHAR = /\A./m
+    class SentenceSplitter
+      include Enumerable
+      attr_accessor :input
+      attr_writer :debug
+      # The sentence splitter is initialized with the +text+ to split.
+      # This might be a String or a IO object.
+      def initialize(text=nil)
+        if text.is_a?(String)
+          @input = StringIO.new(text,"r:utf-8")
+        else
+          @input = text
+        end
+      end
+      # Iterate over the sentences in the text.
+      # If the text is nil, exception is raised.
+      def each
+        raise "Invalid argument - text is nil" if @input.nil?
+        buffer_length = 10
+        sentence = ""
+        before_buffer = ""
+        @input.pos = 0
+        after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
+        matched_rule = nil
+        while(!@input.eof?) do
+          matched_before = BEFORE_RE.match(before_buffer)
+          break_detected = false
+          if matched_before
+            start_index = (matched_before.size - 1).times.find do |index|
+              matched_before[index+1]
+            end
+            if @debug
+              puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
+            end
+            REGEXPS.each do |before_re,after_re,value|
+              # skip the whole match
+              if before_re.match(before_buffer) && after_re.match(after_buffer)
+                break_detected = true
+                color = value ? :red : :green
+                if @debug
+                  sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
+                end
+                if value
+                  yield sentence
+                  sentence = ""
+                end
+                break
+              end
+            end
+          end
+          next_after = @input.readchar
+          before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
+          after_buffer.sub!(FIRST_CHAR,"")
+          before_buffer << $&
+          sentence << $&
+          after_buffer << next_after
+        end
+        yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
+      end
+    end
+  end
+end

data/lib/srx/english/word_splitter.rb ADDED Viewed

@@ -0,0 +1,57 @@
+# encoding: utf-8
+module SRX
+  module English
+    class WordSplitter
+      include Enumerable
+      attr_accessor :sentence
+      SPLIT_RULES = {
+        :word => "\\p{Alpha}\\p{Word}*",
+        :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
+        :punct => "\\p{Punct}",
+        :graph => "\\p{Graph}",
+        :other => "[^\\p{Word}\\p{Graph}]+"
+      }
+      SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
+      # The initializer accepts a +sentence+, which might be a
+      # Sentence instance or a String instance.
+      #
+      # The splitter might be initialized without the sentence,
+      # but should be set using the accessor before first call to
+      # +each+ method.
+      def initialize(sentence=nil)
+        @sentence = sentence
+      end
+      # This method iterates over the words in the sentence.
+      # It yields the string representation of the word and
+      # its type, which is one of:
+      # * +:word+ - a regular word (including words containing numbers, like A4)
+      # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
+      # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
+      # * +:graph+ - any single graphical (visible) character
+      # * +:other+ - anything which is not covered by the above types (non-visible
+      #   characters in particular)
+      def each
+        raise "Invalid argument - sentence is nil" if @sentence.nil?
+        @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
+          if !word.nil?
+            yield word, :word
+          elsif !number.nil?
+            yield number, :number
+          elsif !punct.nil?
+            yield punct, :punct
+          elsif !graph.nil?
+            yield graph, :graph
+          else
+            yield other, :other
+          end
+        end
+      end
+    end
+  end
+end

data/srx-english.gemspec ADDED Viewed

@@ -0,0 +1,23 @@
+# -*- encoding: utf-8 -*-
+$:.push File.expand_path("../lib", __FILE__)
+Gem::Specification.new do |s|
+  s.name        = "srx-english"
+  s.version     = "0.1.0"
+  s.platform    = Gem::Platform::RUBY
+  s.authors     = ["Aleksander Pohl"]
+  s.email       = ["apohllo@o2.pl"]
+  s.homepage    = "http://github.com/apohllo/srx2ruby"
+  s.summary     = %q{English sentence and word segmentation rules.}
+  s.description = %q{English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
+  s.rubyforge_project = "srx-english"
+  s.has_rdoc = false
+  s.files         = `git ls-files`.split("\n")
+  s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
+  s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
+  s.require_paths = ["lib"]
+  s.add_dependency("term-ansicolor", ["~> 1.0.5"])
+end

metadata ADDED Viewed

@@ -0,0 +1,77 @@
+--- !ruby/object:Gem::Specification
+name: srx-english
+version: !ruby/object:Gem::Version
+  prerelease:
+  version: 0.1.0
+platform: ruby
+authors:
+- Aleksander Pohl
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-04-19 00:00:00 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: term-ansicolor
+  prerelease: false
+  requirement: &id001 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        version: 1.0.5
+  type: :runtime
+  version_requirements: *id001
+description: "English sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
+email:
+- apohllo@o2.pl
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- .gitignore
+- README.rdoc
+- changelog.txt
+- features/sentence_splitter.feature
+- features/steps/sentence_splitter.rb
+- features/steps/word_splitter.rb
+- features/word_splitter.feature
+- lib/srx/english/sentence_splitter.rb
+- lib/srx/english/word_splitter.rb
+- srx-english.gemspec
+homepage: http://github.com/apohllo/srx2ruby
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: "0"
+requirements: []
+rubyforge_project: srx-english
+rubygems_version: 1.8.21
+signing_key:
+specification_version: 3
+summary: English sentence and word segmentation rules.
+test_files:
+- features/sentence_splitter.feature
+- features/steps/sentence_splitter.rb
+- features/steps/word_splitter.rb
+- features/word_splitter.feature