RubyGems - srx-polish - Versions diffs - 0.1.1 → 0.2.1 - Mend

srx-polish 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

data/{README.txt → README.rdoc} +30 -5
data/changelog.txt +10 -0
data/features/sentence_splitter.feature +34 -0
data/features/steps/sentence_splitter.rb +17 -0
data/features/steps/word_splitter.rb +26 -0
data/features/word_splitter.feature +75 -0
data/lib/srx/polish/{sentence.rb → sentence_splitter.rb} +3 -1
data/lib/srx/polish/word_splitter.rb +56 -0
data/srx-polish.gemspec +3 -3
metadata +18 -11

data/{README.txt → README.rdoc} RENAMED

@@ -4,8 +4,8 @@
 = DESCRIPTION
-'srx-polish' is a Ruby library containint Polish sentence segmentation rules
-based on SRX rules defined by Marcin Miłkowski:
+'srx-polish' is a Ruby library containing Polish sentence and word segmentation rules.
+The sentence segementation rules are based on rules defined by Marcin Miłkowski:
 http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
 = FEATURES/PROBLEMS
@@ -24,20 +24,45 @@ Standard rubygems installation:
 The library defines the SRX::Polish::Sentence class allowing to iterate
 over the matched sentences:
-  require 'srx/polish/sentence'
+  require 'srx/polish/sentence_splitter'
   text =<<-END
     Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
     sprawa jest szczegółowo opisana.
   END
-  sentences = SRX::Polish::Sentence.new(text)
-  sentences.each do |sentence|
+  splitter = SRX::Polish::SentenceSplitter.new(text)
+  splitter.each do |sentence|
     puts sentence.gsub(/\n|\r/,"")
   end
   # Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
   # Na s. 10 książki sprawa jest szczegółowo opisana.
+  require 'srx/polish/word_splitter'
+  sentence = "Ala ma kota za 5zł i 10$."
+  splitter = SRX::Polish::WordSplitter.new(sentence)
+  splitter.each do |word,type|
+    puts "'#{word}' #{type}"
+  end
+  # 'Ala' word
+  # ' ' other
+  # 'ma' word
+  # ' ' other
+  # 'kota' word
+  # ' ' other
+  # 'za' word
+  # ' ' other
+  # '5' number
+  # 'zł' word
+  # ' ' other
+  # 'i' word
+  # ' ' other
+  # '10' number
+  # '$' graph
+  # '.' punct
 == LICENSE
 Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski

data/changelog.txt ADDED

@@ -0,0 +1,10 @@
+0.2.1
+- fix extension of README (was md, should be rdoc)
+0.2.0
+- API change - Sentence changed into SentenceSplitter
+- Word segmentation rules added
+- Basic features added
+0.1.1
+- Dependency on Term::Ansicolor added
+0.1.0
+- First release of sentence segmentation rules

data/features/sentence_splitter.feature ADDED

@@ -0,0 +1,34 @@
+Feature: sentence splitter
+  Scenario: splitting text
+    Given a text
+      """
+      W październiku 1890 r. dwóch żołnierzy dokonało
+      rewolty, np. Andrzej i P. Woźny.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence                                                                         |
+      #----------------------------------------------------------------------------------#
+      | W październiku 1890 r. dwóch żołnierzy dokonało rewolty, np. Andrzej i P. Woźny. |
+    Given a text
+      """
+      Wiosna, lato, itd. A wczoraj mieliśmy jesień.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence                   |
+      #----------------------------#
+      | Wiosna, lato, itd.         |
+      | A wczoraj mieliśmy jesień. |
+    Given a text
+      """
+      Andrzej G. (20 l.). A z nim sześciu mężczyzn.
+      """
+    When the text is split
+    Then the following sentences should be detected
+      | sentence                   |
+      #----------------------------#
+      | Andrzej G. (20 l.).        |
+      | A z nim sześciu mężczyzn.  |

data/features/steps/sentence_splitter.rb ADDED

@@ -0,0 +1,17 @@
+# encoding: utf-8
+$:.unshift "lib"
+require 'srx/polish/sentence_splitter'
+Given /^a text$/ do |text|
+  @text = text
+end
+When /^the text is split$/ do
+  @splitter = SRX::Polish::SentenceSplitter.new(@text)
+end
+Then /^the following sentences should be detected$/ do |table|
+  table.hashes.zip(@splitter.to_a).each do |expected,returned|
+    returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
+  end
+end

data/features/steps/word_splitter.rb ADDED

@@ -0,0 +1,26 @@
+# encoding: utf-8
+$:.unshift "lib"
+require 'srx/polish/word_splitter'
+Given /^a sentence '([^']+)'$/ do |sentence|
+  @sentence = sentence.force_encoding('utf-8')
+end
+When /^the sentence is split$/ do
+  @splitter = SRX::Polish::WordSplitter.new(@sentence)
+end
+Then /^the following segments should be detected$/ do |table|
+  table.hashes.zip(@splitter.to_a).each do |expected,returned|
+    returned[0].should == expected[:segment].gsub(/'/,"")
+    returned[1].should == expected[:type].to_sym
+  end
+end
+Then /^the following non-blank segments should be detected$/ do |table|
+  segments = @splitter.select{|s| s[1] != :other}
+  table.hashes.zip(segments).each do |expected,returned|
+    returned[0].should == expected[:segment].gsub(/'/,"")
+    returned[1].should == expected[:type].to_sym
+  end
+end

data/features/word_splitter.feature ADDED

@@ -0,0 +1,75 @@
+Feature: word splitter
+  Scenario: splitting a sentence
+    Given a sentence 'My home is my castle.'
+    When the sentence is split
+    Then the following segments should be detected
+      | segment | type  |
+      #-----------------#
+      | My      | word  |
+      | ' '     | other |
+      | home    | word  |
+      | ' '     | other |
+      | is      | word  |
+      | ' '     | other |
+      | my      | word  |
+      | ' '     | other |
+      | castle  | word  |
+      | .       | punct |
+    Given a sentence 'W dniu 14/12/2011 nastąpił napad na bank!'
+    When the sentence is split
+    Then the following non-blank segments should be detected
+      | segment    | type   |
+      #---------------------#
+      | W          | word   |
+      | dniu       | word   |
+      | 14/12/2011 | number |
+      | nastąpił   | word   |
+      | napad      | word   |
+      | na         | word   |
+      | bank       | word   |
+      | !          | punct  |
+    Given a sentence 'Użytkownik o loginie ania8 zalogował się 7 listopada.'
+    When the sentence is split
+    Then the following non-blank segments should be detected
+      | segment    | type   |
+      #---------------------#
+      | Użytkownik | word   |
+      | o          | word   |
+      | loginie    | word   |
+      | ania8      | word   |
+      | zalogował  | word   |
+      | się        | word   |
+      | 7          | number |
+      | listopada  | word   |
+      | .          | punct  |
+    Given a sentence 'Czy wrona, kruk i gawron to polskie ptaki?'
+    When the sentence is split
+    Then the following non-blank segments should be detected
+      | segment    | type   |
+      #---------------------#
+      | Czy        | word   |
+      | wrona      | word   |
+      | ,          | punct  |
+      | kruk       | word   |
+      | i          | word   |
+      | gawron     | word   |
+      | to         | word   |
+      | polskie    | word   |
+      | ptaki      | word   |
+      | ?          | punct  |
+    Given a sentence 'Czy 10 000 000$ to duża kwota?'
+    When the sentence is split
+    Then the following non-blank segments should be detected
+      | segment    | type   |
+      #---------------------#
+      | Czy        | word   |
+      | 10 000 000 | number |
+      | $          | graph  |
+      | to         | word   |
+      | duża       | word   |
+      | kwota      | word   |
+      | ?          | punct  |

data/lib/srx/polish/{sentence.rb → sentence_splitter.rb} RENAMED

@@ -55,8 +55,9 @@ module SRX
     REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
     FIRST_CHAR = /\A./m
+    class SentenceSplitter
+      include Enumerable
-    class Sentence
       attr_accessor :input
       attr_writer :debug
@@ -69,6 +70,7 @@ module SRX
       end
       def each
+        raise "Invalid argument - text is nil" if @input.nil?
         buffer_length = 10
         sentence = ""
         before_buffer = ""

data/lib/srx/polish/word_splitter.rb ADDED

@@ -0,0 +1,56 @@
+# encoding: utf-8
+module SRX
+  module Polish
+    class WordSplitter
+      include Enumerable
+      attr_accessor :sentence
+      SPLIT_RULES = {
+        :word => "\\p{Alpha}\\p{Word}*",
+        :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
+        :punct => "\\p{Punct}",
+        :graph => "\\p{Graph}",
+        :other => "[^\\p{Word}\\p{Graph}]+"
+      }
+      SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
+      # The initializer accepts a +sentence+, which might be a
+      # Sentence instance or a String instance.
+      #
+      # The splitter might be initialized without the sentence,
+      # but should be set using the accessor before first call to
+      # +each+ method.
+      def initialize(sentence=nil)
+        @sentence = sentence
+      end
+      # This method iterates over the words in the sentence.
+      # It yields the string representation of the word and
+      # its type, which is one of:
+      # * +:word+ - a regular word (including words containing numbers, like A4)
+      # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
+      # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
+      # * +:graph+ - any single graphical (visible) character
+      # * +:other+ - anything which is not covered by the above types (non-visible
+      #   characters in particular)
+      def each
+        raise "Invalid argument - sentence is nil" if @sentence.nil?
+        @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
+          if !word.nil?
+            yield word, :word
+          elsif !number.nil?
+            yield number, :number
+          elsif !punct.nil?
+            yield punct, :punct
+          elsif !graph.nil?
+            yield graph, :graph
+          else
+            yield other, :other
+          end
+        end
+      end
+    end
+  end
+end

data/srx-polish.gemspec CHANGED

@@ -3,13 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
 Gem::Specification.new do |s|
   s.name        = "srx-polish"
-  s.version     = "0.1.1"
+  s.version     = "0.2.1"
   s.platform    = Gem::Platform::RUBY
   s.authors     = ["Aleksander Pohl"]
   s.email       = ["apohllo@o2.pl"]
   s.homepage    = "http://github.com/apohllo/srx2ruby"
-  s.summary     = %q{Polish sentence segmentation rules.}
-  s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
+  s.summary     = %q{Polish sentence and word segmentation rules.}
+  s.description = %q{Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
   s.rubyforge_project = "srx-polish"
   s.has_rdoc = false

metadata CHANGED

@@ -2,7 +2,7 @@
 name: srx-polish
 version: !ruby/object:Gem::Version
   prerelease:
-  version: 0.1.1
+  version: 0.2.1
 platform: ruby
 authors:
 - Aleksander Pohl
@@ -10,8 +10,7 @@ autorequire:
 bindir: bin
 cert_chain: []
-date: 2011-06-02 00:00:00 +02:00
-default_executable:
+date: 2011-10-14 00:00:00 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: term-ansicolor
@@ -24,7 +23,7 @@ dependencies:
         version: 1.0.5
   type: :runtime
   version_requirements: *id001
-description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
+description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
 email:
 - apohllo@o2.pl
 executables: []
@@ -35,10 +34,15 @@ extra_rdoc_files: []
 files:
 - .gitignore
-- README.txt
-- lib/srx/polish/sentence.rb
+- README.rdoc
+- changelog.txt
+- features/sentence_splitter.feature
+- features/steps/sentence_splitter.rb
+- features/steps/word_splitter.rb
+- features/word_splitter.feature
+- lib/srx/polish/sentence_splitter.rb
+- lib/srx/polish/word_splitter.rb
 - srx-polish.gemspec
-has_rdoc: true
 homepage: http://github.com/apohllo/srx2ruby
 licenses: []
@@ -62,9 +66,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
 requirements: []
 rubyforge_project: srx-polish
-rubygems_version: 1.5.2
+rubygems_version: 1.8.5
 signing_key:
 specification_version: 3
-summary: Polish sentence segmentation rules.
-test_files: []
+summary: Polish sentence and word segmentation rules.
+test_files:
+- features/sentence_splitter.feature
+- features/steps/sentence_splitter.rb
+- features/steps/word_splitter.rb
+- features/word_splitter.feature