srx-polish 0.1.1 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,8 +4,8 @@
4
4
 
5
5
  = DESCRIPTION
6
6
 
7
- 'srx-polish' is a Ruby library containint Polish sentence segmentation rules
8
- based on SRX rules defined by Marcin Miłkowski:
7
+ 'srx-polish' is a Ruby library containing Polish sentence and word segmentation rules.
8
+ The sentence segementation rules are based on rules defined by Marcin Miłkowski:
9
9
  http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
10
 
11
11
  = FEATURES/PROBLEMS
@@ -24,20 +24,45 @@ Standard rubygems installation:
24
24
  The library defines the SRX::Polish::Sentence class allowing to iterate
25
25
  over the matched sentences:
26
26
 
27
- require 'srx/polish/sentence'
27
+ require 'srx/polish/sentence_splitter'
28
28
 
29
29
  text =<<-END
30
30
  Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
31
31
  sprawa jest szczegółowo opisana.
32
32
  END
33
33
 
34
- sentences = SRX::Polish::Sentence.new(text)
35
- sentences.each do |sentence|
34
+ splitter = SRX::Polish::SentenceSplitter.new(text)
35
+ splitter.each do |sentence|
36
36
  puts sentence.gsub(/\n|\r/,"")
37
37
  end
38
38
  # Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
39
39
  # Na s. 10 książki sprawa jest szczegółowo opisana.
40
40
 
41
+ require 'srx/polish/word_splitter'
42
+
43
+ sentence = "Ala ma kota za 5zł i 10$."
44
+ splitter = SRX::Polish::WordSplitter.new(sentence)
45
+ splitter.each do |word,type|
46
+ puts "'#{word}' #{type}"
47
+ end
48
+ # 'Ala' word
49
+ # ' ' other
50
+ # 'ma' word
51
+ # ' ' other
52
+ # 'kota' word
53
+ # ' ' other
54
+ # 'za' word
55
+ # ' ' other
56
+ # '5' number
57
+ # 'zł' word
58
+ # ' ' other
59
+ # 'i' word
60
+ # ' ' other
61
+ # '10' number
62
+ # '$' graph
63
+ # '.' punct
64
+
65
+
41
66
  == LICENSE
42
67
 
43
68
  Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
@@ -0,0 +1,10 @@
1
+ 0.2.1
2
+ - fix extension of README (was md, should be rdoc)
3
+ 0.2.0
4
+ - API change - Sentence changed into SentenceSplitter
5
+ - Word segmentation rules added
6
+ - Basic features added
7
+ 0.1.1
8
+ - Dependency on Term::Ansicolor added
9
+ 0.1.0
10
+ - First release of sentence segmentation rules
@@ -0,0 +1,34 @@
1
+ Feature: sentence splitter
2
+ Scenario: splitting text
3
+ Given a text
4
+ """
5
+ W październiku 1890 r. dwóch żołnierzy dokonało
6
+ rewolty, np. Andrzej i P. Woźny.
7
+ """
8
+ When the text is split
9
+ Then the following sentences should be detected
10
+ | sentence |
11
+ #----------------------------------------------------------------------------------#
12
+ | W październiku 1890 r. dwóch żołnierzy dokonało rewolty, np. Andrzej i P. Woźny. |
13
+
14
+ Given a text
15
+ """
16
+ Wiosna, lato, itd. A wczoraj mieliśmy jesień.
17
+ """
18
+ When the text is split
19
+ Then the following sentences should be detected
20
+ | sentence |
21
+ #----------------------------#
22
+ | Wiosna, lato, itd. |
23
+ | A wczoraj mieliśmy jesień. |
24
+
25
+ Given a text
26
+ """
27
+ Andrzej G. (20 l.). A z nim sześciu mężczyzn.
28
+ """
29
+ When the text is split
30
+ Then the following sentences should be detected
31
+ | sentence |
32
+ #----------------------------#
33
+ | Andrzej G. (20 l.). |
34
+ | A z nim sześciu mężczyzn. |
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/polish/sentence_splitter'
4
+
5
+ Given /^a text$/ do |text|
6
+ @text = text
7
+ end
8
+
9
+ When /^the text is split$/ do
10
+ @splitter = SRX::Polish::SentenceSplitter.new(@text)
11
+ end
12
+
13
+ Then /^the following sentences should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/polish/word_splitter'
4
+
5
+ Given /^a sentence '([^']+)'$/ do |sentence|
6
+ @sentence = sentence.force_encoding('utf-8')
7
+ end
8
+
9
+ When /^the sentence is split$/ do
10
+ @splitter = SRX::Polish::WordSplitter.new(@sentence)
11
+ end
12
+
13
+ Then /^the following segments should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned[0].should == expected[:segment].gsub(/'/,"")
16
+ returned[1].should == expected[:type].to_sym
17
+ end
18
+ end
19
+
20
+ Then /^the following non-blank segments should be detected$/ do |table|
21
+ segments = @splitter.select{|s| s[1] != :other}
22
+ table.hashes.zip(segments).each do |expected,returned|
23
+ returned[0].should == expected[:segment].gsub(/'/,"")
24
+ returned[1].should == expected[:type].to_sym
25
+ end
26
+ end
@@ -0,0 +1,75 @@
1
+ Feature: word splitter
2
+ Scenario: splitting a sentence
3
+ Given a sentence 'My home is my castle.'
4
+ When the sentence is split
5
+ Then the following segments should be detected
6
+ | segment | type |
7
+ #-----------------#
8
+ | My | word |
9
+ | ' ' | other |
10
+ | home | word |
11
+ | ' ' | other |
12
+ | is | word |
13
+ | ' ' | other |
14
+ | my | word |
15
+ | ' ' | other |
16
+ | castle | word |
17
+ | . | punct |
18
+
19
+ Given a sentence 'W dniu 14/12/2011 nastąpił napad na bank!'
20
+ When the sentence is split
21
+ Then the following non-blank segments should be detected
22
+ | segment | type |
23
+ #---------------------#
24
+ | W | word |
25
+ | dniu | word |
26
+ | 14/12/2011 | number |
27
+ | nastąpił | word |
28
+ | napad | word |
29
+ | na | word |
30
+ | bank | word |
31
+ | ! | punct |
32
+
33
+ Given a sentence 'Użytkownik o loginie ania8 zalogował się 7 listopada.'
34
+ When the sentence is split
35
+ Then the following non-blank segments should be detected
36
+ | segment | type |
37
+ #---------------------#
38
+ | Użytkownik | word |
39
+ | o | word |
40
+ | loginie | word |
41
+ | ania8 | word |
42
+ | zalogował | word |
43
+ | się | word |
44
+ | 7 | number |
45
+ | listopada | word |
46
+ | . | punct |
47
+
48
+ Given a sentence 'Czy wrona, kruk i gawron to polskie ptaki?'
49
+ When the sentence is split
50
+ Then the following non-blank segments should be detected
51
+ | segment | type |
52
+ #---------------------#
53
+ | Czy | word |
54
+ | wrona | word |
55
+ | , | punct |
56
+ | kruk | word |
57
+ | i | word |
58
+ | gawron | word |
59
+ | to | word |
60
+ | polskie | word |
61
+ | ptaki | word |
62
+ | ? | punct |
63
+
64
+ Given a sentence 'Czy 10 000 000$ to duża kwota?'
65
+ When the sentence is split
66
+ Then the following non-blank segments should be detected
67
+ | segment | type |
68
+ #---------------------#
69
+ | Czy | word |
70
+ | 10 000 000 | number |
71
+ | $ | graph |
72
+ | to | word |
73
+ | duża | word |
74
+ | kwota | word |
75
+ | ? | punct |
@@ -55,8 +55,9 @@ module SRX
55
55
  REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
56
  FIRST_CHAR = /\A./m
57
57
 
58
+ class SentenceSplitter
59
+ include Enumerable
58
60
 
59
- class Sentence
60
61
  attr_accessor :input
61
62
  attr_writer :debug
62
63
 
@@ -69,6 +70,7 @@ module SRX
69
70
  end
70
71
 
71
72
  def each
73
+ raise "Invalid argument - text is nil" if @input.nil?
72
74
  buffer_length = 10
73
75
  sentence = ""
74
76
  before_buffer = ""
@@ -0,0 +1,56 @@
1
+ # encoding: utf-8
2
+
3
+ module SRX
4
+ module Polish
5
+ class WordSplitter
6
+ include Enumerable
7
+
8
+ attr_accessor :sentence
9
+ SPLIT_RULES = {
10
+ :word => "\\p{Alpha}\\p{Word}*",
11
+ :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
12
+ :punct => "\\p{Punct}",
13
+ :graph => "\\p{Graph}",
14
+ :other => "[^\\p{Word}\\p{Graph}]+"
15
+ }
16
+
17
+ SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
18
+
19
+ # The initializer accepts a +sentence+, which might be a
20
+ # Sentence instance or a String instance.
21
+ #
22
+ # The splitter might be initialized without the sentence,
23
+ # but should be set using the accessor before first call to
24
+ # +each+ method.
25
+ def initialize(sentence=nil)
26
+ @sentence = sentence
27
+ end
28
+
29
+ # This method iterates over the words in the sentence.
30
+ # It yields the string representation of the word and
31
+ # its type, which is one of:
32
+ # * +:word+ - a regular word (including words containing numbers, like A4)
33
+ # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
34
+ # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
35
+ # * +:graph+ - any single graphical (visible) character
36
+ # * +:other+ - anything which is not covered by the above types (non-visible
37
+ # characters in particular)
38
+ def each
39
+ raise "Invalid argument - sentence is nil" if @sentence.nil?
40
+ @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
41
+ if !word.nil?
42
+ yield word, :word
43
+ elsif !number.nil?
44
+ yield number, :number
45
+ elsif !punct.nil?
46
+ yield punct, :punct
47
+ elsif !graph.nil?
48
+ yield graph, :graph
49
+ else
50
+ yield other, :other
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -3,13 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "srx-polish"
6
- s.version = "0.1.1"
6
+ s.version = "0.2.1"
7
7
  s.platform = Gem::Platform::RUBY
8
8
  s.authors = ["Aleksander Pohl"]
9
9
  s.email = ["apohllo@o2.pl"]
10
10
  s.homepage = "http://github.com/apohllo/srx2ruby"
11
- s.summary = %q{Polish sentence segmentation rules.}
12
- s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
11
+ s.summary = %q{Polish sentence and word segmentation rules.}
12
+ s.description = %q{Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
13
13
 
14
14
  s.rubyforge_project = "srx-polish"
15
15
  s.has_rdoc = false
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: srx-polish
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.1.1
5
+ version: 0.2.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Aleksander Pohl
@@ -10,8 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-02 00:00:00 +02:00
14
- default_executable:
13
+ date: 2011-10-14 00:00:00 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: term-ansicolor
@@ -24,7 +23,7 @@ dependencies:
24
23
  version: 1.0.5
25
24
  type: :runtime
26
25
  version_requirements: *id001
27
- description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
26
+ description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
28
27
  email:
29
28
  - apohllo@o2.pl
30
29
  executables: []
@@ -35,10 +34,15 @@ extra_rdoc_files: []
35
34
 
36
35
  files:
37
36
  - .gitignore
38
- - README.txt
39
- - lib/srx/polish/sentence.rb
37
+ - README.rdoc
38
+ - changelog.txt
39
+ - features/sentence_splitter.feature
40
+ - features/steps/sentence_splitter.rb
41
+ - features/steps/word_splitter.rb
42
+ - features/word_splitter.feature
43
+ - lib/srx/polish/sentence_splitter.rb
44
+ - lib/srx/polish/word_splitter.rb
40
45
  - srx-polish.gemspec
41
- has_rdoc: true
42
46
  homepage: http://github.com/apohllo/srx2ruby
43
47
  licenses: []
44
48
 
@@ -62,9 +66,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
66
  requirements: []
63
67
 
64
68
  rubyforge_project: srx-polish
65
- rubygems_version: 1.5.2
69
+ rubygems_version: 1.8.5
66
70
  signing_key:
67
71
  specification_version: 3
68
- summary: Polish sentence segmentation rules.
69
- test_files: []
70
-
72
+ summary: Polish sentence and word segmentation rules.
73
+ test_files:
74
+ - features/sentence_splitter.feature
75
+ - features/steps/sentence_splitter.rb
76
+ - features/steps/word_splitter.rb
77
+ - features/word_splitter.feature