srx-polish 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,8 +4,8 @@
4
4
 
5
5
  = DESCRIPTION
6
6
 
7
- 'srx-polish' is a Ruby library containint Polish sentence segmentation rules
8
- based on SRX rules defined by Marcin Miłkowski:
7
+ 'srx-polish' is a Ruby library containing Polish sentence and word segmentation rules.
8
+ The sentence segementation rules are based on rules defined by Marcin Miłkowski:
9
9
  http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
10
 
11
11
  = FEATURES/PROBLEMS
@@ -24,20 +24,45 @@ Standard rubygems installation:
24
24
  The library defines the SRX::Polish::Sentence class allowing to iterate
25
25
  over the matched sentences:
26
26
 
27
- require 'srx/polish/sentence'
27
+ require 'srx/polish/sentence_splitter'
28
28
 
29
29
  text =<<-END
30
30
  Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
31
31
  sprawa jest szczegółowo opisana.
32
32
  END
33
33
 
34
- sentences = SRX::Polish::Sentence.new(text)
35
- sentences.each do |sentence|
34
+ splitter = SRX::Polish::SentenceSplitter.new(text)
35
+ splitter.each do |sentence|
36
36
  puts sentence.gsub(/\n|\r/,"")
37
37
  end
38
38
  # Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
39
39
  # Na s. 10 książki sprawa jest szczegółowo opisana.
40
40
 
41
+ require 'srx/polish/word_splitter'
42
+
43
+ sentence = "Ala ma kota za 5zł i 10$."
44
+ splitter = SRX::Polish::WordSplitter.new(sentence)
45
+ splitter.each do |word,type|
46
+ puts "'#{word}' #{type}"
47
+ end
48
+ # 'Ala' word
49
+ # ' ' other
50
+ # 'ma' word
51
+ # ' ' other
52
+ # 'kota' word
53
+ # ' ' other
54
+ # 'za' word
55
+ # ' ' other
56
+ # '5' number
57
+ # 'zł' word
58
+ # ' ' other
59
+ # 'i' word
60
+ # ' ' other
61
+ # '10' number
62
+ # '$' graph
63
+ # '.' punct
64
+
65
+
41
66
  == LICENSE
42
67
 
43
68
  Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
@@ -0,0 +1,10 @@
1
+ 0.2.1
2
+ - fix extension of README (was md, should be rdoc)
3
+ 0.2.0
4
+ - API change - Sentence changed into SentenceSplitter
5
+ - Word segmentation rules added
6
+ - Basic features added
7
+ 0.1.1
8
+ - Dependency on Term::Ansicolor added
9
+ 0.1.0
10
+ - First release of sentence segmentation rules
@@ -0,0 +1,34 @@
1
+ Feature: sentence splitter
2
+ Scenario: splitting text
3
+ Given a text
4
+ """
5
+ W październiku 1890 r. dwóch żołnierzy dokonało
6
+ rewolty, np. Andrzej i P. Woźny.
7
+ """
8
+ When the text is split
9
+ Then the following sentences should be detected
10
+ | sentence |
11
+ #----------------------------------------------------------------------------------#
12
+ | W październiku 1890 r. dwóch żołnierzy dokonało rewolty, np. Andrzej i P. Woźny. |
13
+
14
+ Given a text
15
+ """
16
+ Wiosna, lato, itd. A wczoraj mieliśmy jesień.
17
+ """
18
+ When the text is split
19
+ Then the following sentences should be detected
20
+ | sentence |
21
+ #----------------------------#
22
+ | Wiosna, lato, itd. |
23
+ | A wczoraj mieliśmy jesień. |
24
+
25
+ Given a text
26
+ """
27
+ Andrzej G. (20 l.). A z nim sześciu mężczyzn.
28
+ """
29
+ When the text is split
30
+ Then the following sentences should be detected
31
+ | sentence |
32
+ #----------------------------#
33
+ | Andrzej G. (20 l.). |
34
+ | A z nim sześciu mężczyzn. |
@@ -0,0 +1,17 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/polish/sentence_splitter'
4
+
5
+ Given /^a text$/ do |text|
6
+ @text = text
7
+ end
8
+
9
+ When /^the text is split$/ do
10
+ @splitter = SRX::Polish::SentenceSplitter.new(@text)
11
+ end
12
+
13
+ Then /^the following sentences should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
16
+ end
17
+ end
@@ -0,0 +1,26 @@
1
+ # encoding: utf-8
2
+ $:.unshift "lib"
3
+ require 'srx/polish/word_splitter'
4
+
5
+ Given /^a sentence '([^']+)'$/ do |sentence|
6
+ @sentence = sentence.force_encoding('utf-8')
7
+ end
8
+
9
+ When /^the sentence is split$/ do
10
+ @splitter = SRX::Polish::WordSplitter.new(@sentence)
11
+ end
12
+
13
+ Then /^the following segments should be detected$/ do |table|
14
+ table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
+ returned[0].should == expected[:segment].gsub(/'/,"")
16
+ returned[1].should == expected[:type].to_sym
17
+ end
18
+ end
19
+
20
+ Then /^the following non-blank segments should be detected$/ do |table|
21
+ segments = @splitter.select{|s| s[1] != :other}
22
+ table.hashes.zip(segments).each do |expected,returned|
23
+ returned[0].should == expected[:segment].gsub(/'/,"")
24
+ returned[1].should == expected[:type].to_sym
25
+ end
26
+ end
@@ -0,0 +1,75 @@
1
+ Feature: word splitter
2
+ Scenario: splitting a sentence
3
+ Given a sentence 'My home is my castle.'
4
+ When the sentence is split
5
+ Then the following segments should be detected
6
+ | segment | type |
7
+ #-----------------#
8
+ | My | word |
9
+ | ' ' | other |
10
+ | home | word |
11
+ | ' ' | other |
12
+ | is | word |
13
+ | ' ' | other |
14
+ | my | word |
15
+ | ' ' | other |
16
+ | castle | word |
17
+ | . | punct |
18
+
19
+ Given a sentence 'W dniu 14/12/2011 nastąpił napad na bank!'
20
+ When the sentence is split
21
+ Then the following non-blank segments should be detected
22
+ | segment | type |
23
+ #---------------------#
24
+ | W | word |
25
+ | dniu | word |
26
+ | 14/12/2011 | number |
27
+ | nastąpił | word |
28
+ | napad | word |
29
+ | na | word |
30
+ | bank | word |
31
+ | ! | punct |
32
+
33
+ Given a sentence 'Użytkownik o loginie ania8 zalogował się 7 listopada.'
34
+ When the sentence is split
35
+ Then the following non-blank segments should be detected
36
+ | segment | type |
37
+ #---------------------#
38
+ | Użytkownik | word |
39
+ | o | word |
40
+ | loginie | word |
41
+ | ania8 | word |
42
+ | zalogował | word |
43
+ | się | word |
44
+ | 7 | number |
45
+ | listopada | word |
46
+ | . | punct |
47
+
48
+ Given a sentence 'Czy wrona, kruk i gawron to polskie ptaki?'
49
+ When the sentence is split
50
+ Then the following non-blank segments should be detected
51
+ | segment | type |
52
+ #---------------------#
53
+ | Czy | word |
54
+ | wrona | word |
55
+ | , | punct |
56
+ | kruk | word |
57
+ | i | word |
58
+ | gawron | word |
59
+ | to | word |
60
+ | polskie | word |
61
+ | ptaki | word |
62
+ | ? | punct |
63
+
64
+ Given a sentence 'Czy 10 000 000$ to duża kwota?'
65
+ When the sentence is split
66
+ Then the following non-blank segments should be detected
67
+ | segment | type |
68
+ #---------------------#
69
+ | Czy | word |
70
+ | 10 000 000 | number |
71
+ | $ | graph |
72
+ | to | word |
73
+ | duża | word |
74
+ | kwota | word |
75
+ | ? | punct |
@@ -55,8 +55,9 @@ module SRX
55
55
  REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
56
  FIRST_CHAR = /\A./m
57
57
 
58
+ class SentenceSplitter
59
+ include Enumerable
58
60
 
59
- class Sentence
60
61
  attr_accessor :input
61
62
  attr_writer :debug
62
63
 
@@ -69,6 +70,7 @@ module SRX
69
70
  end
70
71
 
71
72
  def each
73
+ raise "Invalid argument - text is nil" if @input.nil?
72
74
  buffer_length = 10
73
75
  sentence = ""
74
76
  before_buffer = ""
@@ -0,0 +1,56 @@
1
+ # encoding: utf-8
2
+
3
+ module SRX
4
+ module Polish
5
+ class WordSplitter
6
+ include Enumerable
7
+
8
+ attr_accessor :sentence
9
+ SPLIT_RULES = {
10
+ :word => "\\p{Alpha}\\p{Word}*",
11
+ :number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
12
+ :punct => "\\p{Punct}",
13
+ :graph => "\\p{Graph}",
14
+ :other => "[^\\p{Word}\\p{Graph}]+"
15
+ }
16
+
17
+ SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
18
+
19
+ # The initializer accepts a +sentence+, which might be a
20
+ # Sentence instance or a String instance.
21
+ #
22
+ # The splitter might be initialized without the sentence,
23
+ # but should be set using the accessor before first call to
24
+ # +each+ method.
25
+ def initialize(sentence=nil)
26
+ @sentence = sentence
27
+ end
28
+
29
+ # This method iterates over the words in the sentence.
30
+ # It yields the string representation of the word and
31
+ # its type, which is one of:
32
+ # * +:word+ - a regular word (including words containing numbers, like A4)
33
+ # * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
34
+ # * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
35
+ # * +:graph+ - any single graphical (visible) character
36
+ # * +:other+ - anything which is not covered by the above types (non-visible
37
+ # characters in particular)
38
+ def each
39
+ raise "Invalid argument - sentence is nil" if @sentence.nil?
40
+ @sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
41
+ if !word.nil?
42
+ yield word, :word
43
+ elsif !number.nil?
44
+ yield number, :number
45
+ elsif !punct.nil?
46
+ yield punct, :punct
47
+ elsif !graph.nil?
48
+ yield graph, :graph
49
+ else
50
+ yield other, :other
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -3,13 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "srx-polish"
6
- s.version = "0.1.1"
6
+ s.version = "0.2.1"
7
7
  s.platform = Gem::Platform::RUBY
8
8
  s.authors = ["Aleksander Pohl"]
9
9
  s.email = ["apohllo@o2.pl"]
10
10
  s.homepage = "http://github.com/apohllo/srx2ruby"
11
- s.summary = %q{Polish sentence segmentation rules.}
12
- s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
11
+ s.summary = %q{Polish sentence and word segmentation rules.}
12
+ s.description = %q{Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
13
13
 
14
14
  s.rubyforge_project = "srx-polish"
15
15
  s.has_rdoc = false
metadata CHANGED
@@ -2,7 +2,7 @@
2
2
  name: srx-polish
3
3
  version: !ruby/object:Gem::Version
4
4
  prerelease:
5
- version: 0.1.1
5
+ version: 0.2.1
6
6
  platform: ruby
7
7
  authors:
8
8
  - Aleksander Pohl
@@ -10,8 +10,7 @@ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
12
 
13
- date: 2011-06-02 00:00:00 +02:00
14
- default_executable:
13
+ date: 2011-10-14 00:00:00 Z
15
14
  dependencies:
16
15
  - !ruby/object:Gem::Dependency
17
16
  name: term-ansicolor
@@ -24,7 +23,7 @@ dependencies:
24
23
  version: 1.0.5
25
24
  type: :runtime
26
25
  version_requirements: *id001
27
- description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
26
+ description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
28
27
  email:
29
28
  - apohllo@o2.pl
30
29
  executables: []
@@ -35,10 +34,15 @@ extra_rdoc_files: []
35
34
 
36
35
  files:
37
36
  - .gitignore
38
- - README.txt
39
- - lib/srx/polish/sentence.rb
37
+ - README.rdoc
38
+ - changelog.txt
39
+ - features/sentence_splitter.feature
40
+ - features/steps/sentence_splitter.rb
41
+ - features/steps/word_splitter.rb
42
+ - features/word_splitter.feature
43
+ - lib/srx/polish/sentence_splitter.rb
44
+ - lib/srx/polish/word_splitter.rb
40
45
  - srx-polish.gemspec
41
- has_rdoc: true
42
46
  homepage: http://github.com/apohllo/srx2ruby
43
47
  licenses: []
44
48
 
@@ -62,9 +66,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
62
66
  requirements: []
63
67
 
64
68
  rubyforge_project: srx-polish
65
- rubygems_version: 1.5.2
69
+ rubygems_version: 1.8.5
66
70
  signing_key:
67
71
  specification_version: 3
68
- summary: Polish sentence segmentation rules.
69
- test_files: []
70
-
72
+ summary: Polish sentence and word segmentation rules.
73
+ test_files:
74
+ - features/sentence_splitter.feature
75
+ - features/steps/sentence_splitter.rb
76
+ - features/steps/word_splitter.rb
77
+ - features/word_splitter.feature