srx-polish 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/{README.txt → README.rdoc} +30 -5
- data/changelog.txt +10 -0
- data/features/sentence_splitter.feature +34 -0
- data/features/steps/sentence_splitter.rb +17 -0
- data/features/steps/word_splitter.rb +26 -0
- data/features/word_splitter.feature +75 -0
- data/lib/srx/polish/{sentence.rb → sentence_splitter.rb} +3 -1
- data/lib/srx/polish/word_splitter.rb +56 -0
- data/srx-polish.gemspec +3 -3
- metadata +18 -11
data/{README.txt → README.rdoc}
RENAMED
@@ -4,8 +4,8 @@
|
|
4
4
|
|
5
5
|
= DESCRIPTION
|
6
6
|
|
7
|
-
'srx-polish' is a Ruby library
|
8
|
-
based on
|
7
|
+
'srx-polish' is a Ruby library containing Polish sentence and word segmentation rules.
|
8
|
+
The sentence segementation rules are based on rules defined by Marcin Miłkowski:
|
9
9
|
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
10
|
|
11
11
|
= FEATURES/PROBLEMS
|
@@ -24,20 +24,45 @@ Standard rubygems installation:
|
|
24
24
|
The library defines the SRX::Polish::Sentence class allowing to iterate
|
25
25
|
over the matched sentences:
|
26
26
|
|
27
|
-
require 'srx/polish/
|
27
|
+
require 'srx/polish/sentence_splitter'
|
28
28
|
|
29
29
|
text =<<-END
|
30
30
|
Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
|
31
31
|
sprawa jest szczegółowo opisana.
|
32
32
|
END
|
33
33
|
|
34
|
-
|
35
|
-
|
34
|
+
splitter = SRX::Polish::SentenceSplitter.new(text)
|
35
|
+
splitter.each do |sentence|
|
36
36
|
puts sentence.gsub(/\n|\r/,"")
|
37
37
|
end
|
38
38
|
# Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
|
39
39
|
# Na s. 10 książki sprawa jest szczegółowo opisana.
|
40
40
|
|
41
|
+
require 'srx/polish/word_splitter'
|
42
|
+
|
43
|
+
sentence = "Ala ma kota za 5zł i 10$."
|
44
|
+
splitter = SRX::Polish::WordSplitter.new(sentence)
|
45
|
+
splitter.each do |word,type|
|
46
|
+
puts "'#{word}' #{type}"
|
47
|
+
end
|
48
|
+
# 'Ala' word
|
49
|
+
# ' ' other
|
50
|
+
# 'ma' word
|
51
|
+
# ' ' other
|
52
|
+
# 'kota' word
|
53
|
+
# ' ' other
|
54
|
+
# 'za' word
|
55
|
+
# ' ' other
|
56
|
+
# '5' number
|
57
|
+
# 'zł' word
|
58
|
+
# ' ' other
|
59
|
+
# 'i' word
|
60
|
+
# ' ' other
|
61
|
+
# '10' number
|
62
|
+
# '$' graph
|
63
|
+
# '.' punct
|
64
|
+
|
65
|
+
|
41
66
|
== LICENSE
|
42
67
|
|
43
68
|
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
data/changelog.txt
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
0.2.1
|
2
|
+
- fix extension of README (was md, should be rdoc)
|
3
|
+
0.2.0
|
4
|
+
- API change - Sentence changed into SentenceSplitter
|
5
|
+
- Word segmentation rules added
|
6
|
+
- Basic features added
|
7
|
+
0.1.1
|
8
|
+
- Dependency on Term::Ansicolor added
|
9
|
+
0.1.0
|
10
|
+
- First release of sentence segmentation rules
|
@@ -0,0 +1,34 @@
|
|
1
|
+
Feature: sentence splitter
|
2
|
+
Scenario: splitting text
|
3
|
+
Given a text
|
4
|
+
"""
|
5
|
+
W październiku 1890 r. dwóch żołnierzy dokonało
|
6
|
+
rewolty, np. Andrzej i P. Woźny.
|
7
|
+
"""
|
8
|
+
When the text is split
|
9
|
+
Then the following sentences should be detected
|
10
|
+
| sentence |
|
11
|
+
#----------------------------------------------------------------------------------#
|
12
|
+
| W październiku 1890 r. dwóch żołnierzy dokonało rewolty, np. Andrzej i P. Woźny. |
|
13
|
+
|
14
|
+
Given a text
|
15
|
+
"""
|
16
|
+
Wiosna, lato, itd. A wczoraj mieliśmy jesień.
|
17
|
+
"""
|
18
|
+
When the text is split
|
19
|
+
Then the following sentences should be detected
|
20
|
+
| sentence |
|
21
|
+
#----------------------------#
|
22
|
+
| Wiosna, lato, itd. |
|
23
|
+
| A wczoraj mieliśmy jesień. |
|
24
|
+
|
25
|
+
Given a text
|
26
|
+
"""
|
27
|
+
Andrzej G. (20 l.). A z nim sześciu mężczyzn.
|
28
|
+
"""
|
29
|
+
When the text is split
|
30
|
+
Then the following sentences should be detected
|
31
|
+
| sentence |
|
32
|
+
#----------------------------#
|
33
|
+
| Andrzej G. (20 l.). |
|
34
|
+
| A z nim sześciu mężczyzn. |
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/polish/sentence_splitter'
|
4
|
+
|
5
|
+
Given /^a text$/ do |text|
|
6
|
+
@text = text
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the text is split$/ do
|
10
|
+
@splitter = SRX::Polish::SentenceSplitter.new(@text)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following sentences should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/polish/word_splitter'
|
4
|
+
|
5
|
+
Given /^a sentence '([^']+)'$/ do |sentence|
|
6
|
+
@sentence = sentence.force_encoding('utf-8')
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the sentence is split$/ do
|
10
|
+
@splitter = SRX::Polish::WordSplitter.new(@sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following segments should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
16
|
+
returned[1].should == expected[:type].to_sym
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Then /^the following non-blank segments should be detected$/ do |table|
|
21
|
+
segments = @splitter.select{|s| s[1] != :other}
|
22
|
+
table.hashes.zip(segments).each do |expected,returned|
|
23
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
24
|
+
returned[1].should == expected[:type].to_sym
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
Feature: word splitter
|
2
|
+
Scenario: splitting a sentence
|
3
|
+
Given a sentence 'My home is my castle.'
|
4
|
+
When the sentence is split
|
5
|
+
Then the following segments should be detected
|
6
|
+
| segment | type |
|
7
|
+
#-----------------#
|
8
|
+
| My | word |
|
9
|
+
| ' ' | other |
|
10
|
+
| home | word |
|
11
|
+
| ' ' | other |
|
12
|
+
| is | word |
|
13
|
+
| ' ' | other |
|
14
|
+
| my | word |
|
15
|
+
| ' ' | other |
|
16
|
+
| castle | word |
|
17
|
+
| . | punct |
|
18
|
+
|
19
|
+
Given a sentence 'W dniu 14/12/2011 nastąpił napad na bank!'
|
20
|
+
When the sentence is split
|
21
|
+
Then the following non-blank segments should be detected
|
22
|
+
| segment | type |
|
23
|
+
#---------------------#
|
24
|
+
| W | word |
|
25
|
+
| dniu | word |
|
26
|
+
| 14/12/2011 | number |
|
27
|
+
| nastąpił | word |
|
28
|
+
| napad | word |
|
29
|
+
| na | word |
|
30
|
+
| bank | word |
|
31
|
+
| ! | punct |
|
32
|
+
|
33
|
+
Given a sentence 'Użytkownik o loginie ania8 zalogował się 7 listopada.'
|
34
|
+
When the sentence is split
|
35
|
+
Then the following non-blank segments should be detected
|
36
|
+
| segment | type |
|
37
|
+
#---------------------#
|
38
|
+
| Użytkownik | word |
|
39
|
+
| o | word |
|
40
|
+
| loginie | word |
|
41
|
+
| ania8 | word |
|
42
|
+
| zalogował | word |
|
43
|
+
| się | word |
|
44
|
+
| 7 | number |
|
45
|
+
| listopada | word |
|
46
|
+
| . | punct |
|
47
|
+
|
48
|
+
Given a sentence 'Czy wrona, kruk i gawron to polskie ptaki?'
|
49
|
+
When the sentence is split
|
50
|
+
Then the following non-blank segments should be detected
|
51
|
+
| segment | type |
|
52
|
+
#---------------------#
|
53
|
+
| Czy | word |
|
54
|
+
| wrona | word |
|
55
|
+
| , | punct |
|
56
|
+
| kruk | word |
|
57
|
+
| i | word |
|
58
|
+
| gawron | word |
|
59
|
+
| to | word |
|
60
|
+
| polskie | word |
|
61
|
+
| ptaki | word |
|
62
|
+
| ? | punct |
|
63
|
+
|
64
|
+
Given a sentence 'Czy 10 000 000$ to duża kwota?'
|
65
|
+
When the sentence is split
|
66
|
+
Then the following non-blank segments should be detected
|
67
|
+
| segment | type |
|
68
|
+
#---------------------#
|
69
|
+
| Czy | word |
|
70
|
+
| 10 000 000 | number |
|
71
|
+
| $ | graph |
|
72
|
+
| to | word |
|
73
|
+
| duża | word |
|
74
|
+
| kwota | word |
|
75
|
+
| ? | punct |
|
@@ -55,8 +55,9 @@ module SRX
|
|
55
55
|
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
56
56
|
FIRST_CHAR = /\A./m
|
57
57
|
|
58
|
+
class SentenceSplitter
|
59
|
+
include Enumerable
|
58
60
|
|
59
|
-
class Sentence
|
60
61
|
attr_accessor :input
|
61
62
|
attr_writer :debug
|
62
63
|
|
@@ -69,6 +70,7 @@ module SRX
|
|
69
70
|
end
|
70
71
|
|
71
72
|
def each
|
73
|
+
raise "Invalid argument - text is nil" if @input.nil?
|
72
74
|
buffer_length = 10
|
73
75
|
sentence = ""
|
74
76
|
before_buffer = ""
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SRX
|
4
|
+
module Polish
|
5
|
+
class WordSplitter
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :sentence
|
9
|
+
SPLIT_RULES = {
|
10
|
+
:word => "\\p{Alpha}\\p{Word}*",
|
11
|
+
:number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
|
12
|
+
:punct => "\\p{Punct}",
|
13
|
+
:graph => "\\p{Graph}",
|
14
|
+
:other => "[^\\p{Word}\\p{Graph}]+"
|
15
|
+
}
|
16
|
+
|
17
|
+
SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
|
18
|
+
|
19
|
+
# The initializer accepts a +sentence+, which might be a
|
20
|
+
# Sentence instance or a String instance.
|
21
|
+
#
|
22
|
+
# The splitter might be initialized without the sentence,
|
23
|
+
# but should be set using the accessor before first call to
|
24
|
+
# +each+ method.
|
25
|
+
def initialize(sentence=nil)
|
26
|
+
@sentence = sentence
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method iterates over the words in the sentence.
|
30
|
+
# It yields the string representation of the word and
|
31
|
+
# its type, which is one of:
|
32
|
+
# * +:word+ - a regular word (including words containing numbers, like A4)
|
33
|
+
# * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
|
34
|
+
# * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
|
35
|
+
# * +:graph+ - any single graphical (visible) character
|
36
|
+
# * +:other+ - anything which is not covered by the above types (non-visible
|
37
|
+
# characters in particular)
|
38
|
+
def each
|
39
|
+
raise "Invalid argument - sentence is nil" if @sentence.nil?
|
40
|
+
@sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
|
41
|
+
if !word.nil?
|
42
|
+
yield word, :word
|
43
|
+
elsif !number.nil?
|
44
|
+
yield number, :number
|
45
|
+
elsif !punct.nil?
|
46
|
+
yield punct, :punct
|
47
|
+
elsif !graph.nil?
|
48
|
+
yield graph, :graph
|
49
|
+
else
|
50
|
+
yield other, :other
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/srx-polish.gemspec
CHANGED
@@ -3,13 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "srx-polish"
|
6
|
-
s.version = "0.
|
6
|
+
s.version = "0.2.1"
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
8
|
s.authors = ["Aleksander Pohl"]
|
9
9
|
s.email = ["apohllo@o2.pl"]
|
10
10
|
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
-
s.summary = %q{Polish sentence segmentation rules.}
|
12
|
-
s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
|
11
|
+
s.summary = %q{Polish sentence and word segmentation rules.}
|
12
|
+
s.description = %q{Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
|
13
13
|
|
14
14
|
s.rubyforge_project = "srx-polish"
|
15
15
|
s.has_rdoc = false
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: srx-polish
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.2.1
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Aleksander Pohl
|
@@ -10,8 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-10-14 00:00:00 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: term-ansicolor
|
@@ -24,7 +23,7 @@ dependencies:
|
|
24
23
|
version: 1.0.5
|
25
24
|
type: :runtime
|
26
25
|
version_requirements: *id001
|
27
|
-
description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
|
26
|
+
description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
|
28
27
|
email:
|
29
28
|
- apohllo@o2.pl
|
30
29
|
executables: []
|
@@ -35,10 +34,15 @@ extra_rdoc_files: []
|
|
35
34
|
|
36
35
|
files:
|
37
36
|
- .gitignore
|
38
|
-
- README.
|
39
|
-
-
|
37
|
+
- README.rdoc
|
38
|
+
- changelog.txt
|
39
|
+
- features/sentence_splitter.feature
|
40
|
+
- features/steps/sentence_splitter.rb
|
41
|
+
- features/steps/word_splitter.rb
|
42
|
+
- features/word_splitter.feature
|
43
|
+
- lib/srx/polish/sentence_splitter.rb
|
44
|
+
- lib/srx/polish/word_splitter.rb
|
40
45
|
- srx-polish.gemspec
|
41
|
-
has_rdoc: true
|
42
46
|
homepage: http://github.com/apohllo/srx2ruby
|
43
47
|
licenses: []
|
44
48
|
|
@@ -62,9 +66,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
62
66
|
requirements: []
|
63
67
|
|
64
68
|
rubyforge_project: srx-polish
|
65
|
-
rubygems_version: 1.5
|
69
|
+
rubygems_version: 1.8.5
|
66
70
|
signing_key:
|
67
71
|
specification_version: 3
|
68
|
-
summary: Polish sentence segmentation rules.
|
69
|
-
test_files:
|
70
|
-
|
72
|
+
summary: Polish sentence and word segmentation rules.
|
73
|
+
test_files:
|
74
|
+
- features/sentence_splitter.feature
|
75
|
+
- features/steps/sentence_splitter.rb
|
76
|
+
- features/steps/word_splitter.rb
|
77
|
+
- features/word_splitter.feature
|