srx-polish 0.1.1 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- data/{README.txt → README.rdoc} +30 -5
- data/changelog.txt +10 -0
- data/features/sentence_splitter.feature +34 -0
- data/features/steps/sentence_splitter.rb +17 -0
- data/features/steps/word_splitter.rb +26 -0
- data/features/word_splitter.feature +75 -0
- data/lib/srx/polish/{sentence.rb → sentence_splitter.rb} +3 -1
- data/lib/srx/polish/word_splitter.rb +56 -0
- data/srx-polish.gemspec +3 -3
- metadata +18 -11
data/{README.txt → README.rdoc}
RENAMED
@@ -4,8 +4,8 @@
|
|
4
4
|
|
5
5
|
= DESCRIPTION
|
6
6
|
|
7
|
-
'srx-polish' is a Ruby library
|
8
|
-
based on
|
7
|
+
'srx-polish' is a Ruby library containing Polish sentence and word segmentation rules.
|
8
|
+
The sentence segementation rules are based on rules defined by Marcin Miłkowski:
|
9
9
|
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
10
|
|
11
11
|
= FEATURES/PROBLEMS
|
@@ -24,20 +24,45 @@ Standard rubygems installation:
|
|
24
24
|
The library defines the SRX::Polish::Sentence class allowing to iterate
|
25
25
|
over the matched sentences:
|
26
26
|
|
27
|
-
require 'srx/polish/
|
27
|
+
require 'srx/polish/sentence_splitter'
|
28
28
|
|
29
29
|
text =<<-END
|
30
30
|
Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
|
31
31
|
sprawa jest szczegółowo opisana.
|
32
32
|
END
|
33
33
|
|
34
|
-
|
35
|
-
|
34
|
+
splitter = SRX::Polish::SentenceSplitter.new(text)
|
35
|
+
splitter.each do |sentence|
|
36
36
|
puts sentence.gsub(/\n|\r/,"")
|
37
37
|
end
|
38
38
|
# Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
|
39
39
|
# Na s. 10 książki sprawa jest szczegółowo opisana.
|
40
40
|
|
41
|
+
require 'srx/polish/word_splitter'
|
42
|
+
|
43
|
+
sentence = "Ala ma kota za 5zł i 10$."
|
44
|
+
splitter = SRX::Polish::WordSplitter.new(sentence)
|
45
|
+
splitter.each do |word,type|
|
46
|
+
puts "'#{word}' #{type}"
|
47
|
+
end
|
48
|
+
# 'Ala' word
|
49
|
+
# ' ' other
|
50
|
+
# 'ma' word
|
51
|
+
# ' ' other
|
52
|
+
# 'kota' word
|
53
|
+
# ' ' other
|
54
|
+
# 'za' word
|
55
|
+
# ' ' other
|
56
|
+
# '5' number
|
57
|
+
# 'zł' word
|
58
|
+
# ' ' other
|
59
|
+
# 'i' word
|
60
|
+
# ' ' other
|
61
|
+
# '10' number
|
62
|
+
# '$' graph
|
63
|
+
# '.' punct
|
64
|
+
|
65
|
+
|
41
66
|
== LICENSE
|
42
67
|
|
43
68
|
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
data/changelog.txt
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
0.2.1
|
2
|
+
- fix extension of README (was md, should be rdoc)
|
3
|
+
0.2.0
|
4
|
+
- API change - Sentence changed into SentenceSplitter
|
5
|
+
- Word segmentation rules added
|
6
|
+
- Basic features added
|
7
|
+
0.1.1
|
8
|
+
- Dependency on Term::Ansicolor added
|
9
|
+
0.1.0
|
10
|
+
- First release of sentence segmentation rules
|
@@ -0,0 +1,34 @@
|
|
1
|
+
Feature: sentence splitter
|
2
|
+
Scenario: splitting text
|
3
|
+
Given a text
|
4
|
+
"""
|
5
|
+
W październiku 1890 r. dwóch żołnierzy dokonało
|
6
|
+
rewolty, np. Andrzej i P. Woźny.
|
7
|
+
"""
|
8
|
+
When the text is split
|
9
|
+
Then the following sentences should be detected
|
10
|
+
| sentence |
|
11
|
+
#----------------------------------------------------------------------------------#
|
12
|
+
| W październiku 1890 r. dwóch żołnierzy dokonało rewolty, np. Andrzej i P. Woźny. |
|
13
|
+
|
14
|
+
Given a text
|
15
|
+
"""
|
16
|
+
Wiosna, lato, itd. A wczoraj mieliśmy jesień.
|
17
|
+
"""
|
18
|
+
When the text is split
|
19
|
+
Then the following sentences should be detected
|
20
|
+
| sentence |
|
21
|
+
#----------------------------#
|
22
|
+
| Wiosna, lato, itd. |
|
23
|
+
| A wczoraj mieliśmy jesień. |
|
24
|
+
|
25
|
+
Given a text
|
26
|
+
"""
|
27
|
+
Andrzej G. (20 l.). A z nim sześciu mężczyzn.
|
28
|
+
"""
|
29
|
+
When the text is split
|
30
|
+
Then the following sentences should be detected
|
31
|
+
| sentence |
|
32
|
+
#----------------------------#
|
33
|
+
| Andrzej G. (20 l.). |
|
34
|
+
| A z nim sześciu mężczyzn. |
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/polish/sentence_splitter'
|
4
|
+
|
5
|
+
Given /^a text$/ do |text|
|
6
|
+
@text = text
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the text is split$/ do
|
10
|
+
@splitter = SRX::Polish::SentenceSplitter.new(@text)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following sentences should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
$:.unshift "lib"
|
3
|
+
require 'srx/polish/word_splitter'
|
4
|
+
|
5
|
+
Given /^a sentence '([^']+)'$/ do |sentence|
|
6
|
+
@sentence = sentence.force_encoding('utf-8')
|
7
|
+
end
|
8
|
+
|
9
|
+
When /^the sentence is split$/ do
|
10
|
+
@splitter = SRX::Polish::WordSplitter.new(@sentence)
|
11
|
+
end
|
12
|
+
|
13
|
+
Then /^the following segments should be detected$/ do |table|
|
14
|
+
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
16
|
+
returned[1].should == expected[:type].to_sym
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
Then /^the following non-blank segments should be detected$/ do |table|
|
21
|
+
segments = @splitter.select{|s| s[1] != :other}
|
22
|
+
table.hashes.zip(segments).each do |expected,returned|
|
23
|
+
returned[0].should == expected[:segment].gsub(/'/,"")
|
24
|
+
returned[1].should == expected[:type].to_sym
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
Feature: word splitter
|
2
|
+
Scenario: splitting a sentence
|
3
|
+
Given a sentence 'My home is my castle.'
|
4
|
+
When the sentence is split
|
5
|
+
Then the following segments should be detected
|
6
|
+
| segment | type |
|
7
|
+
#-----------------#
|
8
|
+
| My | word |
|
9
|
+
| ' ' | other |
|
10
|
+
| home | word |
|
11
|
+
| ' ' | other |
|
12
|
+
| is | word |
|
13
|
+
| ' ' | other |
|
14
|
+
| my | word |
|
15
|
+
| ' ' | other |
|
16
|
+
| castle | word |
|
17
|
+
| . | punct |
|
18
|
+
|
19
|
+
Given a sentence 'W dniu 14/12/2011 nastąpił napad na bank!'
|
20
|
+
When the sentence is split
|
21
|
+
Then the following non-blank segments should be detected
|
22
|
+
| segment | type |
|
23
|
+
#---------------------#
|
24
|
+
| W | word |
|
25
|
+
| dniu | word |
|
26
|
+
| 14/12/2011 | number |
|
27
|
+
| nastąpił | word |
|
28
|
+
| napad | word |
|
29
|
+
| na | word |
|
30
|
+
| bank | word |
|
31
|
+
| ! | punct |
|
32
|
+
|
33
|
+
Given a sentence 'Użytkownik o loginie ania8 zalogował się 7 listopada.'
|
34
|
+
When the sentence is split
|
35
|
+
Then the following non-blank segments should be detected
|
36
|
+
| segment | type |
|
37
|
+
#---------------------#
|
38
|
+
| Użytkownik | word |
|
39
|
+
| o | word |
|
40
|
+
| loginie | word |
|
41
|
+
| ania8 | word |
|
42
|
+
| zalogował | word |
|
43
|
+
| się | word |
|
44
|
+
| 7 | number |
|
45
|
+
| listopada | word |
|
46
|
+
| . | punct |
|
47
|
+
|
48
|
+
Given a sentence 'Czy wrona, kruk i gawron to polskie ptaki?'
|
49
|
+
When the sentence is split
|
50
|
+
Then the following non-blank segments should be detected
|
51
|
+
| segment | type |
|
52
|
+
#---------------------#
|
53
|
+
| Czy | word |
|
54
|
+
| wrona | word |
|
55
|
+
| , | punct |
|
56
|
+
| kruk | word |
|
57
|
+
| i | word |
|
58
|
+
| gawron | word |
|
59
|
+
| to | word |
|
60
|
+
| polskie | word |
|
61
|
+
| ptaki | word |
|
62
|
+
| ? | punct |
|
63
|
+
|
64
|
+
Given a sentence 'Czy 10 000 000$ to duża kwota?'
|
65
|
+
When the sentence is split
|
66
|
+
Then the following non-blank segments should be detected
|
67
|
+
| segment | type |
|
68
|
+
#---------------------#
|
69
|
+
| Czy | word |
|
70
|
+
| 10 000 000 | number |
|
71
|
+
| $ | graph |
|
72
|
+
| to | word |
|
73
|
+
| duża | word |
|
74
|
+
| kwota | word |
|
75
|
+
| ? | punct |
|
@@ -55,8 +55,9 @@ module SRX
|
|
55
55
|
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
56
56
|
FIRST_CHAR = /\A./m
|
57
57
|
|
58
|
+
class SentenceSplitter
|
59
|
+
include Enumerable
|
58
60
|
|
59
|
-
class Sentence
|
60
61
|
attr_accessor :input
|
61
62
|
attr_writer :debug
|
62
63
|
|
@@ -69,6 +70,7 @@ module SRX
|
|
69
70
|
end
|
70
71
|
|
71
72
|
def each
|
73
|
+
raise "Invalid argument - text is nil" if @input.nil?
|
72
74
|
buffer_length = 10
|
73
75
|
sentence = ""
|
74
76
|
before_buffer = ""
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module SRX
|
4
|
+
module Polish
|
5
|
+
class WordSplitter
|
6
|
+
include Enumerable
|
7
|
+
|
8
|
+
attr_accessor :sentence
|
9
|
+
SPLIT_RULES = {
|
10
|
+
:word => "\\p{Alpha}\\p{Word}*",
|
11
|
+
:number => "\\p{Digit}+(?:[:., _/-]\\p{Digit}+)*",
|
12
|
+
:punct => "\\p{Punct}",
|
13
|
+
:graph => "\\p{Graph}",
|
14
|
+
:other => "[^\\p{Word}\\p{Graph}]+"
|
15
|
+
}
|
16
|
+
|
17
|
+
SPLIT_RE = /#{SPLIT_RULES.values.map{|v| "(#{v})"}.join("|")}/m
|
18
|
+
|
19
|
+
# The initializer accepts a +sentence+, which might be a
|
20
|
+
# Sentence instance or a String instance.
|
21
|
+
#
|
22
|
+
# The splitter might be initialized without the sentence,
|
23
|
+
# but should be set using the accessor before first call to
|
24
|
+
# +each+ method.
|
25
|
+
def initialize(sentence=nil)
|
26
|
+
@sentence = sentence
|
27
|
+
end
|
28
|
+
|
29
|
+
# This method iterates over the words in the sentence.
|
30
|
+
# It yields the string representation of the word and
|
31
|
+
# its type, which is one of:
|
32
|
+
# * +:word+ - a regular word (including words containing numbers, like A4)
|
33
|
+
# * +:number+ - a number (including number with spaces, dashes, slashes, etc.)
|
34
|
+
# * +:punct+ - single punctuation character (comma, semicolon, full stop, etc.)
|
35
|
+
# * +:graph+ - any single graphical (visible) character
|
36
|
+
# * +:other+ - anything which is not covered by the above types (non-visible
|
37
|
+
# characters in particular)
|
38
|
+
def each
|
39
|
+
raise "Invalid argument - sentence is nil" if @sentence.nil?
|
40
|
+
@sentence.scan(SPLIT_RE) do |word,number,punct,graph,other|
|
41
|
+
if !word.nil?
|
42
|
+
yield word, :word
|
43
|
+
elsif !number.nil?
|
44
|
+
yield number, :number
|
45
|
+
elsif !punct.nil?
|
46
|
+
yield punct, :punct
|
47
|
+
elsif !graph.nil?
|
48
|
+
yield graph, :graph
|
49
|
+
else
|
50
|
+
yield other, :other
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/srx-polish.gemspec
CHANGED
@@ -3,13 +3,13 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "srx-polish"
|
6
|
-
s.version = "0.
|
6
|
+
s.version = "0.2.1"
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
8
|
s.authors = ["Aleksander Pohl"]
|
9
9
|
s.email = ["apohllo@o2.pl"]
|
10
10
|
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
-
s.summary = %q{Polish sentence segmentation rules.}
|
12
|
-
s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
|
11
|
+
s.summary = %q{Polish sentence and word segmentation rules.}
|
12
|
+
s.description = %q{Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Miłkowski's SRX rules.}
|
13
13
|
|
14
14
|
s.rubyforge_project = "srx-polish"
|
15
15
|
s.has_rdoc = false
|
metadata
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
name: srx-polish
|
3
3
|
version: !ruby/object:Gem::Version
|
4
4
|
prerelease:
|
5
|
-
version: 0.
|
5
|
+
version: 0.2.1
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
8
8
|
- Aleksander Pohl
|
@@ -10,8 +10,7 @@ autorequire:
|
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
12
|
|
13
|
-
date: 2011-
|
14
|
-
default_executable:
|
13
|
+
date: 2011-10-14 00:00:00 Z
|
15
14
|
dependencies:
|
16
15
|
- !ruby/object:Gem::Dependency
|
17
16
|
name: term-ansicolor
|
@@ -24,7 +23,7 @@ dependencies:
|
|
24
23
|
version: 1.0.5
|
25
24
|
type: :runtime
|
26
25
|
version_requirements: *id001
|
27
|
-
description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
|
26
|
+
description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
|
28
27
|
email:
|
29
28
|
- apohllo@o2.pl
|
30
29
|
executables: []
|
@@ -35,10 +34,15 @@ extra_rdoc_files: []
|
|
35
34
|
|
36
35
|
files:
|
37
36
|
- .gitignore
|
38
|
-
- README.
|
39
|
-
-
|
37
|
+
- README.rdoc
|
38
|
+
- changelog.txt
|
39
|
+
- features/sentence_splitter.feature
|
40
|
+
- features/steps/sentence_splitter.rb
|
41
|
+
- features/steps/word_splitter.rb
|
42
|
+
- features/word_splitter.feature
|
43
|
+
- lib/srx/polish/sentence_splitter.rb
|
44
|
+
- lib/srx/polish/word_splitter.rb
|
40
45
|
- srx-polish.gemspec
|
41
|
-
has_rdoc: true
|
42
46
|
homepage: http://github.com/apohllo/srx2ruby
|
43
47
|
licenses: []
|
44
48
|
|
@@ -62,9 +66,12 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
62
66
|
requirements: []
|
63
67
|
|
64
68
|
rubyforge_project: srx-polish
|
65
|
-
rubygems_version: 1.5
|
69
|
+
rubygems_version: 1.8.5
|
66
70
|
signing_key:
|
67
71
|
specification_version: 3
|
68
|
-
summary: Polish sentence segmentation rules.
|
69
|
-
test_files:
|
70
|
-
|
72
|
+
summary: Polish sentence and word segmentation rules.
|
73
|
+
test_files:
|
74
|
+
- features/sentence_splitter.feature
|
75
|
+
- features/steps/sentence_splitter.rb
|
76
|
+
- features/steps/word_splitter.rb
|
77
|
+
- features/word_splitter.feature
|