srx-polish 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/README.txt +57 -0
- data/lib/srx/polish/sentence.rb +115 -0
- data/srx-polish.gemspec +23 -0
- metadata +70 -0
data/.gitignore
ADDED
data/README.txt
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
== srx-polish
|
2
|
+
|
3
|
+
* https://github.com/apohllo/srx-polish
|
4
|
+
|
5
|
+
= DESCRIPTION
|
6
|
+
|
7
|
+
'srx-polish' is a Ruby library containint Polish sentence segmentation rules
|
8
|
+
based on SRX rules defined by Marcin Miłkowski:
|
9
|
+
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
|
+
|
11
|
+
= FEATURES/PROBLEMS
|
12
|
+
|
13
|
+
* this library is generated by 'srx2ruby' which has some limitations and might
|
14
|
+
be not 100% SRX standard compliant.
|
15
|
+
|
16
|
+
= INSTALL
|
17
|
+
|
18
|
+
Standard rubygems installation:
|
19
|
+
|
20
|
+
$ gem install srx-polish
|
21
|
+
|
22
|
+
= BASIC USAGE
|
23
|
+
|
24
|
+
The library defines the SRX::Polish::Sentence class allowing to iterate
|
25
|
+
over the matched sentences:
|
26
|
+
|
27
|
+
require 'srx/polish/sentence'
|
28
|
+
|
29
|
+
text =<<-END
|
30
|
+
Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
|
31
|
+
sprawa jest szczegółowo opisana.
|
32
|
+
END
|
33
|
+
|
34
|
+
sentences = SRX::Polish::Sentence.new(text)
|
35
|
+
sentences.each do |sentence|
|
36
|
+
puts sentence.gsub(/\n|\r/,"")
|
37
|
+
end
|
38
|
+
# Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
|
39
|
+
# Na s. 10 książki sprawa jest szczegółowo opisana.
|
40
|
+
|
41
|
+
== LICENSE
|
42
|
+
|
43
|
+
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
44
|
+
|
45
|
+
This program is free software: you can redistribute it and/or modify
|
46
|
+
it under the terms of the GNU General Public License as published by
|
47
|
+
the Free Software Foundation, either version 3 of the License, or
|
48
|
+
(at your option) any later version.
|
49
|
+
|
50
|
+
This program is distributed in the hope that it will be useful,
|
51
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
52
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
53
|
+
GNU General Public License for more details.
|
54
|
+
|
55
|
+
== FEEDBACK
|
56
|
+
|
57
|
+
* mailto:apohllo@o2.pl
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'stringio'
|
3
|
+
require 'term/ansicolor'
|
4
|
+
module SRX
|
5
|
+
module Polish
|
6
|
+
RULES =
|
7
|
+
[["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
|
8
|
+
nil,
|
9
|
+
false],
|
10
|
+
["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
|
11
|
+
["(?:\\b[Nn]r\\.\\s)", "\\d", false],
|
12
|
+
["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
|
13
|
+
["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
|
14
|
+
nil,
|
15
|
+
false],
|
16
|
+
["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
|
17
|
+
["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
|
18
|
+
"[\\p{Ll}\\d]",
|
19
|
+
false],
|
20
|
+
["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
|
21
|
+
["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
|
22
|
+
"[\\p{Ps}–—-]\\s?\\p{Ll}",
|
23
|
+
false],
|
24
|
+
["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
|
25
|
+
["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
|
26
|
+
["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
|
27
|
+
["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
|
28
|
+
"\\p{Ll}",
|
29
|
+
false],
|
30
|
+
["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
|
31
|
+
"\\p{Ll}+",
|
32
|
+
false],
|
33
|
+
["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
|
34
|
+
"\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
|
35
|
+
false],
|
36
|
+
["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
|
37
|
+
["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
|
38
|
+
"\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
|
39
|
+
false],
|
40
|
+
["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
|
41
|
+
["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
|
42
|
+
["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
|
43
|
+
"\\p{Lu}[^\\p{Lu}]",
|
44
|
+
false],
|
45
|
+
["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
|
46
|
+
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
47
|
+
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
48
|
+
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
49
|
+
["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
|
50
|
+
nil,
|
51
|
+
true],
|
52
|
+
["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
|
53
|
+
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
|
54
|
+
BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
55
|
+
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
56
|
+
FIRST_CHAR = /\A./m
|
57
|
+
|
58
|
+
|
59
|
+
class Sentence
|
60
|
+
attr_accessor :input
|
61
|
+
attr_writer :debug
|
62
|
+
|
63
|
+
def initialize(text=nil)
|
64
|
+
if text.is_a?(String)
|
65
|
+
@input = StringIO.new(text,"r:utf-8")
|
66
|
+
else
|
67
|
+
@input = text
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def each
|
72
|
+
buffer_length = 10
|
73
|
+
sentence = ""
|
74
|
+
before_buffer = ""
|
75
|
+
@input.pos = 0
|
76
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
77
|
+
matched_rule = nil
|
78
|
+
while(!@input.eof?) do
|
79
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
80
|
+
break_detected = false
|
81
|
+
if matched_before
|
82
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
83
|
+
matched_before[index+1]
|
84
|
+
end
|
85
|
+
if @debug
|
86
|
+
puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
|
87
|
+
end
|
88
|
+
REGEXPS.each do |before_re,after_re,value|
|
89
|
+
# skip the whole match
|
90
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
91
|
+
break_detected = true
|
92
|
+
color = value ? :red : :green
|
93
|
+
if @debug
|
94
|
+
sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
|
95
|
+
end
|
96
|
+
if value
|
97
|
+
yield sentence
|
98
|
+
sentence = ""
|
99
|
+
end
|
100
|
+
break
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
next_after = @input.readchar
|
105
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
106
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
107
|
+
before_buffer << $&
|
108
|
+
sentence << $&
|
109
|
+
after_buffer << next_after
|
110
|
+
end
|
111
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
data/srx-polish.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx-polish"
|
6
|
+
s.version = "0.1.1"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{Polish sentence segmentation rules.}
|
12
|
+
s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "srx-polish"
|
15
|
+
s.has_rdoc = false
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency("term-ansicolor", ["~> 1.0.5"])
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx-polish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-02 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: term-ansicolor
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.5
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
|
28
|
+
email:
|
29
|
+
- apohllo@o2.pl
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- README.txt
|
39
|
+
- lib/srx/polish/sentence.rb
|
40
|
+
- srx-polish.gemspec
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://github.com/apohllo/srx2ruby
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project: srx-polish
|
65
|
+
rubygems_version: 1.5.2
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: Polish sentence segmentation rules.
|
69
|
+
test_files: []
|
70
|
+
|