srx-polish 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/README.txt +57 -0
- data/lib/srx/polish/sentence.rb +115 -0
- data/srx-polish.gemspec +23 -0
- metadata +70 -0
data/.gitignore
ADDED
data/README.txt
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
== srx-polish
|
2
|
+
|
3
|
+
* https://github.com/apohllo/srx-polish
|
4
|
+
|
5
|
+
= DESCRIPTION
|
6
|
+
|
7
|
+
'srx-polish' is a Ruby library containint Polish sentence segmentation rules
|
8
|
+
based on SRX rules defined by Marcin Miłkowski:
|
9
|
+
http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
|
10
|
+
|
11
|
+
= FEATURES/PROBLEMS
|
12
|
+
|
13
|
+
* this library is generated by 'srx2ruby' which has some limitations and might
|
14
|
+
be not 100% SRX standard compliant.
|
15
|
+
|
16
|
+
= INSTALL
|
17
|
+
|
18
|
+
Standard rubygems installation:
|
19
|
+
|
20
|
+
$ gem install srx-polish
|
21
|
+
|
22
|
+
= BASIC USAGE
|
23
|
+
|
24
|
+
The library defines the SRX::Polish::Sentence class allowing to iterate
|
25
|
+
over the matched sentences:
|
26
|
+
|
27
|
+
require 'srx/polish/sentence'
|
28
|
+
|
29
|
+
text =<<-END
|
30
|
+
Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
|
31
|
+
sprawa jest szczegółowo opisana.
|
32
|
+
END
|
33
|
+
|
34
|
+
sentences = SRX::Polish::Sentence.new(text)
|
35
|
+
sentences.each do |sentence|
|
36
|
+
puts sentence.gsub(/\n|\r/,"")
|
37
|
+
end
|
38
|
+
# Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
|
39
|
+
# Na s. 10 książki sprawa jest szczegółowo opisana.
|
40
|
+
|
41
|
+
== LICENSE
|
42
|
+
|
43
|
+
Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
|
44
|
+
|
45
|
+
This program is free software: you can redistribute it and/or modify
|
46
|
+
it under the terms of the GNU General Public License as published by
|
47
|
+
the Free Software Foundation, either version 3 of the License, or
|
48
|
+
(at your option) any later version.
|
49
|
+
|
50
|
+
This program is distributed in the hope that it will be useful,
|
51
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
52
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
53
|
+
GNU General Public License for more details.
|
54
|
+
|
55
|
+
== FEEDBACK
|
56
|
+
|
57
|
+
* mailto:apohllo@o2.pl
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#encoding: utf-8
|
2
|
+
require 'stringio'
|
3
|
+
require 'term/ansicolor'
|
4
|
+
module SRX
|
5
|
+
module Polish
|
6
|
+
RULES =
|
7
|
+
[["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
|
8
|
+
nil,
|
9
|
+
false],
|
10
|
+
["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
|
11
|
+
["(?:\\b[Nn]r\\.\\s)", "\\d", false],
|
12
|
+
["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
|
13
|
+
["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
|
14
|
+
nil,
|
15
|
+
false],
|
16
|
+
["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
|
17
|
+
["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
|
18
|
+
"[\\p{Ll}\\d]",
|
19
|
+
false],
|
20
|
+
["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
|
21
|
+
["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
|
22
|
+
"[\\p{Ps}–—-]\\s?\\p{Ll}",
|
23
|
+
false],
|
24
|
+
["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
|
25
|
+
["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
|
26
|
+
["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
|
27
|
+
["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
|
28
|
+
"\\p{Ll}",
|
29
|
+
false],
|
30
|
+
["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
|
31
|
+
"\\p{Ll}+",
|
32
|
+
false],
|
33
|
+
["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
|
34
|
+
"\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
|
35
|
+
false],
|
36
|
+
["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
|
37
|
+
["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
|
38
|
+
"\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
|
39
|
+
false],
|
40
|
+
["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
|
41
|
+
["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
|
42
|
+
["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
|
43
|
+
"\\p{Lu}[^\\p{Lu}]",
|
44
|
+
false],
|
45
|
+
["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
|
46
|
+
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
47
|
+
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
48
|
+
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
49
|
+
["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
|
50
|
+
nil,
|
51
|
+
true],
|
52
|
+
["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
|
53
|
+
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
|
54
|
+
BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
55
|
+
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
56
|
+
FIRST_CHAR = /\A./m
|
57
|
+
|
58
|
+
|
59
|
+
class Sentence
|
60
|
+
attr_accessor :input
|
61
|
+
attr_writer :debug
|
62
|
+
|
63
|
+
def initialize(text=nil)
|
64
|
+
if text.is_a?(String)
|
65
|
+
@input = StringIO.new(text,"r:utf-8")
|
66
|
+
else
|
67
|
+
@input = text
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def each
|
72
|
+
buffer_length = 10
|
73
|
+
sentence = ""
|
74
|
+
before_buffer = ""
|
75
|
+
@input.pos = 0
|
76
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
|
77
|
+
matched_rule = nil
|
78
|
+
while(!@input.eof?) do
|
79
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
80
|
+
break_detected = false
|
81
|
+
if matched_before
|
82
|
+
start_index = (matched_before.size - 1).times.find do |index|
|
83
|
+
matched_before[index+1]
|
84
|
+
end
|
85
|
+
if @debug
|
86
|
+
puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
|
87
|
+
end
|
88
|
+
REGEXPS.each do |before_re,after_re,value|
|
89
|
+
# skip the whole match
|
90
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
91
|
+
break_detected = true
|
92
|
+
color = value ? :red : :green
|
93
|
+
if @debug
|
94
|
+
sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
|
95
|
+
end
|
96
|
+
if value
|
97
|
+
yield sentence
|
98
|
+
sentence = ""
|
99
|
+
end
|
100
|
+
break
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
next_after = @input.readchar
|
105
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
106
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
107
|
+
before_buffer << $&
|
108
|
+
sentence << $&
|
109
|
+
after_buffer << next_after
|
110
|
+
end
|
111
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
data/srx-polish.gemspec
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "srx-polish"
|
6
|
+
s.version = "0.1.1"
|
7
|
+
s.platform = Gem::Platform::RUBY
|
8
|
+
s.authors = ["Aleksander Pohl"]
|
9
|
+
s.email = ["apohllo@o2.pl"]
|
10
|
+
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
|
+
s.summary = %q{Polish sentence segmentation rules.}
|
12
|
+
s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "srx-polish"
|
15
|
+
s.has_rdoc = false
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split("\n")
|
18
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
|
+
s.require_paths = ["lib"]
|
21
|
+
|
22
|
+
s.add_dependency("term-ansicolor", ["~> 1.0.5"])
|
23
|
+
end
|
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: srx-polish
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.1.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Aleksander Pohl
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2011-06-02 00:00:00 +02:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: term-ansicolor
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ~>
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: 1.0.5
|
25
|
+
type: :runtime
|
26
|
+
version_requirements: *id001
|
27
|
+
description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
|
28
|
+
email:
|
29
|
+
- apohllo@o2.pl
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- README.txt
|
39
|
+
- lib/srx/polish/sentence.rb
|
40
|
+
- srx-polish.gemspec
|
41
|
+
has_rdoc: true
|
42
|
+
homepage: http://github.com/apohllo/srx2ruby
|
43
|
+
licenses: []
|
44
|
+
|
45
|
+
post_install_message:
|
46
|
+
rdoc_options: []
|
47
|
+
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ">="
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: "0"
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: "0"
|
62
|
+
requirements: []
|
63
|
+
|
64
|
+
rubyforge_project: srx-polish
|
65
|
+
rubygems_version: 1.5.2
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: Polish sentence segmentation rules.
|
69
|
+
test_files: []
|
70
|
+
|