srx-polish 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ .*.sw?
2
+ *.gem
3
+ pkg
4
+ work
@@ -0,0 +1,57 @@
1
+ == srx-polish
2
+
3
+ * https://github.com/apohllo/srx-polish
4
+
5
+ = DESCRIPTION
6
+
7
+ 'srx-polish' is a Ruby library containint Polish sentence segmentation rules
8
+ based on SRX rules defined by Marcin Miłkowski:
9
+ http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
+
11
+ = FEATURES/PROBLEMS
12
+
13
+ * this library is generated by 'srx2ruby' which has some limitations and might
14
+ be not 100% SRX standard compliant.
15
+
16
+ = INSTALL
17
+
18
+ Standard rubygems installation:
19
+
20
+ $ gem install srx-polish
21
+
22
+ = BASIC USAGE
23
+
24
+ The library defines the SRX::Polish::Sentence class allowing to iterate
25
+ over the matched sentences:
26
+
27
+ require 'srx/polish/sentence'
28
+
29
+ text =<<-END
30
+ Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
31
+ sprawa jest szczegółowo opisana.
32
+ END
33
+
34
+ sentences = SRX::Polish::Sentence.new(text)
35
+ sentences.each do |sentence|
36
+ puts sentence.gsub(/\n|\r/,"")
37
+ end
38
+ # Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
39
+ # Na s. 10 książki sprawa jest szczegółowo opisana.
40
+
41
+ == LICENSE
42
+
43
+ Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
44
+
45
+ This program is free software: you can redistribute it and/or modify
46
+ it under the terms of the GNU General Public License as published by
47
+ the Free Software Foundation, either version 3 of the License, or
48
+ (at your option) any later version.
49
+
50
+ This program is distributed in the hope that it will be useful,
51
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
52
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
53
+ GNU General Public License for more details.
54
+
55
+ == FEEDBACK
56
+
57
+ * mailto:apohllo@o2.pl
@@ -0,0 +1,115 @@
1
+ #encoding: utf-8
2
+ require 'stringio'
3
+ require 'term/ansicolor'
4
+ module SRX
5
+ module Polish
6
+ RULES =
7
+ [["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
8
+ nil,
9
+ false],
10
+ ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
11
+ ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
12
+ ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
13
+ ["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
14
+ nil,
15
+ false],
16
+ ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
17
+ ["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
18
+ "[\\p{Ll}\\d]",
19
+ false],
20
+ ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
21
+ ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
22
+ "[\\p{Ps}–—-]\\s?\\p{Ll}",
23
+ false],
24
+ ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
25
+ ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
26
+ ["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
27
+ ["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
28
+ "\\p{Ll}",
29
+ false],
30
+ ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
31
+ "\\p{Ll}+",
32
+ false],
33
+ ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
34
+ "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
35
+ false],
36
+ ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
37
+ ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
38
+ "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
39
+ false],
40
+ ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
41
+ ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
42
+ ["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
43
+ "\\p{Lu}[^\\p{Lu}]",
44
+ false],
45
+ ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
46
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
47
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
48
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
49
+ ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
50
+ nil,
51
+ true],
52
+ ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
53
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
54
+ BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
55
+ REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
+ FIRST_CHAR = /\A./m
57
+
58
+
59
+ class Sentence
60
+ attr_accessor :input
61
+ attr_writer :debug
62
+
63
+ def initialize(text=nil)
64
+ if text.is_a?(String)
65
+ @input = StringIO.new(text,"r:utf-8")
66
+ else
67
+ @input = text
68
+ end
69
+ end
70
+
71
+ def each
72
+ buffer_length = 10
73
+ sentence = ""
74
+ before_buffer = ""
75
+ @input.pos = 0
76
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
77
+ matched_rule = nil
78
+ while(!@input.eof?) do
79
+ matched_before = BEFORE_RE.match(before_buffer)
80
+ break_detected = false
81
+ if matched_before
82
+ start_index = (matched_before.size - 1).times.find do |index|
83
+ matched_before[index+1]
84
+ end
85
+ if @debug
86
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
87
+ end
88
+ REGEXPS.each do |before_re,after_re,value|
89
+ # skip the whole match
90
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
91
+ break_detected = true
92
+ color = value ? :red : :green
93
+ if @debug
94
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
95
+ end
96
+ if value
97
+ yield sentence
98
+ sentence = ""
99
+ end
100
+ break
101
+ end
102
+ end
103
+ end
104
+ next_after = @input.readchar
105
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
106
+ after_buffer.sub!(FIRST_CHAR,"")
107
+ before_buffer << $&
108
+ sentence << $&
109
+ after_buffer << next_after
110
+ end
111
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx-polish"
6
+ s.version = "0.1.1"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{Polish sentence segmentation rules.}
12
+ s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
13
+
14
+ s.rubyforge_project = "srx-polish"
15
+ s.has_rdoc = false
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("term-ansicolor", ["~> 1.0.5"])
23
+ end
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx-polish
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.1
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-02 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: term-ansicolor
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.5
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
28
+ email:
29
+ - apohllo@o2.pl
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - README.txt
39
+ - lib/srx/polish/sentence.rb
40
+ - srx-polish.gemspec
41
+ has_rdoc: true
42
+ homepage: http://github.com/apohllo/srx2ruby
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ requirements: []
63
+
64
+ rubyforge_project: srx-polish
65
+ rubygems_version: 1.5.2
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: Polish sentence segmentation rules.
69
+ test_files: []
70
+