srx-polish 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ .*.sw?
2
+ *.gem
3
+ pkg
4
+ work
@@ -0,0 +1,57 @@
1
+ == srx-polish
2
+
3
+ * https://github.com/apohllo/srx-polish
4
+
5
+ = DESCRIPTION
6
+
7
+ 'srx-polish' is a Ruby library containint Polish sentence segmentation rules
8
+ based on SRX rules defined by Marcin Miłkowski:
9
+ http://morfologik.blogspot.com/2009/11/talking-about-srx-in-lt-during-ltc.html
10
+
11
+ = FEATURES/PROBLEMS
12
+
13
+ * this library is generated by 'srx2ruby' which has some limitations and might
14
+ be not 100% SRX standard compliant.
15
+
16
+ = INSTALL
17
+
18
+ Standard rubygems installation:
19
+
20
+ $ gem install srx-polish
21
+
22
+ = BASIC USAGE
23
+
24
+ The library defines the SRX::Polish::Sentence class allowing to iterate
25
+ over the matched sentences:
26
+
27
+ require 'srx/polish/sentence'
28
+
29
+ text =<<-END
30
+ Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie. Na s. 10 książki
31
+ sprawa jest szczegółowo opisana.
32
+ END
33
+
34
+ sentences = SRX::Polish::Sentence.new(text)
35
+ sentences.each do |sentence|
36
+ puts sentence.gsub(/\n|\r/,"")
37
+ end
38
+ # Kiedy spotkałem p. Wojtka miał na sobie krótkie spodnie.
39
+ # Na s. 10 książki sprawa jest szczegółowo opisana.
40
+
41
+ == LICENSE
42
+
43
+ Copyright (C) 2011 Aleksander Pohl, Marcin Miłkowski, Jarosław Lipski
44
+
45
+ This program is free software: you can redistribute it and/or modify
46
+ it under the terms of the GNU General Public License as published by
47
+ the Free Software Foundation, either version 3 of the License, or
48
+ (at your option) any later version.
49
+
50
+ This program is distributed in the hope that it will be useful,
51
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
52
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
53
+ GNU General Public License for more details.
54
+
55
+ == FEEDBACK
56
+
57
+ * mailto:apohllo@o2.pl
@@ -0,0 +1,115 @@
1
+ #encoding: utf-8
2
+ require 'stringio'
3
+ require 'term/ansicolor'
4
+ module SRX
5
+ module Polish
6
+ RULES =
7
+ [["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
8
+ nil,
9
+ false],
10
+ ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
11
+ ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
12
+ ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
13
+ ["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
14
+ nil,
15
+ false],
16
+ ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
17
+ ["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
18
+ "[\\p{Ll}\\d]",
19
+ false],
20
+ ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
21
+ ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
22
+ "[\\p{Ps}–—-]\\s?\\p{Ll}",
23
+ false],
24
+ ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
25
+ ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
26
+ ["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
27
+ ["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
28
+ "\\p{Ll}",
29
+ false],
30
+ ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
31
+ "\\p{Ll}+",
32
+ false],
33
+ ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
34
+ "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
35
+ false],
36
+ ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
37
+ ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
38
+ "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
39
+ false],
40
+ ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
41
+ ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
42
+ ["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
43
+ "\\p{Lu}[^\\p{Lu}]",
44
+ false],
45
+ ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
46
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
47
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
48
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
49
+ ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
50
+ nil,
51
+ true],
52
+ ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
53
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
54
+ BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
55
+ REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
+ FIRST_CHAR = /\A./m
57
+
58
+
59
+ class Sentence
60
+ attr_accessor :input
61
+ attr_writer :debug
62
+
63
+ def initialize(text=nil)
64
+ if text.is_a?(String)
65
+ @input = StringIO.new(text,"r:utf-8")
66
+ else
67
+ @input = text
68
+ end
69
+ end
70
+
71
+ def each
72
+ buffer_length = 10
73
+ sentence = ""
74
+ before_buffer = ""
75
+ @input.pos = 0
76
+ after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
77
+ matched_rule = nil
78
+ while(!@input.eof?) do
79
+ matched_before = BEFORE_RE.match(before_buffer)
80
+ break_detected = false
81
+ if matched_before
82
+ start_index = (matched_before.size - 1).times.find do |index|
83
+ matched_before[index+1]
84
+ end
85
+ if @debug
86
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
87
+ end
88
+ REGEXPS.each do |before_re,after_re,value|
89
+ # skip the whole match
90
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
91
+ break_detected = true
92
+ color = value ? :red : :green
93
+ if @debug
94
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
95
+ end
96
+ if value
97
+ yield sentence
98
+ sentence = ""
99
+ end
100
+ break
101
+ end
102
+ end
103
+ end
104
+ next_after = @input.readchar
105
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
106
+ after_buffer.sub!(FIRST_CHAR,"")
107
+ before_buffer << $&
108
+ sentence << $&
109
+ after_buffer << next_after
110
+ end
111
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,23 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = "srx-polish"
6
+ s.version = "0.1.1"
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ["Aleksander Pohl"]
9
+ s.email = ["apohllo@o2.pl"]
10
+ s.homepage = "http://github.com/apohllo/srx2ruby"
11
+ s.summary = %q{Polish sentence segmentation rules.}
12
+ s.description = %q{Polish sentence segmentation rules based on Marcin Miłkowski's SRX rules.}
13
+
14
+ s.rubyforge_project = "srx-polish"
15
+ s.has_rdoc = false
16
+
17
+ s.files = `git ls-files`.split("\n")
18
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
19
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
+ s.require_paths = ["lib"]
21
+
22
+ s.add_dependency("term-ansicolor", ["~> 1.0.5"])
23
+ end
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: srx-polish
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.1.1
6
+ platform: ruby
7
+ authors:
8
+ - Aleksander Pohl
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2011-06-02 00:00:00 +02:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: term-ansicolor
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ~>
23
+ - !ruby/object:Gem::Version
24
+ version: 1.0.5
25
+ type: :runtime
26
+ version_requirements: *id001
27
+ description: "Polish sentence segmentation rules based on Marcin Mi\xC5\x82kowski's SRX rules."
28
+ email:
29
+ - apohllo@o2.pl
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - README.txt
39
+ - lib/srx/polish/sentence.rb
40
+ - srx-polish.gemspec
41
+ has_rdoc: true
42
+ homepage: http://github.com/apohllo/srx2ruby
43
+ licenses: []
44
+
45
+ post_install_message:
46
+ rdoc_options: []
47
+
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ">="
54
+ - !ruby/object:Gem::Version
55
+ version: "0"
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ requirements: []
63
+
64
+ rubyforge_project: srx-polish
65
+ rubygems_version: 1.5.2
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: Polish sentence segmentation rules.
69
+ test_files: []
70
+