srx-polish 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +2 -1
- data/.ruby-version +1 -0
- data/Gemfile +2 -0
- data/changelog.txt +3 -0
- data/features/steps/sentence_splitter.rb +1 -1
- data/features/steps/word_splitter.rb +4 -4
- data/lib/srx/polish/polish-rules.rb +114 -0
- data/lib/srx/polish/sentence_splitter.rb +45 -86
- data/srx-polish.gemspec +3 -3
- metadata +45 -45
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: c1a5a585338642469bee4a679efc4f0d3f12228d
|
4
|
+
data.tar.gz: 5c5df57c6c78b5a89647e4f2ec21af5262dec6e8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 87d75afc302d0debe4aecf899184402eae26082925089d94f13c5a004beb3f3e7ac18b5bb9e8bc44689389165b2119432d27756e74dbace820ddd7b94cc7a7d7
|
7
|
+
data.tar.gz: 29a1b4c730e782b415bae8bb7b5f4d09fd9f126c01724916ce1a80b158ea5f087a608aa63d02f9d9bf831e802ef5e3d50ccb0b9ef281036123786f31ef86cea4
|
data/.gitignore
CHANGED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.1
|
data/Gemfile
ADDED
data/changelog.txt
CHANGED
@@ -12,6 +12,6 @@ end
|
|
12
12
|
|
13
13
|
Then /^the following sentences should be detected$/ do |table|
|
14
14
|
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
-
returned.gsub(/\s*\n/," ").strip.
|
15
|
+
expect(returned.gsub(/\s*\n/," ").strip).to eq expected[:sentence]
|
16
16
|
end
|
17
17
|
end
|
@@ -12,15 +12,15 @@ end
|
|
12
12
|
|
13
13
|
Then /^the following segments should be detected$/ do |table|
|
14
14
|
table.hashes.zip(@splitter.to_a).each do |expected,returned|
|
15
|
-
returned[0].
|
16
|
-
returned[1].
|
15
|
+
expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
|
16
|
+
expect(returned[1]).to eq expected[:type].to_sym
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
20
|
Then /^the following non-blank segments should be detected$/ do |table|
|
21
21
|
segments = @splitter.select{|s| s[1] != :other}
|
22
22
|
table.hashes.zip(segments).each do |expected,returned|
|
23
|
-
returned[0].
|
24
|
-
returned[1].
|
23
|
+
expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
|
24
|
+
expect(returned[1]).to eq expected[:type].to_sym
|
25
25
|
end
|
26
26
|
end
|
@@ -0,0 +1,114 @@
|
|
1
|
+
module SRX
|
2
|
+
module Polish
|
3
|
+
module Rules
|
4
|
+
def rules
|
5
|
+
@@rules ||=
|
6
|
+
[["(?:\\b[Aa]dw?\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\b[Aa]l\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\barch\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bbm\\.\\s)",
|
7
|
+
nil,
|
8
|
+
false],
|
9
|
+
["(?:\\bbr\\.\\p{Pe}?\\s)", "[^\\p{Lu}]", false],
|
10
|
+
["(?:\\bbry[gt]\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw?\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdł\\.\\s)|(?:\\bdn\\.\\s)",
|
11
|
+
nil,
|
12
|
+
false],
|
13
|
+
["(?:\\bdo[tlp]\\.\\s)", "[^\\p{Lu}]", false],
|
14
|
+
["(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bin\\.\\s)|(?:\\bim\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\błac\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\btj\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\btzn\\.\\s)|(?:\\b[Zz]ob\\.\\s)|(?:\\bsłow\\.\\s)",
|
15
|
+
nil,
|
16
|
+
false],
|
17
|
+
["(?:[^s]\\.pl\\.\\s)", "\\p{Lu}\\p{Ll}+", true],
|
18
|
+
["(?:\\bp[wnl]\\.\\s)|(?:\\bang\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bal\\.\\s)|(?:\\bk\\.\\s)|(?:\\bn\\.\\s)",
|
19
|
+
nil,
|
20
|
+
false],
|
21
|
+
["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
|
22
|
+
["(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)", "[\\p{Ll}\\d]+", false],
|
23
|
+
["(?:\\b[Nn]r\\.\\s)", "\\d", false],
|
24
|
+
["(?:\\bw[wł]\\.\\s)|(?:\\bur\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyw\\.\\s)|(?:\\bwy[łdm]\\.\\s)|(?:\\b[bu]p\\.\\s)|(?:\\bwyst\\.\\s)",
|
25
|
+
nil,
|
26
|
+
false],
|
27
|
+
["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
|
28
|
+
["(?:\\b[Tt]ow\\.\\s)|(?:\\bo\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\bzn\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzach\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\bmed\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b[dD][hr]\\.\\s)",
|
29
|
+
nil,
|
30
|
+
false],
|
31
|
+
["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
|
32
|
+
["(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bhab\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\bp\\.n\\.e\\.\\s)|(?:\\bdyr\\.\\smuz\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)",
|
33
|
+
nil,
|
34
|
+
false],
|
35
|
+
["(?:\\bII?\\społ\\.\\s)", "[IVX]+", false],
|
36
|
+
["(?:\\betc\\.\\s)", "\\p{Ll}", false],
|
37
|
+
["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
|
38
|
+
["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
|
39
|
+
["(?:\\b[rwn]\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bcdn\\.\\s)", "\\p{Ll}", false],
|
40
|
+
["(?:\\b[Ss]zer\\.\\s)", nil, false],
|
41
|
+
["(?:\\bjw\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bw\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\bdag\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bnpl\\.\\s)",
|
42
|
+
"\\p{Ll}",
|
43
|
+
false],
|
44
|
+
["(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)|(?:\\b[Tt]ab\\.\\s)",
|
45
|
+
"[\\p{Ll}\\d]",
|
46
|
+
false],
|
47
|
+
["(?:\\btel\\.\\s)", nil, false],
|
48
|
+
["(?:\\b[ptw]g\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\brkm\\.\\s)",
|
49
|
+
"\\p{Ll}",
|
50
|
+
false],
|
51
|
+
["(?:\\bust\\.\\s)|(?:\\bpar\\.\\s)", "\\d", false],
|
52
|
+
["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
|
53
|
+
["(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\bba!\\s)", "\\p{Ll}", false],
|
54
|
+
["(?:\\bpo[zk]\\.\\s)", nil, false],
|
55
|
+
["(?:\\bop\\.\\s)", "cit\\.", false],
|
56
|
+
["(?:\\b[Oo][Oo]\\.\\s)", nil, false],
|
57
|
+
["(?:\\b(?:[CDSR]z|Ch)\\.\\s)", "\\p{Ll}", false],
|
58
|
+
["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
|
59
|
+
["(?:\\((?:od|do|w)\\s[1-9]\\d*\\sr\\.\\s)", nil, false],
|
60
|
+
["(?:\\d+\\sr\\.\\s)|(?:\\bn\\.e\\.\\s)", "[^\\p{Lu}]", false],
|
61
|
+
["(?:\\bt\\.\\s)", "\\d+", false],
|
62
|
+
["(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)",
|
63
|
+
nil,
|
64
|
+
false],
|
65
|
+
["(?:\\bT\\.)", "Love\\b", false],
|
66
|
+
["(?:\\bpl\\.)", "Open[oO]ffice", false],
|
67
|
+
["(?:\\bha\\.\\s)", "[\\p{Ll}]", false],
|
68
|
+
["(?:\\bmin\\.\\s)", "[\\p{Ll}\\d]", false],
|
69
|
+
["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
|
70
|
+
["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
|
71
|
+
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
72
|
+
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
73
|
+
["(?:\\.\\p{L}+\\.\\s)", "\\p{Ll}", false],
|
74
|
+
["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
|
75
|
+
"\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
|
76
|
+
false],
|
77
|
+
["(?:['\"„][\\.!?…]['\"”]\\s)", nil, false],
|
78
|
+
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
79
|
+
["(?:[!?]+\\p{Pe} )", "\\p{Ll}", false],
|
80
|
+
["(?:[\\p{Ps}][!?]+[\\p{Pe}] )", nil, false],
|
81
|
+
["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )",
|
82
|
+
"\\p{Ll}",
|
83
|
+
false],
|
84
|
+
["(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)",
|
85
|
+
nil,
|
86
|
+
false],
|
87
|
+
["(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)",
|
88
|
+
"\\p{Lu}[^\\p{Lu}]",
|
89
|
+
false],
|
90
|
+
["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
|
91
|
+
"\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
|
92
|
+
false],
|
93
|
+
["(?:\\(\\p{Lu}?\\p{Ll}+\\.\\s)", nil, false],
|
94
|
+
["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
|
95
|
+
["(?:\\bDz\\.\\s?U\\.\\s)", nil, false],
|
96
|
+
["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
|
97
|
+
"\\p{Ll}+",
|
98
|
+
false],
|
99
|
+
["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
|
100
|
+
"[\\p{Ps}–—-]\\s?\\p{Ll}",
|
101
|
+
false],
|
102
|
+
["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)",
|
103
|
+
"\\p{Ll}",
|
104
|
+
false],
|
105
|
+
["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002¹²³]*\\s)",
|
106
|
+
nil,
|
107
|
+
true],
|
108
|
+
["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
|
109
|
+
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true],
|
110
|
+
["(?:: )", "[—\\-–] \\p{Lu}", true]]
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
@@ -1,58 +1,13 @@
|
|
1
1
|
#encoding: utf-8
|
2
2
|
require 'stringio'
|
3
3
|
require 'term/ansicolor'
|
4
|
+
require_relative 'polish-rules'
|
4
5
|
module SRX
|
5
6
|
module Polish
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
|
11
|
-
["(?:\\b[Nn]r\\.\\s)", "\\d", false],
|
12
|
-
["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
|
13
|
-
["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
|
14
|
-
nil,
|
15
|
-
false],
|
16
|
-
["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
|
17
|
-
["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
|
18
|
-
"[\\p{Ll}\\d]",
|
19
|
-
false],
|
20
|
-
["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
|
21
|
-
["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
|
22
|
-
"[\\p{Ps}–—-]\\s?\\p{Ll}",
|
23
|
-
false],
|
24
|
-
["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
|
25
|
-
["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
|
26
|
-
["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
|
27
|
-
["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
|
28
|
-
"\\p{Ll}",
|
29
|
-
false],
|
30
|
-
["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
|
31
|
-
"\\p{Ll}+",
|
32
|
-
false],
|
33
|
-
["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
|
34
|
-
"\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
|
35
|
-
false],
|
36
|
-
["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
|
37
|
-
["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
|
38
|
-
"\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
|
39
|
-
false],
|
40
|
-
["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
|
41
|
-
["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
|
42
|
-
["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
|
43
|
-
"\\p{Lu}[^\\p{Lu}]",
|
44
|
-
false],
|
45
|
-
["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
|
46
|
-
["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
|
47
|
-
["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
|
48
|
-
["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
|
49
|
-
["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
|
50
|
-
nil,
|
51
|
-
true],
|
52
|
-
["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
|
53
|
-
["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
|
54
|
-
BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
55
|
-
REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
7
|
+
extend Rules
|
8
|
+
|
9
|
+
BEFORE_RE = /(?:#{self.rules.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
|
10
|
+
REGEXPS = self.rules.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
|
56
11
|
FIRST_CHAR = /\A./m
|
57
12
|
|
58
13
|
class SentenceSplitter
|
@@ -67,50 +22,54 @@ module SRX
|
|
67
22
|
else
|
68
23
|
@input = text
|
69
24
|
end
|
25
|
+
@debug = false
|
70
26
|
end
|
71
27
|
|
72
28
|
def each
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
29
|
+
if block_given?
|
30
|
+
raise "Invalid argument - text is nil" if @input.nil?
|
31
|
+
buffer_length = 10
|
32
|
+
sentence = ""
|
33
|
+
before_buffer = ""
|
34
|
+
@input.pos = 0
|
35
|
+
after_buffer = buffer_length.times.map{|i| @input.readchar }.join("")
|
36
|
+
while(!@input.eof?) do
|
37
|
+
matched_before = BEFORE_RE.match(before_buffer)
|
38
|
+
break_detected = false
|
39
|
+
if matched_before
|
40
|
+
(matched_before.size - 1).times.find do |index|
|
41
|
+
matched_before[index+1]
|
42
|
+
end
|
43
|
+
if @debug
|
44
|
+
puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
|
45
|
+
end
|
46
|
+
REGEXPS.each do |before_re,after_re,value|
|
47
|
+
# skip the whole match
|
48
|
+
if before_re.match(before_buffer) && after_re.match(after_buffer)
|
49
|
+
break_detected = true
|
50
|
+
if @debug
|
51
|
+
color = value ? :red : :green
|
52
|
+
sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
|
53
|
+
end
|
54
|
+
if value
|
55
|
+
yield sentence
|
56
|
+
sentence = ""
|
57
|
+
end
|
58
|
+
break
|
101
59
|
end
|
102
|
-
break
|
103
60
|
end
|
104
61
|
end
|
62
|
+
next_after = @input.readchar
|
63
|
+
before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
|
64
|
+
after_buffer.sub!(FIRST_CHAR,"")
|
65
|
+
before_buffer << $&
|
66
|
+
sentence << $&
|
67
|
+
after_buffer << next_after
|
105
68
|
end
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
before_buffer << $&
|
110
|
-
sentence << $&
|
111
|
-
after_buffer << next_after
|
69
|
+
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
70
|
+
else
|
71
|
+
enum_for(:each)
|
112
72
|
end
|
113
|
-
yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
|
114
73
|
end
|
115
74
|
end
|
116
75
|
end
|
data/srx-polish.gemspec
CHANGED
@@ -3,9 +3,9 @@ $:.push File.expand_path("../lib", __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |s|
|
5
5
|
s.name = "srx-polish"
|
6
|
-
s.version = "0.
|
6
|
+
s.version = "0.3.0"
|
7
7
|
s.platform = Gem::Platform::RUBY
|
8
|
-
s.authors = ["Aleksander Pohl"]
|
8
|
+
s.authors = ["Aleksander Smywiński-Pohl"]
|
9
9
|
s.email = ["apohllo@o2.pl"]
|
10
10
|
s.homepage = "http://github.com/apohllo/srx2ruby"
|
11
11
|
s.summary = %q{Polish sentence and word segmentation rules.}
|
@@ -19,5 +19,5 @@ Gem::Specification.new do |s|
|
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
21
|
|
22
|
-
s.add_dependency
|
22
|
+
s.add_dependency 'term-ansicolor', '~> 1.0', '>= 1.0.5'
|
23
23
|
end
|
metadata
CHANGED
@@ -1,77 +1,77 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: srx-polish
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 0.2.1
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
6
5
|
platform: ruby
|
7
|
-
authors:
|
8
|
-
- Aleksander Pohl
|
6
|
+
authors:
|
7
|
+
- Aleksander Smywiński-Pohl
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
- !ruby/object:Gem::Dependency
|
11
|
+
date: 2018-07-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
16
14
|
name: term-ansicolor
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.0'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
23
22
|
version: 1.0.5
|
24
23
|
type: :runtime
|
25
|
-
|
26
|
-
|
27
|
-
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.0'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.0.5
|
33
|
+
description: Polish sentence and word segmentation rules. The sentence segmentation
|
34
|
+
rules are based on Marcin Miłkowski's SRX rules.
|
35
|
+
email:
|
28
36
|
- apohllo@o2.pl
|
29
37
|
executables: []
|
30
|
-
|
31
38
|
extensions: []
|
32
|
-
|
33
39
|
extra_rdoc_files: []
|
34
|
-
|
35
|
-
|
36
|
-
- .
|
40
|
+
files:
|
41
|
+
- ".gitignore"
|
42
|
+
- ".ruby-version"
|
43
|
+
- Gemfile
|
37
44
|
- README.rdoc
|
38
45
|
- changelog.txt
|
39
46
|
- features/sentence_splitter.feature
|
40
47
|
- features/steps/sentence_splitter.rb
|
41
48
|
- features/steps/word_splitter.rb
|
42
49
|
- features/word_splitter.feature
|
50
|
+
- lib/srx/polish/polish-rules.rb
|
43
51
|
- lib/srx/polish/sentence_splitter.rb
|
44
52
|
- lib/srx/polish/word_splitter.rb
|
45
53
|
- srx-polish.gemspec
|
46
54
|
homepage: http://github.com/apohllo/srx2ruby
|
47
55
|
licenses: []
|
48
|
-
|
56
|
+
metadata: {}
|
49
57
|
post_install_message:
|
50
58
|
rdoc_options: []
|
51
|
-
|
52
|
-
require_paths:
|
59
|
+
require_paths:
|
53
60
|
- lib
|
54
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
-
|
56
|
-
requirements:
|
61
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
57
63
|
- - ">="
|
58
|
-
- !ruby/object:Gem::Version
|
59
|
-
version:
|
60
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
61
|
-
|
62
|
-
requirements:
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
67
|
+
requirements:
|
63
68
|
- - ">="
|
64
|
-
- !ruby/object:Gem::Version
|
65
|
-
version:
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
version: '0'
|
66
71
|
requirements: []
|
67
|
-
|
68
72
|
rubyforge_project: srx-polish
|
69
|
-
rubygems_version:
|
73
|
+
rubygems_version: 2.6.14
|
70
74
|
signing_key:
|
71
|
-
specification_version:
|
75
|
+
specification_version: 4
|
72
76
|
summary: Polish sentence and word segmentation rules.
|
73
|
-
test_files:
|
74
|
-
- features/sentence_splitter.feature
|
75
|
-
- features/steps/sentence_splitter.rb
|
76
|
-
- features/steps/word_splitter.rb
|
77
|
-
- features/word_splitter.feature
|
77
|
+
test_files: []
|