srx-polish 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c1a5a585338642469bee4a679efc4f0d3f12228d
4
+ data.tar.gz: 5c5df57c6c78b5a89647e4f2ec21af5262dec6e8
5
+ SHA512:
6
+ metadata.gz: 87d75afc302d0debe4aecf899184402eae26082925089d94f13c5a004beb3f3e7ac18b5bb9e8bc44689389165b2119432d27756e74dbace820ddd7b94cc7a7d7
7
+ data.tar.gz: 29a1b4c730e782b415bae8bb7b5f4d09fd9f126c01724916ce1a80b158ea5f087a608aa63d02f9d9bf831e802ef5e3d50ccb0b9ef281036123786f31ef86cea4
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
- .*.sw?
2
1
  *.gem
2
+ .*.sw?
3
+ Gemfile.lock
3
4
  pkg
4
5
  work
@@ -0,0 +1 @@
1
+ 2.1
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -1,3 +1,6 @@
1
+ 0.3.0
2
+ - rule update (as of 07/2018)
3
+ - all rules are in separate file to reduce code clutter
1
4
  0.2.1
2
5
  - fix extension of README (was md, should be rdoc)
3
6
  0.2.0
@@ -12,6 +12,6 @@ end
12
12
 
13
13
  Then /^the following sentences should be detected$/ do |table|
14
14
  table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
- returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
15
+ expect(returned.gsub(/\s*\n/," ").strip).to eq expected[:sentence]
16
16
  end
17
17
  end
@@ -12,15 +12,15 @@ end
12
12
 
13
13
  Then /^the following segments should be detected$/ do |table|
14
14
  table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
- returned[0].should == expected[:segment].gsub(/'/,"")
16
- returned[1].should == expected[:type].to_sym
15
+ expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
16
+ expect(returned[1]).to eq expected[:type].to_sym
17
17
  end
18
18
  end
19
19
 
20
20
  Then /^the following non-blank segments should be detected$/ do |table|
21
21
  segments = @splitter.select{|s| s[1] != :other}
22
22
  table.hashes.zip(segments).each do |expected,returned|
23
- returned[0].should == expected[:segment].gsub(/'/,"")
24
- returned[1].should == expected[:type].to_sym
23
+ expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
24
+ expect(returned[1]).to eq expected[:type].to_sym
25
25
  end
26
26
  end
@@ -0,0 +1,114 @@
1
+ module SRX
2
+ module Polish
3
+ module Rules
4
+ def rules
5
+ @@rules ||=
6
+ [["(?:\\b[Aa]dw?\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\b[Aa]l\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\barch\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bbm\\.\\s)",
7
+ nil,
8
+ false],
9
+ ["(?:\\bbr\\.\\p{Pe}?\\s)", "[^\\p{Lu}]", false],
10
+ ["(?:\\bbry[gt]\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw?\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdł\\.\\s)|(?:\\bdn\\.\\s)",
11
+ nil,
12
+ false],
13
+ ["(?:\\bdo[tlp]\\.\\s)", "[^\\p{Lu}]", false],
14
+ ["(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bin\\.\\s)|(?:\\bim\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\błac\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\btj\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\btzn\\.\\s)|(?:\\b[Zz]ob\\.\\s)|(?:\\bsłow\\.\\s)",
15
+ nil,
16
+ false],
17
+ ["(?:[^s]\\.pl\\.\\s)", "\\p{Lu}\\p{Ll}+", true],
18
+ ["(?:\\bp[wnl]\\.\\s)|(?:\\bang\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bal\\.\\s)|(?:\\bk\\.\\s)|(?:\\bn\\.\\s)",
19
+ nil,
20
+ false],
21
+ ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
22
+ ["(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)", "[\\p{Ll}\\d]+", false],
23
+ ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
24
+ ["(?:\\bw[wł]\\.\\s)|(?:\\bur\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyw\\.\\s)|(?:\\bwy[łdm]\\.\\s)|(?:\\b[bu]p\\.\\s)|(?:\\bwyst\\.\\s)",
25
+ nil,
26
+ false],
27
+ ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
28
+ ["(?:\\b[Tt]ow\\.\\s)|(?:\\bo\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\bzn\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzach\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\bmed\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b[dD][hr]\\.\\s)",
29
+ nil,
30
+ false],
31
+ ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
32
+ ["(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bhab\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\bp\\.n\\.e\\.\\s)|(?:\\bdyr\\.\\smuz\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)",
33
+ nil,
34
+ false],
35
+ ["(?:\\bII?\\społ\\.\\s)", "[IVX]+", false],
36
+ ["(?:\\betc\\.\\s)", "\\p{Ll}", false],
37
+ ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
38
+ ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
39
+ ["(?:\\b[rwn]\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bcdn\\.\\s)", "\\p{Ll}", false],
40
+ ["(?:\\b[Ss]zer\\.\\s)", nil, false],
41
+ ["(?:\\bjw\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bw\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\bdag\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bnpl\\.\\s)",
42
+ "\\p{Ll}",
43
+ false],
44
+ ["(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)|(?:\\b[Tt]ab\\.\\s)",
45
+ "[\\p{Ll}\\d]",
46
+ false],
47
+ ["(?:\\btel\\.\\s)", nil, false],
48
+ ["(?:\\b[ptw]g\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\brkm\\.\\s)",
49
+ "\\p{Ll}",
50
+ false],
51
+ ["(?:\\bust\\.\\s)|(?:\\bpar\\.\\s)", "\\d", false],
52
+ ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
53
+ ["(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\bba!\\s)", "\\p{Ll}", false],
54
+ ["(?:\\bpo[zk]\\.\\s)", nil, false],
55
+ ["(?:\\bop\\.\\s)", "cit\\.", false],
56
+ ["(?:\\b[Oo][Oo]\\.\\s)", nil, false],
57
+ ["(?:\\b(?:[CDSR]z|Ch)\\.\\s)", "\\p{Ll}", false],
58
+ ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
59
+ ["(?:\\((?:od|do|w)\\s[1-9]\\d*\\sr\\.\\s)", nil, false],
60
+ ["(?:\\d+\\sr\\.\\s)|(?:\\bn\\.e\\.\\s)", "[^\\p{Lu}]", false],
61
+ ["(?:\\bt\\.\\s)", "\\d+", false],
62
+ ["(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)",
63
+ nil,
64
+ false],
65
+ ["(?:\\bT\\.)", "Love\\b", false],
66
+ ["(?:\\bpl\\.)", "Open[oO]ffice", false],
67
+ ["(?:\\bha\\.\\s)", "[\\p{Ll}]", false],
68
+ ["(?:\\bmin\\.\\s)", "[\\p{Ll}\\d]", false],
69
+ ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
70
+ ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
71
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
72
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
73
+ ["(?:\\.\\p{L}+\\.\\s)", "\\p{Ll}", false],
74
+ ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
75
+ "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
76
+ false],
77
+ ["(?:['\"„][\\.!?…]['\"”]\\s)", nil, false],
78
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
79
+ ["(?:[!?]+\\p{Pe} )", "\\p{Ll}", false],
80
+ ["(?:[\\p{Ps}][!?]+[\\p{Pe}] )", nil, false],
81
+ ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )",
82
+ "\\p{Ll}",
83
+ false],
84
+ ["(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)",
85
+ nil,
86
+ false],
87
+ ["(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)",
88
+ "\\p{Lu}[^\\p{Lu}]",
89
+ false],
90
+ ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
91
+ "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
92
+ false],
93
+ ["(?:\\(\\p{Lu}?\\p{Ll}+\\.\\s)", nil, false],
94
+ ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
95
+ ["(?:\\bDz\\.\\s?U\\.\\s)", nil, false],
96
+ ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
97
+ "\\p{Ll}+",
98
+ false],
99
+ ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
100
+ "[\\p{Ps}–—-]\\s?\\p{Ll}",
101
+ false],
102
+ ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)",
103
+ "\\p{Ll}",
104
+ false],
105
+ ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002¹²³]*\\s)",
106
+ nil,
107
+ true],
108
+ ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
109
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true],
110
+ ["(?:: )", "[—\\-–] \\p{Lu}", true]]
111
+ end
112
+ end
113
+ end
114
+ end
@@ -1,58 +1,13 @@
1
1
  #encoding: utf-8
2
2
  require 'stringio'
3
3
  require 'term/ansicolor'
4
+ require_relative 'polish-rules'
4
5
  module SRX
5
6
  module Polish
6
- RULES =
7
- [["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
8
- nil,
9
- false],
10
- ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
11
- ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
12
- ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
13
- ["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
14
- nil,
15
- false],
16
- ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
17
- ["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
18
- "[\\p{Ll}\\d]",
19
- false],
20
- ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
21
- ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
22
- "[\\p{Ps}–—-]\\s?\\p{Ll}",
23
- false],
24
- ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
25
- ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
26
- ["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
27
- ["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
28
- "\\p{Ll}",
29
- false],
30
- ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
31
- "\\p{Ll}+",
32
- false],
33
- ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
34
- "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
35
- false],
36
- ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
37
- ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
38
- "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
39
- false],
40
- ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
41
- ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
42
- ["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
43
- "\\p{Lu}[^\\p{Lu}]",
44
- false],
45
- ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
46
- ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
47
- ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
48
- ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
49
- ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
50
- nil,
51
- true],
52
- ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
53
- ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
54
- BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
55
- REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
7
+ extend Rules
8
+
9
+ BEFORE_RE = /(?:#{self.rules.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
10
+ REGEXPS = self.rules.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
11
  FIRST_CHAR = /\A./m
57
12
 
58
13
  class SentenceSplitter
@@ -67,50 +22,54 @@ module SRX
67
22
  else
68
23
  @input = text
69
24
  end
25
+ @debug = false
70
26
  end
71
27
 
72
28
  def each
73
- raise "Invalid argument - text is nil" if @input.nil?
74
- buffer_length = 10
75
- sentence = ""
76
- before_buffer = ""
77
- @input.pos = 0
78
- after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
79
- matched_rule = nil
80
- while(!@input.eof?) do
81
- matched_before = BEFORE_RE.match(before_buffer)
82
- break_detected = false
83
- if matched_before
84
- start_index = (matched_before.size - 1).times.find do |index|
85
- matched_before[index+1]
86
- end
87
- if @debug
88
- puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
89
- end
90
- REGEXPS.each do |before_re,after_re,value|
91
- # skip the whole match
92
- if before_re.match(before_buffer) && after_re.match(after_buffer)
93
- break_detected = true
94
- color = value ? :red : :green
95
- if @debug
96
- sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
97
- end
98
- if value
99
- yield sentence
100
- sentence = ""
29
+ if block_given?
30
+ raise "Invalid argument - text is nil" if @input.nil?
31
+ buffer_length = 10
32
+ sentence = ""
33
+ before_buffer = ""
34
+ @input.pos = 0
35
+ after_buffer = buffer_length.times.map{|i| @input.readchar }.join("")
36
+ while(!@input.eof?) do
37
+ matched_before = BEFORE_RE.match(before_buffer)
38
+ break_detected = false
39
+ if matched_before
40
+ (matched_before.size - 1).times.find do |index|
41
+ matched_before[index+1]
42
+ end
43
+ if @debug
44
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
45
+ end
46
+ REGEXPS.each do |before_re,after_re,value|
47
+ # skip the whole match
48
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
49
+ break_detected = true
50
+ if @debug
51
+ color = value ? :red : :green
52
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
53
+ end
54
+ if value
55
+ yield sentence
56
+ sentence = ""
57
+ end
58
+ break
101
59
  end
102
- break
103
60
  end
104
61
  end
62
+ next_after = @input.readchar
63
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
64
+ after_buffer.sub!(FIRST_CHAR,"")
65
+ before_buffer << $&
66
+ sentence << $&
67
+ after_buffer << next_after
105
68
  end
106
- next_after = @input.readchar
107
- before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
108
- after_buffer.sub!(FIRST_CHAR,"")
109
- before_buffer << $&
110
- sentence << $&
111
- after_buffer << next_after
69
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
70
+ else
71
+ enum_for(:each)
112
72
  end
113
- yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
114
73
  end
115
74
  end
116
75
  end
@@ -3,9 +3,9 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "srx-polish"
6
- s.version = "0.2.1"
6
+ s.version = "0.3.0"
7
7
  s.platform = Gem::Platform::RUBY
8
- s.authors = ["Aleksander Pohl"]
8
+ s.authors = ["Aleksander Smywiński-Pohl"]
9
9
  s.email = ["apohllo@o2.pl"]
10
10
  s.homepage = "http://github.com/apohllo/srx2ruby"
11
11
  s.summary = %q{Polish sentence and word segmentation rules.}
@@ -19,5 +19,5 @@ Gem::Specification.new do |s|
19
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
21
 
22
- s.add_dependency("term-ansicolor", ["~> 1.0.5"])
22
+ s.add_dependency 'term-ansicolor', '~> 1.0', '>= 1.0.5'
23
23
  end
metadata CHANGED
@@ -1,77 +1,77 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: srx-polish
3
- version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.2.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
6
5
  platform: ruby
7
- authors:
8
- - Aleksander Pohl
6
+ authors:
7
+ - Aleksander Smywiński-Pohl
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
-
13
- date: 2011-10-14 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
11
+ date: 2018-07-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
16
14
  name: term-ansicolor
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
19
- none: false
20
- requirements:
21
- - - ~>
22
- - !ruby/object:Gem::Version
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
23
22
  version: 1.0.5
24
23
  type: :runtime
25
- version_requirements: *id001
26
- description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
27
- email:
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.5
33
+ description: Polish sentence and word segmentation rules. The sentence segmentation
34
+ rules are based on Marcin Miłkowski's SRX rules.
35
+ email:
28
36
  - apohllo@o2.pl
29
37
  executables: []
30
-
31
38
  extensions: []
32
-
33
39
  extra_rdoc_files: []
34
-
35
- files:
36
- - .gitignore
40
+ files:
41
+ - ".gitignore"
42
+ - ".ruby-version"
43
+ - Gemfile
37
44
  - README.rdoc
38
45
  - changelog.txt
39
46
  - features/sentence_splitter.feature
40
47
  - features/steps/sentence_splitter.rb
41
48
  - features/steps/word_splitter.rb
42
49
  - features/word_splitter.feature
50
+ - lib/srx/polish/polish-rules.rb
43
51
  - lib/srx/polish/sentence_splitter.rb
44
52
  - lib/srx/polish/word_splitter.rb
45
53
  - srx-polish.gemspec
46
54
  homepage: http://github.com/apohllo/srx2ruby
47
55
  licenses: []
48
-
56
+ metadata: {}
49
57
  post_install_message:
50
58
  rdoc_options: []
51
-
52
- require_paths:
59
+ require_paths:
53
60
  - lib
54
- required_ruby_version: !ruby/object:Gem::Requirement
55
- none: false
56
- requirements:
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
57
63
  - - ">="
58
- - !ruby/object:Gem::Version
59
- version: "0"
60
- required_rubygems_version: !ruby/object:Gem::Requirement
61
- none: false
62
- requirements:
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
63
68
  - - ">="
64
- - !ruby/object:Gem::Version
65
- version: "0"
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
66
71
  requirements: []
67
-
68
72
  rubyforge_project: srx-polish
69
- rubygems_version: 1.8.5
73
+ rubygems_version: 2.6.14
70
74
  signing_key:
71
- specification_version: 3
75
+ specification_version: 4
72
76
  summary: Polish sentence and word segmentation rules.
73
- test_files:
74
- - features/sentence_splitter.feature
75
- - features/steps/sentence_splitter.rb
76
- - features/steps/word_splitter.rb
77
- - features/word_splitter.feature
77
+ test_files: []