srx-polish 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c1a5a585338642469bee4a679efc4f0d3f12228d
4
+ data.tar.gz: 5c5df57c6c78b5a89647e4f2ec21af5262dec6e8
5
+ SHA512:
6
+ metadata.gz: 87d75afc302d0debe4aecf899184402eae26082925089d94f13c5a004beb3f3e7ac18b5bb9e8bc44689389165b2119432d27756e74dbace820ddd7b94cc7a7d7
7
+ data.tar.gz: 29a1b4c730e782b415bae8bb7b5f4d09fd9f126c01724916ce1a80b158ea5f087a608aa63d02f9d9bf831e802ef5e3d50ccb0b9ef281036123786f31ef86cea4
data/.gitignore CHANGED
@@ -1,4 +1,5 @@
1
- .*.sw?
2
1
  *.gem
2
+ .*.sw?
3
+ Gemfile.lock
3
4
  pkg
4
5
  work
@@ -0,0 +1 @@
1
+ 2.1
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org'
2
+ gemspec
@@ -1,3 +1,6 @@
1
+ 0.3.0
2
+ - rule update (as of 07/2018)
3
+ - all rules are in separate file to reduce code clutter
1
4
  0.2.1
2
5
  - fix extension of README (was md, should be rdoc)
3
6
  0.2.0
@@ -12,6 +12,6 @@ end
12
12
 
13
13
  Then /^the following sentences should be detected$/ do |table|
14
14
  table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
- returned.gsub(/\s*\n/," ").strip.should == expected[:sentence]
15
+ expect(returned.gsub(/\s*\n/," ").strip).to eq expected[:sentence]
16
16
  end
17
17
  end
@@ -12,15 +12,15 @@ end
12
12
 
13
13
  Then /^the following segments should be detected$/ do |table|
14
14
  table.hashes.zip(@splitter.to_a).each do |expected,returned|
15
- returned[0].should == expected[:segment].gsub(/'/,"")
16
- returned[1].should == expected[:type].to_sym
15
+ expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
16
+ expect(returned[1]).to eq expected[:type].to_sym
17
17
  end
18
18
  end
19
19
 
20
20
  Then /^the following non-blank segments should be detected$/ do |table|
21
21
  segments = @splitter.select{|s| s[1] != :other}
22
22
  table.hashes.zip(segments).each do |expected,returned|
23
- returned[0].should == expected[:segment].gsub(/'/,"")
24
- returned[1].should == expected[:type].to_sym
23
+ expect(returned[0]).to eq expected[:segment].gsub(/'/,"")
24
+ expect(returned[1]).to eq expected[:type].to_sym
25
25
  end
26
26
  end
@@ -0,0 +1,114 @@
1
+ module SRX
2
+ module Polish
3
+ module Rules
4
+ def rules
5
+ @@rules ||=
6
+ [["(?:\\b[Aa]dw?\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\b[Aa]l\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\barch\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bbm\\.\\s)",
7
+ nil,
8
+ false],
9
+ ["(?:\\bbr\\.\\p{Pe}?\\s)", "[^\\p{Lu}]", false],
10
+ ["(?:\\bbry[gt]\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw?\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdł\\.\\s)|(?:\\bdn\\.\\s)",
11
+ nil,
12
+ false],
13
+ ["(?:\\bdo[tlp]\\.\\s)", "[^\\p{Lu}]", false],
14
+ ["(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bin\\.\\s)|(?:\\bim\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\błac\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\btj\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\btzn\\.\\s)|(?:\\b[Zz]ob\\.\\s)|(?:\\bsłow\\.\\s)",
15
+ nil,
16
+ false],
17
+ ["(?:[^s]\\.pl\\.\\s)", "\\p{Lu}\\p{Ll}+", true],
18
+ ["(?:\\bp[wnl]\\.\\s)|(?:\\bang\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bal\\.\\s)|(?:\\bk\\.\\s)|(?:\\bn\\.\\s)",
19
+ nil,
20
+ false],
21
+ ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
22
+ ["(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)", "[\\p{Ll}\\d]+", false],
23
+ ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
24
+ ["(?:\\bw[wł]\\.\\s)|(?:\\bur\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyw\\.\\s)|(?:\\bwy[łdm]\\.\\s)|(?:\\b[bu]p\\.\\s)|(?:\\bwyst\\.\\s)",
25
+ nil,
26
+ false],
27
+ ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
28
+ ["(?:\\b[Tt]ow\\.\\s)|(?:\\bo\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\bzn\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzach\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\bmed\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b[dD][hr]\\.\\s)",
29
+ nil,
30
+ false],
31
+ ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
32
+ ["(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bhab\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\bp\\.n\\.e\\.\\s)|(?:\\bdyr\\.\\smuz\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)",
33
+ nil,
34
+ false],
35
+ ["(?:\\bII?\\społ\\.\\s)", "[IVX]+", false],
36
+ ["(?:\\betc\\.\\s)", "\\p{Ll}", false],
37
+ ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
38
+ ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
39
+ ["(?:\\b[rwn]\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bcdn\\.\\s)", "\\p{Ll}", false],
40
+ ["(?:\\b[Ss]zer\\.\\s)", nil, false],
41
+ ["(?:\\bjw\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bw\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\bdag\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bnpl\\.\\s)",
42
+ "\\p{Ll}",
43
+ false],
44
+ ["(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)|(?:\\b[Tt]ab\\.\\s)",
45
+ "[\\p{Ll}\\d]",
46
+ false],
47
+ ["(?:\\btel\\.\\s)", nil, false],
48
+ ["(?:\\b[ptw]g\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\brkm\\.\\s)",
49
+ "\\p{Ll}",
50
+ false],
51
+ ["(?:\\bust\\.\\s)|(?:\\bpar\\.\\s)", "\\d", false],
52
+ ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
53
+ ["(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\bba!\\s)", "\\p{Ll}", false],
54
+ ["(?:\\bpo[zk]\\.\\s)", nil, false],
55
+ ["(?:\\bop\\.\\s)", "cit\\.", false],
56
+ ["(?:\\b[Oo][Oo]\\.\\s)", nil, false],
57
+ ["(?:\\b(?:[CDSR]z|Ch)\\.\\s)", "\\p{Ll}", false],
58
+ ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
59
+ ["(?:\\((?:od|do|w)\\s[1-9]\\d*\\sr\\.\\s)", nil, false],
60
+ ["(?:\\d+\\sr\\.\\s)|(?:\\bn\\.e\\.\\s)", "[^\\p{Lu}]", false],
61
+ ["(?:\\bt\\.\\s)", "\\d+", false],
62
+ ["(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)",
63
+ nil,
64
+ false],
65
+ ["(?:\\bT\\.)", "Love\\b", false],
66
+ ["(?:\\bpl\\.)", "Open[oO]ffice", false],
67
+ ["(?:\\bha\\.\\s)", "[\\p{Ll}]", false],
68
+ ["(?:\\bmin\\.\\s)", "[\\p{Ll}\\d]", false],
69
+ ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
70
+ ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
71
+ ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
72
+ ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
73
+ ["(?:\\.\\p{L}+\\.\\s)", "\\p{Ll}", false],
74
+ ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
75
+ "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
76
+ false],
77
+ ["(?:['\"„][\\.!?…]['\"”]\\s)", nil, false],
78
+ ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
79
+ ["(?:[!?]+\\p{Pe} )", "\\p{Ll}", false],
80
+ ["(?:[\\p{Ps}][!?]+[\\p{Pe}] )", nil, false],
81
+ ["(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )",
82
+ "\\p{Ll}",
83
+ false],
84
+ ["(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)",
85
+ nil,
86
+ false],
87
+ ["(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)",
88
+ "\\p{Lu}[^\\p{Lu}]",
89
+ false],
90
+ ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
91
+ "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
92
+ false],
93
+ ["(?:\\(\\p{Lu}?\\p{Ll}+\\.\\s)", nil, false],
94
+ ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
95
+ ["(?:\\bDz\\.\\s?U\\.\\s)", nil, false],
96
+ ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
97
+ "\\p{Ll}+",
98
+ false],
99
+ ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
100
+ "[\\p{Ps}–—-]\\s?\\p{Ll}",
101
+ false],
102
+ ["(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)",
103
+ "\\p{Ll}",
104
+ false],
105
+ ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002¹²³]*\\s)",
106
+ nil,
107
+ true],
108
+ ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
109
+ ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true],
110
+ ["(?:: )", "[—\\-–] \\p{Lu}", true]]
111
+ end
112
+ end
113
+ end
114
+ end
@@ -1,58 +1,13 @@
1
1
  #encoding: utf-8
2
2
  require 'stringio'
3
3
  require 'term/ansicolor'
4
+ require_relative 'polish-rules'
4
5
  module SRX
5
6
  module Polish
6
- RULES =
7
- [["(?:\\b[Aa]l\\.\\s)|(?:\\b[Aa]rt\\.\\s)|(?:\\b[Cc]d\\.\\s)|(?:\\b[Hh]r\\.\\s)|(?:\\b[Nn][bp]\\.\\s)|(?:\\b[Pp][ts]\\.\\s)|(?:\\b[Rr]y[cs]\\.\\s)|(?:\\b[Tt]zw\\.\\s)|(?:\\b[\\d,\\.]*ty[sś]\\.\\p{Pe}?\\s)|(?:\\b[bu]p\\.\\s)|(?:\\badw\\.\\s)|(?:\\bafr\\.\\s)|(?:\\bakad\\.\\s)|(?:\\bal\\.\\s)|(?:\\bam\\.\\s)|(?:\\bamer\\.\\s)|(?:\\bang\\.\\s)|(?:\\barch\\.\\s)|(?:\\bartyst\\.\\s)|(?:\\bastr\\.\\s)|(?:\\baustr\\.\\s)|(?:\\bbałt\\.\\s)|(?:\\bbdb\\.\\s)|(?:\\bbm\\.\\s)|(?:\\bbr\\.\\p{Pe}?\\s)|(?:\\bbry[gt]\\.\\s)|(?:\\bbł\\.\\s)|(?:\\bc\\.k\\.\\s)|(?:\\bc\\.o\\.\\s)|(?:\\bcdn\\.\\s)|(?:\\bcentr\\.\\s)|(?:\\bces\\.\\s)|(?:\\bchem\\.\\s)|(?:\\bchir\\.\\s)|(?:\\bchiń\\.\\s)|(?:\\bcyg\\.\\s)|(?:\\bcyt\\.\\s)|(?:\\bcyw\\.\\s)|(?:\\bczes\\.\\s)|(?:\\bczw\\.\\s)|(?:\\bczyt\\.\\s)|(?:\\bdaw\\.\\s)|(?:\\bdcn\\.\\s)|(?:\\bdekl\\.\\s)|(?:\\bdemokr\\.\\s)|(?:\\bdet\\.\\s)|(?:\\bdiec\\.\\s)|(?:\\bdn\\.\\s)|(?:\\bdo[tlp]\\.\\s)|(?:\\bdost\\.\\s)|(?:\\bdosł\\.\\s)|(?:\\bds\\.\\s)|(?:\\bdst\\.\\s)|(?:\\bduszp\\.\\s)|(?:\\bdypl\\.\\s)|(?:\\bdł\\.\\s)|(?:\\begz\\.\\s)|(?:\\bekol\\.\\s)|(?:\\bekon\\.\\s)|(?:\\belektr\\.\\s)|(?:\\bem\\.\\s)|(?:\\bew\\.\\s)|(?:\\bfab\\.\\s)|(?:\\bfarm\\.\\s)|(?:\\bfot\\.\\s)|(?:\\bfr\\.\\s)|(?:\\bgastr\\.\\s)|(?:\\bgat\\.\\s)|(?:\\bgeogr\\.\\s)|(?:\\bgeol\\.\\s)|(?:\\bgimn\\.\\s)|(?:\\bgm\\.\\s)|(?:\\bgodz\\.\\s)|(?:\\bgosp\\.\\s)|(?:\\bgr\\.\\s)|(?:\\bgram\\.\\s)|(?:\\bgórn\\.\\s)|(?:\\bgłęb\\.\\s)|(?:\\bh\\.c\\.\\s)|(?:\\bhist\\.\\s)|(?:\\bhiszp\\.\\s)|(?:\\bhot\\.\\s)|(?:\\bid\\.\\s)|(?:\\bim\\.\\s)|(?:\\bin\\.\\s)|(?:\\biron\\.\\s)|(?:\\bjn\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bk\\.\\s)|(?:\\bk\\.k\\.\\s)|(?:\\bk\\.p\\.a\\.\\s)|(?:\\bk\\.p\\.c\\.\\s)|(?:\\bk\\.r\\.\\s)|(?:\\bk\\.r\\.o\\.\\s)|(?:\\bkard\\.\\s)|(?:\\bkat\\.\\s)|(?:\\bkatol\\.\\s)|(?:\\bkk\\.\\s)|(?:\\bko?l\\.\\s)|(?:\\bkpc\\.\\s)|(?:\\bkpt\\.\\s)|(?:\\bkr\\.\\s)|(?:\\bkrak\\.\\s)|(?:\\bkryt\\.\\s)|(?:\\bkult\\.\\s)|(?:\\blaic\\.\\s)|(?:\\bm\\.in\\.\\s)|(?:\\bn\\.\\s)|(?:\\bniem\\.\\s)|(?:\\bp[wnl]\\.\\s)|(?:\\bpo[lw]\\.\\s)|(?:\\btj\\.\\s)|(?:\\btzn\\.\\s)|(?:\\bu[lb]\\.\\s?)|(?:\\bur\\.\\s)|(?:\\bw[wł]\\.\\s)|(?:\\bwoj\\.\\s)|(?:\\bwyst\\.\\s)|(?:\\bwył\\.\\s)|(?:\\bzm\\.\\s)|(?:\\bzob\\.\\s)|(?:\\bćw\\.\\s)|(?:\\bćwicz\\.\\s)|(?:\\błac\\.\\s)|(?:\\bżarg\\.\\s)|(?:\\bżyd\\.\\s)|(?:\\bżyw\\.\\s)",
8
- nil,
9
- false],
10
- ["(?:\\b[Oo]k\\.\\s)", "[\\p{Ll}\\d]", false],
11
- ["(?:\\b[Nn]r\\.\\s)", "\\d", false],
12
- ["(?:\\b(?i)mazeł\\stow\\.\\s)", nil, true],
13
- ["(?:['\"„][\\.!?…]['\"”]\\s)|(?:[\\p{Ps}][!?]+[\\p{Pe}] )|(?:\\b(?:[Nn]a|[Pp]o)dkom\\.\\s)|(?:\\b(?:[sS]p|st|[Ss]półdz|społ|spółgł|[Ss]to[łw])\\.\\s)|(?:\\b(?i)gen\\.\\s)|(?:\\b(?i)mgr\\.\\s)|(?:\\b(?i)por\\.\\s)|(?:\\b(?i)pro[fk]\\.\\s)|(?:\\bDz\\.\\s?U\\.\\s)|(?:\\b[,uoi]\\ss\\.\\s)|(?:\\b[Aa]dw\\.\\s)|(?:\\b[Dd]oc\\.\\s)|(?:\\b[Dd]y[wr]\\.\\s)|(?:\\b[Ii]nż\\.\\s)|(?:\\b[Ll]ek\\.\\s)|(?:\\b[Mm]ec\\.\\s)|(?:\\b[Nn]a\\sos\\.\\s)|(?:\\b[Nn]ajśw\\.\\s)|(?:\\b[Oo][Oo]\\.\\s)|(?:\\b[Pp]rzyp\\.\\s)|(?:\\b[Pp]łk\\.\\s)|(?:\\b[Rr]ed\\.\\)?\\s)|(?:\\b[Rr]eż\\.\\s)|(?:\\b[Ss]zer\\.\\s)|(?:\\b[Tt]ow\\.\\s)|(?:\\b[dD][hr]\\.\\s)|(?:\\b[kK]s\\.\\s)|(?:\\b[śŚ][pwW]\\.\\s)|(?:\\b\\p{Lu}\\.\\p{Lu}\\.\\s)|(?:\\b\\p{Lu}\\.\\s\\p{Lu}\\.\\s)|(?:\\bhab\\.\\s)|(?:\\bmed\\.\\s)|(?:\\bo\\.\\s)|(?:\\bpo[zk]\\.\\s)|(?:\\btel\\.\\s)|(?:\\bzach\\.\\s)|(?:\\bzagr\\.\\s)|(?:\\bzak\\.\\s)|(?:\\bzakł\\.\\s)|(?:\\bzal\\.\\s)|(?:\\bzam\\.\\s)|(?:\\bzast\\.\\s)|(?:\\bzaw\\.\\s)|(?:\\bzazw\\.\\s)|(?:\\bzał\\.\\s)|(?:\\bzdr\\.\\s)|(?:\\bzew\\.\\s)|(?:\\bzewn\\.\\s)|(?:\\bzn\\.\\s)",
14
- nil,
15
- false],
16
- ["(?:\\b[rls]\\.\\s)", "[1-9]+", false],
17
- ["(?:\\b[Tt]ab\\.\\s)|(?:\\bmin\\.\\s)|(?:\\bpkt\\.\\s)|(?:\\bstr\\.\\s)",
18
- "[\\p{Ll}\\d]",
19
- false],
20
- ["(?:\\bW\\s\\d{4}\\sr\\.\\s)", "[\\p{Lu}–—„\"-]", false],
21
- ["(?:[\\.!?…]+['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)",
22
- "[\\p{Ps}–—-]\\s?\\p{Ll}",
23
- false],
24
- ["(?:\\bit[dp]\\.\\s)", "[„”\"]?\\p{Ll}", false],
25
- ["(?:\\s[A-Z]\\.\\s)", "\\(\\d", false],
26
- ["(?:\\bpar\\.\\s)|(?:\\bust\\.\\s)", "\\d", false],
27
- ["(?:[!?]+\\p{Pe} )|(?:[\\.!?…]['\"\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\p{Pe}\\s)|(?:[\\[\\(]*\\.\\.\\.[\\]\\)]* )|(?:[\\[\\(]*…[\\]\\)]* )|(?:\\.\\p{L}+\\.\\s)|(?:\\b(?:[CDSR]z|Ch)\\.\\s)|(?:\\b[Gg][rR]\\.\\s)|(?:\\b[Pp]on\\.\\s)|(?:\\b[Ss]ob\\.\\s)|(?:\\b[Zz][Łł]\\.\\s)|(?:\\b[cd]?m\\.\\s)|(?:\\b[d]?kg\\.\\s)|(?:\\b[dm]l\\.\\s)|(?:\\b[ptw]g\\.\\s)|(?:\\b[rwn]\\.\\s)|(?:\\bba!\\s)|(?:\\bcdn\\.\\s)|(?:\\bcos\\.\\s)|(?:\\bcosec\\.\\s)|(?:\\bdag\\.\\s)|(?:\\betc\\.\\s)|(?:\\bit[dp]\\.\\s)|(?:\\bjw\\.\\s)|(?:\\bml[nd]\\.\\s)|(?:\\bn[tn]\\.\\s)|(?:\\bn\\.e\\.\\s)|(?:\\bnpl\\.\\s)|(?:\\brkm\\.\\s)|(?:\\bsec\\.\\s)|(?:\\bsin\\.\\s)|(?:\\bw\\.\\s)",
28
- "\\p{Ll}",
29
- false],
30
- ["(?:[\\p{Ll}&&[^aeouiyęąó]][\\p{Ll}&&[^aeouiyęąó]]+\\.\\s)",
31
- "\\p{Ll}+",
32
- false],
33
- ["(?:[\\s\\(\\[][\\p{L}&&[^rwn]]\\.\\s+)",
34
- "\\p{Ll}\\p{Ll}|\\p{Lu}[\\p{Punct}\\p{Lu}]",
35
- false],
36
- ["(?:\\blit\\.\\s)", "\\p{Ll}\\p{Pe}?\\p{P}?\\s", false],
37
- ["(?:\\s(?<!\\.)[„\"\\p{Ps}]?[A-Z]\\.\\s)|(?:^[„\"]?[A-ZŚĆŻŹ]\\.\\s)",
38
- "\\p{Ll}|\\p{Lu}\\p{Ll}\\p{Ll}+",
39
- false],
40
- ["(?:\\bproc\\.\\)?\\s)", "\\p{Ll}|\\p{Lu}\\p{Lu}+", false],
41
- ["(?:\\b\\d+\\.\\s)", "\\p{Ll}|\\p{Lu}{2,}", false],
42
- ["(?:\\.\\p{Lu}\\p{Ll}\\.\\s?)|(?:\\b\\p{Lu}\\p{Ll}\\.\\s?)",
43
- "\\p{Lu}[^\\p{Lu}]",
44
- false],
45
- ["(?:\\b[pP]\\.\\s)", "\\p{Lu}\\p{Ll}+", false],
46
- ["(?:\\b\\p{L}\\.)", "\\p{L}\\.", false],
47
- ["(?:\\b\\p{L}\\.\\s)", "\\p{L}\\.\\s", false],
48
- ["(?:[\"”']\\s*)", "\\s*\\p{Ll}", false],
49
- ["(?:[\\.!?…]['\"\\p{Pe}\\u00BB\\u2019\\u201D\\u203A\\u0002]*\\s)|(?:\\r?\\n\\s*\\r?\\n[\\t]*)",
50
- nil,
51
- true],
52
- ["(?:[\\.!?…]['»\"”\\p{Pe}]*)", "\\p{Lu}[^\\p{Lu}\\.]", true],
53
- ["(?:\\s\\p{L}[\\.!?…]\\s)", "\\p{Lu}\\p{Ll}", true]]
54
- BEFORE_RE = /(?:#{RULES.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
55
- REGEXPS = RULES.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
7
+ extend Rules
8
+
9
+ BEFORE_RE = /(?:#{self.rules.map{|s,e,v| "(#{s})"}.join("|")})\Z/m
10
+ REGEXPS = self.rules.map{|s,e,v| [/(#{s})\Z/m,/\A(#{e})/m,v] }
56
11
  FIRST_CHAR = /\A./m
57
12
 
58
13
  class SentenceSplitter
@@ -67,50 +22,54 @@ module SRX
67
22
  else
68
23
  @input = text
69
24
  end
25
+ @debug = false
70
26
  end
71
27
 
72
28
  def each
73
- raise "Invalid argument - text is nil" if @input.nil?
74
- buffer_length = 10
75
- sentence = ""
76
- before_buffer = ""
77
- @input.pos = 0
78
- after_buffer = buffer_length.times.map{|i| @input.readchar}.join("")
79
- matched_rule = nil
80
- while(!@input.eof?) do
81
- matched_before = BEFORE_RE.match(before_buffer)
82
- break_detected = false
83
- if matched_before
84
- start_index = (matched_before.size - 1).times.find do |index|
85
- matched_before[index+1]
86
- end
87
- if @debug
88
- puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
89
- end
90
- REGEXPS.each do |before_re,after_re,value|
91
- # skip the whole match
92
- if before_re.match(before_buffer) && after_re.match(after_buffer)
93
- break_detected = true
94
- color = value ? :red : :green
95
- if @debug
96
- sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
97
- end
98
- if value
99
- yield sentence
100
- sentence = ""
29
+ if block_given?
30
+ raise "Invalid argument - text is nil" if @input.nil?
31
+ buffer_length = 10
32
+ sentence = ""
33
+ before_buffer = ""
34
+ @input.pos = 0
35
+ after_buffer = buffer_length.times.map{|i| @input.readchar }.join("")
36
+ while(!@input.eof?) do
37
+ matched_before = BEFORE_RE.match(before_buffer)
38
+ break_detected = false
39
+ if matched_before
40
+ (matched_before.size - 1).times.find do |index|
41
+ matched_before[index+1]
42
+ end
43
+ if @debug
44
+ puts "#{before_buffer}|#{after_buffer.gsub(/\n/,"\\n")}"
45
+ end
46
+ REGEXPS.each do |before_re,after_re,value|
47
+ # skip the whole match
48
+ if before_re.match(before_buffer) && after_re.match(after_buffer)
49
+ break_detected = true
50
+ if @debug
51
+ color = value ? :red : :green
52
+ sentence << Term::ANSIColor.send(color,"<#{before_re}:#{after_re}>")
53
+ end
54
+ if value
55
+ yield sentence
56
+ sentence = ""
57
+ end
58
+ break
101
59
  end
102
- break
103
60
  end
104
61
  end
62
+ next_after = @input.readchar
63
+ before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
64
+ after_buffer.sub!(FIRST_CHAR,"")
65
+ before_buffer << $&
66
+ sentence << $&
67
+ after_buffer << next_after
105
68
  end
106
- next_after = @input.readchar
107
- before_buffer.sub!(FIRST_CHAR,"") if before_buffer.size >= buffer_length
108
- after_buffer.sub!(FIRST_CHAR,"")
109
- before_buffer << $&
110
- sentence << $&
111
- after_buffer << next_after
69
+ yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
70
+ else
71
+ enum_for(:each)
112
72
  end
113
- yield sentence + after_buffer unless sentence.empty? || after_buffer.empty?
114
73
  end
115
74
  end
116
75
  end
@@ -3,9 +3,9 @@ $:.push File.expand_path("../lib", __FILE__)
3
3
 
4
4
  Gem::Specification.new do |s|
5
5
  s.name = "srx-polish"
6
- s.version = "0.2.1"
6
+ s.version = "0.3.0"
7
7
  s.platform = Gem::Platform::RUBY
8
- s.authors = ["Aleksander Pohl"]
8
+ s.authors = ["Aleksander Smywiński-Pohl"]
9
9
  s.email = ["apohllo@o2.pl"]
10
10
  s.homepage = "http://github.com/apohllo/srx2ruby"
11
11
  s.summary = %q{Polish sentence and word segmentation rules.}
@@ -19,5 +19,5 @@ Gem::Specification.new do |s|
19
19
  s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
20
20
  s.require_paths = ["lib"]
21
21
 
22
- s.add_dependency("term-ansicolor", ["~> 1.0.5"])
22
+ s.add_dependency 'term-ansicolor', '~> 1.0', '>= 1.0.5'
23
23
  end
metadata CHANGED
@@ -1,77 +1,77 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: srx-polish
3
- version: !ruby/object:Gem::Version
4
- prerelease:
5
- version: 0.2.1
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
6
5
  platform: ruby
7
- authors:
8
- - Aleksander Pohl
6
+ authors:
7
+ - Aleksander Smywiński-Pohl
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
-
13
- date: 2011-10-14 00:00:00 Z
14
- dependencies:
15
- - !ruby/object:Gem::Dependency
11
+ date: 2018-07-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
16
14
  name: term-ansicolor
17
- prerelease: false
18
- requirement: &id001 !ruby/object:Gem::Requirement
19
- none: false
20
- requirements:
21
- - - ~>
22
- - !ruby/object:Gem::Version
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.0'
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
23
22
  version: 1.0.5
24
23
  type: :runtime
25
- version_requirements: *id001
26
- description: "Polish sentence and word segmentation rules. The sentence segmentation rules are based on Marcin Mi\xC5\x82kowski's SRX rules."
27
- email:
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: '1.0'
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.0.5
33
+ description: Polish sentence and word segmentation rules. The sentence segmentation
34
+ rules are based on Marcin Miłkowski's SRX rules.
35
+ email:
28
36
  - apohllo@o2.pl
29
37
  executables: []
30
-
31
38
  extensions: []
32
-
33
39
  extra_rdoc_files: []
34
-
35
- files:
36
- - .gitignore
40
+ files:
41
+ - ".gitignore"
42
+ - ".ruby-version"
43
+ - Gemfile
37
44
  - README.rdoc
38
45
  - changelog.txt
39
46
  - features/sentence_splitter.feature
40
47
  - features/steps/sentence_splitter.rb
41
48
  - features/steps/word_splitter.rb
42
49
  - features/word_splitter.feature
50
+ - lib/srx/polish/polish-rules.rb
43
51
  - lib/srx/polish/sentence_splitter.rb
44
52
  - lib/srx/polish/word_splitter.rb
45
53
  - srx-polish.gemspec
46
54
  homepage: http://github.com/apohllo/srx2ruby
47
55
  licenses: []
48
-
56
+ metadata: {}
49
57
  post_install_message:
50
58
  rdoc_options: []
51
-
52
- require_paths:
59
+ require_paths:
53
60
  - lib
54
- required_ruby_version: !ruby/object:Gem::Requirement
55
- none: false
56
- requirements:
61
+ required_ruby_version: !ruby/object:Gem::Requirement
62
+ requirements:
57
63
  - - ">="
58
- - !ruby/object:Gem::Version
59
- version: "0"
60
- required_rubygems_version: !ruby/object:Gem::Requirement
61
- none: false
62
- requirements:
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ required_rubygems_version: !ruby/object:Gem::Requirement
67
+ requirements:
63
68
  - - ">="
64
- - !ruby/object:Gem::Version
65
- version: "0"
69
+ - !ruby/object:Gem::Version
70
+ version: '0'
66
71
  requirements: []
67
-
68
72
  rubyforge_project: srx-polish
69
- rubygems_version: 1.8.5
73
+ rubygems_version: 2.6.14
70
74
  signing_key:
71
- specification_version: 3
75
+ specification_version: 4
72
76
  summary: Polish sentence and word segmentation rules.
73
- test_files:
74
- - features/sentence_splitter.feature
75
- - features/steps/sentence_splitter.rb
76
- - features/steps/word_splitter.rb
77
- - features/word_splitter.feature
77
+ test_files: []