srx-languagetool 0.8.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f73cebe4cfa7e771e3250f1f1dc13694b6450e19a4f9adf81769c5a75baa76d5
4
- data.tar.gz: 35f29775c7d85150bc61551e9fb32adcb60cebc392d9d37e01d829b193a7464c
3
+ metadata.gz: 15d222238ca97d49bdcfd3ab55fed3ac9556cd1479ede47c6bcc126eca67db91
4
+ data.tar.gz: c67a1a5931a94b815d45147c091083ee776006e21003c95872b534158bdbd9fa
5
5
  SHA512:
6
- metadata.gz: 548319e33a292724739e81eb28433594dfea9ec8e9e1fd78366d1847e601a5c6e1521e1fa639bd760fd33ee80d45190a83e92fdcb64d17efae730f5ac7958e6d
7
- data.tar.gz: b55939faa805e0e5102a8610c4c89571f47c5002d3c7b7dfa26fa9f8785f83ce4083bec63abad845565091ddfad752c2ca6b9599a88be621e5618053ad5b0394
6
+ metadata.gz: bd4ad4a1740d7c1f880bcfcb32acfd6b636c1491716d59d256d60ae66b0611783f134e116c1c8c16e1acf147aced8484865492fc7293fdd8f0330bf22d622a59
7
+ data.tar.gz: 6c28a74647adcd1a87e26d6c4cba540fcd38f0f71c817a4ce2a59cbcbbc6e21932aee3b56e2ad316fe11c838f48a70547c518a169495f14409acac8ada194d9b
@@ -10,10 +10,10 @@ jobs:
10
10
  - name: Set up Ruby
11
11
  uses: ruby/setup-ruby@v1
12
12
  with:
13
- ruby-version: 2.7.6
13
+ ruby-version: 2.7.7
14
14
  - name: Install
15
15
  run: |
16
- gem install bundler -v 2.3.22
16
+ gem install bundler -v 2.4.1
17
17
  bundle install
18
18
  - name: Type check
19
19
  run: bundle exec solargraph typecheck --level typed
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 2.7.6
1
+ 2.7.7
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  ## [Unreleased]
2
2
 
3
+ ## [0.9.0] - 2022-12-30
4
+
5
+ - Update rules to LanguageTool 6.0
6
+
3
7
  ## [0.8.0] - 2022-09-29
4
8
 
5
9
  - Update rules to LanguageTool 5.9
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- srx-languagetool (0.8.0)
4
+ srx-languagetool (0.9.0)
5
5
  srx (< 1.0)
6
6
 
7
7
  GEM
@@ -9,47 +9,47 @@ GEM
9
9
  specs:
10
10
  ast (2.4.2)
11
11
  backport (1.2.0)
12
- benchmark (0.2.0)
12
+ benchmark (0.2.1)
13
13
  byebug (11.1.3)
14
14
  diff-lcs (1.5.0)
15
15
  e2mmap (0.1.0)
16
16
  jaro_winkler (1.5.4)
17
- json (2.6.2)
17
+ json (2.6.3)
18
18
  kramdown (2.4.0)
19
19
  rexml
20
20
  kramdown-parser-gfm (1.1.0)
21
21
  kramdown (~> 2.0)
22
22
  minitest (5.16.3)
23
- nokogiri (1.13.8-x86_64-darwin)
23
+ nokogiri (1.13.10-x86_64-darwin)
24
24
  racc (~> 1.4)
25
25
  parallel (1.22.1)
26
- parser (3.1.2.1)
26
+ parser (3.1.3.0)
27
27
  ast (~> 2.4.1)
28
- racc (1.6.0)
28
+ racc (1.6.2)
29
29
  rainbow (3.1.1)
30
30
  rake (13.0.6)
31
- regexp_parser (2.6.0)
31
+ regexp_parser (2.6.1)
32
32
  reverse_markdown (2.1.1)
33
33
  nokogiri
34
34
  rexml (3.2.5)
35
- rspec-expectations (3.11.1)
35
+ rspec-expectations (3.12.1)
36
36
  diff-lcs (>= 1.2.0, < 2.0)
37
- rspec-support (~> 3.11.0)
38
- rspec-support (3.11.1)
39
- rubocop (1.36.0)
37
+ rspec-support (~> 3.12.0)
38
+ rspec-support (3.12.0)
39
+ rubocop (1.41.1)
40
40
  json (~> 2.3)
41
41
  parallel (~> 1.10)
42
42
  parser (>= 3.1.2.1)
43
43
  rainbow (>= 2.2.2, < 4.0)
44
44
  regexp_parser (>= 1.8, < 3.0)
45
45
  rexml (>= 3.2.5, < 4.0)
46
- rubocop-ast (>= 1.20.1, < 2.0)
46
+ rubocop-ast (>= 1.23.0, < 2.0)
47
47
  ruby-progressbar (~> 1.7)
48
48
  unicode-display_width (>= 1.4.0, < 3.0)
49
- rubocop-ast (1.21.0)
49
+ rubocop-ast (1.24.1)
50
50
  parser (>= 3.1.1.0)
51
51
  ruby-progressbar (1.11.0)
52
- solargraph (0.47.1)
52
+ solargraph (0.48.0)
53
53
  backport (~> 1.2)
54
54
  benchmark
55
55
  bundler (>= 1.17.2)
@@ -76,6 +76,7 @@ GEM
76
76
  PLATFORMS
77
77
  x86_64-darwin-20
78
78
  x86_64-darwin-21
79
+ x86_64-darwin-22
79
80
 
80
81
  DEPENDENCIES
81
82
  byebug
@@ -87,4 +88,4 @@ DEPENDENCIES
87
88
  srx-languagetool!
88
89
 
89
90
  BUNDLED WITH
90
- 2.3.22
91
+ 2.4.1
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Srx
4
4
  module Languagetool
5
- VERSION = '0.8.0'
5
+ VERSION = '0.9.0'
6
6
  end
7
7
  end
data/lib/srx/segment.srx CHANGED
@@ -4,8 +4,10 @@
4
4
  <formathandle type="start" include="no"></formathandle>
5
5
  <formathandle type="end" include="yes"></formathandle>
6
6
  <formathandle type="isolated" include="no"></formathandle>
7
- <okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes" useIcu4JBreakRules="no" treatIsolatedCodesAsWhitespace="no"></okpsrx:options>
8
- <okpsrx:sample language="sr" useMappedRules="yes">Поштовани господине одн. госпођо. Видео сам </okpsrx:sample>
7
+ <okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes"></okpsrx:options>
8
+ <okpsrx:sample language="pl" useMappedRules="yes">Als een hoogleraar met emeritaat ('pensioen') is, mag hij de functieaanduiding prof. blijven gebruiken, maar hij heeft tevens het recht gekregen om het bijvoeglijk naamwoord emeritus (Latijn voor 'uitgediend') aan zijn functietitel toe te voegen: em. prof. dr.
9
+ Tussen de twee wereldoorlogen vestigde prof. ir. Messerschmitt zich in Augsburg waar hij met behulp van een oudere, rijke vriendin (met wie hij later trouwde) zijn eerste vliegtuigen bouwde, het waren passagierstoestellen.
10
+ 250 p. n.e.</okpsrx:sample>
9
11
  <okpsrx:rangeRule></okpsrx:rangeRule>
10
12
  </header>
11
13
  <body>
@@ -1084,6 +1086,11 @@
1084
1086
  <beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u0002]*\p{Pe}\s</beforebreak>
1085
1087
  <afterbreak>\p{Ll}</afterbreak>
1086
1088
  </rule>
1089
+ <!--p. n.e. (błędny podział wiersza)-->
1090
+ <rule break="no">
1091
+ <beforebreak>p\.\s</beforebreak>
1092
+ <afterbreak>n\.\s?e\.</afterbreak>
1093
+ </rule>
1087
1094
  <rule break="yes">
1088
1095
  <beforebreak>[\.!?…]['"\p{Pe}\u00BB\u2019\u201D\u203A\u0002¹²³]*\s</beforebreak>
1089
1096
  <afterbreak></afterbreak>
@@ -1106,7 +1113,7 @@
1106
1113
  <beforebreak>[\u00A0\s]</beforebreak>
1107
1114
  <afterbreak>\n</afterbreak>
1108
1115
  </rule>
1109
- <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1116
+ <rule break="no">
1110
1117
  <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
1118
  <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1112
1119
  </rule>
@@ -1114,96 +1121,96 @@
1114
1121
  <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1115
1122
  <afterbreak>\p{Ll}</afterbreak>
1116
1123
  </rule>
1117
- <rule break="no"><!-- U.S.A (no dot at end) -->
1124
+ <rule break="no">
1118
1125
  <beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
1119
1126
  <afterbreak>[A-Z]\b</afterbreak>
1120
1127
  </rule>
1121
- <rule break="no"><!-- A.I (no dot at end) -->
1128
+ <rule break="no">
1122
1129
  <beforebreak>\bA\.</beforebreak>
1123
1130
  <afterbreak>I\b</afterbreak>
1124
1131
  </rule>
1125
- <rule break="no"><!-- S.I (no dot at end) -->
1132
+ <rule break="no">
1126
1133
  <beforebreak>\bS\.</beforebreak>
1127
1134
  <afterbreak>I\b</afterbreak>
1128
1135
  </rule>
1129
- <rule break="no"><!-- L.A (no dot at end) -->
1136
+ <rule break="no">
1130
1137
  <beforebreak>\bL\.</beforebreak>
1131
1138
  <afterbreak>A\b</afterbreak>
1132
1139
  </rule>
1133
- <rule break="no"><!-- U.S (no dot at end) -->
1140
+ <rule break="no">
1134
1141
  <beforebreak>\bU\.</beforebreak>
1135
1142
  <afterbreak>[SK]\b</afterbreak>
1136
1143
  </rule>
1137
- <rule break="no"><!-- I.S (no dot at end) -->
1144
+ <rule break="no">
1138
1145
  <beforebreak>\bI\.</beforebreak>
1139
1146
  <afterbreak>S\b</afterbreak>
1140
1147
  </rule>
1141
- <rule break="no"><!-- M.Z (no dot at end) -->
1148
+ <rule break="no">
1142
1149
  <beforebreak>\bM\.</beforebreak>
1143
1150
  <afterbreak>Z\b</afterbreak>
1144
1151
  </rule>
1145
- <rule break="no"><!-- URLs without "www."-->
1152
+ <rule break="no">
1146
1153
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1147
1154
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
1148
1155
  </rule>
1149
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
1156
+ <rule break="no">
1150
1157
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
1151
1158
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1152
1159
  </rule>
1153
- <rule break="no"><!-- No. 5 -->
1160
+ <rule break="no">
1154
1161
  <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1155
1162
  <afterbreak>\p{N}</afterbreak>
1156
1163
  </rule>
1157
- <rule break="no"><!-- Ph.D. -->
1164
+ <rule break="no">
1158
1165
  <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1159
1166
  <afterbreak>D\.?</afterbreak>
1160
1167
  </rule>
1161
- <rule break="no"><!-- min. -->
1168
+ <rule break="no">
1162
1169
  <beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1163
1170
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1164
1171
  </rule>
1165
- <rule break="no"><!-- hr. -->
1172
+ <rule break="no">
1166
1173
  <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1167
1174
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1168
1175
  </rule>
1169
- <rule break="no"><!-- Fig. 8 -->
1176
+ <rule break="no">
1170
1177
  <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1171
1178
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1172
1179
  </rule>
1173
- <rule break="no"><!-- Fig. (8) -->
1180
+ <rule break="no">
1174
1181
  <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1175
1182
  <afterbreak>\(\p{N}\)</afterbreak>
1176
1183
  </rule>
1177
- <rule break="no"><!-- I'm (...) great! -->
1184
+ <rule break="no">
1178
1185
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1179
1186
  <afterbreak>[^\p{P}]</afterbreak>
1180
1187
  </rule>
1181
- <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1188
+ <rule break="no">
1182
1189
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1183
1190
  <afterbreak>[^\p{P}]</afterbreak>
1184
1191
  </rule>
1185
- <rule break="no"><!-- e.g. -->
1192
+ <rule break="no">
1186
1193
  <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1187
1194
  <afterbreak></afterbreak>
1188
1195
  </rule>
1189
- <rule break="no"><!-- vs. -->
1196
+ <rule break="no">
1190
1197
  <beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
1191
1198
  <afterbreak></afterbreak>
1192
1199
  </rule>
1193
- <rule break="no"><!-- pp. -->
1200
+ <rule break="no">
1194
1201
  <beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
1195
1202
  <afterbreak></afterbreak>
1196
1203
  </rule>
1197
- <rule break="no"><!-- esp. -->
1204
+ <rule break="no">
1198
1205
  <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1199
1206
  <afterbreak></afterbreak>
1200
1207
  </rule>
1201
1208
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1202
- <rule break="no"><!-- Etc. -->
1209
+ <rule break="no">
1203
1210
  <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1204
1211
  <afterbreak>[^\p{Lu}]</afterbreak>
1205
1212
  </rule>
1206
- <rule break="no"><!-- BTW (by the way) -->
1213
+ <rule break="no">
1207
1214
  <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1208
1215
  <afterbreak></afterbreak>
1209
1216
  </rule>
@@ -1251,39 +1258,39 @@
1251
1258
  <beforebreak>(?i)FRITZ!</beforebreak>
1252
1259
  <afterbreak>(?i)Box</afterbreak>
1253
1260
  </rule>
1254
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
1261
+ <rule break="no">
1255
1262
  <beforebreak>ID.</beforebreak>
1256
1263
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
1257
1264
  </rule>
1258
- <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1265
+ <rule break="no">
1259
1266
  <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1260
1267
  <afterbreak></afterbreak>
1261
1268
  </rule>
1262
- <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1269
+ <rule break="no">
1263
1270
  <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1264
1271
  <afterbreak></afterbreak>
1265
1272
  </rule>
1266
- <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1273
+ <rule break="no">
1267
1274
  <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1268
1275
  <afterbreak></afterbreak>
1269
1276
  </rule>
1270
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1277
+ <rule break="no">
1271
1278
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1272
1279
  <afterbreak>Eng\.?</afterbreak>
1273
1280
  </rule>
1274
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1281
+ <rule break="no">
1275
1282
  <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1276
1283
  <afterbreak>[BM]\.?</afterbreak>
1277
1284
  </rule>
1278
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1285
+ <rule break="no">
1279
1286
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1280
1287
  <afterbreak>Sc\.?</afterbreak>
1281
1288
  </rule>
1282
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1289
+ <rule break="no">
1283
1290
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1284
1291
  <afterbreak>Comp?\.?</afterbreak>
1285
1292
  </rule>
1286
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1293
+ <rule break="no">
1287
1294
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1288
1295
  <afterbreak>Arch\.?</afterbreak>
1289
1296
  </rule>
@@ -1375,7 +1382,7 @@
1375
1382
  <beforebreak>\b\p{L}\.</beforebreak>
1376
1383
  <afterbreak>\p{L}\.</afterbreak>
1377
1384
  </rule>
1378
- <rule break="no"><!-- Jones v. Smith -->
1385
+ <rule break="no">
1379
1386
  <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1380
1387
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1381
1388
  </rule>
@@ -1388,7 +1395,7 @@
1388
1395
  <afterbreak>\p{Ll}+</afterbreak>
1389
1396
  </rule>
1390
1397
  <rule break="no">
1391
- <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1398
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak>
1392
1399
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1393
1400
  </rule>
1394
1401
  <rule break="no">
@@ -1419,8 +1426,8 @@
1419
1426
  <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1420
1427
  <afterbreak></afterbreak>
1421
1428
  </rule>
1422
- <rule break="no"><!-- i.e. -->
1423
- <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1429
+ <rule break="no">
1430
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak>
1424
1431
  <afterbreak></afterbreak>
1425
1432
  </rule>
1426
1433
  <rule break="yes">
@@ -1532,37 +1539,44 @@
1532
1539
  </languagerule>
1533
1540
  <languagerule languagerulename="Dutch">
1534
1541
  <rule break="no">
1535
- <!-- sp.a -->
1536
1542
  <beforebreak>\b(sp|SP)</beforebreak>
1537
1543
  <afterbreak>\.[aA]\b</afterbreak>
1538
1544
  </rule>
1539
1545
  <rule break="no">
1540
- <!-- .Net -->
1541
1546
  <beforebreak>\s[.]</beforebreak>
1542
1547
  <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1543
1548
  </rule>
1544
- <rule break="no"><!-- quoted sentence in sentence -->
1549
+ <rule break="no">
1545
1550
  <beforebreak>[.?!][’'"]</beforebreak>
1546
1551
  <afterbreak> [a-z]</afterbreak>
1547
1552
  </rule>
1548
- <rule break="no"><!-- URLs without "www."-->
1553
+ <rule break="no">
1549
1554
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1550
1555
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
1551
1556
  </rule>
1552
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
1557
+ <rule break="no">
1553
1558
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
1554
1559
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
1555
1560
  </rule>
1556
- <rule break="no"><!-- Abbreviated books of the Bible and biblical apocrypha-->
1561
+ <rule break="no">
1562
+ <beforebreak>\b(blz|pag|fig)\.\s</beforebreak>
1563
+ <afterbreak>[0-9]</afterbreak>
1564
+ </rule>
1565
+ <!--Abbrevs that can happen in sentence and at end-->
1566
+ <rule break="no">
1567
+ <beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak>
1568
+ <afterbreak>\p{Ll}</afterbreak>
1569
+ </rule>
1570
+ <rule break="no">
1557
1571
  <beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
1558
1572
  <afterbreak></afterbreak>
1559
1573
  </rule>
1560
1574
  <rule break="no">
1561
- <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros|Stb)\.\s</beforebreak>
1562
- <afterbreak></afterbreak>
1575
+ <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|(Prof\.)?[Dd]r|Em|Fa|Kon|Stb)\.\s</beforebreak>
1576
+ <afterbreak>\p{Lu}</afterbreak>
1563
1577
  </rule>
1564
1578
  <rule break="no">
1565
- <beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak>
1579
+ <beforebreak>\b([Mm]ej|[Mm]evr|[Mm]rs|[Mm]s|[Mm]gr|[Mm]w|Ndl|Ned|Nl|No|Prof|[Ss]ecr|Chr|Jac|[Ww]ed)\.\s</beforebreak>
1566
1580
  <afterbreak></afterbreak>
1567
1581
  </rule>
1568
1582
  <rule break="no">
@@ -1570,23 +1584,27 @@
1570
1584
  <afterbreak></afterbreak>
1571
1585
  </rule>
1572
1586
  <rule break="no">
1573
- <beforebreak>\b(abs|abstr|adj|adm|afb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|bv|afm|evt|exp)\.\s</beforebreak>
1587
+ <beforebreak>\b(abs|abstr|adj|adm|[Aa]fb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|afm|evt|exp|vs)\.\s</beforebreak>
1574
1588
  <afterbreak></afterbreak>
1575
1589
  </rule>
1576
1590
  <rule break="no">
1577
- <beforebreak>\b(al|ald|alg|amb|ambt|anat|antrop|apoth)\.\s</beforebreak>
1591
+ <beforebreak>\b(ald|alg|amb|anat|antrop|apoth)\.\s</beforebreak>
1578
1592
  <afterbreak></afterbreak>
1579
1593
  </rule>
1594
+ <rule break="yes">
1595
+ <beforebreak>\seen\sprof\.\s</beforebreak>
1596
+ <afterbreak>\p{Lu}</afterbreak>
1597
+ </rule>
1580
1598
  <rule break="no">
1581
1599
  <beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
1582
1600
  <afterbreak></afterbreak>
1583
1601
  </rule>
1584
1602
  <rule break="no">
1585
- <beforebreak>\b(arch|archeol|art|bc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
1603
+ <beforebreak>\b(arch|archeolbc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
1586
1604
  <afterbreak></afterbreak>
1587
1605
  </rule>
1588
1606
  <rule break="no">
1589
- <beforebreak>\b(bijz|blz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
1607
+ <beforebreak>\b(bijz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
1590
1608
  <afterbreak></afterbreak>
1591
1609
  </rule>
1592
1610
  <rule break="no">
@@ -1594,7 +1612,7 @@
1594
1612
  <afterbreak></afterbreak>
1595
1613
  </rule>
1596
1614
  <rule break="no">
1597
- <beforebreak>\b(ed|em|enz|etc|ev|[Ee]xcl|fa|fam|fig|fin|fl|fr.)\.\s</beforebreak>
1615
+ <beforebreak>\b([Ee]d|em|ev|[Ee]xcl|[Ff]a|[Ff]am|[fF]ig|fin|fl|fr)\.\s</beforebreak>
1598
1616
  <afterbreak></afterbreak>
1599
1617
  </rule>
1600
1618
  <rule break="no">
@@ -1602,39 +1620,39 @@
1602
1620
  <afterbreak></afterbreak>
1603
1621
  </rule>
1604
1622
  <rule break="no">
1605
- <beforebreak>\b(jl|jr|kr|kt|lab|lic|ll|lt|lw|max|mevr|mi|[Mm]in|mld)\.\s</beforebreak>
1623
+ <beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak>
1606
1624
  <afterbreak></afterbreak>
1607
1625
  </rule>
1608
1626
  <rule break="no">
1609
- <beforebreak>\b(mln|mr|mw|nl|no|nr|nrs|ob|obl|ong|onov|o.a)\.\s</beforebreak>
1627
+ <beforebreak>\b(mln|[Mm]r|[Mm]w|nl|no|nr|nrs|ob|obl|ong|onov)\.\s</beforebreak>
1610
1628
  <afterbreak></afterbreak>
1611
1629
  </rule>
1612
1630
  <rule break="no">
1613
- <beforebreak>\b(opm|org|ov|pag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
1631
+ <beforebreak>\b(opm|org|ov|[Pp]ag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
1614
1632
  <afterbreak></afterbreak>
1615
1633
  </rule>
1616
1634
  <rule break="no">
1617
- <beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|tk)\.\s</beforebreak>
1635
+ <beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|prof|tk)\.\s</beforebreak>
1618
1636
  <afterbreak></afterbreak>
1619
1637
  </rule>
1620
1638
  <rule break="no">
1621
1639
  <beforebreak>\b([A-Z]|Adr|Chr|Fr|Fred|IJ|Jac|Joh|Ph|St|Th|Tj|v|v\.(\s)?d)\.(\s)?</beforebreak>
1622
- <afterbreak>[A-Z]</afterbreak>
1640
+ <afterbreak>\p{Lu}</afterbreak>
1623
1641
  </rule>
1624
1642
  <rule break="no">
1625
1643
  <beforebreak>\b[vn]\.\s</beforebreak>
1626
1644
  <afterbreak>Chr</afterbreak>
1627
1645
  </rule>
1628
1646
  <rule break="no">
1629
- <beforebreak>\b(uitsl|ver|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
1630
- <afterbreak></afterbreak>
1647
+ <beforebreak>\b(uitsl|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
1648
+ <afterbreak>\p{Ll}</afterbreak>
1631
1649
  </rule>
1632
1650
  <rule break="no">
1633
- <beforebreak>\b(mm|cm|km|mg|kg|h|kW|mW)\.\s</beforebreak>
1651
+ <beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
1634
1652
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
1635
1653
  </rule>
1636
1654
  <rule break="yes">
1637
- <beforebreak>\b(mm|cm|km|ml|kg|kW|h|mg)\.\s</beforebreak>
1655
+ <beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
1638
1656
  <afterbreak></afterbreak>
1639
1657
  </rule>
1640
1658
  <rule break="no">
@@ -1686,10 +1704,6 @@
1686
1704
  <afterbreak></afterbreak>
1687
1705
  </rule>
1688
1706
  <rule break="no">
1689
- <beforebreak>\b\p{Lu}\p{Ll}\.\s?</beforebreak>
1690
- <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1691
- </rule>
1692
- <rule break="no">
1693
1707
  <beforebreak>\.\p{Lu}\p{Ll}\.\s?</beforebreak>
1694
1708
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1695
1709
  </rule>
@@ -1698,14 +1712,6 @@
1698
1712
  <beforebreak>\b\d+\.\s</beforebreak>
1699
1713
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
1700
1714
  </rule>
1701
- <rule break="yes">
1702
- <beforebreak>\been\sprof\.\s</beforebreak>
1703
- <afterbreak>[^\p{Ll}]</afterbreak>
1704
- </rule>
1705
- <rule break="no">
1706
- <beforebreak>\bprof\.\s</beforebreak>
1707
- <afterbreak></afterbreak>
1708
- </rule>
1709
1715
  <rule break="no">
1710
1716
  <beforebreak>[.!?…][’'"]\s</beforebreak>
1711
1717
  <afterbreak>[a-z]</afterbreak>
@@ -1723,11 +1729,11 @@
1723
1729
  <afterbreak>[a-z]</afterbreak>
1724
1730
  </rule>
1725
1731
  <rule break="yes">
1726
- <beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1732
+ <beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1727
1733
  <afterbreak></afterbreak>
1728
1734
  </rule>
1729
1735
  <rule break="yes">
1730
- <beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1736
+ <beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1731
1737
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1732
1738
  </rule>
1733
1739
  <rule break="yes">
@@ -1768,31 +1774,29 @@
1768
1774
  <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1769
1775
  </rule>
1770
1776
  <rule break="no">
1771
- <!-- "E. coli etc. -->
1772
1777
  <beforebreak>"[A-Z][.]\s</beforebreak>
1773
1778
  <afterbreak>[a-z]</afterbreak>
1774
1779
  </rule>
1775
1780
  <rule break="no">
1776
- <!-- Cornelisz. -->
1777
1781
  <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1778
1782
  <afterbreak>[a-z]</afterbreak>
1779
1783
  </rule>
1780
1784
  <rule break="no">
1781
- <!-- De n. XIV/vagus (nervus) -->
1782
1785
  <beforebreak>De n[.]\s</beforebreak>
1783
1786
  <afterbreak>[a-z]|[XIV]</afterbreak>
1784
1787
  </rule>
1785
1788
  <rule break="no">
1786
- <!-- MOL.E -->
1787
1789
  <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1788
1790
  <afterbreak>[A-Z]</afterbreak>
1789
1791
  </rule>
1790
1792
  <rule break="no">
1791
- <!-- ..." betekent -->
1792
1793
  <beforebreak>\.\.</beforebreak>
1793
1794
  <afterbreak>" [a-z]</afterbreak>
1794
1795
  </rule>
1795
- <!-- ##### end of Dutch #### -->
1796
+ <rule break="no">
1797
+ <beforebreak>\sBTW\.</beforebreak>
1798
+ <afterbreak>\p{Ll}</afterbreak>
1799
+ </rule>
1796
1800
  </languagerule>
1797
1801
  <languagerule languagerulename="Slovak">
1798
1802
  <rule break="no">
@@ -4370,7 +4374,7 @@
4370
4374
  </rule>
4371
4375
  <rule break="no">
4372
4376
  <beforebreak>\b(р|ред|Рис|рус|с|сб|св|См|см|сов|соч|соц|спец|ср|ст|стр|т|тел|Тел|тех|тов|тт|туп)\.\s</beforebreak>
4373
- <afterbreak></afterbreak>
4377
+ <afterbreak>\p{Ll}</afterbreak>
4374
4378
  </rule>
4375
4379
  <rule break="no">
4376
4380
  <beforebreak>\b(руб|Руб|тыс|Тыс|трлн)\.\s</beforebreak>
@@ -4654,7 +4658,7 @@
4654
4658
  <afterbreak>[XIV\d]+\b</afterbreak>
4655
4659
  </rule>
4656
4660
  <rule break="no">
4657
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4661
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4658
4662
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4659
4663
  </rule>
4660
4664
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
@@ -4721,12 +4725,10 @@
4721
4725
  </rule>
4722
4726
  </languagerule>
4723
4727
  <languagerule languagerulename="Spanish">
4724
-
4725
4728
  <rule break="no">
4726
4729
  <beforebreak>¿[^?]+:[\s\u00A0]</beforebreak>
4727
4730
  <afterbreak>.</afterbreak>
4728
4731
  </rule>
4729
-
4730
4732
  <rule break="no">
4731
4733
  <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4732
4734
  <afterbreak>\p{Ll}</afterbreak>
@@ -4742,7 +4744,7 @@
4742
4744
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4743
4745
  <rule break="no">
4744
4746
  <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4745
- <afterbreak/>
4747
+ <afterbreak></afterbreak>
4746
4748
  </rule>
4747
4749
  <!-- Ellipsis: ... lowercase -->
4748
4750
  <rule break="no">
@@ -4772,39 +4774,37 @@
4772
4774
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4773
4775
  </rule>
4774
4776
  <rule break="no">
4775
- <!-- URLs without "www."-->
4776
4777
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
4777
4778
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
4778
4779
  </rule>
4779
4780
  <rule break="no">
4780
- <!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
4781
4781
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
4782
4782
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
4783
4783
  </rule>
4784
4784
  <!-- Abbreviations that cannot finish sentences-->
4785
4785
  <rule break="no">
4786
4786
  <beforebreak>\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0]</beforebreak>
4787
- <afterbreak/>
4787
+ <afterbreak></afterbreak>
4788
4788
  </rule>
4789
4789
  <rule break="no">
4790
4790
  <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4791
- <afterbreak/>
4791
+ <afterbreak></afterbreak>
4792
4792
  </rule>
4793
4793
  <rule break="no">
4794
4794
  <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4795
- <afterbreak/>
4795
+ <afterbreak></afterbreak>
4796
4796
  </rule>
4797
4797
  <rule break="no">
4798
4798
  <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4799
- <afterbreak/>
4799
+ <afterbreak></afterbreak>
4800
4800
  </rule>
4801
4801
  <rule break="no">
4802
4802
  <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4803
- <afterbreak/>
4803
+ <afterbreak></afterbreak>
4804
4804
  </rule>
4805
4805
  <rule break="no">
4806
4806
  <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4807
- <afterbreak/>
4807
+ <afterbreak></afterbreak>
4808
4808
  </rule>
4809
4809
  <!-- Abbreviations that can finish sentences -->
4810
4810
  <rule break="no">
@@ -4837,7 +4837,7 @@
4837
4837
  <!-- Composed abbrev. -->
4838
4838
  <rule break="no">
4839
4839
  <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4840
- <afterbreak/>
4840
+ <afterbreak></afterbreak>
4841
4841
  </rule>
4842
4842
  <!-- Units -->
4843
4843
  <rule break="no">
@@ -4859,11 +4859,11 @@
4859
4859
  </rule>
4860
4860
  </languagerule>
4861
4861
  <languagerule languagerulename="German">
4862
- <rule break="no"><!-- URLs without "www."-->
4862
+ <rule break="no">
4863
4863
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
4864
4864
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
4865
4865
  </rule>
4866
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
4866
+ <rule break="no">
4867
4867
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
4868
4868
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
4869
4869
  </rule>
@@ -4882,27 +4882,27 @@
4882
4882
  <beforebreak>[^-\p{L}'’/°]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4883
4883
  <afterbreak></afterbreak>
4884
4884
  </rule>
4885
- <rule break="no"><!-- special case: "Das 1. Internationale Filmfestival findet nächste Woche statt." -->
4885
+ <rule break="no">
4886
4886
  <beforebreak>([Dd](as|er|ie|iese[rsmn]?|en|em)|[kmsd]?ein(e[rsnm]?)?|am|fürs|ins|zum|im|am|zur) \d+\.[\u00A0\s]+</beforebreak>
4887
4887
  <afterbreak>[A-ZÄÖÜ].*</afterbreak>
4888
4888
  </rule>
4889
4889
  <rule break="no">
4890
- <beforebreak>Ust.</beforebreak><!-- needed for German rule UST_ID -->
4890
+ <beforebreak>Ust.</beforebreak>
4891
4891
  <afterbreak>Id</afterbreak>
4892
4892
  </rule>
4893
4893
  <rule break="no">
4894
- <beforebreak>Prof.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN -->
4894
+ <beforebreak>Prof.</beforebreak>
4895
4895
  <afterbreak>Dr</afterbreak>
4896
4896
  </rule>
4897
4897
  <rule break="no">
4898
- <beforebreak>Dr.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN -->
4898
+ <beforebreak>Dr.</beforebreak>
4899
4899
  <afterbreak>iur|med|oec|phil|rer|theol</afterbreak>
4900
4900
  </rule>
4901
4901
  <rule break="no">
4902
4902
  <beforebreak>(?i)FRITZ!</beforebreak>
4903
4903
  <afterbreak>(?i)Box</afterbreak>
4904
4904
  </rule>
4905
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
4905
+ <rule break="no">
4906
4906
  <beforebreak>ID.</beforebreak>
4907
4907
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
4908
4908
  </rule>
@@ -4972,7 +4972,7 @@
4972
4972
  </rule>
4973
4973
  <!-- don't split at cases like "Friedrich II. wird auch..." -->
4974
4974
  <rule break="no">
4975
- <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak>
4975
+ <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak>
4976
4976
  <afterbreak>[^\p{Lu}]+</afterbreak>
4977
4977
  </rule>
4978
4978
  <!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
@@ -5012,11 +5012,11 @@
5012
5012
  </rule>
5013
5013
  <!-- German abbreviations -->
5014
5014
  <rule break="no">
5015
- <beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
5015
+ <beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
5016
5016
  <afterbreak></afterbreak>
5017
5017
  </rule>
5018
5018
  <rule break="no">
5019
- <beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|exkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
5019
+ <beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
5020
5020
  <afterbreak></afterbreak>
5021
5021
  </rule>
5022
5022
  <rule break="no">
@@ -5028,11 +5028,11 @@
5028
5028
  <afterbreak>\p{Ll}</afterbreak>
5029
5029
  </rule>
5030
5030
  <rule break="no">
5031
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
5031
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
5032
5032
  <afterbreak></afterbreak>
5033
5033
  </rule>
5034
5034
  <rule break="no">
5035
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
5035
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
5036
5036
  <afterbreak></afterbreak>
5037
5037
  </rule>
5038
5038
  <rule break="no">
@@ -5048,7 +5048,11 @@
5048
5048
  <afterbreak>\p{Ll}</afterbreak>
5049
5049
  </rule>
5050
5050
  <rule break="no">
5051
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
5051
+ <beforebreak>\d+\.\d+\.[\u00A0\s]</beforebreak>
5052
+ <afterbreak>[\-–][\u00A0\s]\d+</afterbreak>
5053
+ </rule>
5054
+ <rule break="no">
5055
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
5052
5056
  <afterbreak></afterbreak>
5053
5057
  </rule>
5054
5058
  <!-- Break rules -->
@@ -5159,27 +5163,27 @@
5159
5163
  <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sep|Sept|Oct|Okt|Nov|Dec|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co)\.\s</beforebreak>
5160
5164
  <afterbreak></afterbreak>
5161
5165
  </rule>
5162
- <rule break="no"><!-- Ph.D. -->
5166
+ <rule break="no">
5163
5167
  <beforebreak>\bP[Hh]\.\s?</beforebreak>
5164
5168
  <afterbreak>D\.?</afterbreak>
5165
5169
  </rule>
5166
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5170
+ <rule break="no">
5167
5171
  <beforebreak>\b[BM]\.\s?</beforebreak>
5168
5172
  <afterbreak>Eng\.?</afterbreak>
5169
5173
  </rule>
5170
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5174
+ <rule break="no">
5171
5175
  <beforebreak>\bLL\.\s?</beforebreak>
5172
5176
  <afterbreak>[BM]\.?</afterbreak>
5173
5177
  </rule>
5174
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5178
+ <rule break="no">
5175
5179
  <beforebreak>\b[BM]\.\s?</beforebreak>
5176
5180
  <afterbreak>Sc\.?</afterbreak>
5177
5181
  </rule>
5178
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5182
+ <rule break="no">
5179
5183
  <beforebreak>\b[BM]\.\s?</beforebreak>
5180
5184
  <afterbreak>Comp?\.?</afterbreak>
5181
5185
  </rule>
5182
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5186
+ <rule break="no">
5183
5187
  <beforebreak>\b[BM]\.\s?</beforebreak>
5184
5188
  <afterbreak>Arch\.?</afterbreak>
5185
5189
  </rule>
@@ -5309,16 +5313,15 @@
5309
5313
  <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5310
5314
  <afterbreak></afterbreak>
5311
5315
  </rule>
5312
- <rule break="no"><!-- URLs without "www."-->
5316
+ <rule break="no">
5313
5317
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
5314
5318
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
5315
5319
  </rule>
5316
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
5320
+ <rule break="no">
5317
5321
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5318
5322
  <afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
5319
5323
  </rule>
5320
5324
  <rule break="no">
5321
- <!-- gaffa.org -->
5322
5325
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5323
5326
  <afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
5324
5327
  </rule>
@@ -5363,15 +5366,15 @@
5363
5366
  <beforebreak>\b\p{L}\.</beforebreak>
5364
5367
  <afterbreak>\p{L}\.</afterbreak>
5365
5368
  </rule>
5366
- <rule break="no"><!-- Je suis (...) Chris. -->
5369
+ <rule break="no">
5367
5370
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5368
5371
  <afterbreak>[^\p{P}]</afterbreak>
5369
5372
  </rule>
5370
- <rule break="no"><!-- Je suis (...?) Chris. -->
5373
+ <rule break="no">
5371
5374
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5372
5375
  <afterbreak>[^\p{P}]</afterbreak>
5373
5376
  </rule>
5374
- <rule break="no"><!-- Jones v. Smith -->
5377
+ <rule break="no">
5375
5378
  <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5376
5379
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5377
5380
  </rule>
@@ -5411,44 +5414,44 @@
5411
5414
  <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5412
5415
  <afterbreak></afterbreak>
5413
5416
  </rule>
5414
- <rule break="no"><!-- i.e. -->
5415
- <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5417
+ <rule break="no">
5418
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak>
5416
5419
  <afterbreak></afterbreak>
5417
5420
  </rule>
5418
- <rule break="no"><!-- U.S.A (no dot at end) -->
5421
+ <rule break="no">
5419
5422
  <beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
5420
5423
  <afterbreak>[A-Z]\b</afterbreak>
5421
5424
  </rule>
5422
- <rule break="no"><!-- L.A (no dot at end) -->
5425
+ <rule break="no">
5423
5426
  <beforebreak>\bL\.</beforebreak>
5424
5427
  <afterbreak>A\b</afterbreak>
5425
5428
  </rule>
5426
- <rule break="no"><!-- U.S (no dot at end) -->
5429
+ <rule break="no">
5427
5430
  <beforebreak>\bU\.</beforebreak>
5428
5431
  <afterbreak>[SK]\b</afterbreak>
5429
5432
  </rule>
5430
- <rule break="no"><!-- No. 5 -->
5433
+ <rule break="no">
5431
5434
  <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5432
5435
  <afterbreak>\p{N}</afterbreak>
5433
5436
  </rule>
5434
- <rule break="no"><!-- Ph.D. -->
5437
+ <rule break="no">
5435
5438
  <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5436
5439
  <afterbreak>D\.?</afterbreak>
5437
5440
  </rule>
5438
- <rule break="no"><!-- e.g. -->
5441
+ <rule break="no">
5439
5442
  <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5440
5443
  <afterbreak></afterbreak>
5441
5444
  </rule>
5442
- <rule break="no"><!-- vs. -->
5445
+ <rule break="no">
5443
5446
  <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5444
5447
  <afterbreak></afterbreak>
5445
5448
  </rule>
5446
5449
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5447
- <rule break="no"><!-- Etc. -->
5450
+ <rule break="no">
5448
5451
  <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5449
5452
  <afterbreak>[^\p{Lu}]</afterbreak>
5450
5453
  </rule>
5451
- <rule break="no"><!-- BTW (by the way) -->
5454
+ <rule break="no">
5452
5455
  <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5453
5456
  <afterbreak></afterbreak>
5454
5457
  </rule>
@@ -5456,39 +5459,39 @@
5456
5459
  <beforebreak>(?i)FRITZ!</beforebreak>
5457
5460
  <afterbreak>(?i)Box</afterbreak>
5458
5461
  </rule>
5459
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
5462
+ <rule break="no">
5460
5463
  <beforebreak>ID.</beforebreak>
5461
5464
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
5462
5465
  </rule>
5463
- <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5466
+ <rule break="no">
5464
5467
  <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5465
5468
  <afterbreak></afterbreak>
5466
5469
  </rule>
5467
- <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5470
+ <rule break="no">
5468
5471
  <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5469
5472
  <afterbreak></afterbreak>
5470
5473
  </rule>
5471
- <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5474
+ <rule break="no">
5472
5475
  <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5473
5476
  <afterbreak></afterbreak>
5474
5477
  </rule>
5475
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5478
+ <rule break="no">
5476
5479
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5477
5480
  <afterbreak>Eng\.?</afterbreak>
5478
5481
  </rule>
5479
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5482
+ <rule break="no">
5480
5483
  <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5481
5484
  <afterbreak>[BM]\.?</afterbreak>
5482
5485
  </rule>
5483
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5486
+ <rule break="no">
5484
5487
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5485
5488
  <afterbreak>Sc\.?</afterbreak>
5486
5489
  </rule>
5487
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5490
+ <rule break="no">
5488
5491
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5489
5492
  <afterbreak>Comp?\.?</afterbreak>
5490
5493
  </rule>
5491
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5494
+ <rule break="no">
5492
5495
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5493
5496
  <afterbreak>Arch\.?</afterbreak>
5494
5497
  </rule>
@@ -5534,7 +5537,6 @@
5534
5537
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5535
5538
  </rule>
5536
5539
  </languagerule>
5537
-
5538
5540
  <languagerule languagerulename="Ukrainian">
5539
5541
  <!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. -->
5540
5542
  <rule break="no">
@@ -5547,8 +5549,8 @@
5547
5549
  <afterbreak>\p{Lu}</afterbreak>
5548
5550
  </rule>
5549
5551
  <rule break="no">
5550
- <beforebreak>[.!?…][\h]+</beforebreak>
5551
- <afterbreak>[\h]*([«"„“(]|[&#x2010;-&#x2015;-][\h])\p{Ll}</afterbreak>
5552
+ <beforebreak>[.!?…][»“]?[\h]+</beforebreak>
5553
+ <afterbreak>[\h]*([«"„“(]|[‐-―-][\h])\p{Ll}</afterbreak>
5552
5554
  </rule>
5553
5555
  <rule break="yes">
5554
5556
  <beforebreak>\v[\h]*</beforebreak>
@@ -5562,7 +5564,7 @@
5562
5564
  <!-- various punctuation between lowercase letters -->
5563
5565
  <rule break="no">
5564
5566
  <beforebreak>\b\p{Ll}+[.!?][\h\v]*</beforebreak>
5565
- <afterbreak>\h*(([\(«]|[\[&#x2010;-&#x2015;-][\h\v]*)?\p{Ll})</afterbreak>
5567
+ <afterbreak>\h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll})</afterbreak>
5566
5568
  </rule>
5567
5569
  <rule break="no">
5568
5570
  <beforebreak>([\[\(]*[\]\)]*|\.\.\.|…)[\h\v]+</beforebreak>
@@ -5583,7 +5585,6 @@
5583
5585
  <beforebreak>(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]*</beforebreak>
5584
5586
  <afterbreak></afterbreak>
5585
5587
  </rule>
5586
- <!-- І. В. Коваль, Т. 2, C. 202 -->
5587
5588
  <!-- Іван Ч. (1914 р. н.) -->
5588
5589
  <rule break="no">
5589
5590
  <beforebreak>[\h\v][А-ЯІЇЄҐ]\.[\h\v]*</beforebreak>
@@ -5606,7 +5607,7 @@
5606
5607
  -->
5607
5608
  <rule break="no">
5608
5609
  <beforebreak>\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+</beforebreak>
5609
- <afterbreak>[\h\v]*[№0-9&#x2010;-&#x2015;-]</afterbreak>
5610
+ <afterbreak>[\h\v]*[№0-9‐-―-]</afterbreak>
5610
5611
  </rule>
5611
5612
  <!-- річка - р. Дніпро -->
5612
5613
  <rule break="no">
@@ -5615,7 +5616,7 @@
5615
5616
  </rule>
5616
5617
  <!-- У травні 1949 р. Грушківський район -->
5617
5618
  <rule break="no">
5618
- <beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[&#x2010;-&#x2015;-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
5619
+ <beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[‐-―-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
5619
5620
  <afterbreak>[\v\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h\v]</afterbreak>
5620
5621
  </rule>
5621
5622
  <!-- 15 вересня 1995 р. Україною було підписно -->
@@ -5635,22 +5636,27 @@
5635
5636
  </rule>
5636
5637
  <!-- усталені скорочення, що не збігаються з нескороченими словами -->
5637
5638
  <rule break="no">
5638
- <!-- unfortunately \b ignores \u0301 -->
5639
- <beforebreak>\b(укр|рос|англ|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк))\.[\h\v]*</beforebreak>
5639
+ <beforebreak>\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]*</beforebreak>
5640
5640
  <afterbreak></afterbreak>
5641
5641
  </rule>
5642
5642
  <rule break="no">
5643
- <!-- unfortunately \b ignores \u0301 -->
5644
- <beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
5643
+ <beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
5645
5644
  <afterbreak></afterbreak>
5646
5645
  </rule>
5647
5646
  <rule break="no">
5648
- <!-- unfortunately \b ignores \u0301 -->
5649
5647
  <beforebreak>\b[сС]т\.[\h\v]</beforebreak>
5650
5648
  <afterbreak>[\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v])</afterbreak>
5651
5649
  </rule>
5650
+ <!-- нар. 1945 р. | (1966 р. нар.) | 1975 — нар. Осипчук -->
5651
+ <rule break="no">
5652
+ <beforebreak>([0-9]|[-–—])[\h\v]+нар\.[\h\v]*</beforebreak>
5653
+ <afterbreak></afterbreak>
5654
+ </rule>
5655
+ <rule break="no">
5656
+ <beforebreak>\bнар\.[\h\v]*</beforebreak>
5657
+ <afterbreak>([0-9]|бл\.|арт\.)</afterbreak>
5658
+ </rule>
5652
5659
  <rule break="no">
5653
- <!-- no break only for дол. США -->
5654
5660
  <beforebreak>\bдол\.[\h\v]*</beforebreak>
5655
5661
  <afterbreak>США</afterbreak>
5656
5662
  </rule>
@@ -5666,7 +5672,7 @@
5666
5672
  </rule>
5667
5673
  <!-- Верховний орган, див. Африканський національний конгрес -->
5668
5674
  <rule break="no">
5669
- <beforebreak>[,&#x2010;-&#x2015;-][\h\v]*(див)\.[\h\v]*</beforebreak>
5675
+ <beforebreak>[,‐-―-][\h\v]*(див)\.[\h\v]*</beforebreak>
5670
5676
  <afterbreak></afterbreak>
5671
5677
  </rule>
5672
5678
  <!-- скорочення в дужках:
@@ -5678,10 +5684,14 @@
5678
5684
  </rule>
5679
5685
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5680
5686
  <rule break="no">
5681
- <beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5687
+ <beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5682
5688
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5683
5689
  </rule>
5684
5690
  <rule break="no">
5691
+ <beforebreak>(?&lt;![іи]\s+)\bдр\.[\h\v]*</beforebreak>
5692
+ <afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak>
5693
+ </rule>
5694
+ <rule break="no">
5685
5695
  <beforebreak>\bМан\.[\h\v]*</beforebreak>
5686
5696
  <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5687
5697
  </rule>
@@ -5690,18 +5700,16 @@
5690
5700
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5691
5701
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5692
5702
  </rule>
5693
- <!-- арт. - артикул -->
5694
5703
  <!-- TODO: арт. - артист -->
5695
5704
  <rule break="no">
5696
5705
  <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5697
5706
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5698
5707
  </rule>
5699
- <!-- ХІІ р., 3-6 арт. -->
5708
+ <!-- ХІІ р., 3-6 арт., 2-3 тт. -->
5700
5709
  <rule break="no">
5701
- <beforebreak>[0-9][\h\v]+арт\.[\h\v]*</beforebreak>
5710
+ <beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak>
5702
5711
  <afterbreak></afterbreak>
5703
5712
  </rule>
5704
- <!-- місто, але принаймні з парою літер в назві бо є ще метри (м) -->
5705
5713
  <!-- але розбиваємо «всього 20 м. Почалося» -->
5706
5714
  <rule break="no">
5707
5715
  <beforebreak>(?&lt;!\d[\h\v]*)\bм\.[\h\v]*</beforebreak>
@@ -5725,10 +5733,8 @@
5725
5733
  <!-- статус правових держав. — Авт.). -->
5726
5734
  <rule break="no">
5727
5735
  <beforebreak></beforebreak>
5728
- <afterbreak>[\h\v]*[&#x2010;-&#x2015;-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
5736
+ <afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
5729
5737
  </rule>
5730
- <!-- force the break -->
5731
- <!-- часто зустрічається крапка+U+202F+пробіл, який srx чомусь не розбиває на речення -->
5732
5738
  <!-- але лишаємо ініціали: С.\u202F Шелухин -->
5733
5739
  <rule break="yes">
5734
5740
  <beforebreak>(?&lt;!\h[А-ЯІЇЄҐ])[.!?…]{1,3}\u202F[\h\v]+</beforebreak>
@@ -5746,10 +5752,9 @@
5746
5752
  <!-- “Слон” (2008 р.) У минулому харків’янка -->
5747
5753
  <rule break="yes">
5748
5754
  <beforebreak>[.!?…]['»"„“”)\]›]?[\h\v]+</beforebreak>
5749
- <afterbreak>([&#x2010;-&#x2015;-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
5755
+ <afterbreak>([‐-―-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
5750
5756
  </rule>
5751
5757
  </languagerule>
5752
-
5753
5758
  <languagerule languagerulename="Belarusian">
5754
5759
  <rule break="no">
5755
5760
  <beforebreak>\b\d+\.\s</beforebreak>
@@ -6016,11 +6021,11 @@
6016
6021
  </rule>
6017
6022
  </languagerule>
6018
6023
  <languagerule languagerulename="Portuguese">
6019
- <rule break="no"><!-- URLs without "www."-->
6024
+ <rule break="no">
6020
6025
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
6021
6026
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
6022
6027
  </rule>
6023
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
6028
+ <rule break="no">
6024
6029
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
6025
6030
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
6026
6031
  </rule>
@@ -6515,7 +6520,7 @@
6515
6520
  </rule>
6516
6521
  <!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."-->
6517
6522
  <rule break="no">
6518
- <beforebreak>[\s ][IVX]+\s</beforebreak>
6523
+ <beforebreak>[\s ][IVX]+\s</beforebreak>
6519
6524
  <afterbreak>[^\p{Lu}]+</afterbreak>
6520
6525
  </rule>
6521
6526
  <!--Не раздвајај у случају као "од 13. до 14. века"-->
@@ -6654,83 +6659,83 @@
6654
6659
  </rule>
6655
6660
  </languagerule>
6656
6661
  <languagerule languagerulename="Arabic">
6657
- <rule break="no">
6658
- <beforebreak>\bwww\.</beforebreak>
6659
- <afterbreak>\w</afterbreak>
6660
- </rule>
6661
- <rule break="no">
6662
- <beforebreak>[\[\(]*…[\]\)]* </beforebreak>
6663
- <afterbreak>\p{Ll}</afterbreak>
6664
- </rule>
6665
- <rule break="no">
6666
- <beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
6667
- <afterbreak></afterbreak>
6668
- </rule>
6669
- <rule break="no">
6670
- <beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
6671
- <afterbreak>\p{Ll}</afterbreak>
6672
- </rule>
6673
- <rule break="no">
6674
- <beforebreak>[«»"”']\s*</beforebreak>
6675
- <afterbreak>\s*\p{Ll}</afterbreak>
6676
- </rule>
6677
- <rule break="no">
6678
- <beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
6679
- <afterbreak></afterbreak>
6680
- </rule>
6681
- <rule break="no">
6682
- <beforebreak>\b\p{L}\.\s</beforebreak>
6683
- <afterbreak>\p{L}\.\s</afterbreak>
6684
- </rule>
6685
- <rule break="no">
6686
- <beforebreak>\b\p{L}\.</beforebreak>
6687
- <afterbreak>\p{L}\.</afterbreak>
6688
- </rule>
6689
- <rule break="yes">
6690
- <beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
6691
- <afterbreak>\p{N}+\)\s</afterbreak>
6692
- </rule>
6693
- <rule break="no">
6694
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
6695
- <afterbreak>[\p{N}\p{Ll}]</afterbreak>
6696
- </rule>
6697
- <rule break="no">
6698
- <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
6699
- <afterbreak>[^\p{Lu}]</afterbreak>
6700
- </rule>
6701
- <rule break="no">
6702
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
6703
- <afterbreak></afterbreak>
6704
- </rule>
6705
- <rule break="no">
6706
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
6707
- <afterbreak></afterbreak>
6708
- </rule>
6709
- <rule break="no">
6710
- <beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
6711
- <afterbreak></afterbreak>
6712
- </rule>
6713
- <rule break="no">
6714
- <beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
6715
- <afterbreak></afterbreak>
6716
- </rule>
6717
- <rule break="no">
6718
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
6719
- <afterbreak></afterbreak>
6720
- </rule>
6721
- <rule break="yes">
6722
- <beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
6723
- <afterbreak></afterbreak>
6724
- </rule>
6725
- <rule break="yes">
6726
- <beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
6727
- <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
6728
- </rule>
6729
- <rule break="yes">
6730
- <beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
6731
- <afterbreak>\p{Lu}\p{Ll}</afterbreak>
6732
- </rule>
6733
- </languagerule>
6662
+ <rule break="no">
6663
+ <beforebreak>\bwww\.</beforebreak>
6664
+ <afterbreak>\w</afterbreak>
6665
+ </rule>
6666
+ <rule break="no">
6667
+ <beforebreak>[\[\(]*…[\]\)]* </beforebreak>
6668
+ <afterbreak>\p{Ll}</afterbreak>
6669
+ </rule>
6670
+ <rule break="no">
6671
+ <beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
6672
+ <afterbreak></afterbreak>
6673
+ </rule>
6674
+ <rule break="no">
6675
+ <beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
6676
+ <afterbreak>\p{Ll}</afterbreak>
6677
+ </rule>
6678
+ <rule break="no">
6679
+ <beforebreak>[«»"”']\s*</beforebreak>
6680
+ <afterbreak>\s*\p{Ll}</afterbreak>
6681
+ </rule>
6682
+ <rule break="no">
6683
+ <beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
6684
+ <afterbreak></afterbreak>
6685
+ </rule>
6686
+ <rule break="no">
6687
+ <beforebreak>\b\p{L}\.\s</beforebreak>
6688
+ <afterbreak>\p{L}\.\s</afterbreak>
6689
+ </rule>
6690
+ <rule break="no">
6691
+ <beforebreak>\b\p{L}\.</beforebreak>
6692
+ <afterbreak>\p{L}\.</afterbreak>
6693
+ </rule>
6694
+ <rule break="yes">
6695
+ <beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
6696
+ <afterbreak>\p{N}+\)\s</afterbreak>
6697
+ </rule>
6698
+ <rule break="no">
6699
+ <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
6700
+ <afterbreak>[\p{N}\p{Ll}]</afterbreak>
6701
+ </rule>
6702
+ <rule break="no">
6703
+ <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
6704
+ <afterbreak>[^\p{Lu}]</afterbreak>
6705
+ </rule>
6706
+ <rule break="no">
6707
+ <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
6708
+ <afterbreak></afterbreak>
6709
+ </rule>
6710
+ <rule break="no">
6711
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
6712
+ <afterbreak></afterbreak>
6713
+ </rule>
6714
+ <rule break="no">
6715
+ <beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
6716
+ <afterbreak></afterbreak>
6717
+ </rule>
6718
+ <rule break="no">
6719
+ <beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
6720
+ <afterbreak></afterbreak>
6721
+ </rule>
6722
+ <rule break="no">
6723
+ <beforebreak>\(\p{Ll}+\.\s</beforebreak>
6724
+ <afterbreak></afterbreak>
6725
+ </rule>
6726
+ <rule break="yes">
6727
+ <beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
6728
+ <afterbreak></afterbreak>
6729
+ </rule>
6730
+ <rule break="yes">
6731
+ <beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
6732
+ <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
6733
+ </rule>
6734
+ <rule break="yes">
6735
+ <beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
6736
+ <afterbreak>\p{Lu}\p{Ll}</afterbreak>
6737
+ </rule>
6738
+ </languagerule>
6734
6739
  </languagerules>
6735
6740
  <maprules>
6736
6741
  <languagemap languagepattern=".*" languagerulename="GeneralImportant"></languagemap>
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: srx-languagetool
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.0
4
+ version: 0.9.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Aaron Madlon-Kay
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-09-29 00:00:00.000000000 Z
11
+ date: 2022-12-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: srx