srx-languagetool 0.8.0 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/srx/segment.srx CHANGED
@@ -5,7 +5,7 @@
5
5
  <formathandle type="end" include="yes"></formathandle>
6
6
  <formathandle type="isolated" include="no"></formathandle>
7
7
  <okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes" useIcu4JBreakRules="no" treatIsolatedCodesAsWhitespace="no"></okpsrx:options>
8
- <okpsrx:sample language="sr" useMappedRules="yes">Поштовани господине одн. госпођо. Видео сам </okpsrx:sample>
8
+ <okpsrx:sample language="nl" useMappedRules="yes">De organisatie Doe! is een rare.</okpsrx:sample>
9
9
  <okpsrx:rangeRule></okpsrx:rangeRule>
10
10
  </header>
11
11
  <body>
@@ -1084,6 +1084,11 @@
1084
1084
  <beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u0002]*\p{Pe}\s</beforebreak>
1085
1085
  <afterbreak>\p{Ll}</afterbreak>
1086
1086
  </rule>
1087
+ <!--p. n.e. (błędny podział wiersza)-->
1088
+ <rule break="no">
1089
+ <beforebreak>p\.\s</beforebreak>
1090
+ <afterbreak>n\.\s?e\.</afterbreak>
1091
+ </rule>
1087
1092
  <rule break="yes">
1088
1093
  <beforebreak>[\.!?…]['"\p{Pe}\u00BB\u2019\u201D\u203A\u0002¹²³]*\s</beforebreak>
1089
1094
  <afterbreak></afterbreak>
@@ -1106,7 +1111,7 @@
1106
1111
  <beforebreak>[\u00A0\s]</beforebreak>
1107
1112
  <afterbreak>\n</afterbreak>
1108
1113
  </rule>
1109
- <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1114
+ <rule break="no">
1110
1115
  <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
1116
  <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1112
1117
  </rule>
@@ -1114,96 +1119,100 @@
1114
1119
  <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1115
1120
  <afterbreak>\p{Ll}</afterbreak>
1116
1121
  </rule>
1117
- <rule break="no"><!-- U.S.A (no dot at end) -->
1122
+ <rule break="no">
1118
1123
  <beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
1119
1124
  <afterbreak>[A-Z]\b</afterbreak>
1120
1125
  </rule>
1121
- <rule break="no"><!-- A.I (no dot at end) -->
1126
+ <rule break="no">
1122
1127
  <beforebreak>\bA\.</beforebreak>
1123
1128
  <afterbreak>I\b</afterbreak>
1124
1129
  </rule>
1125
- <rule break="no"><!-- S.I (no dot at end) -->
1130
+ <rule break="no">
1126
1131
  <beforebreak>\bS\.</beforebreak>
1127
1132
  <afterbreak>I\b</afterbreak>
1128
1133
  </rule>
1129
- <rule break="no"><!-- L.A (no dot at end) -->
1134
+ <rule break="no">
1130
1135
  <beforebreak>\bL\.</beforebreak>
1131
1136
  <afterbreak>A\b</afterbreak>
1132
1137
  </rule>
1133
- <rule break="no"><!-- U.S (no dot at end) -->
1138
+ <rule break="no">
1134
1139
  <beforebreak>\bU\.</beforebreak>
1135
1140
  <afterbreak>[SK]\b</afterbreak>
1136
1141
  </rule>
1137
- <rule break="no"><!-- I.S (no dot at end) -->
1142
+ <rule break="no">
1138
1143
  <beforebreak>\bI\.</beforebreak>
1139
1144
  <afterbreak>S\b</afterbreak>
1140
1145
  </rule>
1141
- <rule break="no"><!-- M.Z (no dot at end) -->
1146
+ <rule break="no">
1142
1147
  <beforebreak>\bM\.</beforebreak>
1143
1148
  <afterbreak>Z\b</afterbreak>
1144
1149
  </rule>
1145
- <rule break="no"><!-- URLs without "www."-->
1150
+ <rule break="no">
1146
1151
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1147
1152
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
1148
1153
  </rule>
1149
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
1154
+ <rule break="no">
1150
1155
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
1151
1156
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1152
1157
  </rule>
1153
- <rule break="no"><!-- No. 5 -->
1158
+ <rule break="no">
1154
1159
  <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1155
1160
  <afterbreak>\p{N}</afterbreak>
1156
1161
  </rule>
1157
- <rule break="no"><!-- Ph.D. -->
1162
+ <rule break="no">
1158
1163
  <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1159
1164
  <afterbreak>D\.?</afterbreak>
1160
1165
  </rule>
1161
- <rule break="no"><!-- min. -->
1166
+ <rule break="no">
1162
1167
  <beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1163
1168
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1164
1169
  </rule>
1165
- <rule break="no"><!-- hr. -->
1170
+ <rule break="no">
1166
1171
  <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1167
1172
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1168
1173
  </rule>
1169
- <rule break="no"><!-- Fig. 8 -->
1174
+ <rule break="no">
1170
1175
  <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1171
1176
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1172
1177
  </rule>
1173
- <rule break="no"><!-- Fig. (8) -->
1178
+ <rule break="no">
1174
1179
  <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1175
1180
  <afterbreak>\(\p{N}\)</afterbreak>
1176
1181
  </rule>
1177
- <rule break="no"><!-- I'm (...) great! -->
1182
+ <rule break="no">
1178
1183
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1179
1184
  <afterbreak>[^\p{P}]</afterbreak>
1180
1185
  </rule>
1181
- <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1186
+ <rule break="no">
1187
+ <beforebreak>[?!.…]["”]\)[\s\u00A0]</beforebreak>
1188
+ <afterbreak>[a-z].*</afterbreak>
1189
+ </rule>
1190
+ <rule break="no">
1182
1191
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1183
1192
  <afterbreak>[^\p{P}]</afterbreak>
1184
1193
  </rule>
1185
- <rule break="no"><!-- e.g. -->
1194
+ <rule break="no">
1186
1195
  <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1187
1196
  <afterbreak></afterbreak>
1188
1197
  </rule>
1189
- <rule break="no"><!-- vs. -->
1198
+ <rule break="no">
1190
1199
  <beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
1191
1200
  <afterbreak></afterbreak>
1192
1201
  </rule>
1193
- <rule break="no"><!-- pp. -->
1202
+ <rule break="no">
1194
1203
  <beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
1195
1204
  <afterbreak></afterbreak>
1196
1205
  </rule>
1197
- <rule break="no"><!-- esp. -->
1206
+ <rule break="no">
1198
1207
  <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1199
1208
  <afterbreak></afterbreak>
1200
1209
  </rule>
1201
1210
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1202
- <rule break="no"><!-- Etc. -->
1211
+ <rule break="no">
1203
1212
  <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1204
1213
  <afterbreak>[^\p{Lu}]</afterbreak>
1205
1214
  </rule>
1206
- <rule break="no"><!-- BTW (by the way) -->
1215
+ <rule break="no">
1207
1216
  <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1208
1217
  <afterbreak></afterbreak>
1209
1218
  </rule>
@@ -1251,39 +1260,39 @@
1251
1260
  <beforebreak>(?i)FRITZ!</beforebreak>
1252
1261
  <afterbreak>(?i)Box</afterbreak>
1253
1262
  </rule>
1254
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
1263
+ <rule break="no">
1255
1264
  <beforebreak>ID.</beforebreak>
1256
1265
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
1257
1266
  </rule>
1258
- <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1267
+ <rule break="no">
1259
1268
  <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1260
1269
  <afterbreak></afterbreak>
1261
1270
  </rule>
1262
- <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1271
+ <rule break="no">
1263
1272
  <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1264
1273
  <afterbreak></afterbreak>
1265
1274
  </rule>
1266
- <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1275
+ <rule break="no">
1267
1276
  <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1268
1277
  <afterbreak></afterbreak>
1269
1278
  </rule>
1270
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1279
+ <rule break="no">
1271
1280
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1272
1281
  <afterbreak>Eng\.?</afterbreak>
1273
1282
  </rule>
1274
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1283
+ <rule break="no">
1275
1284
  <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1276
1285
  <afterbreak>[BM]\.?</afterbreak>
1277
1286
  </rule>
1278
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1287
+ <rule break="no">
1279
1288
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1280
1289
  <afterbreak>Sc\.?</afterbreak>
1281
1290
  </rule>
1282
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1291
+ <rule break="no">
1283
1292
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1284
1293
  <afterbreak>Comp?\.?</afterbreak>
1285
1294
  </rule>
1286
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1295
+ <rule break="no">
1287
1296
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1288
1297
  <afterbreak>Arch\.?</afterbreak>
1289
1298
  </rule>
@@ -1375,7 +1384,7 @@
1375
1384
  <beforebreak>\b\p{L}\.</beforebreak>
1376
1385
  <afterbreak>\p{L}\.</afterbreak>
1377
1386
  </rule>
1378
- <rule break="no"><!-- Jones v. Smith -->
1387
+ <rule break="no">
1379
1388
  <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1380
1389
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1381
1390
  </rule>
@@ -1388,7 +1397,7 @@
1388
1397
  <afterbreak>\p{Ll}+</afterbreak>
1389
1398
  </rule>
1390
1399
  <rule break="no">
1391
- <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1400
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak>
1392
1401
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1393
1402
  </rule>
1394
1403
  <rule break="no">
@@ -1419,8 +1428,8 @@
1419
1428
  <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1420
1429
  <afterbreak></afterbreak>
1421
1430
  </rule>
1422
- <rule break="no"><!-- i.e. -->
1423
- <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1431
+ <rule break="no">
1432
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak>
1424
1433
  <afterbreak></afterbreak>
1425
1434
  </rule>
1426
1435
  <rule break="yes">
@@ -1532,37 +1541,52 @@
1532
1541
  </languagerule>
1533
1542
  <languagerule languagerulename="Dutch">
1534
1543
  <rule break="no">
1535
- <!-- sp.a -->
1536
1544
  <beforebreak>\b(sp|SP)</beforebreak>
1537
1545
  <afterbreak>\.[aA]\b</afterbreak>
1538
1546
  </rule>
1539
1547
  <rule break="no">
1540
- <!-- .Net -->
1541
1548
  <beforebreak>\s[.]</beforebreak>
1542
1549
  <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1543
1550
  </rule>
1544
- <rule break="no"><!-- quoted sentence in sentence -->
1551
+ <rule break="no">
1545
1552
  <beforebreak>[.?!][’'"]</beforebreak>
1546
1553
  <afterbreak> [a-z]</afterbreak>
1547
1554
  </rule>
1548
- <rule break="no"><!-- URLs without "www."-->
1555
+ <rule break="no">
1549
1556
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1550
1557
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
1551
1558
  </rule>
1552
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
1559
+ <rule break="no">
1553
1560
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
1554
1561
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
1555
1562
  </rule>
1556
- <rule break="no"><!-- Abbreviated books of the Bible and biblical apocrypha-->
1557
- <beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
1563
+ <rule break="no">
1564
+ <beforebreak>\b(blz|pag|fig)\.\s</beforebreak>
1565
+ <afterbreak>[0-9]</afterbreak>
1566
+ </rule>
1567
+ <!--Abbrevs that can happen in sentence and at end-->
1568
+ <rule break="no">
1569
+ <beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak>
1570
+ <afterbreak>\p{Ll}</afterbreak>
1571
+ </rule>
1572
+ <rule break="yes">
1573
+ <beforebreak>\s(la|do|del)\sMar\.\s</beforebreak>
1558
1574
  <afterbreak></afterbreak>
1559
1575
  </rule>
1560
1576
  <rule break="no">
1561
- <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros|Stb)\.\s</beforebreak>
1577
+ <beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
1562
1578
  <afterbreak></afterbreak>
1563
1579
  </rule>
1564
1580
  <rule break="no">
1565
- <beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak>
1581
+ <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|(Prof\.)?[Dd]r|Em|Fa|Kon|Stb)\.\s</beforebreak>
1582
+ <afterbreak>\p{Lu}</afterbreak>
1583
+ </rule>
1584
+ <rule break="no">
1585
+ <beforebreak>\b(Stb)\.\s</beforebreak>
1586
+ <afterbreak>[0-9]</afterbreak>
1587
+ </rule>
1588
+ <rule break="no">
1589
+ <beforebreak>\b([Mm]ej|[Mm]evr|[Mm]rs|[Mm]s|[Mm]gr|[Mm]w|Ndl|Ned|Nl|No|Prof|[Ss]ecr|Chr|Jac|[Ww]ed|Zr)\.\s</beforebreak>
1566
1590
  <afterbreak></afterbreak>
1567
1591
  </rule>
1568
1592
  <rule break="no">
@@ -1570,23 +1594,27 @@
1570
1594
  <afterbreak></afterbreak>
1571
1595
  </rule>
1572
1596
  <rule break="no">
1573
- <beforebreak>\b(abs|abstr|adj|adm|afb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|bv|afm|evt|exp)\.\s</beforebreak>
1597
+ <beforebreak>\b(abs|abstr|adj|adm|[Aa]fb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|afm|evt|exp|vs)\.\s</beforebreak>
1574
1598
  <afterbreak></afterbreak>
1575
1599
  </rule>
1576
1600
  <rule break="no">
1577
- <beforebreak>\b(al|ald|alg|amb|ambt|anat|antrop|apoth)\.\s</beforebreak>
1601
+ <beforebreak>\b(ald|alg|amb|anat|antrop|apoth)\.\s</beforebreak>
1578
1602
  <afterbreak></afterbreak>
1579
1603
  </rule>
1604
+ <rule break="yes">
1605
+ <beforebreak>\seen\sprof\.\s</beforebreak>
1606
+ <afterbreak>\p{Lu}</afterbreak>
1607
+ </rule>
1580
1608
  <rule break="no">
1581
1609
  <beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
1582
1610
  <afterbreak></afterbreak>
1583
1611
  </rule>
1584
1612
  <rule break="no">
1585
- <beforebreak>\b(arch|archeol|art|bc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
1613
+ <beforebreak>\b(arch|archeolbc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
1586
1614
  <afterbreak></afterbreak>
1587
1615
  </rule>
1588
1616
  <rule break="no">
1589
- <beforebreak>\b(bijz|blz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
1617
+ <beforebreak>\b(bijz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
1590
1618
  <afterbreak></afterbreak>
1591
1619
  </rule>
1592
1620
  <rule break="no">
@@ -1594,47 +1622,47 @@
1594
1622
  <afterbreak></afterbreak>
1595
1623
  </rule>
1596
1624
  <rule break="no">
1597
- <beforebreak>\b(ed|em|enz|etc|ev|[Ee]xcl|fa|fam|fig|fin|fl|fr.)\.\s</beforebreak>
1625
+ <beforebreak>\b([Ee]d|em|ev|[Ee]xcl|[Ff]a|[Ff]am|[fF]ig|fin|fl|fr)\.\s</beforebreak>
1598
1626
  <afterbreak></afterbreak>
1599
1627
  </rule>
1600
1628
  <rule break="no">
1601
- <beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|inz|ir|jhr|jkvr)\.\s</beforebreak>
1629
+ <beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|[Ss]ec|inz|ir|jhr|jkvr)\.\s</beforebreak>
1602
1630
  <afterbreak></afterbreak>
1603
1631
  </rule>
1604
1632
  <rule break="no">
1605
- <beforebreak>\b(jl|jr|kr|kt|lab|lic|ll|lt|lw|max|mevr|mi|[Mm]in|mld)\.\s</beforebreak>
1633
+ <beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak>
1606
1634
  <afterbreak></afterbreak>
1607
1635
  </rule>
1608
1636
  <rule break="no">
1609
- <beforebreak>\b(mln|mr|mw|nl|no|nr|nrs|ob|obl|ong|onov|o.a)\.\s</beforebreak>
1637
+ <beforebreak>\b(mln|[Mm]r|[Mm]w|nl|no|nr|nrs|ob|obl|ong|onov)\.\s</beforebreak>
1610
1638
  <afterbreak></afterbreak>
1611
1639
  </rule>
1612
1640
  <rule break="no">
1613
- <beforebreak>\b(opm|org|ov|pag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
1641
+ <beforebreak>\b(opm|org|ov|[Pp]ag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
1614
1642
  <afterbreak></afterbreak>
1615
1643
  </rule>
1616
1644
  <rule break="no">
1617
- <beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|tk)\.\s</beforebreak>
1645
+ <beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|prof|tk)\.\s</beforebreak>
1618
1646
  <afterbreak></afterbreak>
1619
1647
  </rule>
1620
1648
  <rule break="no">
1621
1649
  <beforebreak>\b([A-Z]|Adr|Chr|Fr|Fred|IJ|Jac|Joh|Ph|St|Th|Tj|v|v\.(\s)?d)\.(\s)?</beforebreak>
1622
- <afterbreak>[A-Z]</afterbreak>
1650
+ <afterbreak>\p{Lu}</afterbreak>
1623
1651
  </rule>
1624
1652
  <rule break="no">
1625
1653
  <beforebreak>\b[vn]\.\s</beforebreak>
1626
1654
  <afterbreak>Chr</afterbreak>
1627
1655
  </rule>
1628
1656
  <rule break="no">
1629
- <beforebreak>\b(uitsl|ver|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
1630
- <afterbreak></afterbreak>
1657
+ <beforebreak>\b(uitsl|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
1658
+ <afterbreak>\p{Ll}</afterbreak>
1631
1659
  </rule>
1632
1660
  <rule break="no">
1633
- <beforebreak>\b(mm|cm|km|mg|kg|h|kW|mW)\.\s</beforebreak>
1661
+ <beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
1634
1662
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
1635
1663
  </rule>
1636
1664
  <rule break="yes">
1637
- <beforebreak>\b(mm|cm|km|ml|kg|kW|h|mg)\.\s</beforebreak>
1665
+ <beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
1638
1666
  <afterbreak></afterbreak>
1639
1667
  </rule>
1640
1668
  <rule break="no">
@@ -1686,10 +1714,6 @@
1686
1714
  <afterbreak></afterbreak>
1687
1715
  </rule>
1688
1716
  <rule break="no">
1689
- <beforebreak>\b\p{Lu}\p{Ll}\.\s?</beforebreak>
1690
- <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1691
- </rule>
1692
- <rule break="no">
1693
1717
  <beforebreak>\.\p{Lu}\p{Ll}\.\s?</beforebreak>
1694
1718
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1695
1719
  </rule>
@@ -1698,14 +1722,6 @@
1698
1722
  <beforebreak>\b\d+\.\s</beforebreak>
1699
1723
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
1700
1724
  </rule>
1701
- <rule break="yes">
1702
- <beforebreak>\been\sprof\.\s</beforebreak>
1703
- <afterbreak>[^\p{Ll}]</afterbreak>
1704
- </rule>
1705
- <rule break="no">
1706
- <beforebreak>\bprof\.\s</beforebreak>
1707
- <afterbreak></afterbreak>
1708
- </rule>
1709
1725
  <rule break="no">
1710
1726
  <beforebreak>[.!?…][’'"]\s</beforebreak>
1711
1727
  <afterbreak>[a-z]</afterbreak>
@@ -1723,11 +1739,19 @@
1723
1739
  <afterbreak>[a-z]</afterbreak>
1724
1740
  </rule>
1725
1741
  <rule break="yes">
1726
- <beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1742
+ <beforebreak>\s'[2-9][.]\s</beforebreak>
1743
+ <afterbreak></afterbreak>
1744
+ </rule>
1745
+ <rule break="no">
1746
+ <beforebreak>\s[A-Z].+!\s</beforebreak>
1747
+ <afterbreak>[a-z]</afterbreak>
1748
+ </rule>
1749
+ <rule break="yes">
1750
+ <beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1727
1751
  <afterbreak></afterbreak>
1728
1752
  </rule>
1729
1753
  <rule break="yes">
1730
- <beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1754
+ <beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1731
1755
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1732
1756
  </rule>
1733
1757
  <rule break="yes">
@@ -1768,31 +1792,29 @@
1768
1792
  <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1769
1793
  </rule>
1770
1794
  <rule break="no">
1771
- <!-- "E. coli etc. -->
1772
1795
  <beforebreak>"[A-Z][.]\s</beforebreak>
1773
1796
  <afterbreak>[a-z]</afterbreak>
1774
1797
  </rule>
1775
1798
  <rule break="no">
1776
- <!-- Cornelisz. -->
1777
1799
  <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1778
1800
  <afterbreak>[a-z]</afterbreak>
1779
1801
  </rule>
1780
1802
  <rule break="no">
1781
- <!-- De n. XIV/vagus (nervus) -->
1782
1803
  <beforebreak>De n[.]\s</beforebreak>
1783
1804
  <afterbreak>[a-z]|[XIV]</afterbreak>
1784
1805
  </rule>
1785
1806
  <rule break="no">
1786
- <!-- MOL.E -->
1787
1807
  <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1788
1808
  <afterbreak>[A-Z]</afterbreak>
1789
1809
  </rule>
1790
1810
  <rule break="no">
1791
- <!-- ..." betekent -->
1792
1811
  <beforebreak>\.\.</beforebreak>
1793
1812
  <afterbreak>" [a-z]</afterbreak>
1794
1813
  </rule>
1795
- <!-- ##### end of Dutch #### -->
1814
+ <rule break="no">
1815
+ <beforebreak>\sBTW\.</beforebreak>
1816
+ <afterbreak>\p{Ll}</afterbreak>
1817
+ </rule>
1796
1818
  </languagerule>
1797
1819
  <languagerule languagerulename="Slovak">
1798
1820
  <rule break="no">
@@ -4370,7 +4392,7 @@
4370
4392
  </rule>
4371
4393
  <rule break="no">
4372
4394
  <beforebreak>\b(р|ред|Рис|рус|с|сб|св|См|см|сов|соч|соц|спец|ср|ст|стр|т|тел|Тел|тех|тов|тт|туп)\.\s</beforebreak>
4373
- <afterbreak></afterbreak>
4395
+ <afterbreak>\p{Ll}</afterbreak>
4374
4396
  </rule>
4375
4397
  <rule break="no">
4376
4398
  <beforebreak>\b(руб|Руб|тыс|Тыс|трлн)\.\s</beforebreak>
@@ -4654,7 +4676,7 @@
4654
4676
  <afterbreak>[XIV\d]+\b</afterbreak>
4655
4677
  </rule>
4656
4678
  <rule break="no">
4657
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4679
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4658
4680
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4659
4681
  </rule>
4660
4682
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
@@ -4721,12 +4743,10 @@
4721
4743
  </rule>
4722
4744
  </languagerule>
4723
4745
  <languagerule languagerulename="Spanish">
4724
-
4725
4746
  <rule break="no">
4726
4747
  <beforebreak>¿[^?]+:[\s\u00A0]</beforebreak>
4727
4748
  <afterbreak>.</afterbreak>
4728
4749
  </rule>
4729
-
4730
4750
  <rule break="no">
4731
4751
  <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4732
4752
  <afterbreak>\p{Ll}</afterbreak>
@@ -4742,7 +4762,7 @@
4742
4762
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4743
4763
  <rule break="no">
4744
4764
  <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4745
- <afterbreak/>
4765
+ <afterbreak></afterbreak>
4746
4766
  </rule>
4747
4767
  <!-- Ellipsis: ... lowercase -->
4748
4768
  <rule break="no">
@@ -4772,39 +4792,37 @@
4772
4792
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4773
4793
  </rule>
4774
4794
  <rule break="no">
4775
- <!-- URLs without "www."-->
4776
4795
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
4777
4796
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
4778
4797
  </rule>
4779
4798
  <rule break="no">
4780
- <!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
4781
4799
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
4782
4800
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
4783
4801
  </rule>
4784
4802
  <!-- Abbreviations that cannot finish sentences-->
4785
4803
  <rule break="no">
4786
4804
  <beforebreak>\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0]</beforebreak>
4787
- <afterbreak/>
4805
+ <afterbreak></afterbreak>
4788
4806
  </rule>
4789
4807
  <rule break="no">
4790
4808
  <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4791
- <afterbreak/>
4809
+ <afterbreak></afterbreak>
4792
4810
  </rule>
4793
4811
  <rule break="no">
4794
4812
  <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4795
- <afterbreak/>
4813
+ <afterbreak></afterbreak>
4796
4814
  </rule>
4797
4815
  <rule break="no">
4798
4816
  <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4799
- <afterbreak/>
4817
+ <afterbreak></afterbreak>
4800
4818
  </rule>
4801
4819
  <rule break="no">
4802
4820
  <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4803
- <afterbreak/>
4821
+ <afterbreak></afterbreak>
4804
4822
  </rule>
4805
4823
  <rule break="no">
4806
4824
  <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4807
- <afterbreak/>
4825
+ <afterbreak></afterbreak>
4808
4826
  </rule>
4809
4827
  <!-- Abbreviations that can finish sentences -->
4810
4828
  <rule break="no">
@@ -4837,7 +4855,7 @@
4837
4855
  <!-- Composed abbrev. -->
4838
4856
  <rule break="no">
4839
4857
  <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4840
- <afterbreak/>
4858
+ <afterbreak></afterbreak>
4841
4859
  </rule>
4842
4860
  <!-- Units -->
4843
4861
  <rule break="no">
@@ -4859,11 +4877,11 @@
4859
4877
  </rule>
4860
4878
  </languagerule>
4861
4879
  <languagerule languagerulename="German">
4862
- <rule break="no"><!-- URLs without "www."-->
4880
+ <rule break="no">
4863
4881
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
4864
4882
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
4865
4883
  </rule>
4866
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
4884
+ <rule break="no">
4867
4885
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
4868
4886
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
4869
4887
  </rule>
@@ -4882,27 +4900,27 @@
4882
4900
  <beforebreak>[^-\p{L}'’/°]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4883
4901
  <afterbreak></afterbreak>
4884
4902
  </rule>
4885
- <rule break="no"><!-- special case: "Das 1. Internationale Filmfestival findet nächste Woche statt." -->
4903
+ <rule break="no">
4886
4904
  <beforebreak>([Dd](as|er|ie|iese[rsmn]?|en|em)|[kmsd]?ein(e[rsnm]?)?|am|fürs|ins|zum|im|am|zur) \d+\.[\u00A0\s]+</beforebreak>
4887
4905
  <afterbreak>[A-ZÄÖÜ].*</afterbreak>
4888
4906
  </rule>
4889
4907
  <rule break="no">
4890
- <beforebreak>Ust.</beforebreak><!-- needed for German rule UST_ID -->
4908
+ <beforebreak>Ust.</beforebreak>
4891
4909
  <afterbreak>Id</afterbreak>
4892
4910
  </rule>
4893
4911
  <rule break="no">
4894
- <beforebreak>Prof.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN -->
4912
+ <beforebreak>Prof.</beforebreak>
4895
4913
  <afterbreak>Dr</afterbreak>
4896
4914
  </rule>
4897
4915
  <rule break="no">
4898
- <beforebreak>Dr.</beforebreak><!-- needed for German rule ABKUERZUNG_LEERZEICHEN -->
4916
+ <beforebreak>Dr.</beforebreak>
4899
4917
  <afterbreak>iur|med|oec|phil|rer|theol</afterbreak>
4900
4918
  </rule>
4901
4919
  <rule break="no">
4902
4920
  <beforebreak>(?i)FRITZ!</beforebreak>
4903
4921
  <afterbreak>(?i)Box</afterbreak>
4904
4922
  </rule>
4905
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
4923
+ <rule break="no">
4906
4924
  <beforebreak>ID.</beforebreak>
4907
4925
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
4908
4926
  </rule>
@@ -5012,11 +5030,11 @@
5012
5030
  </rule>
5013
5031
  <!-- German abbreviations -->
5014
5032
  <rule break="no">
5015
- <beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
5033
+ <beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|Bsp|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
5016
5034
  <afterbreak></afterbreak>
5017
5035
  </rule>
5018
5036
  <rule break="no">
5019
- <beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|exkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
5037
+ <beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
5020
5038
  <afterbreak></afterbreak>
5021
5039
  </rule>
5022
5040
  <rule break="no">
@@ -5028,11 +5046,11 @@
5028
5046
  <afterbreak>\p{Ll}</afterbreak>
5029
5047
  </rule>
5030
5048
  <rule break="no">
5031
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
5049
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
5032
5050
  <afterbreak></afterbreak>
5033
5051
  </rule>
5034
5052
  <rule break="no">
5035
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
5053
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
5036
5054
  <afterbreak></afterbreak>
5037
5055
  </rule>
5038
5056
  <rule break="no">
@@ -5048,7 +5066,11 @@
5048
5066
  <afterbreak>\p{Ll}</afterbreak>
5049
5067
  </rule>
5050
5068
  <rule break="no">
5051
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
5069
+ <beforebreak>\d+\.\d+\.[\u00A0\s]</beforebreak>
5070
+ <afterbreak>[\-–][\u00A0\s]\d+</afterbreak>
5071
+ </rule>
5072
+ <rule break="no">
5073
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
5052
5074
  <afterbreak></afterbreak>
5053
5075
  </rule>
5054
5076
  <!-- Break rules -->
@@ -5159,27 +5181,27 @@
5159
5181
  <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sep|Sept|Oct|Okt|Nov|Dec|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co)\.\s</beforebreak>
5160
5182
  <afterbreak></afterbreak>
5161
5183
  </rule>
5162
- <rule break="no"><!-- Ph.D. -->
5184
+ <rule break="no">
5163
5185
  <beforebreak>\bP[Hh]\.\s?</beforebreak>
5164
5186
  <afterbreak>D\.?</afterbreak>
5165
5187
  </rule>
5166
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5188
+ <rule break="no">
5167
5189
  <beforebreak>\b[BM]\.\s?</beforebreak>
5168
5190
  <afterbreak>Eng\.?</afterbreak>
5169
5191
  </rule>
5170
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5192
+ <rule break="no">
5171
5193
  <beforebreak>\bLL\.\s?</beforebreak>
5172
5194
  <afterbreak>[BM]\.?</afterbreak>
5173
5195
  </rule>
5174
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5196
+ <rule break="no">
5175
5197
  <beforebreak>\b[BM]\.\s?</beforebreak>
5176
5198
  <afterbreak>Sc\.?</afterbreak>
5177
5199
  </rule>
5178
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5200
+ <rule break="no">
5179
5201
  <beforebreak>\b[BM]\.\s?</beforebreak>
5180
5202
  <afterbreak>Comp?\.?</afterbreak>
5181
5203
  </rule>
5182
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5204
+ <rule break="no">
5183
5205
  <beforebreak>\b[BM]\.\s?</beforebreak>
5184
5206
  <afterbreak>Arch\.?</afterbreak>
5185
5207
  </rule>
@@ -5309,16 +5331,15 @@
5309
5331
  <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5310
5332
  <afterbreak></afterbreak>
5311
5333
  </rule>
5312
- <rule break="no"><!-- URLs without "www."-->
5334
+ <rule break="no">
5313
5335
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
5314
5336
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
5315
5337
  </rule>
5316
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
5338
+ <rule break="no">
5317
5339
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5318
5340
  <afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
5319
5341
  </rule>
5320
5342
  <rule break="no">
5321
- <!-- gaffa.org -->
5322
5343
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5323
5344
  <afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
5324
5345
  </rule>
@@ -5363,15 +5384,15 @@
5363
5384
  <beforebreak>\b\p{L}\.</beforebreak>
5364
5385
  <afterbreak>\p{L}\.</afterbreak>
5365
5386
  </rule>
5366
- <rule break="no"><!-- Je suis (...) Chris. -->
5387
+ <rule break="no">
5367
5388
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5368
5389
  <afterbreak>[^\p{P}]</afterbreak>
5369
5390
  </rule>
5370
- <rule break="no"><!-- Je suis (...?) Chris. -->
5391
+ <rule break="no">
5371
5392
  <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5372
5393
  <afterbreak>[^\p{P}]</afterbreak>
5373
5394
  </rule>
5374
- <rule break="no"><!-- Jones v. Smith -->
5395
+ <rule break="no">
5375
5396
  <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5376
5397
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5377
5398
  </rule>
@@ -5411,44 +5432,44 @@
5411
5432
  <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5412
5433
  <afterbreak></afterbreak>
5413
5434
  </rule>
5414
- <rule break="no"><!-- i.e. -->
5415
- <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5435
+ <rule break="no">
5436
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak>
5416
5437
  <afterbreak></afterbreak>
5417
5438
  </rule>
5418
- <rule break="no"><!-- U.S.A (no dot at end) -->
5439
+ <rule break="no">
5419
5440
  <beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
5420
5441
  <afterbreak>[A-Z]\b</afterbreak>
5421
5442
  </rule>
5422
- <rule break="no"><!-- L.A (no dot at end) -->
5443
+ <rule break="no">
5423
5444
  <beforebreak>\bL\.</beforebreak>
5424
5445
  <afterbreak>A\b</afterbreak>
5425
5446
  </rule>
5426
- <rule break="no"><!-- U.S (no dot at end) -->
5447
+ <rule break="no">
5427
5448
  <beforebreak>\bU\.</beforebreak>
5428
5449
  <afterbreak>[SK]\b</afterbreak>
5429
5450
  </rule>
5430
- <rule break="no"><!-- No. 5 -->
5451
+ <rule break="no">
5431
5452
  <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5432
5453
  <afterbreak>\p{N}</afterbreak>
5433
5454
  </rule>
5434
- <rule break="no"><!-- Ph.D. -->
5455
+ <rule break="no">
5435
5456
  <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5436
5457
  <afterbreak>D\.?</afterbreak>
5437
5458
  </rule>
5438
- <rule break="no"><!-- e.g. -->
5459
+ <rule break="no">
5439
5460
  <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5440
5461
  <afterbreak></afterbreak>
5441
5462
  </rule>
5442
- <rule break="no"><!-- vs. -->
5463
+ <rule break="no">
5443
5464
  <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5444
5465
  <afterbreak></afterbreak>
5445
5466
  </rule>
5446
5467
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5447
- <rule break="no"><!-- Etc. -->
5468
+ <rule break="no">
5448
5469
  <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5449
5470
  <afterbreak>[^\p{Lu}]</afterbreak>
5450
5471
  </rule>
5451
- <rule break="no"><!-- BTW (by the way) -->
5472
+ <rule break="no">
5452
5473
  <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5453
5474
  <afterbreak></afterbreak>
5454
5475
  </rule>
@@ -5456,39 +5477,39 @@
5456
5477
  <beforebreak>(?i)FRITZ!</beforebreak>
5457
5478
  <afterbreak>(?i)Box</afterbreak>
5458
5479
  </rule>
5459
- <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
5480
+ <rule break="no">
5460
5481
  <beforebreak>ID.</beforebreak>
5461
5482
  <afterbreak>3|4|Buzz|Crozz</afterbreak>
5462
5483
  </rule>
5463
- <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5484
+ <rule break="no">
5464
5485
  <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5465
5486
  <afterbreak></afterbreak>
5466
5487
  </rule>
5467
- <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5488
+ <rule break="no">
5468
5489
  <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5469
5490
  <afterbreak></afterbreak>
5470
5491
  </rule>
5471
- <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5492
+ <rule break="no">
5472
5493
  <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5473
5494
  <afterbreak></afterbreak>
5474
5495
  </rule>
5475
- <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5496
+ <rule break="no">
5476
5497
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5477
5498
  <afterbreak>Eng\.?</afterbreak>
5478
5499
  </rule>
5479
- <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5500
+ <rule break="no">
5480
5501
  <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5481
5502
  <afterbreak>[BM]\.?</afterbreak>
5482
5503
  </rule>
5483
- <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5504
+ <rule break="no">
5484
5505
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5485
5506
  <afterbreak>Sc\.?</afterbreak>
5486
5507
  </rule>
5487
- <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5508
+ <rule break="no">
5488
5509
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5489
5510
  <afterbreak>Comp?\.?</afterbreak>
5490
5511
  </rule>
5491
- <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5512
+ <rule break="no">
5492
5513
  <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5493
5514
  <afterbreak>Arch\.?</afterbreak>
5494
5515
  </rule>
@@ -5534,7 +5555,6 @@
5534
5555
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5535
5556
  </rule>
5536
5557
  </languagerule>
5537
-
5538
5558
  <languagerule languagerulename="Ukrainian">
5539
5559
  <!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. -->
5540
5560
  <rule break="no">
@@ -5547,8 +5567,8 @@
5547
5567
  <afterbreak>\p{Lu}</afterbreak>
5548
5568
  </rule>
5549
5569
  <rule break="no">
5550
- <beforebreak>[.!?…][\h]+</beforebreak>
5551
- <afterbreak>[\h]*([«"„“(]|[&#x2010;-&#x2015;-][\h])\p{Ll}</afterbreak>
5570
+ <beforebreak>[.!?…][»“]?[\h]+</beforebreak>
5571
+ <afterbreak>[\h]*([«"„“(]|[‐-―-][\h])\p{Ll}</afterbreak>
5552
5572
  </rule>
5553
5573
  <rule break="yes">
5554
5574
  <beforebreak>\v[\h]*</beforebreak>
@@ -5562,7 +5582,7 @@
5562
5582
  <!-- various punctuation between lowercase letters -->
5563
5583
  <rule break="no">
5564
5584
  <beforebreak>\b\p{Ll}+[.!?][\h\v]*</beforebreak>
5565
- <afterbreak>\h*(([\(«]|[\[&#x2010;-&#x2015;-][\h\v]*)?\p{Ll})</afterbreak>
5585
+ <afterbreak>\h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll})</afterbreak>
5566
5586
  </rule>
5567
5587
  <rule break="no">
5568
5588
  <beforebreak>([\[\(]*[\]\)]*|\.\.\.|…)[\h\v]+</beforebreak>
@@ -5583,7 +5603,6 @@
5583
5603
  <beforebreak>(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]*</beforebreak>
5584
5604
  <afterbreak></afterbreak>
5585
5605
  </rule>
5586
- <!-- І. В. Коваль, Т. 2, C. 202 -->
5587
5606
  <!-- Іван Ч. (1914 р. н.) -->
5588
5607
  <rule break="no">
5589
5608
  <beforebreak>[\h\v][А-ЯІЇЄҐ]\.[\h\v]*</beforebreak>
@@ -5606,7 +5625,7 @@
5606
5625
  -->
5607
5626
  <rule break="no">
5608
5627
  <beforebreak>\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+</beforebreak>
5609
- <afterbreak>[\h\v]*[№0-9&#x2010;-&#x2015;-]</afterbreak>
5628
+ <afterbreak>[\h\v]*[№0-9‐-―-]</afterbreak>
5610
5629
  </rule>
5611
5630
  <!-- річка - р. Дніпро -->
5612
5631
  <rule break="no">
@@ -5615,7 +5634,7 @@
5615
5634
  </rule>
5616
5635
  <!-- У травні 1949 р. Грушківський район -->
5617
5636
  <rule break="no">
5618
- <beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[&#x2010;-&#x2015;-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
5637
+ <beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[‐-―-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
5619
5638
  <afterbreak>[\v\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h\v]</afterbreak>
5620
5639
  </rule>
5621
5640
  <!-- 15 вересня 1995 р. Україною було підписно -->
@@ -5635,22 +5654,31 @@
5635
5654
  </rule>
5636
5655
  <!-- усталені скорочення, що не збігаються з нескороченими словами -->
5637
5656
  <rule break="no">
5638
- <!-- unfortunately \b ignores \u0301 -->
5639
- <beforebreak>\b(укр|рос|англ|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк))\.[\h\v]*</beforebreak>
5657
+ <beforebreak>\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]*</beforebreak>
5640
5658
  <afterbreak></afterbreak>
5641
5659
  </rule>
5642
5660
  <rule break="no">
5643
- <!-- unfortunately \b ignores \u0301 -->
5644
- <beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
5661
+ <beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
5645
5662
  <afterbreak></afterbreak>
5646
5663
  </rule>
5647
5664
  <rule break="no">
5648
- <!-- unfortunately \b ignores \u0301 -->
5665
+ <beforebreak>\b(кін)\.[\h\v]*</beforebreak>
5666
+ <afterbreak>[а-яіїєґ0-9IXV]|[ІХ]+\b</afterbreak>
5667
+ </rule>
5668
+ <rule break="no">
5649
5669
  <beforebreak>\b[сС]т\.[\h\v]</beforebreak>
5650
5670
  <afterbreak>[\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v])</afterbreak>
5651
5671
  </rule>
5672
+ <!-- нар. 1945 р. | (1966 р. нар.) | 1975 — нар. Осипчук -->
5673
+ <rule break="no">
5674
+ <beforebreak>([0-9]|[-–—])[\h\v]+нар\.[\h\v]*</beforebreak>
5675
+ <afterbreak></afterbreak>
5676
+ </rule>
5677
+ <rule break="no">
5678
+ <beforebreak>\bнар\.[\h\v]*</beforebreak>
5679
+ <afterbreak>([0-9]|бл\.|арт\.)</afterbreak>
5680
+ </rule>
5652
5681
  <rule break="no">
5653
- <!-- no break only for дол. США -->
5654
5682
  <beforebreak>\bдол\.[\h\v]*</beforebreak>
5655
5683
  <afterbreak>США</afterbreak>
5656
5684
  </rule>
@@ -5666,7 +5694,7 @@
5666
5694
  </rule>
5667
5695
  <!-- Верховний орган, див. Африканський національний конгрес -->
5668
5696
  <rule break="no">
5669
- <beforebreak>[,&#x2010;-&#x2015;-][\h\v]*(див)\.[\h\v]*</beforebreak>
5697
+ <beforebreak>[,‐-―-][\h\v]*(див)\.[\h\v]*</beforebreak>
5670
5698
  <afterbreak></afterbreak>
5671
5699
  </rule>
5672
5700
  <!-- скорочення в дужках:
@@ -5678,10 +5706,14 @@
5678
5706
  </rule>
5679
5707
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5680
5708
  <rule break="no">
5681
- <beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5709
+ <beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5682
5710
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5683
5711
  </rule>
5684
5712
  <rule break="no">
5713
+ <beforebreak>(?&lt;![іи]\s+)\bдр\.[\h\v]*</beforebreak>
5714
+ <afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak>
5715
+ </rule>
5716
+ <rule break="no">
5685
5717
  <beforebreak>\bМан\.[\h\v]*</beforebreak>
5686
5718
  <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5687
5719
  </rule>
@@ -5690,27 +5722,25 @@
5690
5722
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5691
5723
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5692
5724
  </rule>
5693
- <!-- арт. - артикул -->
5694
5725
  <!-- TODO: арт. - артист -->
5695
5726
  <rule break="no">
5696
5727
  <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5697
5728
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5698
5729
  </rule>
5699
- <!-- ХІІ р., 3-6 арт. -->
5730
+ <!-- ХІІ р., 3-6 арт., 2-3 тт. -->
5700
5731
  <rule break="no">
5701
- <beforebreak>[0-9][\h\v]+арт\.[\h\v]*</beforebreak>
5732
+ <beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak>
5702
5733
  <afterbreak></afterbreak>
5703
5734
  </rule>
5704
- <!-- місто, але принаймні з парою літер в назві бо є ще метри (м) -->
5705
5735
  <!-- але розбиваємо «всього 20 м. Почалося» -->
5706
5736
  <rule break="no">
5707
5737
  <beforebreak>(?&lt;!\d[\h\v]*)\bм\.[\h\v]*</beforebreak>
5708
- <afterbreak>[А-ЯІЇЄҐ][а-яіїєґ]</afterbreak>
5738
+ <afterbreak>[А-ЯІЇЄҐ][а-яіїєґ']</afterbreak>
5709
5739
  </rule>
5710
5740
  <!-- село/сторінка/місто, але щоб не збігалося з секундами/метрами -->
5711
5741
  <rule break="no">
5712
5742
  <beforebreak>([\h\v][«(][см]|[^0-9/. ][\h\v]+[см])\.[\h\v]+</beforebreak>
5713
- <afterbreak>[А-ЯІЇЄҐ]</afterbreak>
5743
+ <afterbreak>[А-ЯІЇЄҐ][а-яіїєґ']</afterbreak>
5714
5744
  </rule>
5715
5745
  <!-- (реж. Емманюель -->
5716
5746
  <rule break="no">
@@ -5725,10 +5755,13 @@
5725
5755
  <!-- статус правових держав. — Авт.). -->
5726
5756
  <rule break="no">
5727
5757
  <beforebreak></beforebreak>
5728
- <afterbreak>[\h\v]*[&#x2010;-&#x2015;-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
5758
+ <afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
5759
+ </rule>
5760
+ <!-- Цензор.НЕТ -->
5761
+ <rule break="no">
5762
+ <beforebreak>[а-яіїєґ]\.</beforebreak>
5763
+ <afterbreak>НЕТ|Інфо|Info|City|Life|UA|Ру</afterbreak>
5729
5764
  </rule>
5730
- <!-- force the break -->
5731
- <!-- часто зустрічається крапка+U+202F+пробіл, який srx чомусь не розбиває на речення -->
5732
5765
  <!-- але лишаємо ініціали: С.\u202F Шелухин -->
5733
5766
  <rule break="yes">
5734
5767
  <beforebreak>(?&lt;!\h[А-ЯІЇЄҐ])[.!?…]{1,3}\u202F[\h\v]+</beforebreak>
@@ -5746,10 +5779,9 @@
5746
5779
  <!-- “Слон” (2008 р.) У минулому харків’янка -->
5747
5780
  <rule break="yes">
5748
5781
  <beforebreak>[.!?…]['»"„“”)\]›]?[\h\v]+</beforebreak>
5749
- <afterbreak>([&#x2010;-&#x2015;-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
5782
+ <afterbreak>([‐-―-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
5750
5783
  </rule>
5751
5784
  </languagerule>
5752
-
5753
5785
  <languagerule languagerulename="Belarusian">
5754
5786
  <rule break="no">
5755
5787
  <beforebreak>\b\d+\.\s</beforebreak>
@@ -6016,11 +6048,11 @@
6016
6048
  </rule>
6017
6049
  </languagerule>
6018
6050
  <languagerule languagerulename="Portuguese">
6019
- <rule break="no"><!-- URLs without "www."-->
6051
+ <rule break="no">
6020
6052
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
6021
6053
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
6022
6054
  </rule>
6023
- <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
6055
+ <rule break="no">
6024
6056
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
6025
6057
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
6026
6058
  </rule>
@@ -6654,83 +6686,83 @@
6654
6686
  </rule>
6655
6687
  </languagerule>
6656
6688
  <languagerule languagerulename="Arabic">
6657
- <rule break="no">
6658
- <beforebreak>\bwww\.</beforebreak>
6659
- <afterbreak>\w</afterbreak>
6660
- </rule>
6661
- <rule break="no">
6662
- <beforebreak>[\[\(]*…[\]\)]* </beforebreak>
6663
- <afterbreak>\p{Ll}</afterbreak>
6664
- </rule>
6665
- <rule break="no">
6666
- <beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
6667
- <afterbreak></afterbreak>
6668
- </rule>
6669
- <rule break="no">
6670
- <beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
6671
- <afterbreak>\p{Ll}</afterbreak>
6672
- </rule>
6673
- <rule break="no">
6674
- <beforebreak>[«»"”']\s*</beforebreak>
6675
- <afterbreak>\s*\p{Ll}</afterbreak>
6676
- </rule>
6677
- <rule break="no">
6678
- <beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
6679
- <afterbreak></afterbreak>
6680
- </rule>
6681
- <rule break="no">
6682
- <beforebreak>\b\p{L}\.\s</beforebreak>
6683
- <afterbreak>\p{L}\.\s</afterbreak>
6684
- </rule>
6685
- <rule break="no">
6686
- <beforebreak>\b\p{L}\.</beforebreak>
6687
- <afterbreak>\p{L}\.</afterbreak>
6688
- </rule>
6689
- <rule break="yes">
6690
- <beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
6691
- <afterbreak>\p{N}+\)\s</afterbreak>
6692
- </rule>
6693
- <rule break="no">
6694
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
6695
- <afterbreak>[\p{N}\p{Ll}]</afterbreak>
6696
- </rule>
6697
- <rule break="no">
6698
- <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
6699
- <afterbreak>[^\p{Lu}]</afterbreak>
6700
- </rule>
6701
- <rule break="no">
6702
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
6703
- <afterbreak></afterbreak>
6704
- </rule>
6705
- <rule break="no">
6706
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
6707
- <afterbreak></afterbreak>
6708
- </rule>
6709
- <rule break="no">
6710
- <beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
6711
- <afterbreak></afterbreak>
6712
- </rule>
6713
- <rule break="no">
6714
- <beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
6715
- <afterbreak></afterbreak>
6716
- </rule>
6717
- <rule break="no">
6718
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
6719
- <afterbreak></afterbreak>
6720
- </rule>
6721
- <rule break="yes">
6722
- <beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
6723
- <afterbreak></afterbreak>
6724
- </rule>
6725
- <rule break="yes">
6726
- <beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
6727
- <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
6728
- </rule>
6729
- <rule break="yes">
6730
- <beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
6731
- <afterbreak>\p{Lu}\p{Ll}</afterbreak>
6732
- </rule>
6733
- </languagerule>
6689
+ <rule break="no">
6690
+ <beforebreak>\bwww\.</beforebreak>
6691
+ <afterbreak>\w</afterbreak>
6692
+ </rule>
6693
+ <rule break="no">
6694
+ <beforebreak>[\[\(]*…[\]\)]* </beforebreak>
6695
+ <afterbreak>\p{Ll}</afterbreak>
6696
+ </rule>
6697
+ <rule break="no">
6698
+ <beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
6699
+ <afterbreak></afterbreak>
6700
+ </rule>
6701
+ <rule break="no">
6702
+ <beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
6703
+ <afterbreak>\p{Ll}</afterbreak>
6704
+ </rule>
6705
+ <rule break="no">
6706
+ <beforebreak>[«»"”']\s*</beforebreak>
6707
+ <afterbreak>\s*\p{Ll}</afterbreak>
6708
+ </rule>
6709
+ <rule break="no">
6710
+ <beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
6711
+ <afterbreak></afterbreak>
6712
+ </rule>
6713
+ <rule break="no">
6714
+ <beforebreak>\b\p{L}\.\s</beforebreak>
6715
+ <afterbreak>\p{L}\.\s</afterbreak>
6716
+ </rule>
6717
+ <rule break="no">
6718
+ <beforebreak>\b\p{L}\.</beforebreak>
6719
+ <afterbreak>\p{L}\.</afterbreak>
6720
+ </rule>
6721
+ <rule break="yes">
6722
+ <beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
6723
+ <afterbreak>\p{N}+\)\s</afterbreak>
6724
+ </rule>
6725
+ <rule break="no">
6726
+ <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
6727
+ <afterbreak>[\p{N}\p{Ll}]</afterbreak>
6728
+ </rule>
6729
+ <rule break="no">
6730
+ <beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
6731
+ <afterbreak>[^\p{Lu}]</afterbreak>
6732
+ </rule>
6733
+ <rule break="no">
6734
+ <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
6735
+ <afterbreak></afterbreak>
6736
+ </rule>
6737
+ <rule break="no">
6738
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
6739
+ <afterbreak></afterbreak>
6740
+ </rule>
6741
+ <rule break="no">
6742
+ <beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
6743
+ <afterbreak></afterbreak>
6744
+ </rule>
6745
+ <rule break="no">
6746
+ <beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
6747
+ <afterbreak></afterbreak>
6748
+ </rule>
6749
+ <rule break="no">
6750
+ <beforebreak>\(\p{Ll}+\.\s</beforebreak>
6751
+ <afterbreak></afterbreak>
6752
+ </rule>
6753
+ <rule break="yes">
6754
+ <beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
6755
+ <afterbreak></afterbreak>
6756
+ </rule>
6757
+ <rule break="yes">
6758
+ <beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
6759
+ <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
6760
+ </rule>
6761
+ <rule break="yes">
6762
+ <beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
6763
+ <afterbreak>\p{Lu}\p{Ll}</afterbreak>
6764
+ </rule>
6765
+ </languagerule>
6734
6766
  </languagerules>
6735
6767
  <maprules>
6736
6768
  <languagemap languagepattern=".*" languagerulename="GeneralImportant"></languagemap>