srx-languagetool 0.8.0 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +2 -2
- data/.ruby-version +1 -1
- data/CHANGELOG.md +8 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +23 -22
- data/lib/srx/languagetool/version.rb +1 -1
- data/lib/srx/segment.srx +277 -245
- data/srx-languagetool.gemspec +0 -7
- metadata +2 -86
data/lib/srx/segment.srx
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
<formathandle type="end" include="yes"></formathandle>
|
6
6
|
<formathandle type="isolated" include="no"></formathandle>
|
7
7
|
<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes" useIcu4JBreakRules="no" treatIsolatedCodesAsWhitespace="no"></okpsrx:options>
|
8
|
-
<okpsrx:sample language="
|
8
|
+
<okpsrx:sample language="nl" useMappedRules="yes">De organisatie Doe! is een rare.</okpsrx:sample>
|
9
9
|
<okpsrx:rangeRule></okpsrx:rangeRule>
|
10
10
|
</header>
|
11
11
|
<body>
|
@@ -1084,6 +1084,11 @@
|
|
1084
1084
|
<beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u0002]*\p{Pe}\s</beforebreak>
|
1085
1085
|
<afterbreak>\p{Ll}</afterbreak>
|
1086
1086
|
</rule>
|
1087
|
+
<!--p. n.e. (błędny podział wiersza)-->
|
1088
|
+
<rule break="no">
|
1089
|
+
<beforebreak>p\.\s</beforebreak>
|
1090
|
+
<afterbreak>n\.\s?e\.</afterbreak>
|
1091
|
+
</rule>
|
1087
1092
|
<rule break="yes">
|
1088
1093
|
<beforebreak>[\.!?…]['"\p{Pe}\u00BB\u2019\u201D\u203A\u0002¹²³]*\s</beforebreak>
|
1089
1094
|
<afterbreak></afterbreak>
|
@@ -1106,7 +1111,7 @@
|
|
1106
1111
|
<beforebreak>[\u00A0\s]</beforebreak>
|
1107
1112
|
<afterbreak>\n</afterbreak>
|
1108
1113
|
</rule>
|
1109
|
-
<rule break="no"
|
1114
|
+
<rule break="no">
|
1110
1115
|
<beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
|
1111
1116
|
<afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
|
1112
1117
|
</rule>
|
@@ -1114,96 +1119,100 @@
|
|
1114
1119
|
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
1115
1120
|
<afterbreak>\p{Ll}</afterbreak>
|
1116
1121
|
</rule>
|
1117
|
-
<rule break="no"
|
1122
|
+
<rule break="no">
|
1118
1123
|
<beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
|
1119
1124
|
<afterbreak>[A-Z]\b</afterbreak>
|
1120
1125
|
</rule>
|
1121
|
-
<rule break="no"
|
1126
|
+
<rule break="no">
|
1122
1127
|
<beforebreak>\bA\.</beforebreak>
|
1123
1128
|
<afterbreak>I\b</afterbreak>
|
1124
1129
|
</rule>
|
1125
|
-
<rule break="no"
|
1130
|
+
<rule break="no">
|
1126
1131
|
<beforebreak>\bS\.</beforebreak>
|
1127
1132
|
<afterbreak>I\b</afterbreak>
|
1128
1133
|
</rule>
|
1129
|
-
<rule break="no"
|
1134
|
+
<rule break="no">
|
1130
1135
|
<beforebreak>\bL\.</beforebreak>
|
1131
1136
|
<afterbreak>A\b</afterbreak>
|
1132
1137
|
</rule>
|
1133
|
-
<rule break="no"
|
1138
|
+
<rule break="no">
|
1134
1139
|
<beforebreak>\bU\.</beforebreak>
|
1135
1140
|
<afterbreak>[SK]\b</afterbreak>
|
1136
1141
|
</rule>
|
1137
|
-
<rule break="no"
|
1142
|
+
<rule break="no">
|
1138
1143
|
<beforebreak>\bI\.</beforebreak>
|
1139
1144
|
<afterbreak>S\b</afterbreak>
|
1140
1145
|
</rule>
|
1141
|
-
<rule break="no"
|
1146
|
+
<rule break="no">
|
1142
1147
|
<beforebreak>\bM\.</beforebreak>
|
1143
1148
|
<afterbreak>Z\b</afterbreak>
|
1144
1149
|
</rule>
|
1145
|
-
<rule break="no"
|
1150
|
+
<rule break="no">
|
1146
1151
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
1147
1152
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
1148
1153
|
</rule>
|
1149
|
-
<rule break="no"
|
1154
|
+
<rule break="no">
|
1150
1155
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
1151
1156
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
|
1152
1157
|
</rule>
|
1153
|
-
<rule break="no"
|
1158
|
+
<rule break="no">
|
1154
1159
|
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
1155
1160
|
<afterbreak>\p{N}</afterbreak>
|
1156
1161
|
</rule>
|
1157
|
-
<rule break="no"
|
1162
|
+
<rule break="no">
|
1158
1163
|
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
1159
1164
|
<afterbreak>D\.?</afterbreak>
|
1160
1165
|
</rule>
|
1161
|
-
<rule break="no"
|
1166
|
+
<rule break="no">
|
1162
1167
|
<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
|
1163
1168
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1164
1169
|
</rule>
|
1165
|
-
<rule break="no"
|
1170
|
+
<rule break="no">
|
1166
1171
|
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
|
1167
1172
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1168
1173
|
</rule>
|
1169
|
-
<rule break="no"
|
1174
|
+
<rule break="no">
|
1170
1175
|
<beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1171
1176
|
<afterbreak>\p{N}|[IXV]+</afterbreak>
|
1172
1177
|
</rule>
|
1173
|
-
<rule break="no"
|
1178
|
+
<rule break="no">
|
1174
1179
|
<beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1175
1180
|
<afterbreak>\(\p{N}\)</afterbreak>
|
1176
1181
|
</rule>
|
1177
|
-
<rule break="no"
|
1182
|
+
<rule break="no">
|
1178
1183
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
1179
1184
|
<afterbreak>[^\p{P}]</afterbreak>
|
1180
1185
|
</rule>
|
1181
|
-
<rule break="no"
|
1186
|
+
<rule break="no">
|
1187
|
+
<beforebreak>[?!.…]["”]\)[\s\u00A0]</beforebreak>
|
1188
|
+
<afterbreak>[a-z].*</afterbreak>
|
1189
|
+
</rule>
|
1190
|
+
<rule break="no">
|
1182
1191
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
1183
1192
|
<afterbreak>[^\p{P}]</afterbreak>
|
1184
1193
|
</rule>
|
1185
|
-
<rule break="no"
|
1194
|
+
<rule break="no">
|
1186
1195
|
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
1187
1196
|
<afterbreak></afterbreak>
|
1188
1197
|
</rule>
|
1189
|
-
<rule break="no"
|
1198
|
+
<rule break="no">
|
1190
1199
|
<beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
|
1191
1200
|
<afterbreak></afterbreak>
|
1192
1201
|
</rule>
|
1193
|
-
<rule break="no"
|
1202
|
+
<rule break="no">
|
1194
1203
|
<beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
|
1195
1204
|
<afterbreak></afterbreak>
|
1196
1205
|
</rule>
|
1197
|
-
<rule break="no"
|
1206
|
+
<rule break="no">
|
1198
1207
|
<beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
|
1199
1208
|
<afterbreak></afterbreak>
|
1200
1209
|
</rule>
|
1201
1210
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
1202
|
-
<rule break="no"
|
1211
|
+
<rule break="no">
|
1203
1212
|
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
1204
1213
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
1205
1214
|
</rule>
|
1206
|
-
<rule break="no"
|
1215
|
+
<rule break="no">
|
1207
1216
|
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
1208
1217
|
<afterbreak></afterbreak>
|
1209
1218
|
</rule>
|
@@ -1251,39 +1260,39 @@
|
|
1251
1260
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
1252
1261
|
<afterbreak>(?i)Box</afterbreak>
|
1253
1262
|
</rule>
|
1254
|
-
<rule break="no"
|
1263
|
+
<rule break="no">
|
1255
1264
|
<beforebreak>ID.</beforebreak>
|
1256
1265
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
1257
1266
|
</rule>
|
1258
|
-
<rule break="no"
|
1267
|
+
<rule break="no">
|
1259
1268
|
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
1260
1269
|
<afterbreak></afterbreak>
|
1261
1270
|
</rule>
|
1262
|
-
<rule break="no"
|
1271
|
+
<rule break="no">
|
1263
1272
|
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
1264
1273
|
<afterbreak></afterbreak>
|
1265
1274
|
</rule>
|
1266
|
-
<rule break="no"
|
1275
|
+
<rule break="no">
|
1267
1276
|
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
1268
1277
|
<afterbreak></afterbreak>
|
1269
1278
|
</rule>
|
1270
|
-
<rule break="no"
|
1279
|
+
<rule break="no">
|
1271
1280
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1272
1281
|
<afterbreak>Eng\.?</afterbreak>
|
1273
1282
|
</rule>
|
1274
|
-
<rule break="no"
|
1283
|
+
<rule break="no">
|
1275
1284
|
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
1276
1285
|
<afterbreak>[BM]\.?</afterbreak>
|
1277
1286
|
</rule>
|
1278
|
-
<rule break="no"
|
1287
|
+
<rule break="no">
|
1279
1288
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1280
1289
|
<afterbreak>Sc\.?</afterbreak>
|
1281
1290
|
</rule>
|
1282
|
-
<rule break="no"
|
1291
|
+
<rule break="no">
|
1283
1292
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1284
1293
|
<afterbreak>Comp?\.?</afterbreak>
|
1285
1294
|
</rule>
|
1286
|
-
<rule break="no"
|
1295
|
+
<rule break="no">
|
1287
1296
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1288
1297
|
<afterbreak>Arch\.?</afterbreak>
|
1289
1298
|
</rule>
|
@@ -1375,7 +1384,7 @@
|
|
1375
1384
|
<beforebreak>\b\p{L}\.</beforebreak>
|
1376
1385
|
<afterbreak>\p{L}\.</afterbreak>
|
1377
1386
|
</rule>
|
1378
|
-
<rule break="no"
|
1387
|
+
<rule break="no">
|
1379
1388
|
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
1380
1389
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
1381
1390
|
</rule>
|
@@ -1388,7 +1397,7 @@
|
|
1388
1397
|
<afterbreak>\p{Ll}+</afterbreak>
|
1389
1398
|
</rule>
|
1390
1399
|
<rule break="no">
|
1391
|
-
<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak
|
1400
|
+
<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak>
|
1392
1401
|
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
1393
1402
|
</rule>
|
1394
1403
|
<rule break="no">
|
@@ -1419,8 +1428,8 @@
|
|
1419
1428
|
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
1420
1429
|
<afterbreak></afterbreak>
|
1421
1430
|
</rule>
|
1422
|
-
<rule break="no"
|
1423
|
-
<beforebreak>i\.e\.[\s\u00A0]</beforebreak
|
1431
|
+
<rule break="no">
|
1432
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak>
|
1424
1433
|
<afterbreak></afterbreak>
|
1425
1434
|
</rule>
|
1426
1435
|
<rule break="yes">
|
@@ -1532,37 +1541,52 @@
|
|
1532
1541
|
</languagerule>
|
1533
1542
|
<languagerule languagerulename="Dutch">
|
1534
1543
|
<rule break="no">
|
1535
|
-
<!-- sp.a -->
|
1536
1544
|
<beforebreak>\b(sp|SP)</beforebreak>
|
1537
1545
|
<afterbreak>\.[aA]\b</afterbreak>
|
1538
1546
|
</rule>
|
1539
1547
|
<rule break="no">
|
1540
|
-
<!-- .Net -->
|
1541
1548
|
<beforebreak>\s[.]</beforebreak>
|
1542
1549
|
<afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
|
1543
1550
|
</rule>
|
1544
|
-
<rule break="no"
|
1551
|
+
<rule break="no">
|
1545
1552
|
<beforebreak>[.?!][’'"]</beforebreak>
|
1546
1553
|
<afterbreak> [a-z]</afterbreak>
|
1547
1554
|
</rule>
|
1548
|
-
<rule break="no"
|
1555
|
+
<rule break="no">
|
1549
1556
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
1550
1557
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
1551
1558
|
</rule>
|
1552
|
-
<rule break="no"
|
1559
|
+
<rule break="no">
|
1553
1560
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
1554
1561
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
1555
1562
|
</rule>
|
1556
|
-
<rule break="no"
|
1557
|
-
<beforebreak>\b(
|
1563
|
+
<rule break="no">
|
1564
|
+
<beforebreak>\b(blz|pag|fig)\.\s</beforebreak>
|
1565
|
+
<afterbreak>[0-9]</afterbreak>
|
1566
|
+
</rule>
|
1567
|
+
<!--Abbrevs that can happen in sentence and at end-->
|
1568
|
+
<rule break="no">
|
1569
|
+
<beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak>
|
1570
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1571
|
+
</rule>
|
1572
|
+
<rule break="yes">
|
1573
|
+
<beforebreak>\s(la|do|del)\sMar\.\s</beforebreak>
|
1558
1574
|
<afterbreak></afterbreak>
|
1559
1575
|
</rule>
|
1560
1576
|
<rule break="no">
|
1561
|
-
<beforebreak>\b(
|
1577
|
+
<beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
|
1562
1578
|
<afterbreak></afterbreak>
|
1563
1579
|
</rule>
|
1564
1580
|
<rule break="no">
|
1565
|
-
<beforebreak>\b(
|
1581
|
+
<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|(Prof\.)?[Dd]r|Em|Fa|Kon|Stb)\.\s</beforebreak>
|
1582
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1583
|
+
</rule>
|
1584
|
+
<rule break="no">
|
1585
|
+
<beforebreak>\b(Stb)\.\s</beforebreak>
|
1586
|
+
<afterbreak>[0-9]</afterbreak>
|
1587
|
+
</rule>
|
1588
|
+
<rule break="no">
|
1589
|
+
<beforebreak>\b([Mm]ej|[Mm]evr|[Mm]rs|[Mm]s|[Mm]gr|[Mm]w|Ndl|Ned|Nl|No|Prof|[Ss]ecr|Chr|Jac|[Ww]ed|Zr)\.\s</beforebreak>
|
1566
1590
|
<afterbreak></afterbreak>
|
1567
1591
|
</rule>
|
1568
1592
|
<rule break="no">
|
@@ -1570,23 +1594,27 @@
|
|
1570
1594
|
<afterbreak></afterbreak>
|
1571
1595
|
</rule>
|
1572
1596
|
<rule break="no">
|
1573
|
-
<beforebreak>\b(abs|abstr|adj|adm|
|
1597
|
+
<beforebreak>\b(abs|abstr|adj|adm|[Aa]fb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|afm|evt|exp|vs)\.\s</beforebreak>
|
1574
1598
|
<afterbreak></afterbreak>
|
1575
1599
|
</rule>
|
1576
1600
|
<rule break="no">
|
1577
|
-
<beforebreak>\b(
|
1601
|
+
<beforebreak>\b(ald|alg|amb|anat|antrop|apoth)\.\s</beforebreak>
|
1578
1602
|
<afterbreak></afterbreak>
|
1579
1603
|
</rule>
|
1604
|
+
<rule break="yes">
|
1605
|
+
<beforebreak>\seen\sprof\.\s</beforebreak>
|
1606
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1607
|
+
</rule>
|
1580
1608
|
<rule break="no">
|
1581
1609
|
<beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
|
1582
1610
|
<afterbreak></afterbreak>
|
1583
1611
|
</rule>
|
1584
1612
|
<rule break="no">
|
1585
|
-
<beforebreak>\b(arch|
|
1613
|
+
<beforebreak>\b(arch|archeolbc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
|
1586
1614
|
<afterbreak></afterbreak>
|
1587
1615
|
</rule>
|
1588
1616
|
<rule break="no">
|
1589
|
-
<beforebreak>\b(bijz|
|
1617
|
+
<beforebreak>\b(bijz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
|
1590
1618
|
<afterbreak></afterbreak>
|
1591
1619
|
</rule>
|
1592
1620
|
<rule break="no">
|
@@ -1594,47 +1622,47 @@
|
|
1594
1622
|
<afterbreak></afterbreak>
|
1595
1623
|
</rule>
|
1596
1624
|
<rule break="no">
|
1597
|
-
<beforebreak>\b(
|
1625
|
+
<beforebreak>\b([Ee]d|em|ev|[Ee]xcl|[Ff]a|[Ff]am|[fF]ig|fin|fl|fr)\.\s</beforebreak>
|
1598
1626
|
<afterbreak></afterbreak>
|
1599
1627
|
</rule>
|
1600
1628
|
<rule break="no">
|
1601
|
-
<beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|inz|ir|jhr|jkvr)\.\s</beforebreak>
|
1629
|
+
<beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|[Ss]ec|inz|ir|jhr|jkvr)\.\s</beforebreak>
|
1602
1630
|
<afterbreak></afterbreak>
|
1603
1631
|
</rule>
|
1604
1632
|
<rule break="no">
|
1605
|
-
<beforebreak>\b(jl|jr|kr|kt|
|
1633
|
+
<beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak>
|
1606
1634
|
<afterbreak></afterbreak>
|
1607
1635
|
</rule>
|
1608
1636
|
<rule break="no">
|
1609
|
-
<beforebreak>\b(mln|
|
1637
|
+
<beforebreak>\b(mln|[Mm]r|[Mm]w|nl|no|nr|nrs|ob|obl|ong|onov)\.\s</beforebreak>
|
1610
1638
|
<afterbreak></afterbreak>
|
1611
1639
|
</rule>
|
1612
1640
|
<rule break="no">
|
1613
|
-
<beforebreak>\b(opm|org|ov|
|
1641
|
+
<beforebreak>\b(opm|org|ov|[Pp]ag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
|
1614
1642
|
<afterbreak></afterbreak>
|
1615
1643
|
</rule>
|
1616
1644
|
<rule break="no">
|
1617
|
-
<beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|tk)\.\s</beforebreak>
|
1645
|
+
<beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|prof|tk)\.\s</beforebreak>
|
1618
1646
|
<afterbreak></afterbreak>
|
1619
1647
|
</rule>
|
1620
1648
|
<rule break="no">
|
1621
1649
|
<beforebreak>\b([A-Z]|Adr|Chr|Fr|Fred|IJ|Jac|Joh|Ph|St|Th|Tj|v|v\.(\s)?d)\.(\s)?</beforebreak>
|
1622
|
-
<afterbreak
|
1650
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1623
1651
|
</rule>
|
1624
1652
|
<rule break="no">
|
1625
1653
|
<beforebreak>\b[vn]\.\s</beforebreak>
|
1626
1654
|
<afterbreak>Chr</afterbreak>
|
1627
1655
|
</rule>
|
1628
1656
|
<rule break="no">
|
1629
|
-
<beforebreak>\b(uitsl|
|
1630
|
-
<afterbreak
|
1657
|
+
<beforebreak>\b(uitsl|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
|
1658
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1631
1659
|
</rule>
|
1632
1660
|
<rule break="no">
|
1633
|
-
<beforebreak>\b(mm|cm|km|mg|kg|h|kW|mW)\.\s</beforebreak>
|
1661
|
+
<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
|
1634
1662
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
1635
1663
|
</rule>
|
1636
1664
|
<rule break="yes">
|
1637
|
-
<beforebreak>\b(mm|cm|km|ml|kg|kW|
|
1665
|
+
<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
|
1638
1666
|
<afterbreak></afterbreak>
|
1639
1667
|
</rule>
|
1640
1668
|
<rule break="no">
|
@@ -1686,10 +1714,6 @@
|
|
1686
1714
|
<afterbreak></afterbreak>
|
1687
1715
|
</rule>
|
1688
1716
|
<rule break="no">
|
1689
|
-
<beforebreak>\b\p{Lu}\p{Ll}\.\s?</beforebreak>
|
1690
|
-
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1691
|
-
</rule>
|
1692
|
-
<rule break="no">
|
1693
1717
|
<beforebreak>\.\p{Lu}\p{Ll}\.\s?</beforebreak>
|
1694
1718
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1695
1719
|
</rule>
|
@@ -1698,14 +1722,6 @@
|
|
1698
1722
|
<beforebreak>\b\d+\.\s</beforebreak>
|
1699
1723
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
1700
1724
|
</rule>
|
1701
|
-
<rule break="yes">
|
1702
|
-
<beforebreak>\been\sprof\.\s</beforebreak>
|
1703
|
-
<afterbreak>[^\p{Ll}]</afterbreak>
|
1704
|
-
</rule>
|
1705
|
-
<rule break="no">
|
1706
|
-
<beforebreak>\bprof\.\s</beforebreak>
|
1707
|
-
<afterbreak></afterbreak>
|
1708
|
-
</rule>
|
1709
1725
|
<rule break="no">
|
1710
1726
|
<beforebreak>[.!?…][’'"]\s</beforebreak>
|
1711
1727
|
<afterbreak>[a-z]</afterbreak>
|
@@ -1723,11 +1739,19 @@
|
|
1723
1739
|
<afterbreak>[a-z]</afterbreak>
|
1724
1740
|
</rule>
|
1725
1741
|
<rule break="yes">
|
1726
|
-
<beforebreak
|
1742
|
+
<beforebreak>\s'[2-9][.]\s</beforebreak>
|
1743
|
+
<afterbreak></afterbreak>
|
1744
|
+
</rule>
|
1745
|
+
<rule break="no">
|
1746
|
+
<beforebreak>\s[A-Z].+!\s</beforebreak>
|
1747
|
+
<afterbreak>[a-z]</afterbreak>
|
1748
|
+
</rule>
|
1749
|
+
<rule break="yes">
|
1750
|
+
<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
|
1727
1751
|
<afterbreak></afterbreak>
|
1728
1752
|
</rule>
|
1729
1753
|
<rule break="yes">
|
1730
|
-
<beforebreak>[
|
1754
|
+
<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
|
1731
1755
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1732
1756
|
</rule>
|
1733
1757
|
<rule break="yes">
|
@@ -1768,31 +1792,29 @@
|
|
1768
1792
|
<afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
|
1769
1793
|
</rule>
|
1770
1794
|
<rule break="no">
|
1771
|
-
<!-- "E. coli etc. -->
|
1772
1795
|
<beforebreak>"[A-Z][.]\s</beforebreak>
|
1773
1796
|
<afterbreak>[a-z]</afterbreak>
|
1774
1797
|
</rule>
|
1775
1798
|
<rule break="no">
|
1776
|
-
<!-- Cornelisz. -->
|
1777
1799
|
<beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
|
1778
1800
|
<afterbreak>[a-z]</afterbreak>
|
1779
1801
|
</rule>
|
1780
1802
|
<rule break="no">
|
1781
|
-
<!-- De n. XIV/vagus (nervus) -->
|
1782
1803
|
<beforebreak>De n[.]\s</beforebreak>
|
1783
1804
|
<afterbreak>[a-z]|[XIV]</afterbreak>
|
1784
1805
|
</rule>
|
1785
1806
|
<rule break="no">
|
1786
|
-
<!-- MOL.E -->
|
1787
1807
|
<beforebreak>[A-Z]{2,5}[.]</beforebreak>
|
1788
1808
|
<afterbreak>[A-Z]</afterbreak>
|
1789
1809
|
</rule>
|
1790
1810
|
<rule break="no">
|
1791
|
-
<!-- ..." betekent -->
|
1792
1811
|
<beforebreak>\.\.</beforebreak>
|
1793
1812
|
<afterbreak>" [a-z]</afterbreak>
|
1794
1813
|
</rule>
|
1795
|
-
|
1814
|
+
<rule break="no">
|
1815
|
+
<beforebreak>\sBTW\.</beforebreak>
|
1816
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1817
|
+
</rule>
|
1796
1818
|
</languagerule>
|
1797
1819
|
<languagerule languagerulename="Slovak">
|
1798
1820
|
<rule break="no">
|
@@ -4370,7 +4392,7 @@
|
|
4370
4392
|
</rule>
|
4371
4393
|
<rule break="no">
|
4372
4394
|
<beforebreak>\b(р|ред|Рис|рус|с|сб|св|См|см|сов|соч|соц|спец|ср|ст|стр|т|тел|Тел|тех|тов|тт|туп)\.\s</beforebreak>
|
4373
|
-
<afterbreak
|
4395
|
+
<afterbreak>\p{Ll}</afterbreak>
|
4374
4396
|
</rule>
|
4375
4397
|
<rule break="no">
|
4376
4398
|
<beforebreak>\b(руб|Руб|тыс|Тыс|трлн)\.\s</beforebreak>
|
@@ -4654,7 +4676,7 @@
|
|
4654
4676
|
<afterbreak>[XIV\d]+\b</afterbreak>
|
4655
4677
|
</rule>
|
4656
4678
|
<rule break="no">
|
4657
|
-
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4679
|
+
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4658
4680
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4659
4681
|
</rule>
|
4660
4682
|
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
|
@@ -4721,12 +4743,10 @@
|
|
4721
4743
|
</rule>
|
4722
4744
|
</languagerule>
|
4723
4745
|
<languagerule languagerulename="Spanish">
|
4724
|
-
|
4725
4746
|
<rule break="no">
|
4726
4747
|
<beforebreak>¿[^?]+:[\s\u00A0]</beforebreak>
|
4727
4748
|
<afterbreak>.</afterbreak>
|
4728
4749
|
</rule>
|
4729
|
-
|
4730
4750
|
<rule break="no">
|
4731
4751
|
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
4732
4752
|
<afterbreak>\p{Ll}</afterbreak>
|
@@ -4742,7 +4762,7 @@
|
|
4742
4762
|
<!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
|
4743
4763
|
<rule break="no">
|
4744
4764
|
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
|
4745
|
-
<afterbreak
|
4765
|
+
<afterbreak></afterbreak>
|
4746
4766
|
</rule>
|
4747
4767
|
<!-- Ellipsis: ... lowercase -->
|
4748
4768
|
<rule break="no">
|
@@ -4772,39 +4792,37 @@
|
|
4772
4792
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4773
4793
|
</rule>
|
4774
4794
|
<rule break="no">
|
4775
|
-
<!-- URLs without "www."-->
|
4776
4795
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
4777
4796
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
4778
4797
|
</rule>
|
4779
4798
|
<rule break="no">
|
4780
|
-
<!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
|
4781
4799
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
4782
4800
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
4783
4801
|
</rule>
|
4784
4802
|
<!-- Abbreviations that cannot finish sentences-->
|
4785
4803
|
<rule break="no">
|
4786
4804
|
<beforebreak>\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0]</beforebreak>
|
4787
|
-
<afterbreak
|
4805
|
+
<afterbreak></afterbreak>
|
4788
4806
|
</rule>
|
4789
4807
|
<rule break="no">
|
4790
4808
|
<beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
|
4791
|
-
<afterbreak
|
4809
|
+
<afterbreak></afterbreak>
|
4792
4810
|
</rule>
|
4793
4811
|
<rule break="no">
|
4794
4812
|
<beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
|
4795
|
-
<afterbreak
|
4813
|
+
<afterbreak></afterbreak>
|
4796
4814
|
</rule>
|
4797
4815
|
<rule break="no">
|
4798
4816
|
<beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
|
4799
|
-
<afterbreak
|
4817
|
+
<afterbreak></afterbreak>
|
4800
4818
|
</rule>
|
4801
4819
|
<rule break="no">
|
4802
4820
|
<beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
|
4803
|
-
<afterbreak
|
4821
|
+
<afterbreak></afterbreak>
|
4804
4822
|
</rule>
|
4805
4823
|
<rule break="no">
|
4806
4824
|
<beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
|
4807
|
-
<afterbreak
|
4825
|
+
<afterbreak></afterbreak>
|
4808
4826
|
</rule>
|
4809
4827
|
<!-- Abbreviations that can finish sentences -->
|
4810
4828
|
<rule break="no">
|
@@ -4837,7 +4855,7 @@
|
|
4837
4855
|
<!-- Composed abbrev. -->
|
4838
4856
|
<rule break="no">
|
4839
4857
|
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4840
|
-
<afterbreak
|
4858
|
+
<afterbreak></afterbreak>
|
4841
4859
|
</rule>
|
4842
4860
|
<!-- Units -->
|
4843
4861
|
<rule break="no">
|
@@ -4859,11 +4877,11 @@
|
|
4859
4877
|
</rule>
|
4860
4878
|
</languagerule>
|
4861
4879
|
<languagerule languagerulename="German">
|
4862
|
-
<rule break="no"
|
4880
|
+
<rule break="no">
|
4863
4881
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
4864
4882
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
4865
4883
|
</rule>
|
4866
|
-
<rule break="no"
|
4884
|
+
<rule break="no">
|
4867
4885
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
4868
4886
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
4869
4887
|
</rule>
|
@@ -4882,27 +4900,27 @@
|
|
4882
4900
|
<beforebreak>[^-\p{L}'’/°]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
|
4883
4901
|
<afterbreak></afterbreak>
|
4884
4902
|
</rule>
|
4885
|
-
<rule break="no"
|
4903
|
+
<rule break="no">
|
4886
4904
|
<beforebreak>([Dd](as|er|ie|iese[rsmn]?|en|em)|[kmsd]?ein(e[rsnm]?)?|am|fürs|ins|zum|im|am|zur) \d+\.[\u00A0\s]+</beforebreak>
|
4887
4905
|
<afterbreak>[A-ZÄÖÜ].*</afterbreak>
|
4888
4906
|
</rule>
|
4889
4907
|
<rule break="no">
|
4890
|
-
<beforebreak>Ust.</beforebreak
|
4908
|
+
<beforebreak>Ust.</beforebreak>
|
4891
4909
|
<afterbreak>Id</afterbreak>
|
4892
4910
|
</rule>
|
4893
4911
|
<rule break="no">
|
4894
|
-
<beforebreak>Prof.</beforebreak
|
4912
|
+
<beforebreak>Prof.</beforebreak>
|
4895
4913
|
<afterbreak>Dr</afterbreak>
|
4896
4914
|
</rule>
|
4897
4915
|
<rule break="no">
|
4898
|
-
<beforebreak>Dr.</beforebreak
|
4916
|
+
<beforebreak>Dr.</beforebreak>
|
4899
4917
|
<afterbreak>iur|med|oec|phil|rer|theol</afterbreak>
|
4900
4918
|
</rule>
|
4901
4919
|
<rule break="no">
|
4902
4920
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
4903
4921
|
<afterbreak>(?i)Box</afterbreak>
|
4904
4922
|
</rule>
|
4905
|
-
<rule break="no"
|
4923
|
+
<rule break="no">
|
4906
4924
|
<beforebreak>ID.</beforebreak>
|
4907
4925
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
4908
4926
|
</rule>
|
@@ -5012,11 +5030,11 @@
|
|
5012
5030
|
</rule>
|
5013
5031
|
<!-- German abbreviations -->
|
5014
5032
|
<rule break="no">
|
5015
|
-
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
|
5033
|
+
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|Bsp|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|[Öö]ffentl|prof|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
|
5016
5034
|
<afterbreak></afterbreak>
|
5017
5035
|
</rule>
|
5018
5036
|
<rule break="no">
|
5019
|
-
<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|
|
5037
|
+
<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
|
5020
5038
|
<afterbreak></afterbreak>
|
5021
5039
|
</rule>
|
5022
5040
|
<rule break="no">
|
@@ -5028,11 +5046,11 @@
|
|
5028
5046
|
<afterbreak>\p{Ll}</afterbreak>
|
5029
5047
|
</rule>
|
5030
5048
|
<rule break="no">
|
5031
|
-
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|
|
5049
|
+
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
|
5032
5050
|
<afterbreak></afterbreak>
|
5033
5051
|
</rule>
|
5034
5052
|
<rule break="no">
|
5035
|
-
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|
|
5053
|
+
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
|
5036
5054
|
<afterbreak></afterbreak>
|
5037
5055
|
</rule>
|
5038
5056
|
<rule break="no">
|
@@ -5048,7 +5066,11 @@
|
|
5048
5066
|
<afterbreak>\p{Ll}</afterbreak>
|
5049
5067
|
</rule>
|
5050
5068
|
<rule break="no">
|
5051
|
-
<beforebreak>\
|
5069
|
+
<beforebreak>\d+\.\d+\.[\u00A0\s]</beforebreak>
|
5070
|
+
<afterbreak>[\-–][\u00A0\s]\d+</afterbreak>
|
5071
|
+
</rule>
|
5072
|
+
<rule break="no">
|
5073
|
+
<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|voraussichtl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
|
5052
5074
|
<afterbreak></afterbreak>
|
5053
5075
|
</rule>
|
5054
5076
|
<!-- Break rules -->
|
@@ -5159,27 +5181,27 @@
|
|
5159
5181
|
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sep|Sept|Oct|Okt|Nov|Dec|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co)\.\s</beforebreak>
|
5160
5182
|
<afterbreak></afterbreak>
|
5161
5183
|
</rule>
|
5162
|
-
<rule break="no"
|
5184
|
+
<rule break="no">
|
5163
5185
|
<beforebreak>\bP[Hh]\.\s?</beforebreak>
|
5164
5186
|
<afterbreak>D\.?</afterbreak>
|
5165
5187
|
</rule>
|
5166
|
-
<rule break="no"
|
5188
|
+
<rule break="no">
|
5167
5189
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5168
5190
|
<afterbreak>Eng\.?</afterbreak>
|
5169
5191
|
</rule>
|
5170
|
-
<rule break="no"
|
5192
|
+
<rule break="no">
|
5171
5193
|
<beforebreak>\bLL\.\s?</beforebreak>
|
5172
5194
|
<afterbreak>[BM]\.?</afterbreak>
|
5173
5195
|
</rule>
|
5174
|
-
<rule break="no"
|
5196
|
+
<rule break="no">
|
5175
5197
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5176
5198
|
<afterbreak>Sc\.?</afterbreak>
|
5177
5199
|
</rule>
|
5178
|
-
<rule break="no"
|
5200
|
+
<rule break="no">
|
5179
5201
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5180
5202
|
<afterbreak>Comp?\.?</afterbreak>
|
5181
5203
|
</rule>
|
5182
|
-
<rule break="no"
|
5204
|
+
<rule break="no">
|
5183
5205
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5184
5206
|
<afterbreak>Arch\.?</afterbreak>
|
5185
5207
|
</rule>
|
@@ -5309,16 +5331,15 @@
|
|
5309
5331
|
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
|
5310
5332
|
<afterbreak></afterbreak>
|
5311
5333
|
</rule>
|
5312
|
-
<rule break="no"
|
5334
|
+
<rule break="no">
|
5313
5335
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
5314
5336
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
5315
5337
|
</rule>
|
5316
|
-
<rule break="no"
|
5338
|
+
<rule break="no">
|
5317
5339
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5318
5340
|
<afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
5319
5341
|
</rule>
|
5320
5342
|
<rule break="no">
|
5321
|
-
<!-- gaffa.org -->
|
5322
5343
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5323
5344
|
<afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
|
5324
5345
|
</rule>
|
@@ -5363,15 +5384,15 @@
|
|
5363
5384
|
<beforebreak>\b\p{L}\.</beforebreak>
|
5364
5385
|
<afterbreak>\p{L}\.</afterbreak>
|
5365
5386
|
</rule>
|
5366
|
-
<rule break="no"
|
5387
|
+
<rule break="no">
|
5367
5388
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
5368
5389
|
<afterbreak>[^\p{P}]</afterbreak>
|
5369
5390
|
</rule>
|
5370
|
-
<rule break="no"
|
5391
|
+
<rule break="no">
|
5371
5392
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
5372
5393
|
<afterbreak>[^\p{P}]</afterbreak>
|
5373
5394
|
</rule>
|
5374
|
-
<rule break="no"
|
5395
|
+
<rule break="no">
|
5375
5396
|
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
5376
5397
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
5377
5398
|
</rule>
|
@@ -5411,44 +5432,44 @@
|
|
5411
5432
|
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
5412
5433
|
<afterbreak></afterbreak>
|
5413
5434
|
</rule>
|
5414
|
-
<rule break="no"
|
5415
|
-
<beforebreak>i\.e\.[\s\u00A0]</beforebreak
|
5435
|
+
<rule break="no">
|
5436
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak>
|
5416
5437
|
<afterbreak></afterbreak>
|
5417
5438
|
</rule>
|
5418
|
-
<rule break="no"
|
5439
|
+
<rule break="no">
|
5419
5440
|
<beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
|
5420
5441
|
<afterbreak>[A-Z]\b</afterbreak>
|
5421
5442
|
</rule>
|
5422
|
-
<rule break="no"
|
5443
|
+
<rule break="no">
|
5423
5444
|
<beforebreak>\bL\.</beforebreak>
|
5424
5445
|
<afterbreak>A\b</afterbreak>
|
5425
5446
|
</rule>
|
5426
|
-
<rule break="no"
|
5447
|
+
<rule break="no">
|
5427
5448
|
<beforebreak>\bU\.</beforebreak>
|
5428
5449
|
<afterbreak>[SK]\b</afterbreak>
|
5429
5450
|
</rule>
|
5430
|
-
<rule break="no"
|
5451
|
+
<rule break="no">
|
5431
5452
|
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
5432
5453
|
<afterbreak>\p{N}</afterbreak>
|
5433
5454
|
</rule>
|
5434
|
-
<rule break="no"
|
5455
|
+
<rule break="no">
|
5435
5456
|
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
5436
5457
|
<afterbreak>D\.?</afterbreak>
|
5437
5458
|
</rule>
|
5438
|
-
<rule break="no"
|
5459
|
+
<rule break="no">
|
5439
5460
|
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
5440
5461
|
<afterbreak></afterbreak>
|
5441
5462
|
</rule>
|
5442
|
-
<rule break="no"
|
5463
|
+
<rule break="no">
|
5443
5464
|
<beforebreak>\bvs\.[\s\u00A0]</beforebreak>
|
5444
5465
|
<afterbreak></afterbreak>
|
5445
5466
|
</rule>
|
5446
5467
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
5447
|
-
<rule break="no"
|
5468
|
+
<rule break="no">
|
5448
5469
|
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
5449
5470
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
5450
5471
|
</rule>
|
5451
|
-
<rule break="no"
|
5472
|
+
<rule break="no">
|
5452
5473
|
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
5453
5474
|
<afterbreak></afterbreak>
|
5454
5475
|
</rule>
|
@@ -5456,39 +5477,39 @@
|
|
5456
5477
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
5457
5478
|
<afterbreak>(?i)Box</afterbreak>
|
5458
5479
|
</rule>
|
5459
|
-
<rule break="no"
|
5480
|
+
<rule break="no">
|
5460
5481
|
<beforebreak>ID.</beforebreak>
|
5461
5482
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
5462
5483
|
</rule>
|
5463
|
-
<rule break="no"
|
5484
|
+
<rule break="no">
|
5464
5485
|
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
5465
5486
|
<afterbreak></afterbreak>
|
5466
5487
|
</rule>
|
5467
|
-
<rule break="no"
|
5488
|
+
<rule break="no">
|
5468
5489
|
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
5469
5490
|
<afterbreak></afterbreak>
|
5470
5491
|
</rule>
|
5471
|
-
<rule break="no"
|
5492
|
+
<rule break="no">
|
5472
5493
|
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
5473
5494
|
<afterbreak></afterbreak>
|
5474
5495
|
</rule>
|
5475
|
-
<rule break="no"
|
5496
|
+
<rule break="no">
|
5476
5497
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5477
5498
|
<afterbreak>Eng\.?</afterbreak>
|
5478
5499
|
</rule>
|
5479
|
-
<rule break="no"
|
5500
|
+
<rule break="no">
|
5480
5501
|
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
5481
5502
|
<afterbreak>[BM]\.?</afterbreak>
|
5482
5503
|
</rule>
|
5483
|
-
<rule break="no"
|
5504
|
+
<rule break="no">
|
5484
5505
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5485
5506
|
<afterbreak>Sc\.?</afterbreak>
|
5486
5507
|
</rule>
|
5487
|
-
<rule break="no"
|
5508
|
+
<rule break="no">
|
5488
5509
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5489
5510
|
<afterbreak>Comp?\.?</afterbreak>
|
5490
5511
|
</rule>
|
5491
|
-
<rule break="no"
|
5512
|
+
<rule break="no">
|
5492
5513
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5493
5514
|
<afterbreak>Arch\.?</afterbreak>
|
5494
5515
|
</rule>
|
@@ -5534,7 +5555,6 @@
|
|
5534
5555
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
5535
5556
|
</rule>
|
5536
5557
|
</languagerule>
|
5537
|
-
|
5538
5558
|
<languagerule languagerulename="Ukrainian">
|
5539
5559
|
<!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. -->
|
5540
5560
|
<rule break="no">
|
@@ -5547,8 +5567,8 @@
|
|
5547
5567
|
<afterbreak>\p{Lu}</afterbreak>
|
5548
5568
|
</rule>
|
5549
5569
|
<rule break="no">
|
5550
|
-
<beforebreak>[.!?…][\h]+</beforebreak>
|
5551
|
-
<afterbreak>[\h]*([«"„“(]|[
|
5570
|
+
<beforebreak>[.!?…][»“]?[\h]+</beforebreak>
|
5571
|
+
<afterbreak>[\h]*([«"„“(]|[‐-―-][\h])\p{Ll}</afterbreak>
|
5552
5572
|
</rule>
|
5553
5573
|
<rule break="yes">
|
5554
5574
|
<beforebreak>\v[\h]*</beforebreak>
|
@@ -5562,7 +5582,7 @@
|
|
5562
5582
|
<!-- various punctuation between lowercase letters -->
|
5563
5583
|
<rule break="no">
|
5564
5584
|
<beforebreak>\b\p{Ll}+[.!?][\h\v]*</beforebreak>
|
5565
|
-
<afterbreak>\h*(([\(«]|[\[
|
5585
|
+
<afterbreak>\h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll})</afterbreak>
|
5566
5586
|
</rule>
|
5567
5587
|
<rule break="no">
|
5568
5588
|
<beforebreak>([\[\(]*[\]\)]*|\.\.\.|…)[\h\v]+</beforebreak>
|
@@ -5583,7 +5603,6 @@
|
|
5583
5603
|
<beforebreak>(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]*</beforebreak>
|
5584
5604
|
<afterbreak></afterbreak>
|
5585
5605
|
</rule>
|
5586
|
-
<!-- І. В. Коваль, Т. 2, C. 202 -->
|
5587
5606
|
<!-- Іван Ч. (1914 р. н.) -->
|
5588
5607
|
<rule break="no">
|
5589
5608
|
<beforebreak>[\h\v][А-ЯІЇЄҐ]\.[\h\v]*</beforebreak>
|
@@ -5606,7 +5625,7 @@
|
|
5606
5625
|
-->
|
5607
5626
|
<rule break="no">
|
5608
5627
|
<beforebreak>\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+</beforebreak>
|
5609
|
-
<afterbreak>[\h\v]*[№0-9
|
5628
|
+
<afterbreak>[\h\v]*[№0-9‐-―-]</afterbreak>
|
5610
5629
|
</rule>
|
5611
5630
|
<!-- річка - р. Дніпро -->
|
5612
5631
|
<rule break="no">
|
@@ -5615,7 +5634,7 @@
|
|
5615
5634
|
</rule>
|
5616
5635
|
<!-- У травні 1949 р. Грушківський район -->
|
5617
5636
|
<rule break="no">
|
5618
|
-
<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[
|
5637
|
+
<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[‐-―-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
|
5619
5638
|
<afterbreak>[\v\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h\v]</afterbreak>
|
5620
5639
|
</rule>
|
5621
5640
|
<!-- 15 вересня 1995 р. Україною було підписно -->
|
@@ -5635,22 +5654,31 @@
|
|
5635
5654
|
</rule>
|
5636
5655
|
<!-- усталені скорочення, що не збігаються з нескороченими словами -->
|
5637
5656
|
<rule break="no">
|
5638
|
-
|
5639
|
-
<beforebreak>\b(укр|рос|англ|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк))\.[\h\v]*</beforebreak>
|
5657
|
+
<beforebreak>\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]*</beforebreak>
|
5640
5658
|
<afterbreak></afterbreak>
|
5641
5659
|
</rule>
|
5642
5660
|
<rule break="no">
|
5643
|
-
|
5644
|
-
<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
|
5661
|
+
<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
|
5645
5662
|
<afterbreak></afterbreak>
|
5646
5663
|
</rule>
|
5647
5664
|
<rule break="no">
|
5648
|
-
|
5665
|
+
<beforebreak>\b(кін)\.[\h\v]*</beforebreak>
|
5666
|
+
<afterbreak>[а-яіїєґ0-9IXV]|[ІХ]+\b</afterbreak>
|
5667
|
+
</rule>
|
5668
|
+
<rule break="no">
|
5649
5669
|
<beforebreak>\b[сС]т\.[\h\v]</beforebreak>
|
5650
5670
|
<afterbreak>[\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v])</afterbreak>
|
5651
5671
|
</rule>
|
5672
|
+
<!-- нар. 1945 р. | (1966 р. нар.) | 1975 — нар. Осипчук -->
|
5673
|
+
<rule break="no">
|
5674
|
+
<beforebreak>([0-9]|[-–—])[\h\v]+нар\.[\h\v]*</beforebreak>
|
5675
|
+
<afterbreak></afterbreak>
|
5676
|
+
</rule>
|
5677
|
+
<rule break="no">
|
5678
|
+
<beforebreak>\bнар\.[\h\v]*</beforebreak>
|
5679
|
+
<afterbreak>([0-9]|бл\.|арт\.)</afterbreak>
|
5680
|
+
</rule>
|
5652
5681
|
<rule break="no">
|
5653
|
-
<!-- no break only for дол. США -->
|
5654
5682
|
<beforebreak>\bдол\.[\h\v]*</beforebreak>
|
5655
5683
|
<afterbreak>США</afterbreak>
|
5656
5684
|
</rule>
|
@@ -5666,7 +5694,7 @@
|
|
5666
5694
|
</rule>
|
5667
5695
|
<!-- Верховний орган, див. Африканський національний конгрес -->
|
5668
5696
|
<rule break="no">
|
5669
|
-
<beforebreak>[
|
5697
|
+
<beforebreak>[,‐-―-][\h\v]*(див)\.[\h\v]*</beforebreak>
|
5670
5698
|
<afterbreak></afterbreak>
|
5671
5699
|
</rule>
|
5672
5700
|
<!-- скорочення в дужках:
|
@@ -5678,10 +5706,14 @@
|
|
5678
5706
|
</rule>
|
5679
5707
|
<!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
|
5680
5708
|
<rule break="no">
|
5681
|
-
<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]
|
5709
|
+
<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
|
5682
5710
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
5683
5711
|
</rule>
|
5684
5712
|
<rule break="no">
|
5713
|
+
<beforebreak>(?<![іи]\s+)\bдр\.[\h\v]*</beforebreak>
|
5714
|
+
<afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak>
|
5715
|
+
</rule>
|
5716
|
+
<rule break="no">
|
5685
5717
|
<beforebreak>\bМан\.[\h\v]*</beforebreak>
|
5686
5718
|
<afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
|
5687
5719
|
</rule>
|
@@ -5690,27 +5722,25 @@
|
|
5690
5722
|
<beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
|
5691
5723
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
5692
5724
|
</rule>
|
5693
|
-
<!-- арт. - артикул -->
|
5694
5725
|
<!-- TODO: арт. - артист -->
|
5695
5726
|
<rule break="no">
|
5696
5727
|
<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
|
5697
5728
|
<afterbreak>[\h\v]*[0-9]</afterbreak>
|
5698
5729
|
</rule>
|
5699
|
-
<!-- ХІІ р., 3-6
|
5730
|
+
<!-- ХІІ р., 3-6 арт., 2-3 тт. -->
|
5700
5731
|
<rule break="no">
|
5701
|
-
<beforebreak>[0-9][\h\v]
|
5732
|
+
<beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak>
|
5702
5733
|
<afterbreak></afterbreak>
|
5703
5734
|
</rule>
|
5704
|
-
<!-- місто, але принаймні з парою літер в назві бо є ще метри (м) -->
|
5705
5735
|
<!-- але розбиваємо «всього 20 м. Почалося» -->
|
5706
5736
|
<rule break="no">
|
5707
5737
|
<beforebreak>(?<!\d[\h\v]*)\bм\.[\h\v]*</beforebreak>
|
5708
|
-
<afterbreak>[А-ЯІЇЄҐ][а-яіїєґ]</afterbreak>
|
5738
|
+
<afterbreak>[А-ЯІЇЄҐ][а-яіїєґ']</afterbreak>
|
5709
5739
|
</rule>
|
5710
5740
|
<!-- село/сторінка/місто, але щоб не збігалося з секундами/метрами -->
|
5711
5741
|
<rule break="no">
|
5712
5742
|
<beforebreak>([\h\v][«(][см]|[^0-9/. ][\h\v]+[см])\.[\h\v]+</beforebreak>
|
5713
|
-
<afterbreak>[А-ЯІЇЄҐ]</afterbreak>
|
5743
|
+
<afterbreak>[А-ЯІЇЄҐ][а-яіїєґ']</afterbreak>
|
5714
5744
|
</rule>
|
5715
5745
|
<!-- (реж. Емманюель -->
|
5716
5746
|
<rule break="no">
|
@@ -5725,10 +5755,13 @@
|
|
5725
5755
|
<!-- статус правових держав. — Авт.). -->
|
5726
5756
|
<rule break="no">
|
5727
5757
|
<beforebreak></beforebreak>
|
5728
|
-
<afterbreak>[\h\v]*[
|
5758
|
+
<afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
|
5759
|
+
</rule>
|
5760
|
+
<!-- Цензор.НЕТ -->
|
5761
|
+
<rule break="no">
|
5762
|
+
<beforebreak>[а-яіїєґ]\.</beforebreak>
|
5763
|
+
<afterbreak>НЕТ|Інфо|Info|City|Life|UA|Ру</afterbreak>
|
5729
5764
|
</rule>
|
5730
|
-
<!-- force the break -->
|
5731
|
-
<!-- часто зустрічається крапка+U+202F+пробіл, який srx чомусь не розбиває на речення -->
|
5732
5765
|
<!-- але лишаємо ініціали: С.\u202F Шелухин -->
|
5733
5766
|
<rule break="yes">
|
5734
5767
|
<beforebreak>(?<!\h[А-ЯІЇЄҐ])[.!?…]{1,3}\u202F[\h\v]+</beforebreak>
|
@@ -5746,10 +5779,9 @@
|
|
5746
5779
|
<!-- “Слон” (2008 р.) У минулому харків’янка -->
|
5747
5780
|
<rule break="yes">
|
5748
5781
|
<beforebreak>[.!?…]['»"„“”)\]›]?[\h\v]+</beforebreak>
|
5749
|
-
<afterbreak>([
|
5782
|
+
<afterbreak>([‐-―-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
|
5750
5783
|
</rule>
|
5751
5784
|
</languagerule>
|
5752
|
-
|
5753
5785
|
<languagerule languagerulename="Belarusian">
|
5754
5786
|
<rule break="no">
|
5755
5787
|
<beforebreak>\b\d+\.\s</beforebreak>
|
@@ -6016,11 +6048,11 @@
|
|
6016
6048
|
</rule>
|
6017
6049
|
</languagerule>
|
6018
6050
|
<languagerule languagerulename="Portuguese">
|
6019
|
-
<rule break="no"
|
6051
|
+
<rule break="no">
|
6020
6052
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
6021
6053
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
6022
6054
|
</rule>
|
6023
|
-
<rule break="no"
|
6055
|
+
<rule break="no">
|
6024
6056
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
6025
6057
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
6026
6058
|
</rule>
|
@@ -6654,83 +6686,83 @@
|
|
6654
6686
|
</rule>
|
6655
6687
|
</languagerule>
|
6656
6688
|
<languagerule languagerulename="Arabic">
|
6657
|
-
|
6658
|
-
|
6659
|
-
|
6660
|
-
|
6661
|
-
|
6662
|
-
|
6663
|
-
|
6664
|
-
|
6665
|
-
|
6666
|
-
|
6667
|
-
|
6668
|
-
|
6669
|
-
|
6670
|
-
|
6671
|
-
|
6672
|
-
|
6673
|
-
|
6674
|
-
|
6675
|
-
|
6676
|
-
|
6677
|
-
|
6678
|
-
|
6679
|
-
|
6680
|
-
|
6681
|
-
|
6682
|
-
|
6683
|
-
|
6684
|
-
|
6685
|
-
|
6686
|
-
|
6687
|
-
|
6688
|
-
|
6689
|
-
|
6690
|
-
|
6691
|
-
|
6692
|
-
|
6693
|
-
|
6694
|
-
|
6695
|
-
|
6696
|
-
|
6697
|
-
|
6698
|
-
|
6699
|
-
|
6700
|
-
|
6701
|
-
|
6702
|
-
|
6703
|
-
|
6704
|
-
|
6705
|
-
|
6706
|
-
|
6707
|
-
|
6708
|
-
|
6709
|
-
|
6710
|
-
|
6711
|
-
|
6712
|
-
|
6713
|
-
|
6714
|
-
|
6715
|
-
|
6716
|
-
|
6717
|
-
|
6718
|
-
|
6719
|
-
|
6720
|
-
|
6721
|
-
|
6722
|
-
|
6723
|
-
|
6724
|
-
|
6725
|
-
|
6726
|
-
|
6727
|
-
|
6728
|
-
|
6729
|
-
|
6730
|
-
|
6731
|
-
|
6732
|
-
|
6733
|
-
|
6689
|
+
<rule break="no">
|
6690
|
+
<beforebreak>\bwww\.</beforebreak>
|
6691
|
+
<afterbreak>\w</afterbreak>
|
6692
|
+
</rule>
|
6693
|
+
<rule break="no">
|
6694
|
+
<beforebreak>[\[\(]*…[\]\)]* </beforebreak>
|
6695
|
+
<afterbreak>\p{Ll}</afterbreak>
|
6696
|
+
</rule>
|
6697
|
+
<rule break="no">
|
6698
|
+
<beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
|
6699
|
+
<afterbreak></afterbreak>
|
6700
|
+
</rule>
|
6701
|
+
<rule break="no">
|
6702
|
+
<beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
|
6703
|
+
<afterbreak>\p{Ll}</afterbreak>
|
6704
|
+
</rule>
|
6705
|
+
<rule break="no">
|
6706
|
+
<beforebreak>[«»"”']\s*</beforebreak>
|
6707
|
+
<afterbreak>\s*\p{Ll}</afterbreak>
|
6708
|
+
</rule>
|
6709
|
+
<rule break="no">
|
6710
|
+
<beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
|
6711
|
+
<afterbreak></afterbreak>
|
6712
|
+
</rule>
|
6713
|
+
<rule break="no">
|
6714
|
+
<beforebreak>\b\p{L}\.\s</beforebreak>
|
6715
|
+
<afterbreak>\p{L}\.\s</afterbreak>
|
6716
|
+
</rule>
|
6717
|
+
<rule break="no">
|
6718
|
+
<beforebreak>\b\p{L}\.</beforebreak>
|
6719
|
+
<afterbreak>\p{L}\.</afterbreak>
|
6720
|
+
</rule>
|
6721
|
+
<rule break="yes">
|
6722
|
+
<beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
|
6723
|
+
<afterbreak>\p{N}+\)\s</afterbreak>
|
6724
|
+
</rule>
|
6725
|
+
<rule break="no">
|
6726
|
+
<beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
|
6727
|
+
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
6728
|
+
</rule>
|
6729
|
+
<rule break="no">
|
6730
|
+
<beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
|
6731
|
+
<afterbreak>[^\p{Lu}]</afterbreak>
|
6732
|
+
</rule>
|
6733
|
+
<rule break="no">
|
6734
|
+
<beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
|
6735
|
+
<afterbreak></afterbreak>
|
6736
|
+
</rule>
|
6737
|
+
<rule break="no">
|
6738
|
+
<beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
|
6739
|
+
<afterbreak></afterbreak>
|
6740
|
+
</rule>
|
6741
|
+
<rule break="no">
|
6742
|
+
<beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
|
6743
|
+
<afterbreak></afterbreak>
|
6744
|
+
</rule>
|
6745
|
+
<rule break="no">
|
6746
|
+
<beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
|
6747
|
+
<afterbreak></afterbreak>
|
6748
|
+
</rule>
|
6749
|
+
<rule break="no">
|
6750
|
+
<beforebreak>\(\p{Ll}+\.\s</beforebreak>
|
6751
|
+
<afterbreak></afterbreak>
|
6752
|
+
</rule>
|
6753
|
+
<rule break="yes">
|
6754
|
+
<beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
|
6755
|
+
<afterbreak></afterbreak>
|
6756
|
+
</rule>
|
6757
|
+
<rule break="yes">
|
6758
|
+
<beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
|
6759
|
+
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
6760
|
+
</rule>
|
6761
|
+
<rule break="yes">
|
6762
|
+
<beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
|
6763
|
+
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
6764
|
+
</rule>
|
6765
|
+
</languagerule>
|
6734
6766
|
</languagerules>
|
6735
6767
|
<maprules>
|
6736
6768
|
<languagemap languagepattern=".*" languagerulename="GeneralImportant"></languagemap>
|