srx-languagetool 0.7.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +2 -2
- data/.ruby-version +1 -1
- data/CHANGELOG.md +8 -0
- data/Gemfile.lock +20 -19
- data/lib/srx/languagetool/version.rb +1 -1
- data/lib/srx/segment.srx +312 -277
- metadata +2 -2
data/lib/srx/segment.srx
CHANGED
@@ -4,8 +4,10 @@
|
|
4
4
|
<formathandle type="start" include="no"></formathandle>
|
5
5
|
<formathandle type="end" include="yes"></formathandle>
|
6
6
|
<formathandle type="isolated" include="no"></formathandle>
|
7
|
-
<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes"
|
8
|
-
<okpsrx:sample language="
|
7
|
+
<okpsrx:options oneSegmentIncludesAll="no" trimLeadingWhitespaces="no" trimTrailingWhitespaces="no" useJavaRegex="yes"></okpsrx:options>
|
8
|
+
<okpsrx:sample language="pl" useMappedRules="yes">Als een hoogleraar met emeritaat ('pensioen') is, mag hij de functieaanduiding prof. blijven gebruiken, maar hij heeft tevens het recht gekregen om het bijvoeglijk naamwoord emeritus (Latijn voor 'uitgediend') aan zijn functietitel toe te voegen: em. prof. dr.
|
9
|
+
Tussen de twee wereldoorlogen vestigde prof. ir. Messerschmitt zich in Augsburg waar hij met behulp van een oudere, rijke vriendin (met wie hij later trouwde) zijn eerste vliegtuigen bouwde, het waren passagierstoestellen.
|
10
|
+
250 p. n.e.</okpsrx:sample>
|
9
11
|
<okpsrx:rangeRule></okpsrx:rangeRule>
|
10
12
|
</header>
|
11
13
|
<body>
|
@@ -1084,6 +1086,11 @@
|
|
1084
1086
|
<beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u0002]*\p{Pe}\s</beforebreak>
|
1085
1087
|
<afterbreak>\p{Ll}</afterbreak>
|
1086
1088
|
</rule>
|
1089
|
+
<!--p. n.e. (błędny podział wiersza)-->
|
1090
|
+
<rule break="no">
|
1091
|
+
<beforebreak>p\.\s</beforebreak>
|
1092
|
+
<afterbreak>n\.\s?e\.</afterbreak>
|
1093
|
+
</rule>
|
1087
1094
|
<rule break="yes">
|
1088
1095
|
<beforebreak>[\.!?…]['"\p{Pe}\u00BB\u2019\u201D\u203A\u0002¹²³]*\s</beforebreak>
|
1089
1096
|
<afterbreak></afterbreak>
|
@@ -1106,7 +1113,7 @@
|
|
1106
1113
|
<beforebreak>[\u00A0\s]</beforebreak>
|
1107
1114
|
<afterbreak>\n</afterbreak>
|
1108
1115
|
</rule>
|
1109
|
-
<rule break="no"
|
1116
|
+
<rule break="no">
|
1110
1117
|
<beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
|
1111
1118
|
<afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
|
1112
1119
|
</rule>
|
@@ -1114,96 +1121,96 @@
|
|
1114
1121
|
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
1115
1122
|
<afterbreak>\p{Ll}</afterbreak>
|
1116
1123
|
</rule>
|
1117
|
-
<rule break="no"
|
1124
|
+
<rule break="no">
|
1118
1125
|
<beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
|
1119
1126
|
<afterbreak>[A-Z]\b</afterbreak>
|
1120
1127
|
</rule>
|
1121
|
-
<rule break="no"
|
1128
|
+
<rule break="no">
|
1122
1129
|
<beforebreak>\bA\.</beforebreak>
|
1123
1130
|
<afterbreak>I\b</afterbreak>
|
1124
1131
|
</rule>
|
1125
|
-
<rule break="no"
|
1132
|
+
<rule break="no">
|
1126
1133
|
<beforebreak>\bS\.</beforebreak>
|
1127
1134
|
<afterbreak>I\b</afterbreak>
|
1128
1135
|
</rule>
|
1129
|
-
<rule break="no"
|
1136
|
+
<rule break="no">
|
1130
1137
|
<beforebreak>\bL\.</beforebreak>
|
1131
1138
|
<afterbreak>A\b</afterbreak>
|
1132
1139
|
</rule>
|
1133
|
-
<rule break="no"
|
1140
|
+
<rule break="no">
|
1134
1141
|
<beforebreak>\bU\.</beforebreak>
|
1135
1142
|
<afterbreak>[SK]\b</afterbreak>
|
1136
1143
|
</rule>
|
1137
|
-
<rule break="no"
|
1144
|
+
<rule break="no">
|
1138
1145
|
<beforebreak>\bI\.</beforebreak>
|
1139
1146
|
<afterbreak>S\b</afterbreak>
|
1140
1147
|
</rule>
|
1141
|
-
<rule break="no"
|
1148
|
+
<rule break="no">
|
1142
1149
|
<beforebreak>\bM\.</beforebreak>
|
1143
1150
|
<afterbreak>Z\b</afterbreak>
|
1144
1151
|
</rule>
|
1145
|
-
<rule break="no"
|
1152
|
+
<rule break="no">
|
1146
1153
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
1147
1154
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
1148
1155
|
</rule>
|
1149
|
-
<rule break="no"
|
1156
|
+
<rule break="no">
|
1150
1157
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
1151
1158
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
|
1152
1159
|
</rule>
|
1153
|
-
<rule break="no"
|
1160
|
+
<rule break="no">
|
1154
1161
|
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
1155
1162
|
<afterbreak>\p{N}</afterbreak>
|
1156
1163
|
</rule>
|
1157
|
-
<rule break="no"
|
1164
|
+
<rule break="no">
|
1158
1165
|
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
1159
1166
|
<afterbreak>D\.?</afterbreak>
|
1160
1167
|
</rule>
|
1161
|
-
<rule break="no"
|
1162
|
-
<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
|
1168
|
+
<rule break="no">
|
1169
|
+
<beforebreak>\b([Aa]vg|[Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Ll]tda|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|Ing|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
|
1163
1170
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1164
1171
|
</rule>
|
1165
|
-
<rule break="no"
|
1172
|
+
<rule break="no">
|
1166
1173
|
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
|
1167
1174
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1168
1175
|
</rule>
|
1169
|
-
<rule break="no"
|
1176
|
+
<rule break="no">
|
1170
1177
|
<beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1171
1178
|
<afterbreak>\p{N}|[IXV]+</afterbreak>
|
1172
1179
|
</rule>
|
1173
|
-
<rule break="no"
|
1180
|
+
<rule break="no">
|
1174
1181
|
<beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1175
1182
|
<afterbreak>\(\p{N}\)</afterbreak>
|
1176
1183
|
</rule>
|
1177
|
-
<rule break="no"
|
1184
|
+
<rule break="no">
|
1178
1185
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
1179
1186
|
<afterbreak>[^\p{P}]</afterbreak>
|
1180
1187
|
</rule>
|
1181
|
-
<rule break="no"
|
1188
|
+
<rule break="no">
|
1182
1189
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
1183
1190
|
<afterbreak>[^\p{P}]</afterbreak>
|
1184
1191
|
</rule>
|
1185
|
-
<rule break="no"
|
1192
|
+
<rule break="no">
|
1186
1193
|
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
1187
1194
|
<afterbreak></afterbreak>
|
1188
1195
|
</rule>
|
1189
|
-
<rule break="no"
|
1196
|
+
<rule break="no">
|
1190
1197
|
<beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
|
1191
1198
|
<afterbreak></afterbreak>
|
1192
1199
|
</rule>
|
1193
|
-
<rule break="no"
|
1200
|
+
<rule break="no">
|
1194
1201
|
<beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
|
1195
1202
|
<afterbreak></afterbreak>
|
1196
1203
|
</rule>
|
1197
|
-
<rule break="no"
|
1204
|
+
<rule break="no">
|
1198
1205
|
<beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
|
1199
1206
|
<afterbreak></afterbreak>
|
1200
1207
|
</rule>
|
1201
1208
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
1202
|
-
<rule break="no"
|
1209
|
+
<rule break="no">
|
1203
1210
|
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
1204
1211
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
1205
1212
|
</rule>
|
1206
|
-
<rule break="no"
|
1213
|
+
<rule break="no">
|
1207
1214
|
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
1208
1215
|
<afterbreak></afterbreak>
|
1209
1216
|
</rule>
|
@@ -1251,39 +1258,39 @@
|
|
1251
1258
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
1252
1259
|
<afterbreak>(?i)Box</afterbreak>
|
1253
1260
|
</rule>
|
1254
|
-
<rule break="no"
|
1261
|
+
<rule break="no">
|
1255
1262
|
<beforebreak>ID.</beforebreak>
|
1256
1263
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
1257
1264
|
</rule>
|
1258
|
-
<rule break="no"
|
1265
|
+
<rule break="no">
|
1259
1266
|
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
1260
1267
|
<afterbreak></afterbreak>
|
1261
1268
|
</rule>
|
1262
|
-
<rule break="no"
|
1269
|
+
<rule break="no">
|
1263
1270
|
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
1264
1271
|
<afterbreak></afterbreak>
|
1265
1272
|
</rule>
|
1266
|
-
<rule break="no"
|
1273
|
+
<rule break="no">
|
1267
1274
|
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
1268
1275
|
<afterbreak></afterbreak>
|
1269
1276
|
</rule>
|
1270
|
-
<rule break="no"
|
1277
|
+
<rule break="no">
|
1271
1278
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1272
1279
|
<afterbreak>Eng\.?</afterbreak>
|
1273
1280
|
</rule>
|
1274
|
-
<rule break="no"
|
1281
|
+
<rule break="no">
|
1275
1282
|
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
1276
1283
|
<afterbreak>[BM]\.?</afterbreak>
|
1277
1284
|
</rule>
|
1278
|
-
<rule break="no"
|
1285
|
+
<rule break="no">
|
1279
1286
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1280
1287
|
<afterbreak>Sc\.?</afterbreak>
|
1281
1288
|
</rule>
|
1282
|
-
<rule break="no"
|
1289
|
+
<rule break="no">
|
1283
1290
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1284
1291
|
<afterbreak>Comp?\.?</afterbreak>
|
1285
1292
|
</rule>
|
1286
|
-
<rule break="no"
|
1293
|
+
<rule break="no">
|
1287
1294
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1288
1295
|
<afterbreak>Arch\.?</afterbreak>
|
1289
1296
|
</rule>
|
@@ -1375,7 +1382,7 @@
|
|
1375
1382
|
<beforebreak>\b\p{L}\.</beforebreak>
|
1376
1383
|
<afterbreak>\p{L}\.</afterbreak>
|
1377
1384
|
</rule>
|
1378
|
-
<rule break="no"
|
1385
|
+
<rule break="no">
|
1379
1386
|
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
1380
1387
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
1381
1388
|
</rule>
|
@@ -1388,7 +1395,7 @@
|
|
1388
1395
|
<afterbreak>\p{Ll}+</afterbreak>
|
1389
1396
|
</rule>
|
1390
1397
|
<rule break="no">
|
1391
|
-
<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak
|
1398
|
+
<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak>
|
1392
1399
|
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
1393
1400
|
</rule>
|
1394
1401
|
<rule break="no">
|
@@ -1419,8 +1426,8 @@
|
|
1419
1426
|
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
1420
1427
|
<afterbreak></afterbreak>
|
1421
1428
|
</rule>
|
1422
|
-
<rule break="no"
|
1423
|
-
<beforebreak>i\.e\.[\s\u00A0]</beforebreak
|
1429
|
+
<rule break="no">
|
1430
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak>
|
1424
1431
|
<afterbreak></afterbreak>
|
1425
1432
|
</rule>
|
1426
1433
|
<rule break="yes">
|
@@ -1532,33 +1539,44 @@
|
|
1532
1539
|
</languagerule>
|
1533
1540
|
<languagerule languagerulename="Dutch">
|
1534
1541
|
<rule break="no">
|
1535
|
-
<!-- sp.a -->
|
1536
1542
|
<beforebreak>\b(sp|SP)</beforebreak>
|
1537
1543
|
<afterbreak>\.[aA]\b</afterbreak>
|
1538
1544
|
</rule>
|
1539
1545
|
<rule break="no">
|
1540
|
-
<!-- .Net -->
|
1541
1546
|
<beforebreak>\s[.]</beforebreak>
|
1542
1547
|
<afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
|
1543
1548
|
</rule>
|
1544
|
-
<rule break="no"
|
1549
|
+
<rule break="no">
|
1545
1550
|
<beforebreak>[.?!][’'"]</beforebreak>
|
1546
1551
|
<afterbreak> [a-z]</afterbreak>
|
1547
1552
|
</rule>
|
1548
|
-
<rule break="no"
|
1553
|
+
<rule break="no">
|
1549
1554
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
1550
1555
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
1551
1556
|
</rule>
|
1552
|
-
<rule break="no"
|
1557
|
+
<rule break="no">
|
1553
1558
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
1554
1559
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
1555
1560
|
</rule>
|
1556
1561
|
<rule break="no">
|
1557
|
-
<beforebreak>\b(
|
1562
|
+
<beforebreak>\b(blz|pag|fig)\.\s</beforebreak>
|
1563
|
+
<afterbreak>[0-9]</afterbreak>
|
1564
|
+
</rule>
|
1565
|
+
<!--Abbrevs that can happen in sentence and at end-->
|
1566
|
+
<rule break="no">
|
1567
|
+
<beforebreak>\b(enz|etc|zat|ambt|al|ver|art|wed|lab|bv|Bros)\.\s</beforebreak>
|
1568
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1569
|
+
</rule>
|
1570
|
+
<rule break="no">
|
1571
|
+
<beforebreak>\b(Ge?n|Ex|Le?v|Nu?m|D(eu)?t|Jo?z|Ri|R[ei]cht|Sa?m|Ko?n|Kr[on]{0,2}|Neh?|Est?|Jb|Ps|Spr?|Pr[ed]{0,2}|H(oog)?l|Je?s|Je?r|Kl(aagl)?|Ez(ech)?|Da?n|Ho?s|Jl|Am|Ob|Mc|Mi[ch]{0,2}|Nah?|Hk|Hab|Zf|[SZ]ef|Ha?g|Zc|Zach|Ma?l|Ma?t|Mk|Mar|Lk|Jh|H(an)?d|Ro?m|Kor|Ga?l|Ef|Fp|Fil|Ko|[CK]ol|Th|Th?e[s]{1,2}|Tm|Ti?t|Fm|Fil(em)?|Hb|Hebr?|Jk|Ja[ck]|Pe?tr?|Joh|Jud|Op(enb)?|Wijsh|Tob|Sir|Bar|Makk)\.\s</beforebreak>
|
1558
1572
|
<afterbreak></afterbreak>
|
1559
1573
|
</rule>
|
1560
1574
|
<rule break="no">
|
1561
|
-
<beforebreak>\b(
|
1575
|
+
<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|(Prof\.)?[Dd]r|Em|Fa|Kon|Stb)\.\s</beforebreak>
|
1576
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1577
|
+
</rule>
|
1578
|
+
<rule break="no">
|
1579
|
+
<beforebreak>\b([Mm]ej|[Mm]evr|[Mm]rs|[Mm]s|[Mm]gr|[Mm]w|Ndl|Ned|Nl|No|Prof|[Ss]ecr|Chr|Jac|[Ww]ed)\.\s</beforebreak>
|
1562
1580
|
<afterbreak></afterbreak>
|
1563
1581
|
</rule>
|
1564
1582
|
<rule break="no">
|
@@ -1566,23 +1584,27 @@
|
|
1566
1584
|
<afterbreak></afterbreak>
|
1567
1585
|
</rule>
|
1568
1586
|
<rule break="no">
|
1569
|
-
<beforebreak>\b(abs|abstr|adj|adm|
|
1587
|
+
<beforebreak>\b(abs|abstr|adj|adm|[Aa]fb|[Aa]fd|afk|afl|milj|zgn|plv|bvb|afm|evt|exp|vs)\.\s</beforebreak>
|
1570
1588
|
<afterbreak></afterbreak>
|
1571
1589
|
</rule>
|
1572
1590
|
<rule break="no">
|
1573
|
-
<beforebreak>\b(
|
1591
|
+
<beforebreak>\b(ald|alg|amb|anat|antrop|apoth)\.\s</beforebreak>
|
1574
1592
|
<afterbreak></afterbreak>
|
1575
1593
|
</rule>
|
1594
|
+
<rule break="yes">
|
1595
|
+
<beforebreak>\seen\sprof\.\s</beforebreak>
|
1596
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1597
|
+
</rule>
|
1576
1598
|
<rule break="no">
|
1577
1599
|
<beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
|
1578
1600
|
<afterbreak></afterbreak>
|
1579
1601
|
</rule>
|
1580
1602
|
<rule break="no">
|
1581
|
-
<beforebreak>\b(arch|
|
1603
|
+
<beforebreak>\b(arch|archeolbc|bep|betr|bez|bibl|bijl|[Bb]ijv)\.\s</beforebreak>
|
1582
1604
|
<afterbreak></afterbreak>
|
1583
1605
|
</rule>
|
1584
1606
|
<rule break="no">
|
1585
|
-
<beforebreak>\b(bijz|
|
1607
|
+
<beforebreak>\b(bijz|bw|ca|cat|centr|cf|cfr|cmpl)\.\s</beforebreak>
|
1586
1608
|
<afterbreak></afterbreak>
|
1587
1609
|
</rule>
|
1588
1610
|
<rule break="no">
|
@@ -1590,47 +1612,47 @@
|
|
1590
1612
|
<afterbreak></afterbreak>
|
1591
1613
|
</rule>
|
1592
1614
|
<rule break="no">
|
1593
|
-
<beforebreak>\b(
|
1615
|
+
<beforebreak>\b([Ee]d|em|ev|[Ee]xcl|[Ff]a|[Ff]am|[fF]ig|fin|fl|fr)\.\s</beforebreak>
|
1594
1616
|
<afterbreak></afterbreak>
|
1595
1617
|
</rule>
|
1596
1618
|
<rule break="no">
|
1597
|
-
<beforebreak>\b(geb|get|gld|id|
|
1619
|
+
<beforebreak>\b(geb|[Gg]em|get|gld|id|[Ii]ncl|ind|inf|ing|intern|inz|ir|jhr|jkvr)\.\s</beforebreak>
|
1598
1620
|
<afterbreak></afterbreak>
|
1599
1621
|
</rule>
|
1600
1622
|
<rule break="no">
|
1601
|
-
<beforebreak>\b(jl|jr|kr|kt|
|
1623
|
+
<beforebreak>\b(jl|jr|kr|kt|lic|ll|lt|lw|max|[Mm]evr|mi|[Mm]in|mld)\.\s</beforebreak>
|
1602
1624
|
<afterbreak></afterbreak>
|
1603
1625
|
</rule>
|
1604
1626
|
<rule break="no">
|
1605
|
-
<beforebreak>\b(mln|
|
1627
|
+
<beforebreak>\b(mln|[Mm]r|[Mm]w|nl|no|nr|nrs|ob|obl|ong|onov)\.\s</beforebreak>
|
1606
1628
|
<afterbreak></afterbreak>
|
1607
1629
|
</rule>
|
1608
1630
|
<rule break="no">
|
1609
|
-
<beforebreak>\b(opm|org|ov|
|
1631
|
+
<beforebreak>\b(opm|org|ov|[Pp]ag|par|penn|([1-3][\.e]?)[\s]?pers|plm|plv)\.\s</beforebreak>
|
1610
1632
|
<afterbreak></afterbreak>
|
1611
1633
|
</rule>
|
1612
1634
|
<rule break="no">
|
1613
|
-
<beforebreak>\b(prov|pseud|qty|red|ref|resp|soc|st|tab|tel|temp|tk)\.\s</beforebreak>
|
1635
|
+
<beforebreak>\b(prov|pseud|psych|qty|red|ref|resp|soc|st|tab|tel|temp|prof|tk)\.\s</beforebreak>
|
1614
1636
|
<afterbreak></afterbreak>
|
1615
1637
|
</rule>
|
1616
1638
|
<rule break="no">
|
1617
1639
|
<beforebreak>\b([A-Z]|Adr|Chr|Fr|Fred|IJ|Jac|Joh|Ph|St|Th|Tj|v|v\.(\s)?d)\.(\s)?</beforebreak>
|
1618
|
-
<afterbreak
|
1640
|
+
<afterbreak>\p{Lu}</afterbreak>
|
1619
1641
|
</rule>
|
1620
1642
|
<rule break="no">
|
1621
1643
|
<beforebreak>\b[vn]\.\s</beforebreak>
|
1622
1644
|
<afterbreak>Chr</afterbreak>
|
1623
1645
|
</rule>
|
1624
1646
|
<rule break="no">
|
1625
|
-
<beforebreak>\b(uitsl|
|
1626
|
-
<afterbreak
|
1647
|
+
<beforebreak>\b(uitsl|vgl|vnl|vnw|voorz|ww|zat|[Zz]elfst|zgn?)\.\s</beforebreak>
|
1648
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1627
1649
|
</rule>
|
1628
1650
|
<rule break="no">
|
1629
|
-
<beforebreak>\b(mm|cm|km|mg|kg|h|kW|mW)\.\s</beforebreak>
|
1651
|
+
<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
|
1630
1652
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
1631
1653
|
</rule>
|
1632
1654
|
<rule break="yes">
|
1633
|
-
<beforebreak>\b(mm|cm|km|ml|kg|kW|
|
1655
|
+
<beforebreak>\b(mm|cm|km|ml|mg|kg|h|kW|kg|mW)\.\s</beforebreak>
|
1634
1656
|
<afterbreak></afterbreak>
|
1635
1657
|
</rule>
|
1636
1658
|
<rule break="no">
|
@@ -1682,10 +1704,6 @@
|
|
1682
1704
|
<afterbreak></afterbreak>
|
1683
1705
|
</rule>
|
1684
1706
|
<rule break="no">
|
1685
|
-
<beforebreak>\b\p{Lu}\p{Ll}\.\s?</beforebreak>
|
1686
|
-
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1687
|
-
</rule>
|
1688
|
-
<rule break="no">
|
1689
1707
|
<beforebreak>\.\p{Lu}\p{Ll}\.\s?</beforebreak>
|
1690
1708
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1691
1709
|
</rule>
|
@@ -1694,14 +1712,6 @@
|
|
1694
1712
|
<beforebreak>\b\d+\.\s</beforebreak>
|
1695
1713
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
1696
1714
|
</rule>
|
1697
|
-
<rule break="yes">
|
1698
|
-
<beforebreak>\been\sprof\.\s</beforebreak>
|
1699
|
-
<afterbreak>[^\p{Ll}]</afterbreak>
|
1700
|
-
</rule>
|
1701
|
-
<rule break="no">
|
1702
|
-
<beforebreak>\bprof\.\s</beforebreak>
|
1703
|
-
<afterbreak></afterbreak>
|
1704
|
-
</rule>
|
1705
1715
|
<rule break="no">
|
1706
1716
|
<beforebreak>[.!?…][’'"]\s</beforebreak>
|
1707
1717
|
<afterbreak>[a-z]</afterbreak>
|
@@ -1719,11 +1729,11 @@
|
|
1719
1729
|
<afterbreak>[a-z]</afterbreak>
|
1720
1730
|
</rule>
|
1721
1731
|
<rule break="yes">
|
1722
|
-
<beforebreak>[
|
1732
|
+
<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
|
1723
1733
|
<afterbreak></afterbreak>
|
1724
1734
|
</rule>
|
1725
1735
|
<rule break="yes">
|
1726
|
-
<beforebreak>[
|
1736
|
+
<beforebreak>[.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
|
1727
1737
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1728
1738
|
</rule>
|
1729
1739
|
<rule break="yes">
|
@@ -1764,31 +1774,29 @@
|
|
1764
1774
|
<afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
|
1765
1775
|
</rule>
|
1766
1776
|
<rule break="no">
|
1767
|
-
<!-- "E. coli etc. -->
|
1768
1777
|
<beforebreak>"[A-Z][.]\s</beforebreak>
|
1769
1778
|
<afterbreak>[a-z]</afterbreak>
|
1770
1779
|
</rule>
|
1771
1780
|
<rule break="no">
|
1772
|
-
<!-- Cornelisz. -->
|
1773
1781
|
<beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
|
1774
1782
|
<afterbreak>[a-z]</afterbreak>
|
1775
1783
|
</rule>
|
1776
1784
|
<rule break="no">
|
1777
|
-
<!-- De n. XIV/vagus (nervus) -->
|
1778
1785
|
<beforebreak>De n[.]\s</beforebreak>
|
1779
1786
|
<afterbreak>[a-z]|[XIV]</afterbreak>
|
1780
1787
|
</rule>
|
1781
1788
|
<rule break="no">
|
1782
|
-
<!-- MOL.E -->
|
1783
1789
|
<beforebreak>[A-Z]{2,5}[.]</beforebreak>
|
1784
1790
|
<afterbreak>[A-Z]</afterbreak>
|
1785
1791
|
</rule>
|
1786
1792
|
<rule break="no">
|
1787
|
-
<!-- ..." betekent -->
|
1788
1793
|
<beforebreak>\.\.</beforebreak>
|
1789
1794
|
<afterbreak>" [a-z]</afterbreak>
|
1790
1795
|
</rule>
|
1791
|
-
|
1796
|
+
<rule break="no">
|
1797
|
+
<beforebreak>\sBTW\.</beforebreak>
|
1798
|
+
<afterbreak>\p{Ll}</afterbreak>
|
1799
|
+
</rule>
|
1792
1800
|
</languagerule>
|
1793
1801
|
<languagerule languagerulename="Slovak">
|
1794
1802
|
<rule break="no">
|
@@ -4366,14 +4374,14 @@
|
|
4366
4374
|
</rule>
|
4367
4375
|
<rule break="no">
|
4368
4376
|
<beforebreak>\b(р|ред|Рис|рус|с|сб|св|См|см|сов|соч|соц|спец|ср|ст|стр|т|тел|Тел|тех|тов|тт|туп)\.\s</beforebreak>
|
4369
|
-
<afterbreak
|
4377
|
+
<afterbreak>\p{Ll}</afterbreak>
|
4370
4378
|
</rule>
|
4371
4379
|
<rule break="no">
|
4372
4380
|
<beforebreak>\b(руб|Руб|тыс|Тыс|трлн)\.\s</beforebreak>
|
4373
4381
|
<afterbreak>\p{Ll}</afterbreak>
|
4374
4382
|
</rule>
|
4375
4383
|
<rule break="no">
|
4376
|
-
<beforebreak>\b(
|
4384
|
+
<beforebreak>\b(уд|ул|уч|физ|х|хор|э|Эл|эл)\.\s</beforebreak>
|
4377
4385
|
<afterbreak></afterbreak>
|
4378
4386
|
</rule>
|
4379
4387
|
<rule break="no">
|
@@ -4650,7 +4658,7 @@
|
|
4650
4658
|
<afterbreak>[XIV\d]+\b</afterbreak>
|
4651
4659
|
</rule>
|
4652
4660
|
<rule break="no">
|
4653
|
-
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4661
|
+
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|seg|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4654
4662
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4655
4663
|
</rule>
|
4656
4664
|
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
|
@@ -4718,6 +4726,10 @@
|
|
4718
4726
|
</languagerule>
|
4719
4727
|
<languagerule languagerulename="Spanish">
|
4720
4728
|
<rule break="no">
|
4729
|
+
<beforebreak>¿[^?]+:[\s\u00A0]</beforebreak>
|
4730
|
+
<afterbreak>.</afterbreak>
|
4731
|
+
</rule>
|
4732
|
+
<rule break="no">
|
4721
4733
|
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
4722
4734
|
<afterbreak>\p{Ll}</afterbreak>
|
4723
4735
|
</rule>
|
@@ -4732,7 +4744,7 @@
|
|
4732
4744
|
<!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
|
4733
4745
|
<rule break="no">
|
4734
4746
|
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
|
4735
|
-
<afterbreak
|
4747
|
+
<afterbreak></afterbreak>
|
4736
4748
|
</rule>
|
4737
4749
|
<!-- Ellipsis: ... lowercase -->
|
4738
4750
|
<rule break="no">
|
@@ -4762,43 +4774,41 @@
|
|
4762
4774
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4763
4775
|
</rule>
|
4764
4776
|
<rule break="no">
|
4765
|
-
<!-- URLs without "www."-->
|
4766
4777
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
4767
4778
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
4768
4779
|
</rule>
|
4769
4780
|
<rule break="no">
|
4770
|
-
<!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
|
4771
4781
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
4772
4782
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
4773
4783
|
</rule>
|
4774
4784
|
<!-- Abbreviations that cannot finish sentences-->
|
4775
4785
|
<rule break="no">
|
4776
4786
|
<beforebreak>\b((?iu)(en|febr|mzo|abr|my|jun|jul|ag|agt|set|sept|setbre|oct|nov|novbre|dic|dicbre))\.[\s\u00A0]</beforebreak>
|
4777
|
-
<afterbreak
|
4787
|
+
<afterbreak></afterbreak>
|
4778
4788
|
</rule>
|
4779
4789
|
<rule break="no">
|
4780
4790
|
<beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
|
4781
|
-
<afterbreak
|
4791
|
+
<afterbreak></afterbreak>
|
4782
4792
|
</rule>
|
4783
4793
|
<rule break="no">
|
4784
4794
|
<beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
|
4785
|
-
<afterbreak
|
4795
|
+
<afterbreak></afterbreak>
|
4786
4796
|
</rule>
|
4787
4797
|
<rule break="no">
|
4788
4798
|
<beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
|
4789
|
-
<afterbreak
|
4799
|
+
<afterbreak></afterbreak>
|
4790
4800
|
</rule>
|
4791
4801
|
<rule break="no">
|
4792
4802
|
<beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
|
4793
|
-
<afterbreak
|
4803
|
+
<afterbreak></afterbreak>
|
4794
4804
|
</rule>
|
4795
4805
|
<rule break="no">
|
4796
4806
|
<beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
|
4797
|
-
<afterbreak
|
4807
|
+
<afterbreak></afterbreak>
|
4798
4808
|
</rule>
|
4799
4809
|
<!-- Abbreviations that can finish sentences -->
|
4800
4810
|
<rule break="no">
|
4801
|
-
<beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[
|
4811
|
+
<beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VUuv]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4802
4812
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4803
4813
|
</rule>
|
4804
4814
|
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
|
@@ -4827,7 +4837,7 @@
|
|
4827
4837
|
<!-- Composed abbrev. -->
|
4828
4838
|
<rule break="no">
|
4829
4839
|
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4830
|
-
<afterbreak
|
4840
|
+
<afterbreak></afterbreak>
|
4831
4841
|
</rule>
|
4832
4842
|
<!-- Units -->
|
4833
4843
|
<rule break="no">
|
@@ -4849,11 +4859,11 @@
|
|
4849
4859
|
</rule>
|
4850
4860
|
</languagerule>
|
4851
4861
|
<languagerule languagerulename="German">
|
4852
|
-
<rule break="no"
|
4862
|
+
<rule break="no">
|
4853
4863
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
4854
4864
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
4855
4865
|
</rule>
|
4856
|
-
<rule break="no"
|
4866
|
+
<rule break="no">
|
4857
4867
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
4858
4868
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
4859
4869
|
</rule>
|
@@ -4864,40 +4874,44 @@
|
|
4864
4874
|
</rule>
|
4865
4875
|
<!-- Split at e.g. "1a. Und ..." -->
|
4866
4876
|
<rule break="yes">
|
4867
|
-
<beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
|
4877
|
+
<beforebreak>\d+[a-z]\.[\u00A0\s]{1,2}</beforebreak>
|
4868
4878
|
<afterbreak>\p{Lu}</afterbreak>
|
4869
4879
|
</rule>
|
4870
4880
|
<!-- Don't split at e.g. "d. h." -->
|
4871
4881
|
<rule break="no">
|
4872
|
-
<beforebreak>[^-\p{L}'
|
4882
|
+
<beforebreak>[^-\p{L}'’/°]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
|
4873
4883
|
<afterbreak></afterbreak>
|
4874
4884
|
</rule>
|
4875
4885
|
<rule break="no">
|
4876
|
-
<beforebreak>
|
4886
|
+
<beforebreak>([Dd](as|er|ie|iese[rsmn]?|en|em)|[kmsd]?ein(e[rsnm]?)?|am|fürs|ins|zum|im|am|zur) \d+\.[\u00A0\s]+</beforebreak>
|
4887
|
+
<afterbreak>[A-ZÄÖÜ].*</afterbreak>
|
4888
|
+
</rule>
|
4889
|
+
<rule break="no">
|
4890
|
+
<beforebreak>Ust.</beforebreak>
|
4877
4891
|
<afterbreak>Id</afterbreak>
|
4878
4892
|
</rule>
|
4879
4893
|
<rule break="no">
|
4880
|
-
<beforebreak>Prof.</beforebreak
|
4894
|
+
<beforebreak>Prof.</beforebreak>
|
4881
4895
|
<afterbreak>Dr</afterbreak>
|
4882
4896
|
</rule>
|
4883
4897
|
<rule break="no">
|
4884
|
-
<beforebreak>Dr.</beforebreak
|
4898
|
+
<beforebreak>Dr.</beforebreak>
|
4885
4899
|
<afterbreak>iur|med|oec|phil|rer|theol</afterbreak>
|
4886
4900
|
</rule>
|
4887
4901
|
<rule break="no">
|
4888
4902
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
4889
4903
|
<afterbreak>(?i)Box</afterbreak>
|
4890
4904
|
</rule>
|
4891
|
-
<rule break="no"
|
4905
|
+
<rule break="no">
|
4892
4906
|
<beforebreak>ID.</beforebreak>
|
4893
4907
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
4894
4908
|
</rule>
|
4895
4909
|
<rule break="no">
|
4896
|
-
<beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
|
4910
|
+
<beforebreak>[1-3]\.[\u00A0\s]{1,2}</beforebreak>
|
4897
4911
|
<afterbreak>Liga|Bundesliga|(Fußball|Handball|Basketball)(-B|b)undesliga</afterbreak>
|
4898
4912
|
</rule>
|
4899
4913
|
<rule break="no">
|
4900
|
-
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4914
|
+
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4901
4915
|
<afterbreak>Klässler[sn]?</afterbreak>
|
4902
4916
|
</rule>
|
4903
4917
|
<rule break="no">
|
@@ -4912,43 +4926,43 @@
|
|
4912
4926
|
<!-- Don't split after a white-space followed by a single letter followed
|
4913
4927
|
by a dot followed by another whitespace. e.g. " p. " -->
|
4914
4928
|
<rule break="no">
|
4915
|
-
<beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
|
4929
|
+
<beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]{1,2}</beforebreak>
|
4916
4930
|
<afterbreak>\p{L}\.</afterbreak>
|
4917
4931
|
</rule>
|
4918
4932
|
<!-- Don't split at "bla bla... yada yada" -->
|
4919
4933
|
<rule break="no">
|
4920
|
-
<beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
|
4934
|
+
<beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]{1,2}</beforebreak>
|
4921
4935
|
<afterbreak>\p{Ll}</afterbreak>
|
4922
4936
|
</rule>
|
4923
4937
|
<!-- Don't split [.?!] when they're quoted -->
|
4924
4938
|
<rule break="no">
|
4925
|
-
<beforebreak>['"„][\.!?…]['"“«»][\u00A0\s]</beforebreak>
|
4939
|
+
<beforebreak>['"„][\.!?…]['"“«»][\u00A0\s]{1,2}</beforebreak>
|
4926
4940
|
<afterbreak></afterbreak>
|
4927
4941
|
</rule>
|
4928
4942
|
<!-- Don't break after quote unless there's a capital letter
|
4929
4943
|
e.g.: "That's right!" he said. -->
|
4930
4944
|
<rule break="no">
|
4931
|
-
<beforebreak>["'“«»][\u00A0\s]</beforebreak>
|
4945
|
+
<beforebreak>["'“«»][\u00A0\s]{1,2}</beforebreak>
|
4932
4946
|
<afterbreak>\p{Ll}</afterbreak>
|
4933
4947
|
</rule>
|
4934
4948
|
<!-- e.g. "Das ist . so." - assume one sentence. -->
|
4935
4949
|
<rule break="no">
|
4936
|
-
<beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
|
4950
|
+
<beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]{1,2}</beforebreak>
|
4937
4951
|
<afterbreak></afterbreak>
|
4938
4952
|
</rule>
|
4939
4953
|
<!-- Numbers, dates e.g. "3.10. datiert" -->
|
4940
4954
|
<rule break="no">
|
4941
|
-
<beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
|
4955
|
+
<beforebreak>\b\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4942
4956
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
4943
4957
|
</rule>
|
4944
4958
|
<!-- z.B. "Das hier ist ein(!) Satz." -->
|
4945
4959
|
<rule break="no">
|
4946
|
-
<beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
|
4960
|
+
<beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]{1,2}</beforebreak>
|
4947
4961
|
<afterbreak></afterbreak>
|
4948
4962
|
</rule>
|
4949
4963
|
<!-- z.B. "Das hier ist (genau!) ein Satz." -->
|
4950
4964
|
<rule break="no">
|
4951
|
-
<beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
|
4965
|
+
<beforebreak>[!?]{1,3}[\)\]][\u00A0\s]{1,2}</beforebreak>
|
4952
4966
|
<afterbreak></afterbreak>
|
4953
4967
|
</rule>
|
4954
4968
|
<!-- z.B. "bla (...) blubb" -> kein Satzende -->
|
@@ -4958,55 +4972,55 @@
|
|
4958
4972
|
</rule>
|
4959
4973
|
<!-- don't split at cases like "Friedrich II. wird auch..." -->
|
4960
4974
|
<rule break="no">
|
4961
|
-
<beforebreak>[\u00A0\s
|
4975
|
+
<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]{1,2}</beforebreak>
|
4962
4976
|
<afterbreak>[^\p{Lu}]+</afterbreak>
|
4963
4977
|
</rule>
|
4964
4978
|
<!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
|
4965
4979
|
<rule break="no">
|
4966
|
-
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4980
|
+
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4967
4981
|
<afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
|
4968
4982
|
</rule>
|
4969
4983
|
<!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
|
4970
4984
|
ohne dass eine Satzgrenze erkannt wird
|
4971
4985
|
(z.B. "am 13. Dezember" -> keine Satzgrenze) -->
|
4972
4986
|
<rule break="no">
|
4973
|
-
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4987
|
+
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4974
4988
|
<afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
|
4975
4989
|
</rule>
|
4976
4990
|
<rule break="no">
|
4977
|
-
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4991
|
+
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4978
4992
|
<afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
|
4979
4993
|
</rule>
|
4980
4994
|
<rule break="no">
|
4981
|
-
<beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
|
4995
|
+
<beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]{1,2}</beforebreak>
|
4982
4996
|
<afterbreak>\d\d(\d\d)?</afterbreak>
|
4983
4997
|
</rule>
|
4984
4998
|
<!-- ähnliche Fälle außerhalb der Monatsnamen -->
|
4985
4999
|
<rule break="no">
|
4986
|
-
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
5000
|
+
<beforebreak>\d+\.[\u00A0\s]{1,2}</beforebreak>
|
4987
5001
|
<afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
|
4988
5002
|
</rule>
|
4989
5003
|
<!-- English abbreviations - but these work globally for all languages -->
|
4990
5004
|
<rule break="no">
|
4991
|
-
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]</beforebreak>
|
5005
|
+
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|[Ss]ek|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]{1,2}</beforebreak>
|
4992
5006
|
<afterbreak></afterbreak>
|
4993
5007
|
</rule>
|
4994
5008
|
<!-- Latin abbreviations - but these work globally for all languages -->
|
4995
5009
|
<rule break="no">
|
4996
|
-
<beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
|
5010
|
+
<beforebreak>\b(spp?)\.[\u00A0\s]{1,2}</beforebreak>
|
4997
5011
|
<afterbreak></afterbreak>
|
4998
5012
|
</rule>
|
4999
5013
|
<!-- German abbreviations -->
|
5000
5014
|
<rule break="no">
|
5001
|
-
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|versch|d|Übers|usw|
|
5015
|
+
<beforebreak>\b(betr|Geb|Stk|ggü|Mag|mtl|[Pp]arl|versch|[Ss]tellv|d|Übers|usw|[Bb]zw|Ab[hkst]|[Aa]bzü?gl|[Ll]tda|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|[Aa]utom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]{1,2}</beforebreak>
|
5002
5016
|
<afterbreak></afterbreak>
|
5003
5017
|
</rule>
|
5004
5018
|
<rule break="no">
|
5005
|
-
<beforebreak>\b(cts?|
|
5019
|
+
<beforebreak>\b(cts?|[Cc]a|chem|chin|Chr|cresc|[Dd]at|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|[Ee]igt?l|akt|[Ee]ngl|Erg|al|et[cw]|Etw|ev|[Ee]vtl?|[Ee]xkl|Expl|Exz)\.[\u00A0\s]{1,2}</beforebreak>
|
5006
5020
|
<afterbreak></afterbreak>
|
5007
5021
|
</rule>
|
5008
5022
|
<rule break="no">
|
5009
|
-
<beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
|
5023
|
+
<beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]{1,2}</beforebreak>
|
5010
5024
|
<afterbreak></afterbreak>
|
5011
5025
|
</rule>
|
5012
5026
|
<rule break="no">
|
@@ -5014,23 +5028,31 @@
|
|
5014
5028
|
<afterbreak>\p{Ll}</afterbreak>
|
5015
5029
|
</rule>
|
5016
5030
|
<rule break="no">
|
5017
|
-
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|
|
5031
|
+
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|[Ff]rz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]{1,2}</beforebreak>
|
5018
5032
|
<afterbreak></afterbreak>
|
5019
5033
|
</rule>
|
5020
5034
|
<rule break="no">
|
5021
|
-
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|
|
5035
|
+
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|[Ii]nkl|[Ii]ncl|[Ee]hem|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]{1,2}</beforebreak>
|
5022
5036
|
<afterbreak></afterbreak>
|
5023
5037
|
</rule>
|
5024
5038
|
<rule break="no">
|
5025
|
-
<beforebreak>\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|
|
5039
|
+
<beforebreak>\b([A-ZÖÄÜ][a-zöäüß]+nr|tel|[Gg]em|Pat|prov|Betr|lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mio|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]{1,2}</beforebreak>
|
5026
5040
|
<afterbreak></afterbreak>
|
5027
5041
|
</rule>
|
5028
5042
|
<rule break="no">
|
5029
|
-
<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
|
5043
|
+
<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]{1,2}</beforebreak>
|
5030
5044
|
<afterbreak></afterbreak>
|
5031
5045
|
</rule>
|
5032
5046
|
<rule break="no">
|
5033
|
-
<beforebreak
|
5047
|
+
<beforebreak>([A-ZÖÄÜ][a-zöäüß]+str)\.[\u00A0\s]{1,2}</beforebreak>
|
5048
|
+
<afterbreak>\p{Ll}</afterbreak>
|
5049
|
+
</rule>
|
5050
|
+
<rule break="no">
|
5051
|
+
<beforebreak>\d+\.\d+\.[\u00A0\s]</beforebreak>
|
5052
|
+
<afterbreak>[\-–][\u00A0\s]\d+</afterbreak>
|
5053
|
+
</rule>
|
5054
|
+
<rule break="no">
|
5055
|
+
<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|[Aa][bn]schl|sw|kl|[Gg]r|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|[Rr]echts?staatl|[Ss]taatl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]{1,2}</beforebreak>
|
5034
5056
|
<afterbreak></afterbreak>
|
5035
5057
|
</rule>
|
5036
5058
|
<!-- Break rules -->
|
@@ -5043,7 +5065,7 @@
|
|
5043
5065
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
5044
5066
|
</rule>
|
5045
5067
|
<rule break="yes">
|
5046
|
-
<beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
|
5068
|
+
<beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]{1,2}</beforebreak>
|
5047
5069
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
5048
5070
|
</rule>
|
5049
5071
|
<!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
|
@@ -5141,27 +5163,27 @@
|
|
5141
5163
|
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sep|Sept|Oct|Okt|Nov|Dec|PhD|al|cf|Inc|Ms|Gen|Sen|Prof|Corp|Co)\.\s</beforebreak>
|
5142
5164
|
<afterbreak></afterbreak>
|
5143
5165
|
</rule>
|
5144
|
-
<rule break="no"
|
5166
|
+
<rule break="no">
|
5145
5167
|
<beforebreak>\bP[Hh]\.\s?</beforebreak>
|
5146
5168
|
<afterbreak>D\.?</afterbreak>
|
5147
5169
|
</rule>
|
5148
|
-
<rule break="no"
|
5170
|
+
<rule break="no">
|
5149
5171
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5150
5172
|
<afterbreak>Eng\.?</afterbreak>
|
5151
5173
|
</rule>
|
5152
|
-
<rule break="no"
|
5174
|
+
<rule break="no">
|
5153
5175
|
<beforebreak>\bLL\.\s?</beforebreak>
|
5154
5176
|
<afterbreak>[BM]\.?</afterbreak>
|
5155
5177
|
</rule>
|
5156
|
-
<rule break="no"
|
5178
|
+
<rule break="no">
|
5157
5179
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5158
5180
|
<afterbreak>Sc\.?</afterbreak>
|
5159
5181
|
</rule>
|
5160
|
-
<rule break="no"
|
5182
|
+
<rule break="no">
|
5161
5183
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5162
5184
|
<afterbreak>Comp?\.?</afterbreak>
|
5163
5185
|
</rule>
|
5164
|
-
<rule break="no"
|
5186
|
+
<rule break="no">
|
5165
5187
|
<beforebreak>\b[BM]\.\s?</beforebreak>
|
5166
5188
|
<afterbreak>Arch\.?</afterbreak>
|
5167
5189
|
</rule>
|
@@ -5262,6 +5284,18 @@
|
|
5262
5284
|
<beforebreak>[\s\u00A0]</beforebreak>
|
5263
5285
|
<afterbreak>[»”’"'›]</afterbreak>
|
5264
5286
|
</rule>
|
5287
|
+
<rule break="no">
|
5288
|
+
<beforebreak>ambass|cuil|p|liv|assoc|bibl|ENREG|al|phot|circ|concl|deb|dest|dupl|éd|écon|incl?|ital|jur|juris|jurispr|larg|lex|législ|longit|(?-i)RR|(?-i)ÉÉm|(?-i)EExc|métr|méd|néol|obs|plur|préf|prog|publ|trib|trim|suiv|(?-i)LL|env|élem|ér|ét|hon|hypexp|conj|coop|ch|alph|anglic|app|pr|collab|paragr|sect|para|commiss|coord|dép|dir|gér|secour|sén|gén|abrév|adj|adr|anon|append|av|auj|bibl|bibliogr|bdc|boul|bull|bur|caar|cat|cell|chap|cir|compl|cf|corres|dest|dict|div|dom|dr|édif|éd|électr|élém|encycl|fig|fl|graph|hist|hyp|ill|imm|imp|impr|incl|inc|ind|in[gtvf]|jur|lat|litt|liq|loc|liv|livr|méd|mém|pl|réd|rel|sc|suiv|sup|suppl|trad|univ|mus|pharm|soc|pol|compt|urb|act|confect|exp|réal|prov|introd|inv|tial|enr|ép|équiv|esp|étym|excl|exc|ap|arr|arch|adv|al|anc|angl|ann|gest|gouv|prés|rect|représ|resp|scrut|vol|coll|réf|id|sqq?|janv|fév|avr|juill|oct|nov|déc|admin</beforebreak>
|
5289
|
+
<afterbreak>\p{Ll}.*</afterbreak>
|
5290
|
+
</rule>
|
5291
|
+
<rule break="no">
|
5292
|
+
<beforebreak>\p{Ll}.*</beforebreak>
|
5293
|
+
<afterbreak>ambass|cuil|p|liv|assoc|bibl|oct|déc|jan|fév|avr|juil|sept|nov|ENREG|al|circ|concl|deb|dest|dupl|éd|écon|incl?|ital|jur|juris|jurispr|larg|lex|législ|longit|(?-i)RR|(?-i)ÉÉm|(?-i)EExc|métr|méd|néol|obs|plur|préf|prog|publ|trib|trim|suiv|(?-i)LL|env|élem|ér|ét|hon|hypexp|conj|coop|ch|alph|anglic|app|pr|collab|paragr|sect|para|commiss|coord|dép|dir|gér|secour|sén|gén|abrév|adj|adr|anon|append|av|auj|bibl|bibliogr|bdc|boul|bull|bur|caar|cat|cell|chap|cir|compl|cf|corres|dest|dict|div|dom|dr|édif|éd|électr|élém|encycl|fig|fl|graph|hist|hyp|ill|imm|imp|impr|incl|inc|ind|in[gtvf]|jur|lat|litt|liq|loc|liv|livr|méd|mém|pl|réd|rel|sc|suiv|sup|suppl|trad|univ|mus|pharm|soc|pol|compt|urb|act|confect|exp|réal|prov|introd|inv|tial|enr|ép|équiv|esp|étym|excl|exc|ap|arr|arch|adv|al|anc|angl|ann|gest|gouv|prés|rect|représ|resp|scrut|vol|coll|réf|id|sqq?|janv|fév|avr|juill|oct|nov|déc|admin</afterbreak>
|
5294
|
+
</rule>
|
5295
|
+
<rule break="no">
|
5296
|
+
<beforebreak>.*°C</beforebreak>
|
5297
|
+
<afterbreak>de</afterbreak>
|
5298
|
+
</rule>
|
5265
5299
|
<rule break="yes">
|
5266
5300
|
<beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
|
5267
5301
|
<afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
|
@@ -5270,7 +5304,7 @@
|
|
5270
5304
|
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
5271
5305
|
<afterbreak>\p{Ll}</afterbreak>
|
5272
5306
|
</rule>
|
5273
|
-
<!-- !? + lowercase -->
|
5307
|
+
<!-- !? + lowercase -->
|
5274
5308
|
<rule break="no">
|
5275
5309
|
<beforebreak>(\!|\?)[\s\u00A0]</beforebreak>
|
5276
5310
|
<afterbreak>\p{Ll}</afterbreak>
|
@@ -5279,16 +5313,15 @@
|
|
5279
5313
|
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
|
5280
5314
|
<afterbreak></afterbreak>
|
5281
5315
|
</rule>
|
5282
|
-
<rule break="no"
|
5316
|
+
<rule break="no">
|
5283
5317
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
5284
5318
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
5285
5319
|
</rule>
|
5286
|
-
<rule break="no"
|
5320
|
+
<rule break="no">
|
5287
5321
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5288
5322
|
<afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
5289
5323
|
</rule>
|
5290
5324
|
<rule break="no">
|
5291
|
-
<!-- gaffa.org -->
|
5292
5325
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5293
5326
|
<afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
|
5294
5327
|
</rule>
|
@@ -5333,15 +5366,15 @@
|
|
5333
5366
|
<beforebreak>\b\p{L}\.</beforebreak>
|
5334
5367
|
<afterbreak>\p{L}\.</afterbreak>
|
5335
5368
|
</rule>
|
5336
|
-
<rule break="no"
|
5369
|
+
<rule break="no">
|
5337
5370
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
5338
5371
|
<afterbreak>[^\p{P}]</afterbreak>
|
5339
5372
|
</rule>
|
5340
|
-
<rule break="no"
|
5373
|
+
<rule break="no">
|
5341
5374
|
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
5342
5375
|
<afterbreak>[^\p{P}]</afterbreak>
|
5343
5376
|
</rule>
|
5344
|
-
<rule break="no"
|
5377
|
+
<rule break="no">
|
5345
5378
|
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
5346
5379
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
5347
5380
|
</rule>
|
@@ -5381,44 +5414,44 @@
|
|
5381
5414
|
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
5382
5415
|
<afterbreak></afterbreak>
|
5383
5416
|
</rule>
|
5384
|
-
<rule break="no"
|
5385
|
-
<beforebreak>i\.e\.[\s\u00A0]</beforebreak
|
5417
|
+
<rule break="no">
|
5418
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak>
|
5386
5419
|
<afterbreak></afterbreak>
|
5387
5420
|
</rule>
|
5388
|
-
<rule break="no"
|
5421
|
+
<rule break="no">
|
5389
5422
|
<beforebreak>[A-Z]\.[A-Z]\.</beforebreak>
|
5390
5423
|
<afterbreak>[A-Z]\b</afterbreak>
|
5391
5424
|
</rule>
|
5392
|
-
<rule break="no"
|
5425
|
+
<rule break="no">
|
5393
5426
|
<beforebreak>\bL\.</beforebreak>
|
5394
5427
|
<afterbreak>A\b</afterbreak>
|
5395
5428
|
</rule>
|
5396
|
-
<rule break="no"
|
5429
|
+
<rule break="no">
|
5397
5430
|
<beforebreak>\bU\.</beforebreak>
|
5398
5431
|
<afterbreak>[SK]\b</afterbreak>
|
5399
5432
|
</rule>
|
5400
|
-
<rule break="no"
|
5433
|
+
<rule break="no">
|
5401
5434
|
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
5402
5435
|
<afterbreak>\p{N}</afterbreak>
|
5403
5436
|
</rule>
|
5404
|
-
<rule break="no"
|
5437
|
+
<rule break="no">
|
5405
5438
|
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
5406
5439
|
<afterbreak>D\.?</afterbreak>
|
5407
5440
|
</rule>
|
5408
|
-
<rule break="no"
|
5441
|
+
<rule break="no">
|
5409
5442
|
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
5410
5443
|
<afterbreak></afterbreak>
|
5411
5444
|
</rule>
|
5412
|
-
<rule break="no"
|
5445
|
+
<rule break="no">
|
5413
5446
|
<beforebreak>\bvs\.[\s\u00A0]</beforebreak>
|
5414
5447
|
<afterbreak></afterbreak>
|
5415
5448
|
</rule>
|
5416
5449
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
5417
|
-
<rule break="no"
|
5450
|
+
<rule break="no">
|
5418
5451
|
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
5419
5452
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
5420
5453
|
</rule>
|
5421
|
-
<rule break="no"
|
5454
|
+
<rule break="no">
|
5422
5455
|
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
5423
5456
|
<afterbreak></afterbreak>
|
5424
5457
|
</rule>
|
@@ -5426,39 +5459,39 @@
|
|
5426
5459
|
<beforebreak>(?i)FRITZ!</beforebreak>
|
5427
5460
|
<afterbreak>(?i)Box</afterbreak>
|
5428
5461
|
</rule>
|
5429
|
-
<rule break="no"
|
5462
|
+
<rule break="no">
|
5430
5463
|
<beforebreak>ID.</beforebreak>
|
5431
5464
|
<afterbreak>3|4|Buzz|Crozz</afterbreak>
|
5432
5465
|
</rule>
|
5433
|
-
<rule break="no"
|
5466
|
+
<rule break="no">
|
5434
5467
|
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
5435
5468
|
<afterbreak></afterbreak>
|
5436
5469
|
</rule>
|
5437
|
-
<rule break="no"
|
5470
|
+
<rule break="no">
|
5438
5471
|
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
5439
5472
|
<afterbreak></afterbreak>
|
5440
5473
|
</rule>
|
5441
|
-
<rule break="no"
|
5474
|
+
<rule break="no">
|
5442
5475
|
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
5443
5476
|
<afterbreak></afterbreak>
|
5444
5477
|
</rule>
|
5445
|
-
<rule break="no"
|
5478
|
+
<rule break="no">
|
5446
5479
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5447
5480
|
<afterbreak>Eng\.?</afterbreak>
|
5448
5481
|
</rule>
|
5449
|
-
<rule break="no"
|
5482
|
+
<rule break="no">
|
5450
5483
|
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
5451
5484
|
<afterbreak>[BM]\.?</afterbreak>
|
5452
5485
|
</rule>
|
5453
|
-
<rule break="no"
|
5486
|
+
<rule break="no">
|
5454
5487
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5455
5488
|
<afterbreak>Sc\.?</afterbreak>
|
5456
5489
|
</rule>
|
5457
|
-
<rule break="no"
|
5490
|
+
<rule break="no">
|
5458
5491
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5459
5492
|
<afterbreak>Comp?\.?</afterbreak>
|
5460
5493
|
</rule>
|
5461
|
-
<rule break="no"
|
5494
|
+
<rule break="no">
|
5462
5495
|
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5463
5496
|
<afterbreak>Arch\.?</afterbreak>
|
5464
5497
|
</rule>
|
@@ -5504,7 +5537,6 @@
|
|
5504
5537
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
5505
5538
|
</rule>
|
5506
5539
|
</languagerule>
|
5507
|
-
|
5508
5540
|
<languagerule languagerulename="Ukrainian">
|
5509
5541
|
<!-- when sentence starts with ellipsis: ...Мазій і Юхим теж. -->
|
5510
5542
|
<rule break="no">
|
@@ -5517,8 +5549,8 @@
|
|
5517
5549
|
<afterbreak>\p{Lu}</afterbreak>
|
5518
5550
|
</rule>
|
5519
5551
|
<rule break="no">
|
5520
|
-
<beforebreak>[.!?…][\h]+</beforebreak>
|
5521
|
-
<afterbreak>[\h]*([«"„“(]|[
|
5552
|
+
<beforebreak>[.!?…][»“]?[\h]+</beforebreak>
|
5553
|
+
<afterbreak>[\h]*([«"„“(]|[‐-―-][\h])\p{Ll}</afterbreak>
|
5522
5554
|
</rule>
|
5523
5555
|
<rule break="yes">
|
5524
5556
|
<beforebreak>\v[\h]*</beforebreak>
|
@@ -5532,7 +5564,7 @@
|
|
5532
5564
|
<!-- various punctuation between lowercase letters -->
|
5533
5565
|
<rule break="no">
|
5534
5566
|
<beforebreak>\b\p{Ll}+[.!?][\h\v]*</beforebreak>
|
5535
|
-
<afterbreak>\h*(([\(«]|[\[
|
5567
|
+
<afterbreak>\h*(([\(«]|[\[‐-―-][\h\v]*)?\p{Ll})</afterbreak>
|
5536
5568
|
</rule>
|
5537
5569
|
<rule break="no">
|
5538
5570
|
<beforebreak>([\[\(]*[\]\)]*|\.\.\.|…)[\h\v]+</beforebreak>
|
@@ -5553,7 +5585,6 @@
|
|
5553
5585
|
<beforebreak>(^[\h\v]*|\([\h\v]*|[«„"]|(\b[А-ЯІЇЄҐACEIHOPX]\.-))[А-ЯІЇЄҐA-Z]\.[\h\v]*</beforebreak>
|
5554
5586
|
<afterbreak></afterbreak>
|
5555
5587
|
</rule>
|
5556
|
-
<!-- І. В. Коваль, Т. 2, C. 202 -->
|
5557
5588
|
<!-- Іван Ч. (1914 р. н.) -->
|
5558
5589
|
<rule break="no">
|
5559
5590
|
<beforebreak>[\h\v][А-ЯІЇЄҐ]\.[\h\v]*</beforebreak>
|
@@ -5576,7 +5607,7 @@
|
|
5576
5607
|
-->
|
5577
5608
|
<rule break="no">
|
5578
5609
|
<beforebreak>\b([0-9]{2}|[0-9]{4})[\h\v]+р\.[\h\v]+</beforebreak>
|
5579
|
-
<afterbreak>[\h\v]*[№0-9
|
5610
|
+
<afterbreak>[\h\v]*[№0-9‐-―-]</afterbreak>
|
5580
5611
|
</rule>
|
5581
5612
|
<!-- річка - р. Дніпро -->
|
5582
5613
|
<rule break="no">
|
@@ -5585,7 +5616,7 @@
|
|
5585
5616
|
</rule>
|
5586
5617
|
<!-- У травні 1949 р. Грушківський район -->
|
5587
5618
|
<rule break="no">
|
5588
|
-
<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[
|
5619
|
+
<beforebreak>[А-ЯІЇЄҐ][а-яіїєґ'’-]*([\h]+[а-яіїєґ'’-]+)?[\h](\d{4}[‐-―-])*\d{4}[\h]*р\.[\h\v]*</beforebreak>
|
5589
5620
|
<afterbreak>[\v\h]*(?!(На|Але|Так?)[\h\v]+)[А-ЯІЇЄҐA-Z][^\h\v]</afterbreak>
|
5590
5621
|
</rule>
|
5591
5622
|
<!-- 15 вересня 1995 р. Україною було підписно -->
|
@@ -5605,22 +5636,27 @@
|
|
5605
5636
|
</rule>
|
5606
5637
|
<!-- усталені скорочення, що не збігаються з нескороченими словами -->
|
5607
5638
|
<rule break="no">
|
5608
|
-
|
5609
|
-
<beforebreak>\b(укр|рос|англ|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк))\.[\h\v]*</beforebreak>
|
5639
|
+
<beforebreak>\b(укр|рос|англ?|амер|італ|ісп|нім|фр(анц)?|лат|грец(ьк)?)\.[\h\v]*</beforebreak>
|
5610
5640
|
<afterbreak></afterbreak>
|
5611
5641
|
</rule>
|
5612
5642
|
<rule break="no">
|
5613
|
-
|
5614
|
-
<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
|
5643
|
+
<beforebreak>\b(абз|арк|ауд|бл|буд|бульв|вул|держ|дод|зав|зб|зв|зовн|екон|к|кв|канд|кн|напр|нпр|нац|обл|оп|пл|пол|поч|пп|пор|просп|розд|стор|табл|[Тт]]ел|ч|част)\.[\h\v]*</beforebreak>
|
5615
5644
|
<afterbreak></afterbreak>
|
5616
5645
|
</rule>
|
5617
5646
|
<rule break="no">
|
5618
|
-
<!-- unfortunately \b ignores \u0301 -->
|
5619
5647
|
<beforebreak>\b[сС]т\.[\h\v]</beforebreak>
|
5620
5648
|
<afterbreak>[\h]*(?!([АВУОІЄ]|На|Але|Так?)[\h\v])</afterbreak>
|
5621
5649
|
</rule>
|
5650
|
+
<!-- нар. 1945 р. | (1966 р. нар.) | 1975 — нар. Осипчук -->
|
5651
|
+
<rule break="no">
|
5652
|
+
<beforebreak>([0-9]|[-–—])[\h\v]+нар\.[\h\v]*</beforebreak>
|
5653
|
+
<afterbreak></afterbreak>
|
5654
|
+
</rule>
|
5655
|
+
<rule break="no">
|
5656
|
+
<beforebreak>\bнар\.[\h\v]*</beforebreak>
|
5657
|
+
<afterbreak>([0-9]|бл\.|арт\.)</afterbreak>
|
5658
|
+
</rule>
|
5622
5659
|
<rule break="no">
|
5623
|
-
<!-- no break only for дол. США -->
|
5624
5660
|
<beforebreak>\bдол\.[\h\v]*</beforebreak>
|
5625
5661
|
<afterbreak>США</afterbreak>
|
5626
5662
|
</rule>
|
@@ -5636,7 +5672,7 @@
|
|
5636
5672
|
</rule>
|
5637
5673
|
<!-- Верховний орган, див. Африканський національний конгрес -->
|
5638
5674
|
<rule break="no">
|
5639
|
-
<beforebreak>[
|
5675
|
+
<beforebreak>[,‐-―-][\h\v]*(див)\.[\h\v]*</beforebreak>
|
5640
5676
|
<afterbreak></afterbreak>
|
5641
5677
|
</rule>
|
5642
5678
|
<!-- скорочення в дужках:
|
@@ -5648,10 +5684,14 @@
|
|
5648
5684
|
</rule>
|
5649
5685
|
<!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
|
5650
5686
|
<rule break="no">
|
5651
|
-
<beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]
|
5687
|
+
<beforebreak>\b(ап|[Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|оз|ім|інж|дир|тов|упоряд|тт|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
|
5652
5688
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
5653
5689
|
</rule>
|
5654
5690
|
<rule break="no">
|
5691
|
+
<beforebreak>(?<![іи]\s+)\bдр\.[\h\v]*</beforebreak>
|
5692
|
+
<afterbreak>[\h\v]*[А-ЯІЇЄҐ]</afterbreak>
|
5693
|
+
</rule>
|
5694
|
+
<rule break="no">
|
5655
5695
|
<beforebreak>\bМан\.[\h\v]*</beforebreak>
|
5656
5696
|
<afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
|
5657
5697
|
</rule>
|
@@ -5660,18 +5700,16 @@
|
|
5660
5700
|
<beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
|
5661
5701
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
5662
5702
|
</rule>
|
5663
|
-
<!-- арт. - артикул -->
|
5664
5703
|
<!-- TODO: арт. - артист -->
|
5665
5704
|
<rule break="no">
|
5666
5705
|
<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
|
5667
5706
|
<afterbreak>[\h\v]*[0-9]</afterbreak>
|
5668
5707
|
</rule>
|
5669
|
-
<!-- ХІІ р., 3-6
|
5708
|
+
<!-- ХІІ р., 3-6 арт., 2-3 тт. -->
|
5670
5709
|
<rule break="no">
|
5671
|
-
<beforebreak>[0-9][\h\v]
|
5710
|
+
<beforebreak>[0-9][\h\v]+(арт|тт)\.[\h\v]*</beforebreak>
|
5672
5711
|
<afterbreak></afterbreak>
|
5673
5712
|
</rule>
|
5674
|
-
<!-- місто, але принаймні з парою літер в назві бо є ще метри (м) -->
|
5675
5713
|
<!-- але розбиваємо «всього 20 м. Почалося» -->
|
5676
5714
|
<rule break="no">
|
5677
5715
|
<beforebreak>(?<!\d[\h\v]*)\bм\.[\h\v]*</beforebreak>
|
@@ -5695,10 +5733,8 @@
|
|
5695
5733
|
<!-- статус правових держав. — Авт.). -->
|
5696
5734
|
<rule break="no">
|
5697
5735
|
<beforebreak></beforebreak>
|
5698
|
-
<afterbreak>[\h\v]*[
|
5736
|
+
<afterbreak>[\h\v]*[‐-―-][\h\v]*([Рр]ед|[Аа]вт)[\h\v]*\.[\)\]]</afterbreak>
|
5699
5737
|
</rule>
|
5700
|
-
<!-- force the break -->
|
5701
|
-
<!-- часто зустрічається крапка+U+202F+пробіл, який srx чомусь не розбиває на речення -->
|
5702
5738
|
<!-- але лишаємо ініціали: С.\u202F Шелухин -->
|
5703
5739
|
<rule break="yes">
|
5704
5740
|
<beforebreak>(?<!\h[А-ЯІЇЄҐ])[.!?…]{1,3}\u202F[\h\v]+</beforebreak>
|
@@ -5716,10 +5752,9 @@
|
|
5716
5752
|
<!-- “Слон” (2008 р.) У минулому харків’янка -->
|
5717
5753
|
<rule break="yes">
|
5718
5754
|
<beforebreak>[.!?…]['»"„“”)\]›]?[\h\v]+</beforebreak>
|
5719
|
-
<afterbreak>([
|
5755
|
+
<afterbreak>([‐-―-][\h\v]*)?\p{Lu}[^\p{Lu}]</afterbreak>
|
5720
5756
|
</rule>
|
5721
5757
|
</languagerule>
|
5722
|
-
|
5723
5758
|
<languagerule languagerulename="Belarusian">
|
5724
5759
|
<rule break="no">
|
5725
5760
|
<beforebreak>\b\d+\.\s</beforebreak>
|
@@ -5986,17 +6021,17 @@
|
|
5986
6021
|
</rule>
|
5987
6022
|
</languagerule>
|
5988
6023
|
<languagerule languagerulename="Portuguese">
|
5989
|
-
<rule break="no"
|
6024
|
+
<rule break="no">
|
5990
6025
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
5991
6026
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
5992
6027
|
</rule>
|
5993
|
-
<rule break="no"
|
6028
|
+
<rule break="no">
|
5994
6029
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5995
6030
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
5996
6031
|
</rule>
|
5997
6032
|
<!-- Abbreviations that cannot finish sentences-->
|
5998
6033
|
<rule break="no">
|
5999
|
-
<beforebreak>\b(a|Ab|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s?</beforebreak>
|
6034
|
+
<beforebreak>\b(a|Ab|abr|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|ago|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s?</beforebreak>
|
6000
6035
|
<afterbreak></afterbreak>
|
6001
6036
|
</rule>
|
6002
6037
|
<rule break="no">
|
@@ -6008,7 +6043,7 @@
|
|
6008
6043
|
<afterbreak></afterbreak>
|
6009
6044
|
</rule>
|
6010
6045
|
<rule break="no">
|
6011
|
-
<beforebreak>\b(D|def|dem|deprec|deriv|det|disj|[Dd]ra?s?)\.\s?</beforebreak>
|
6046
|
+
<beforebreak>\b(D|def|dem|deprec|deriv|det|dez|disj|[Dd]ra?s?)\.\s?</beforebreak>
|
6012
6047
|
<afterbreak></afterbreak>
|
6013
6048
|
</rule>
|
6014
6049
|
<rule break="no">
|
@@ -6020,7 +6055,7 @@
|
|
6020
6055
|
<afterbreak>\p{Ll}</afterbreak>
|
6021
6056
|
</rule>
|
6022
6057
|
<rule break="no">
|
6023
|
-
<beforebreak>\b(f|fam|Farm|fem|fig|fin|fl|fr|frac)\.\s?</beforebreak>
|
6058
|
+
<beforebreak>\b(f|fam|Farm|fem|fev|fig|fin|fl|fr|frac)\.\s?</beforebreak>
|
6024
6059
|
<afterbreak></afterbreak>
|
6025
6060
|
</rule>
|
6026
6061
|
<rule break="no">
|
@@ -6036,7 +6071,7 @@
|
|
6036
6071
|
<afterbreak></afterbreak>
|
6037
6072
|
</rule>
|
6038
6073
|
<rule break="no">
|
6039
|
-
<beforebreak>\b(Jorn|Jur)\.\s?</beforebreak>
|
6074
|
+
<beforebreak>\b(jan|jul|jun|Jorn|Jur)\.\s?</beforebreak>
|
6040
6075
|
<afterbreak></afterbreak>
|
6041
6076
|
</rule>
|
6042
6077
|
<rule break="no">
|
@@ -6044,15 +6079,15 @@
|
|
6044
6079
|
<afterbreak></afterbreak>
|
6045
6080
|
</rule>
|
6046
6081
|
<rule break="no">
|
6047
|
-
<beforebreak>\b(m|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s?</beforebreak>
|
6082
|
+
<beforebreak>\b(m|mai|mar|masc|Mat|máx|Mecân|[Mm]ed|Mil|mín|mult|Mús)\.\s?</beforebreak>
|
6048
6083
|
<afterbreak></afterbreak>
|
6049
6084
|
</rule>
|
6050
6085
|
<rule break="no">
|
6051
|
-
<beforebreak>\b(n|N|Náut|N.B|neg|neol|num|núm)\.\s?</beforebreak>
|
6086
|
+
<beforebreak>\b(n|N|Náut|N.B|neg|neol|nov|num|núm)\.\s?</beforebreak>
|
6052
6087
|
<afterbreak></afterbreak>
|
6053
6088
|
</rule>
|
6054
6089
|
<rule break="no">
|
6055
|
-
<beforebreak>\b(ord)\.\s?</beforebreak>
|
6090
|
+
<beforebreak>\b(ord|out)\.\s?</beforebreak>
|
6056
6091
|
<afterbreak></afterbreak>
|
6057
6092
|
</rule>
|
6058
6093
|
<rule break="no">
|
@@ -6068,7 +6103,7 @@
|
|
6068
6103
|
<afterbreak></afterbreak>
|
6069
6104
|
</rule>
|
6070
6105
|
<rule break="no">
|
6071
|
-
<beforebreak>\b(S|S.A|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s?</beforebreak>
|
6106
|
+
<beforebreak>\b(S|S.A|set|símb|S. ?M|[Ss]ra?s?|[Ss]rta|suf|superl)\.\s?</beforebreak>
|
6072
6107
|
<afterbreak></afterbreak>
|
6073
6108
|
</rule>
|
6074
6109
|
<rule break="no">
|
@@ -6090,7 +6125,7 @@
|
|
6090
6125
|
<!-- s. XIX; s.IX; sec. XX; séc. XX -->
|
6091
6126
|
<rule break="no">
|
6092
6127
|
<beforebreak>\bs([eé]c)?\.\s?</beforebreak>
|
6093
|
-
<afterbreak>[
|
6128
|
+
<afterbreak>[IVXDMCL]+</afterbreak>
|
6094
6129
|
</rule>
|
6095
6130
|
<!-- English abbreviations - but these work globally for all languages -->
|
6096
6131
|
<rule break="no">
|
@@ -6485,7 +6520,7 @@
|
|
6485
6520
|
</rule>
|
6486
6521
|
<!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."-->
|
6487
6522
|
<rule break="no">
|
6488
|
-
<beforebreak>[\s
|
6523
|
+
<beforebreak>[\s ][IVX]+\s</beforebreak>
|
6489
6524
|
<afterbreak>[^\p{Lu}]+</afterbreak>
|
6490
6525
|
</rule>
|
6491
6526
|
<!--Не раздвајај у случају као "од 13. до 14. века"-->
|
@@ -6624,83 +6659,83 @@
|
|
6624
6659
|
</rule>
|
6625
6660
|
</languagerule>
|
6626
6661
|
<languagerule languagerulename="Arabic">
|
6627
|
-
|
6628
|
-
|
6629
|
-
|
6630
|
-
|
6631
|
-
|
6632
|
-
|
6633
|
-
|
6634
|
-
|
6635
|
-
|
6636
|
-
|
6637
|
-
|
6638
|
-
|
6639
|
-
|
6640
|
-
|
6641
|
-
|
6642
|
-
|
6643
|
-
|
6644
|
-
|
6645
|
-
|
6646
|
-
|
6647
|
-
|
6648
|
-
|
6649
|
-
|
6650
|
-
|
6651
|
-
|
6652
|
-
|
6653
|
-
|
6654
|
-
|
6655
|
-
|
6656
|
-
|
6657
|
-
|
6658
|
-
|
6659
|
-
|
6660
|
-
|
6661
|
-
|
6662
|
-
|
6663
|
-
|
6664
|
-
|
6665
|
-
|
6666
|
-
|
6667
|
-
|
6668
|
-
|
6669
|
-
|
6670
|
-
|
6671
|
-
|
6672
|
-
|
6673
|
-
|
6674
|
-
|
6675
|
-
|
6676
|
-
|
6677
|
-
|
6678
|
-
|
6679
|
-
|
6680
|
-
|
6681
|
-
|
6682
|
-
|
6683
|
-
|
6684
|
-
|
6685
|
-
|
6686
|
-
|
6687
|
-
|
6688
|
-
|
6689
|
-
|
6690
|
-
|
6691
|
-
|
6692
|
-
|
6693
|
-
|
6694
|
-
|
6695
|
-
|
6696
|
-
|
6697
|
-
|
6698
|
-
|
6699
|
-
|
6700
|
-
|
6701
|
-
|
6702
|
-
|
6703
|
-
|
6662
|
+
<rule break="no">
|
6663
|
+
<beforebreak>\bwww\.</beforebreak>
|
6664
|
+
<afterbreak>\w</afterbreak>
|
6665
|
+
</rule>
|
6666
|
+
<rule break="no">
|
6667
|
+
<beforebreak>[\[\(]*…[\]\)]* </beforebreak>
|
6668
|
+
<afterbreak>\p{Ll}</afterbreak>
|
6669
|
+
</rule>
|
6670
|
+
<rule break="no">
|
6671
|
+
<beforebreak>\p{Ps}[!?؟]+\p{Pe} </beforebreak>
|
6672
|
+
<afterbreak></afterbreak>
|
6673
|
+
</rule>
|
6674
|
+
<rule break="no">
|
6675
|
+
<beforebreak>[\.!?؟…]+\p{Pe} </beforebreak>
|
6676
|
+
<afterbreak>\p{Ll}</afterbreak>
|
6677
|
+
</rule>
|
6678
|
+
<rule break="no">
|
6679
|
+
<beforebreak>[«»"”']\s*</beforebreak>
|
6680
|
+
<afterbreak>\s*\p{Ll}</afterbreak>
|
6681
|
+
</rule>
|
6682
|
+
<rule break="no">
|
6683
|
+
<beforebreak>[«'"„][\.!?؟…]['"”»]\s</beforebreak>
|
6684
|
+
<afterbreak></afterbreak>
|
6685
|
+
</rule>
|
6686
|
+
<rule break="no">
|
6687
|
+
<beforebreak>\b\p{L}\.\s</beforebreak>
|
6688
|
+
<afterbreak>\p{L}\.\s</afterbreak>
|
6689
|
+
</rule>
|
6690
|
+
<rule break="no">
|
6691
|
+
<beforebreak>\b\p{L}\.</beforebreak>
|
6692
|
+
<afterbreak>\p{L}\.</afterbreak>
|
6693
|
+
</rule>
|
6694
|
+
<rule break="yes">
|
6695
|
+
<beforebreak>[^,،][\s]\p{L}{2}\.\s</beforebreak>
|
6696
|
+
<afterbreak>\p{N}+\)\s</afterbreak>
|
6697
|
+
</rule>
|
6698
|
+
<rule break="no">
|
6699
|
+
<beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
|
6700
|
+
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
6701
|
+
</rule>
|
6702
|
+
<rule break="no">
|
6703
|
+
<beforebreak>[\[\(]*\.\.\.[\]\)]* </beforebreak>
|
6704
|
+
<afterbreak>[^\p{Lu}]</afterbreak>
|
6705
|
+
</rule>
|
6706
|
+
<rule break="no">
|
6707
|
+
<beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
|
6708
|
+
<afterbreak></afterbreak>
|
6709
|
+
</rule>
|
6710
|
+
<rule break="no">
|
6711
|
+
<beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
|
6712
|
+
<afterbreak></afterbreak>
|
6713
|
+
</rule>
|
6714
|
+
<rule break="no">
|
6715
|
+
<beforebreak>[^\.]\s[ابتقجحخدذصضعغفقكلمنهوىيءةأ١٢٣٤٥٦٧٨٩٠A-Z]\.\s</beforebreak>
|
6716
|
+
<afterbreak></afterbreak>
|
6717
|
+
</rule>
|
6718
|
+
<rule break="no">
|
6719
|
+
<beforebreak>[^\.]\s[\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652\u0653\u0654\u0655\u0656\u0640]\.\s</beforebreak>
|
6720
|
+
<afterbreak></afterbreak>
|
6721
|
+
</rule>
|
6722
|
+
<rule break="no">
|
6723
|
+
<beforebreak>\(\p{Ll}+\.\s</beforebreak>
|
6724
|
+
<afterbreak></afterbreak>
|
6725
|
+
</rule>
|
6726
|
+
<rule break="yes">
|
6727
|
+
<beforebreak>[\.!?؟…][«»\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
|
6728
|
+
<afterbreak></afterbreak>
|
6729
|
+
</rule>
|
6730
|
+
<rule break="yes">
|
6731
|
+
<beforebreak>[\.!?؟…][«»'"\u00BB\u2019\u201D\u203A\p{Pe}\u0002]*</beforebreak>
|
6732
|
+
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
6733
|
+
</rule>
|
6734
|
+
<rule break="yes">
|
6735
|
+
<beforebreak>\s\p{L}[\.!?؟…]\s</beforebreak>
|
6736
|
+
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
6737
|
+
</rule>
|
6738
|
+
</languagerule>
|
6704
6739
|
</languagerules>
|
6705
6740
|
<maprules>
|
6706
6741
|
<languagemap languagepattern=".*" languagerulename="GeneralImportant"></languagemap>
|