srx-languagetool 0.3.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/srx/segment.srx CHANGED
@@ -1102,12 +1102,16 @@
1102
1102
  </rule>
1103
1103
  </languagerule>
1104
1104
  <languagerule languagerulename="English">
1105
+ <rule break="no">
1106
+ <beforebreak>[\u00A0\s]</beforebreak>
1107
+ <afterbreak>\n</afterbreak>
1108
+ </rule>
1105
1109
  <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1106
- <beforebreak>[a-zA-Z][!\?]\s</beforebreak>
1107
- <afterbreak>\)\s[a-zA-Z]</afterbreak>
1110
+ <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
+ <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1108
1112
  </rule>
1109
1113
  <rule break="no">
1110
- <beforebreak>Yahoo!\s</beforebreak>
1114
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1111
1115
  <afterbreak>\p{Ll}</afterbreak>
1112
1116
  </rule>
1113
1117
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -1118,6 +1122,10 @@
1118
1122
  <beforebreak>\bA\.</beforebreak>
1119
1123
  <afterbreak>I\b</afterbreak>
1120
1124
  </rule>
1125
+ <rule break="no"><!-- S.I (no dot at end) -->
1126
+ <beforebreak>\bS\.</beforebreak>
1127
+ <afterbreak>I\b</afterbreak>
1128
+ </rule>
1121
1129
  <rule break="no"><!-- L.A (no dot at end) -->
1122
1130
  <beforebreak>\bL\.</beforebreak>
1123
1131
  <afterbreak>A\b</afterbreak>
@@ -1126,6 +1134,14 @@
1126
1134
  <beforebreak>\bU\.</beforebreak>
1127
1135
  <afterbreak>[SK]\b</afterbreak>
1128
1136
  </rule>
1137
+ <rule break="no"><!-- I.S (no dot at end) -->
1138
+ <beforebreak>\bI\.</beforebreak>
1139
+ <afterbreak>S\b</afterbreak>
1140
+ </rule>
1141
+ <rule break="no"><!-- M.Z (no dot at end) -->
1142
+ <beforebreak>\bM\.</beforebreak>
1143
+ <afterbreak>Z\b</afterbreak>
1144
+ </rule>
1129
1145
  <rule break="no"><!-- URLs without "www."-->
1130
1146
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1131
1147
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
@@ -1135,96 +1151,100 @@
1135
1151
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1136
1152
  </rule>
1137
1153
  <rule break="no"><!-- No. 5 -->
1138
- <beforebreak>\b[nN]o\.\s</beforebreak>
1154
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1139
1155
  <afterbreak>\p{N}</afterbreak>
1140
1156
  </rule>
1141
1157
  <rule break="no"><!-- Ph.D. -->
1142
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
1158
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1143
1159
  <afterbreak>D\.?</afterbreak>
1144
1160
  </rule>
1145
1161
  <rule break="no"><!-- min. -->
1146
- <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?\s*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.\s</beforebreak>
1162
+ <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|[Mm]ax|[Gg]ovt|[Rr]etd|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1147
1163
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1148
1164
  </rule>
1149
1165
  <rule break="no"><!-- hr. -->
1150
- <beforebreak>\b(hr)\.\s</beforebreak>
1166
+ <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1151
1167
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1152
1168
  </rule>
1153
1169
  <rule break="no"><!-- Fig. 8 -->
1154
- <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1170
+ <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1155
1171
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1156
1172
  </rule>
1157
1173
  <rule break="no"><!-- Fig. (8) -->
1158
- <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1174
+ <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1159
1175
  <afterbreak>\(\p{N}\)</afterbreak>
1160
1176
  </rule>
1161
1177
  <rule break="no"><!-- I'm (...) great! -->
1162
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
1178
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1163
1179
  <afterbreak>[^\p{P}]</afterbreak>
1164
1180
  </rule>
1165
1181
  <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1166
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
1182
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1167
1183
  <afterbreak>[^\p{P}]</afterbreak>
1168
1184
  </rule>
1169
1185
  <rule break="no"><!-- e.g. -->
1170
- <beforebreak>\be\.g\.\s</beforebreak>
1186
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1171
1187
  <afterbreak></afterbreak>
1172
1188
  </rule>
1173
1189
  <rule break="no"><!-- vs. -->
1174
- <beforebreak>\bvs\.\s</beforebreak>
1190
+ <beforebreak>\b[Vv]s\.[\s\u00A0]</beforebreak>
1191
+ <afterbreak></afterbreak>
1192
+ </rule>
1193
+ <rule break="no"><!-- pp. -->
1194
+ <beforebreak>\b(pp|PP)\.[\s\u00A0]</beforebreak>
1175
1195
  <afterbreak></afterbreak>
1176
1196
  </rule>
1177
1197
  <rule break="no"><!-- esp. -->
1178
- <beforebreak>\be[sx]p\.\s</beforebreak>
1198
+ <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1179
1199
  <afterbreak></afterbreak>
1180
1200
  </rule>
1181
1201
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1182
1202
  <rule break="no"><!-- Etc. -->
1183
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
1203
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1184
1204
  <afterbreak>[^\p{Lu}]</afterbreak>
1185
1205
  </rule>
1186
1206
  <rule break="no"><!-- BTW (by the way) -->
1187
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
1207
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1188
1208
  <afterbreak></afterbreak>
1189
1209
  </rule>
1190
1210
  <rule break="no">
1191
- <beforebreak>\bJan\.\s</beforebreak>
1211
+ <beforebreak>\bJan\.[\s\u00A0]</beforebreak>
1192
1212
  <afterbreak></afterbreak>
1193
1213
  </rule>
1194
1214
  <rule break="no">
1195
- <beforebreak>\bFeb\.\s</beforebreak>
1215
+ <beforebreak>\bFeb\.[\s\u00A0]</beforebreak>
1196
1216
  <afterbreak></afterbreak>
1197
1217
  </rule>
1198
1218
  <rule break="no">
1199
- <beforebreak>\bMar\.\s</beforebreak>
1219
+ <beforebreak>\bMar\.[\s\u00A0]</beforebreak>
1200
1220
  <afterbreak></afterbreak>
1201
1221
  </rule>
1202
1222
  <rule break="no">
1203
- <beforebreak>\bApr\.\s</beforebreak>
1223
+ <beforebreak>\bApr\.[\s\u00A0]</beforebreak>
1204
1224
  <afterbreak></afterbreak>
1205
1225
  </rule>
1206
1226
  <rule break="no">
1207
- <beforebreak>\bJu[nl]\.\s</beforebreak>
1227
+ <beforebreak>\bJu[nl]\.[\s\u00A0]</beforebreak>
1208
1228
  <afterbreak></afterbreak>
1209
1229
  </rule>
1210
1230
  <rule break="no">
1211
- <beforebreak>\bAug\.\s</beforebreak>
1231
+ <beforebreak>\bAug\.[\s\u00A0]</beforebreak>
1212
1232
  <afterbreak></afterbreak>
1213
1233
  </rule>
1214
1234
  <rule break="no">
1215
- <beforebreak>\bSept?\.\s</beforebreak>
1235
+ <beforebreak>\bSept?\.[\s\u00A0]</beforebreak>
1216
1236
  <afterbreak></afterbreak>
1217
1237
  </rule>
1218
1238
  <rule break="no">
1219
- <beforebreak>\bOct\.\s</beforebreak>
1239
+ <beforebreak>\bOct\.[\s\u00A0]</beforebreak>
1220
1240
  <afterbreak></afterbreak>
1221
1241
  </rule>
1222
1242
  <rule break="no">
1223
- <beforebreak>\bNov\.\s</beforebreak>
1243
+ <beforebreak>\bNov\.[\s\u00A0]</beforebreak>
1224
1244
  <afterbreak></afterbreak>
1225
1245
  </rule>
1226
1246
  <rule break="no">
1227
- <beforebreak>\bDec\.\s</beforebreak>
1247
+ <beforebreak>\bDec\.[\s\u00A0]</beforebreak>
1228
1248
  <afterbreak></afterbreak>
1229
1249
  </rule>
1230
1250
  <rule break="no">
@@ -1233,46 +1253,46 @@
1233
1253
  </rule>
1234
1254
  <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
1235
1255
  <beforebreak>ID.</beforebreak>
1236
- <afterbreak>3|Buzz|Crozz</afterbreak>
1256
+ <afterbreak>3|4|Buzz|Crozz</afterbreak>
1237
1257
  </rule>
1238
1258
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1239
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
1259
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1240
1260
  <afterbreak></afterbreak>
1241
1261
  </rule>
1242
1262
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1243
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
1263
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1244
1264
  <afterbreak></afterbreak>
1245
1265
  </rule>
1246
1266
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1247
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
1267
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1248
1268
  <afterbreak></afterbreak>
1249
1269
  </rule>
1250
1270
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1251
- <beforebreak>\b[BM]\.\s?</beforebreak>
1271
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1252
1272
  <afterbreak>Eng\.?</afterbreak>
1253
1273
  </rule>
1254
1274
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1255
- <beforebreak>\bLL\.\s?</beforebreak>
1275
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1256
1276
  <afterbreak>[BM]\.?</afterbreak>
1257
1277
  </rule>
1258
1278
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1259
- <beforebreak>\b[BM]\.\s?</beforebreak>
1279
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1260
1280
  <afterbreak>Sc\.?</afterbreak>
1261
1281
  </rule>
1262
1282
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1263
- <beforebreak>\b[BM]\.\s?</beforebreak>
1283
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1264
1284
  <afterbreak>Comp?\.?</afterbreak>
1265
1285
  </rule>
1266
1286
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1267
- <beforebreak>\b[BM]\.\s?</beforebreak>
1287
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1268
1288
  <afterbreak>Arch\.?</afterbreak>
1269
1289
  </rule>
1270
1290
  <rule break="no">
1271
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
1291
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
1272
1292
  <afterbreak></afterbreak>
1273
1293
  </rule>
1274
1294
  <rule break="no">
1275
- <beforebreak>\bet\b\s\bal\.\s</beforebreak>
1295
+ <beforebreak>\bet\b[\s\u00A0]\bal\.[\s\u00A0]</beforebreak>
1276
1296
  <afterbreak></afterbreak>
1277
1297
  </rule>
1278
1298
  <rule break="no">
@@ -1280,51 +1300,51 @@
1280
1300
  <afterbreak></afterbreak>
1281
1301
  </rule>
1282
1302
  <rule break="no">
1283
- <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s</beforebreak>
1303
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[[\s\u00A0]\u00A0]</beforebreak>
1284
1304
  <afterbreak></afterbreak>
1285
1305
  </rule>
1286
1306
  <rule break="no">
1287
- <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s[A-Z]\.\s</beforebreak>
1307
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1288
1308
  <afterbreak></afterbreak>
1289
1309
  </rule>
1290
1310
  <rule break="no">
1291
- <beforebreak>\b(Drs|Messrs|Mmes)\.\s</beforebreak>
1292
- <afterbreak>(and\s)|\p{Lu}\p{Ll}+</afterbreak>
1311
+ <beforebreak>\b(Drs|Messrs|Mmes)\.[\s\u00A0]</beforebreak>
1312
+ <afterbreak>(and[\s\u00A0])|\p{Lu}\p{Ll}+</afterbreak>
1293
1313
  </rule>
1294
1314
  <rule break="no">
1295
- <beforebreak>\bcf\.\s</beforebreak>
1315
+ <beforebreak>\bcf\.[\s\u00A0]</beforebreak>
1296
1316
  <afterbreak></afterbreak>
1297
1317
  </rule>
1298
1318
  <rule break="no">
1299
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
1319
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
1300
1320
  <afterbreak></afterbreak>
1301
1321
  </rule>
1302
1322
  <rule break="no">
1303
- <beforebreak>\bCorp\.\s</beforebreak>
1323
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
1304
1324
  <afterbreak></afterbreak>
1305
1325
  </rule>
1306
1326
  <rule break="no">
1307
- <beforebreak>\bBros\.\s</beforebreak>
1327
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
1308
1328
  <afterbreak></afterbreak>
1309
1329
  </rule>
1310
1330
  <rule break="no">
1311
- <beforebreak>\bDist\.\s</beforebreak>
1331
+ <beforebreak>\bDist\.[\s\u00A0]</beforebreak>
1312
1332
  <afterbreak></afterbreak>
1313
1333
  </rule>
1314
1334
  <rule break="no">
1315
- <beforebreak>\bCo\.\s</beforebreak>
1335
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
1316
1336
  <afterbreak></afterbreak>
1317
1337
  </rule>
1318
1338
  <rule break="no">
1319
- <beforebreak>\bo'clock\s</beforebreak>
1339
+ <beforebreak>\bo'clock[\s\u00A0]</beforebreak>
1320
1340
  <afterbreak></afterbreak>
1321
1341
  </rule>
1322
1342
  <rule break="no">
1323
- <beforebreak>\bfo'c'sle\s</beforebreak>
1343
+ <beforebreak>\bfo'c'sle[\s\u00A0]</beforebreak>
1324
1344
  <afterbreak></afterbreak>
1325
1345
  </rule>
1326
1346
  <rule break="no">
1327
- <beforebreak>\bLtd\.\s</beforebreak>
1347
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
1328
1348
  <afterbreak>\p{Ll}+</afterbreak>
1329
1349
  </rule>
1330
1350
  <rule break="no">
@@ -1340,35 +1360,35 @@
1340
1360
  <afterbreak>\p{Ll}</afterbreak>
1341
1361
  </rule>
1342
1362
  <rule break="no">
1343
- <beforebreak>["”'’]\s*</beforebreak>
1344
- <afterbreak>\s*\p{Ll}</afterbreak>
1363
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
1364
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
1345
1365
  </rule>
1346
1366
  <rule break="no">
1347
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
1367
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
1348
1368
  <afterbreak></afterbreak>
1349
1369
  </rule>
1350
1370
  <rule break="no">
1351
- <beforebreak>\b\p{L}\.\s</beforebreak>
1352
- <afterbreak>\p{L}\.\s</afterbreak>
1371
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
1372
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
1353
1373
  </rule>
1354
1374
  <rule break="no">
1355
1375
  <beforebreak>\b\p{L}\.</beforebreak>
1356
1376
  <afterbreak>\p{L}\.</afterbreak>
1357
1377
  </rule>
1358
1378
  <rule break="no"><!-- Jones v. Smith -->
1359
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
1379
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1360
1380
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1361
1381
  </rule>
1362
1382
  <rule break="yes">
1363
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
1364
- <afterbreak>\p{N}+\)\s</afterbreak>
1383
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
1384
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
1365
1385
  </rule>
1366
1386
  <rule break="yes">
1367
- <beforebreak>\bOK\.\s</beforebreak>
1387
+ <beforebreak>\bOK\.[\s\u00A0]</beforebreak>
1368
1388
  <afterbreak>\p{Ll}+</afterbreak>
1369
1389
  </rule>
1370
1390
  <rule break="no">
1371
- <beforebreak>[\.\s](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.\s</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1391
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1372
1392
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1373
1393
  </rule>
1374
1394
  <rule break="no">
@@ -1376,35 +1396,35 @@
1376
1396
  <afterbreak>[^\p{Lu}]</afterbreak>
1377
1397
  </rule>
1378
1398
  <rule break="no">
1379
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
1399
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
1380
1400
  <afterbreak></afterbreak>
1381
1401
  </rule>
1382
1402
  <rule break="no">
1383
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
1403
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
1384
1404
  <afterbreak></afterbreak>
1385
1405
  </rule>
1386
1406
  <rule break="no">
1387
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
1407
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1388
1408
  <afterbreak></afterbreak>
1389
1409
  </rule>
1390
1410
  <rule break="no">
1391
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
1411
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
1392
1412
  <afterbreak>\p{Ll}+</afterbreak>
1393
1413
  </rule>
1394
1414
  <rule break="no">
1395
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
1415
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
1396
1416
  <afterbreak>\p{Ll}+</afterbreak>
1397
1417
  </rule>
1398
1418
  <rule break="no">
1399
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
1419
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1400
1420
  <afterbreak></afterbreak>
1401
1421
  </rule>
1402
1422
  <rule break="no"><!-- i.e. -->
1403
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
1423
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1404
1424
  <afterbreak></afterbreak>
1405
1425
  </rule>
1406
1426
  <rule break="yes">
1407
- <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
1427
+ <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak>
1408
1428
  <afterbreak></afterbreak>
1409
1429
  </rule>
1410
1430
  <rule break="yes">
@@ -1412,7 +1432,7 @@
1412
1432
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1413
1433
  </rule>
1414
1434
  <rule break="yes">
1415
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
1435
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
1416
1436
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
1417
1437
  </rule>
1418
1438
  </languagerule>
@@ -1511,6 +1531,16 @@
1511
1531
  </rule>
1512
1532
  </languagerule>
1513
1533
  <languagerule languagerulename="Dutch">
1534
+ <rule break="no">
1535
+ <!-- sp.a -->
1536
+ <beforebreak>\b(sp|SP)</beforebreak>
1537
+ <afterbreak>\.[aA]\b</afterbreak>
1538
+ </rule>
1539
+ <rule break="no">
1540
+ <!-- .Net -->
1541
+ <beforebreak>\s[.]</beforebreak>
1542
+ <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1543
+ </rule>
1514
1544
  <rule break="no"><!-- quoted sentence in sentence -->
1515
1545
  <beforebreak>[.?!][’'"]</beforebreak>
1516
1546
  <afterbreak> [a-z]</afterbreak>
@@ -1524,7 +1554,7 @@
1524
1554
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
1525
1555
  </rule>
1526
1556
  <rule break="no">
1527
- <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros)\.\s</beforebreak>
1557
+ <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros|Stb)\.\s</beforebreak>
1528
1558
  <afterbreak></afterbreak>
1529
1559
  </rule>
1530
1560
  <rule break="no">
@@ -1544,6 +1574,10 @@
1544
1574
  <afterbreak></afterbreak>
1545
1575
  </rule>
1546
1576
  <rule break="no">
1577
+ <beforebreak>\b(alc|bro|opm|acc)\.\s</beforebreak>
1578
+ <afterbreak></afterbreak>
1579
+ </rule>
1580
+ <rule break="no">
1547
1581
  <beforebreak>\b(arch|archeol|art|bc|betr|bez|bibl|bijl|bijv)\.\s</beforebreak>
1548
1582
  <afterbreak></afterbreak>
1549
1583
  </rule>
@@ -1729,6 +1763,31 @@
1729
1763
  <beforebreak>[?!.]\s</beforebreak>
1730
1764
  <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1731
1765
  </rule>
1766
+ <rule break="no">
1767
+ <!-- "E. coli etc. -->
1768
+ <beforebreak>"[A-Z][.]\s</beforebreak>
1769
+ <afterbreak>[a-z]</afterbreak>
1770
+ </rule>
1771
+ <rule break="no">
1772
+ <!-- Cornelisz. -->
1773
+ <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1774
+ <afterbreak>[a-z]</afterbreak>
1775
+ </rule>
1776
+ <rule break="no">
1777
+ <!-- De n. XIV/vagus (nervus) -->
1778
+ <beforebreak>De n[.]\s</beforebreak>
1779
+ <afterbreak>[a-z]|[XIV]</afterbreak>
1780
+ </rule>
1781
+ <rule break="no">
1782
+ <!-- MOL.E -->
1783
+ <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1784
+ <afterbreak>[A-Z]</afterbreak>
1785
+ </rule>
1786
+ <rule break="no">
1787
+ <!-- ..." betekent -->
1788
+ <beforebreak>\.\.</beforebreak>
1789
+ <afterbreak>" [a-z]</afterbreak>
1790
+ </rule>
1732
1791
  <!-- ##### end of Dutch #### -->
1733
1792
  </languagerule>
1734
1793
  <languagerule languagerulename="Slovak">
@@ -4556,146 +4615,150 @@
4556
4615
  </languagerule>
4557
4616
  <languagerule languagerulename="Catalan">
4558
4617
  <rule break="no">
4559
- <beforebreak>Yahoo!\s</beforebreak>
4618
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4560
4619
  <afterbreak>\p{Ll}</afterbreak>
4561
4620
  </rule>
4562
4621
  <rule break="yes">
4563
- <beforebreak>\w['’][nNtT]\.\s</beforebreak>
4622
+ <beforebreak>\w['’][nNtT]\.[\s\u00A0]</beforebreak>
4564
4623
  <afterbreak></afterbreak>
4565
4624
  </rule>
4566
4625
  <rule break="yes">
4567
- <beforebreak>\.\[\d+\]\s</beforebreak>
4626
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4568
4627
  <afterbreak></afterbreak>
4569
4628
  </rule>
4570
4629
  <!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
4571
4630
  <rule break="no">
4572
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4631
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4573
4632
  <afterbreak></afterbreak>
4574
4633
  </rule>
4575
4634
  <!-- Abbreviations that cannot finish sentences-->
4576
4635
  <rule break="no">
4577
- <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4636
+ <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4578
4637
  <afterbreak></afterbreak>
4579
4638
  </rule>
4580
4639
  <!-- Abbreviations that can finish sentences -->
4581
4640
  <rule break="no">
4582
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4641
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4583
4642
  <afterbreak>[XIV]+\b</afterbreak>
4584
4643
  </rule>
4585
4644
  <rule break="no">
4586
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4645
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4587
4646
  <afterbreak>[0-9]+\b</afterbreak>
4588
4647
  </rule>
4589
4648
  <rule break="no">
4590
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.\s</beforebreak>
4649
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
4591
4650
  <afterbreak>[XIV\d]+\b</afterbreak>
4592
4651
  </rule>
4593
4652
  <rule break="no">
4594
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4653
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|a|rs|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4595
4654
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4596
4655
  </rule>
4597
4656
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4598
4657
  <rule break="no">
4599
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4658
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4600
4659
  <afterbreak>\p{Ll}</afterbreak>
4601
4660
  </rule>
4602
4661
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4603
4662
  <rule break="no">
4604
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4663
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4605
4664
  <afterbreak>\p{Ll}</afterbreak>
4606
4665
  </rule>
4607
4666
  <rule break="no">
4608
- <beforebreak>\bEE\.\s?</beforebreak>
4609
- <afterbreak>UU</afterbreak>
4667
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4668
+ <afterbreak>\p{Lu}{2}</afterbreak>
4610
4669
  </rule>
4611
4670
  <rule break="no">
4612
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4671
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4613
4672
  <afterbreak>\p{Ll}</afterbreak>
4614
4673
  </rule>
4615
4674
  <!-- max min etc -->
4616
4675
  <rule break="no">
4617
- <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4676
+ <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4618
4677
  <afterbreak>\p{Ll}</afterbreak>
4619
4678
  </rule>
4620
4679
  <!-- Composed abbrev. -->
4621
4680
  <rule break="no">
4622
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4681
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4623
4682
  <afterbreak></afterbreak>
4624
4683
  </rule>
4625
4684
  <!-- Units -->
4626
4685
  <rule break="no">
4627
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4686
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4628
4687
  <afterbreak>\p{Ll}</afterbreak>
4629
4688
  </rule>
4630
4689
  <!-- Ellipsis: ... lowercase -->
4631
4690
  <rule break="no">
4632
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4691
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4633
4692
  <afterbreak>\p{Ll}</afterbreak>
4634
4693
  </rule>
4635
4694
  <!-- (enum...) -->
4636
4695
  <rule break="no">
4637
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4696
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4638
4697
  <afterbreak>\p{Ll}</afterbreak>
4639
4698
  </rule>
4640
4699
  <!-- pero ¡ah! no estaba
4641
4700
  <rule break="no">
4642
- <beforebreak>\b¡\p{L}+!\s</beforebreak>
4701
+ <beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
4643
4702
  <afterbreak>\p{Ll}</afterbreak>
4644
4703
  </rule>
4645
4704
  -->
4646
4705
  <rule break="yes">
4647
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4706
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4648
4707
  <afterbreak></afterbreak>
4649
4708
  </rule>
4650
4709
  <rule break="yes">
4651
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4710
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4652
4711
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4653
4712
  </rule>
4654
4713
  <!-- paragraphs with opening "»" in dialogs-->
4655
4714
  <rule break="yes">
4656
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4657
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4715
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4716
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4658
4717
  </rule>
4659
4718
  </languagerule>
4660
4719
  <languagerule languagerulename="Spanish">
4661
4720
  <rule break="no">
4662
- <beforebreak>Yahoo!\s</beforebreak>
4721
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4722
+ <afterbreak>\p{Ll}</afterbreak>
4723
+ </rule>
4724
+ <rule break="no">
4725
+ <beforebreak>40dB.[\s\u00A0]</beforebreak>
4663
4726
  <afterbreak>\p{Ll}</afterbreak>
4664
4727
  </rule>
4665
4728
  <rule break="yes">
4666
- <beforebreak>\.\[\d+\]\s</beforebreak>
4729
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4667
4730
  <afterbreak></afterbreak>
4668
4731
  </rule>
4669
4732
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4670
4733
  <rule break="no">
4671
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4734
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4672
4735
  <afterbreak/>
4673
4736
  </rule>
4674
4737
  <!-- Ellipsis: ... lowercase -->
4675
4738
  <rule break="no">
4676
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4739
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4677
4740
  <afterbreak>\p{Ll}</afterbreak>
4678
4741
  </rule>
4679
4742
  <!-- (enum...) -->
4680
4743
  <rule break="no">
4681
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4744
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4682
4745
  <afterbreak>\p{Ll}</afterbreak>
4683
4746
  </rule>
4684
4747
  <!-- Abbreviations that can finish sentences -->
4685
4748
  <rule break="no">
4686
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4749
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4687
4750
  <afterbreak>[XIV]+\b</afterbreak>
4688
4751
  </rule>
4689
4752
  <rule break="no">
4690
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4753
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4691
4754
  <afterbreak>[0-9]+\b</afterbreak>
4692
4755
  </rule>
4693
4756
  <rule break="no">
4694
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.\s</beforebreak>
4757
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0]</beforebreak>
4695
4758
  <afterbreak>[XIV\d]+\b</afterbreak>
4696
4759
  </rule>
4697
4760
  <rule break="no">
4698
- <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4761
+ <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4699
4762
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4700
4763
  </rule>
4701
4764
  <rule break="no">
@@ -4710,75 +4773,75 @@
4710
4773
  </rule>
4711
4774
  <!-- Abbreviations that cannot finish sentences-->
4712
4775
  <rule break="no">
4713
- <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4776
+ <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4714
4777
  <afterbreak/>
4715
4778
  </rule>
4716
4779
  <rule break="no">
4717
- <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.\s</beforebreak>
4780
+ <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4718
4781
  <afterbreak/>
4719
4782
  </rule>
4720
4783
  <rule break="no">
4721
- <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.\s</beforebreak>
4784
+ <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4722
4785
  <afterbreak/>
4723
4786
  </rule>
4724
4787
  <rule break="no">
4725
- <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.\s</beforebreak>
4788
+ <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4726
4789
  <afterbreak/>
4727
4790
  </rule>
4728
4791
  <rule break="no">
4729
- <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.\s</beforebreak>
4792
+ <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4730
4793
  <afterbreak/>
4731
4794
  </rule>
4732
4795
  <!-- Abbreviations that can finish sentences -->
4733
4796
  <rule break="no">
4734
- <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4797
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4735
4798
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4736
4799
  </rule>
4737
4800
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4738
4801
  <rule break="no">
4739
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4802
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4740
4803
  <afterbreak>\p{Ll}</afterbreak>
4741
4804
  </rule>
4742
4805
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4743
4806
  <rule break="no">
4744
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4807
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4745
4808
  <afterbreak>\p{Ll}</afterbreak>
4746
4809
  </rule>
4747
4810
  <rule break="no">
4748
- <beforebreak>\bEE\.\s?</beforebreak>
4749
- <afterbreak>UU</afterbreak>
4811
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4812
+ <afterbreak>\p{Lu}{2}</afterbreak>
4750
4813
  </rule>
4751
4814
  <rule break="no">
4752
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4815
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4753
4816
  <afterbreak>\p{Ll}</afterbreak>
4754
4817
  </rule>
4755
4818
  <!-- max min etc -->
4756
4819
  <rule break="no">
4757
- <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4820
+ <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4758
4821
  <afterbreak>\p{Ll}</afterbreak>
4759
4822
  </rule>
4760
4823
  <!-- Composed abbrev. -->
4761
4824
  <rule break="no">
4762
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4825
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4763
4826
  <afterbreak/>
4764
4827
  </rule>
4765
4828
  <!-- Units -->
4766
4829
  <rule break="no">
4767
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4830
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4768
4831
  <afterbreak>\p{Ll}</afterbreak>
4769
4832
  </rule>
4770
4833
  <rule break="yes">
4771
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4834
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4772
4835
  <afterbreak></afterbreak>
4773
4836
  </rule>
4774
4837
  <rule break="yes">
4775
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4838
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4776
4839
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4777
4840
  </rule>
4778
4841
  <!-- paragraphs with opening "»" in dialogs-->
4779
4842
  <rule break="yes">
4780
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4781
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4843
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4844
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4782
4845
  </rule>
4783
4846
  </languagerule>
4784
4847
  <languagerule languagerulename="German">
@@ -4792,17 +4855,17 @@
4792
4855
  </rule>
4793
4856
  <!--support simple lists in markdown style-->
4794
4857
  <rule break="yes">
4795
- <beforebreak>\r?\n\s*[-*]+\s</beforebreak>
4858
+ <beforebreak>\r?\n[\u00A0\s]*[-*]+[\u00A0\s]</beforebreak>
4796
4859
  <afterbreak></afterbreak>
4797
4860
  </rule>
4798
4861
  <!-- Split at e.g. "1a. Und ..." -->
4799
4862
  <rule break="yes">
4800
- <beforebreak>\d+[a-z]\.\s</beforebreak>
4863
+ <beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
4801
4864
  <afterbreak>\p{Lu}</afterbreak>
4802
4865
  </rule>
4803
4866
  <!-- Don't split at e.g. "d. h." -->
4804
4867
  <rule break="no">
4805
- <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?\s</beforebreak>
4868
+ <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4806
4869
  <afterbreak></afterbreak>
4807
4870
  </rule>
4808
4871
  <rule break="no">
@@ -4823,10 +4886,10 @@
4823
4886
  </rule>
4824
4887
  <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
4825
4888
  <beforebreak>ID.</beforebreak>
4826
- <afterbreak>3|Buzz|Crozz</afterbreak>
4889
+ <afterbreak>3|4|Buzz|Crozz</afterbreak>
4827
4890
  </rule>
4828
4891
  <rule break="no">
4829
- <beforebreak>[1-3]\.\s</beforebreak>
4892
+ <beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
4830
4893
  <afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
4831
4894
  </rule>
4832
4895
  <rule break="no">
@@ -4841,126 +4904,126 @@
4841
4904
  <!-- Don't split after a white-space followed by a single letter followed
4842
4905
  by a dot followed by another whitespace. e.g. " p. " -->
4843
4906
  <rule break="no">
4844
- <beforebreak>\s\p{L}\.\s</beforebreak>
4907
+ <beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
4845
4908
  <afterbreak>\p{L}\.</afterbreak>
4846
4909
  </rule>
4847
4910
  <!-- Don't split at "bla bla... yada yada" -->
4848
4911
  <rule break="no">
4849
- <beforebreak>[\[\(]?\.\.\.[\]\)]?\s</beforebreak>
4912
+ <beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
4850
4913
  <afterbreak>\p{Ll}</afterbreak>
4851
4914
  </rule>
4852
4915
  <!-- Don't split [.?!] when they're quoted -->
4853
4916
  <rule break="no">
4854
- <beforebreak>['"„][\.!?…]['"“]\s</beforebreak>
4917
+ <beforebreak>['"„][\.!?…]['"“][\u00A0\s]</beforebreak>
4855
4918
  <afterbreak></afterbreak>
4856
4919
  </rule>
4857
4920
  <!-- Don't break after quote unless there's a capital letter
4858
4921
  e.g.: "That's right!" he said. -->
4859
4922
  <rule break="no">
4860
- <beforebreak>["'“]\s</beforebreak>
4923
+ <beforebreak>["'“][\u00A0\s]</beforebreak>
4861
4924
  <afterbreak>\p{Ll}</afterbreak>
4862
4925
  </rule>
4863
4926
  <!-- e.g. "Das ist . so." - assume one sentence. -->
4864
4927
  <rule break="no">
4865
- <beforebreak>\s([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?\s</beforebreak>
4928
+ <beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4866
4929
  <afterbreak></afterbreak>
4867
4930
  </rule>
4868
4931
  <!-- Numbers, dates e.g. "3.10. datiert" -->
4869
4932
  <rule break="no">
4870
- <beforebreak>\b\d+\.\s</beforebreak>
4933
+ <beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
4871
4934
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
4872
4935
  </rule>
4873
4936
  <!-- z.B. "Das hier ist ein(!) Satz." -->
4874
4937
  <rule break="no">
4875
- <beforebreak>[\(\[][!?]{1,3}[\]\)]\s</beforebreak>
4938
+ <beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
4876
4939
  <afterbreak></afterbreak>
4877
4940
  </rule>
4878
4941
  <!-- z.B. "Das hier ist (genau!) ein Satz." -->
4879
4942
  <rule break="no">
4880
- <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
4943
+ <beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
4881
4944
  <afterbreak></afterbreak>
4882
4945
  </rule>
4883
4946
  <!-- z.B. "bla (...) blubb" -> kein Satzende -->
4884
4947
  <rule break="no">
4885
- <beforebreak>[\(\)\[\]]\s</beforebreak>
4948
+ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
4886
4949
  <afterbreak></afterbreak>
4887
4950
  </rule>
4888
4951
  <!-- don't split at cases like "Friedrich II. wird auch..." -->
4889
4952
  <rule break="no">
4890
- <beforebreak>[\s ][IVX]+\.\s</beforebreak>
4953
+ <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]</beforebreak>
4891
4954
  <afterbreak>[^\p{Lu}]+</afterbreak>
4892
4955
  </rule>
4893
4956
  <!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
4894
4957
  <rule break="no">
4895
- <beforebreak>\d+\.\s</beforebreak>
4896
- <afterbreak>(und|oder|bis)\s</afterbreak>
4958
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4959
+ <afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
4897
4960
  </rule>
4898
4961
  <!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
4899
4962
  ohne dass eine Satzgrenze erkannt wird
4900
4963
  (z.B. "am 13. Dezember" -> keine Satzgrenze) -->
4901
4964
  <rule break="no">
4902
- <beforebreak>\d+\.\s</beforebreak>
4965
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4903
4966
  <afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
4904
4967
  </rule>
4905
4968
  <rule break="no">
4906
- <beforebreak>\d+\.\s</beforebreak>
4969
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4907
4970
  <afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
4908
4971
  </rule>
4909
4972
  <rule break="no">
4910
- <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.\s</beforebreak>
4973
+ <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
4911
4974
  <afterbreak>\d\d(\d\d)?</afterbreak>
4912
4975
  </rule>
4913
4976
  <!-- ähnliche Fälle außerhalb der Monatsnamen -->
4914
4977
  <rule break="no">
4915
- <beforebreak>\d+\.\s</beforebreak>
4978
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4916
4979
  <afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
4917
4980
  </rule>
4918
4981
  <!-- English abbreviations - but these work globally for all languages -->
4919
4982
  <rule break="no">
4920
- <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.\s</beforebreak>
4983
+ <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]</beforebreak>
4921
4984
  <afterbreak></afterbreak>
4922
4985
  </rule>
4923
4986
  <!-- Latin abbreviations - but these work globally for all languages -->
4924
4987
  <rule break="no">
4925
- <beforebreak>\b(spp?)\.\s</beforebreak>
4988
+ <beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
4926
4989
  <afterbreak></afterbreak>
4927
4990
  </rule>
4928
4991
  <!-- German abbreviations -->
4929
4992
  <rule break="no">
4930
- <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.\s</beforebreak>
4993
+ <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|[Ee]inschl|[Vv]mtl|Ev|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]</beforebreak>
4931
4994
  <afterbreak></afterbreak>
4932
4995
  </rule>
4933
4996
  <rule break="no">
4934
- <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.\s</beforebreak>
4997
+ <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|Evt|evt|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
4935
4998
  <afterbreak></afterbreak>
4936
4999
  </rule>
4937
5000
  <rule break="no">
4938
- <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.\s</beforebreak>
5001
+ <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
4939
5002
  <afterbreak></afterbreak>
4940
5003
  </rule>
4941
5004
  <rule break="no">
4942
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.\s</beforebreak>
5005
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]</beforebreak>
4943
5006
  <afterbreak></afterbreak>
4944
5007
  </rule>
4945
5008
  <rule break="no">
4946
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.\s</beforebreak>
5009
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]</beforebreak>
4947
5010
  <afterbreak></afterbreak>
4948
5011
  </rule>
4949
5012
  <rule break="no">
4950
- <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.\s</beforebreak>
5013
+ <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]</beforebreak>
4951
5014
  <afterbreak></afterbreak>
4952
5015
  </rule>
4953
5016
  <rule break="no">
4954
- <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.\s</beforebreak>
5017
+ <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
4955
5018
  <afterbreak></afterbreak>
4956
5019
  </rule>
4957
5020
  <rule break="no">
4958
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.\s</beforebreak>
5021
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]</beforebreak>
4959
5022
  <afterbreak></afterbreak>
4960
5023
  </rule>
4961
5024
  <!-- Break rules -->
4962
5025
  <rule break="yes">
4963
- <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?\s+</beforebreak>
5026
+ <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
4964
5027
  <afterbreak></afterbreak>
4965
5028
  </rule>
4966
5029
  <rule break="yes">
@@ -4968,7 +5031,7 @@
4968
5031
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
4969
5032
  </rule>
4970
5033
  <rule break="yes">
4971
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5034
+ <beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
4972
5035
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
4973
5036
  </rule>
4974
5037
  <!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
@@ -5184,11 +5247,19 @@
5184
5247
  </languagerule>
5185
5248
  <languagerule languagerulename="French">
5186
5249
  <rule break="no">
5187
- <beforebreak>Yahoo!\s</beforebreak>
5250
+ <beforebreak>[\s\u00A0]</beforebreak>
5251
+ <afterbreak>[»”’"'›]</afterbreak>
5252
+ </rule>
5253
+ <rule break="yes">
5254
+ <beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
5255
+ <afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
5256
+ </rule>
5257
+ <rule break="no">
5258
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
5188
5259
  <afterbreak>\p{Ll}</afterbreak>
5189
5260
  </rule>
5190
5261
  <rule break="yes">
5191
- <beforebreak>\.\[\d+\]\s</beforebreak>
5262
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5192
5263
  <afterbreak></afterbreak>
5193
5264
  </rule>
5194
5265
  <rule break="no"><!-- URLs without "www."-->
@@ -5206,15 +5277,15 @@
5206
5277
  </rule>
5207
5278
  <!-- French abbreviations -->
5208
5279
  <rule break="no">
5209
- <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op)\.\s</beforebreak>
5280
+ <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0]</beforebreak>
5210
5281
  <afterbreak>\p{Ll}</afterbreak>
5211
5282
  </rule>
5212
5283
  <rule break="no">
5213
- <beforebreak>\b(etc)\.\)\s</beforebreak>
5284
+ <beforebreak>\b(etc)\.\)[\s\u00A0]</beforebreak>
5214
5285
  <afterbreak></afterbreak>
5215
5286
  </rule>
5216
5287
  <rule break="no">
5217
- <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.\s</beforebreak>
5288
+ <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0]</beforebreak>
5218
5289
  <afterbreak></afterbreak>
5219
5290
  </rule>
5220
5291
  <rule break="no">
@@ -5230,39 +5301,39 @@
5230
5301
  <afterbreak>\p{Ll}</afterbreak>
5231
5302
  </rule>
5232
5303
  <rule break="no">
5233
- <beforebreak>["”'’]\s*</beforebreak>
5234
- <afterbreak>\s*\p{Ll}</afterbreak>
5304
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
5305
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
5235
5306
  </rule>
5236
5307
  <rule break="no">
5237
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
5308
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
5238
5309
  <afterbreak></afterbreak>
5239
5310
  </rule>
5240
5311
  <rule break="no">
5241
- <beforebreak>\b\p{L}\.\s</beforebreak>
5242
- <afterbreak>\p{L}\.\s</afterbreak>
5312
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
5313
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
5243
5314
  </rule>
5244
5315
  <rule break="no">
5245
5316
  <beforebreak>\b\p{L}\.</beforebreak>
5246
5317
  <afterbreak>\p{L}\.</afterbreak>
5247
5318
  </rule>
5248
5319
  <rule break="no"><!-- Je suis (...) Chris. -->
5249
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
5320
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5250
5321
  <afterbreak>[^\p{P}]</afterbreak>
5251
5322
  </rule>
5252
5323
  <rule break="no"><!-- Je suis (...?) Chris. -->
5253
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
5324
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5254
5325
  <afterbreak>[^\p{P}]</afterbreak>
5255
5326
  </rule>
5256
5327
  <rule break="no"><!-- Jones v. Smith -->
5257
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
5328
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5258
5329
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5259
5330
  </rule>
5260
5331
  <rule break="yes">
5261
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
5262
- <afterbreak>\p{N}+\)\s</afterbreak>
5332
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
5333
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
5263
5334
  </rule>
5264
5335
  <rule break="no">
5265
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
5336
+ <beforebreak>[\.\s\u00A0]\p{L}{1,2}\.[\s\u00A0]</beforebreak>
5266
5337
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
5267
5338
  </rule>
5268
5339
  <rule break="no">
@@ -5270,31 +5341,31 @@
5270
5341
  <afterbreak>[^\p{Lu}]</afterbreak>
5271
5342
  </rule>
5272
5343
  <rule break="no">
5273
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
5344
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
5274
5345
  <afterbreak></afterbreak>
5275
5346
  </rule>
5276
5347
  <rule break="no">
5277
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
5348
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
5278
5349
  <afterbreak></afterbreak>
5279
5350
  </rule>
5280
5351
  <rule break="no">
5281
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
5352
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
5282
5353
  <afterbreak></afterbreak>
5283
5354
  </rule>
5284
5355
  <rule break="no">
5285
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
5356
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
5286
5357
  <afterbreak>\p{Ll}+</afterbreak>
5287
5358
  </rule>
5288
5359
  <rule break="no">
5289
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
5360
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
5290
5361
  <afterbreak>\p{Ll}+</afterbreak>
5291
5362
  </rule>
5292
5363
  <rule break="no">
5293
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
5364
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5294
5365
  <afterbreak></afterbreak>
5295
5366
  </rule>
5296
5367
  <rule break="no"><!-- i.e. -->
5297
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
5368
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5298
5369
  <afterbreak></afterbreak>
5299
5370
  </rule>
5300
5371
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -5310,28 +5381,28 @@
5310
5381
  <afterbreak>[SK]\b</afterbreak>
5311
5382
  </rule>
5312
5383
  <rule break="no"><!-- No. 5 -->
5313
- <beforebreak>\b[nN]o\.\s</beforebreak>
5384
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5314
5385
  <afterbreak>\p{N}</afterbreak>
5315
5386
  </rule>
5316
5387
  <rule break="no"><!-- Ph.D. -->
5317
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
5388
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5318
5389
  <afterbreak>D\.?</afterbreak>
5319
5390
  </rule>
5320
5391
  <rule break="no"><!-- e.g. -->
5321
- <beforebreak>\be\.g\.\s</beforebreak>
5392
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5322
5393
  <afterbreak></afterbreak>
5323
5394
  </rule>
5324
5395
  <rule break="no"><!-- vs. -->
5325
- <beforebreak>\bvs\.\s</beforebreak>
5396
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5326
5397
  <afterbreak></afterbreak>
5327
5398
  </rule>
5328
5399
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5329
5400
  <rule break="no"><!-- Etc. -->
5330
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
5401
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5331
5402
  <afterbreak>[^\p{Lu}]</afterbreak>
5332
5403
  </rule>
5333
5404
  <rule break="no"><!-- BTW (by the way) -->
5334
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
5405
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5335
5406
  <afterbreak></afterbreak>
5336
5407
  </rule>
5337
5408
  <rule break="no">
@@ -5340,67 +5411,71 @@
5340
5411
  </rule>
5341
5412
  <rule break="no"><!-- https://de.wikipedia.org/wiki/VW_ID.3 -->
5342
5413
  <beforebreak>ID.</beforebreak>
5343
- <afterbreak>3|Buzz|Crozz</afterbreak>
5414
+ <afterbreak>3|4|Buzz|Crozz</afterbreak>
5344
5415
  </rule>
5345
5416
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5346
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
5417
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5347
5418
  <afterbreak></afterbreak>
5348
5419
  </rule>
5349
5420
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5350
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
5421
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5351
5422
  <afterbreak></afterbreak>
5352
5423
  </rule>
5353
5424
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5354
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
5425
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5355
5426
  <afterbreak></afterbreak>
5356
5427
  </rule>
5357
5428
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5358
- <beforebreak>\b[BM]\.\s?</beforebreak>
5429
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5359
5430
  <afterbreak>Eng\.?</afterbreak>
5360
5431
  </rule>
5361
5432
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5362
- <beforebreak>\bLL\.\s?</beforebreak>
5433
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5363
5434
  <afterbreak>[BM]\.?</afterbreak>
5364
5435
  </rule>
5365
5436
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5366
- <beforebreak>\b[BM]\.\s?</beforebreak>
5437
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5367
5438
  <afterbreak>Sc\.?</afterbreak>
5368
5439
  </rule>
5369
5440
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5370
- <beforebreak>\b[BM]\.\s?</beforebreak>
5441
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5371
5442
  <afterbreak>Comp?\.?</afterbreak>
5372
5443
  </rule>
5373
5444
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5374
- <beforebreak>\b[BM]\.\s?</beforebreak>
5445
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5375
5446
  <afterbreak>Arch\.?</afterbreak>
5376
5447
  </rule>
5377
5448
  <rule break="no">
5378
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
5449
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
5379
5450
  <afterbreak></afterbreak>
5380
5451
  </rule>
5381
5452
  <rule break="no">
5382
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
5453
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
5383
5454
  <afterbreak></afterbreak>
5384
5455
  </rule>
5385
5456
  <rule break="no">
5386
- <beforebreak>\bCorp\.\s</beforebreak>
5457
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
5387
5458
  <afterbreak></afterbreak>
5388
5459
  </rule>
5389
5460
  <rule break="no">
5390
- <beforebreak>\bBros\.\s</beforebreak>
5461
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
5391
5462
  <afterbreak></afterbreak>
5392
5463
  </rule>
5393
5464
  <rule break="no">
5394
- <beforebreak>\bLtd\.\s</beforebreak>
5465
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
5395
5466
  <afterbreak>\p{Ll}+</afterbreak>
5396
5467
  </rule>
5397
5468
  <rule break="no">
5398
- <beforebreak>\bCo\.\s</beforebreak>
5469
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
5399
5470
  <afterbreak></afterbreak>
5400
5471
  </rule>
5472
+ <rule break="no">
5473
+ <beforebreak>\bE\.[\s\u00A0]</beforebreak>
5474
+ <afterbreak>\b[Cc]oli\b</afterbreak>
5475
+ </rule>
5401
5476
  <!-- Break rules -->
5402
5477
  <rule break="yes">
5403
- <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?\s+</beforebreak>
5478
+ <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?[\s\u00A0]+</beforebreak>
5404
5479
  <afterbreak></afterbreak>
5405
5480
  </rule>
5406
5481
  <rule break="yes">
@@ -5408,7 +5483,7 @@
5408
5483
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
5409
5484
  </rule>
5410
5485
  <rule break="yes">
5411
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5486
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
5412
5487
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5413
5488
  </rule>
5414
5489
  </languagerule>
@@ -5470,7 +5545,7 @@
5470
5545
  <!-- І. Коваль -->
5471
5546
  <rule break="no">
5472
5547
  <beforebreak>[\h\v.]([А-ЯІЇЄҐACEIHOPX]\.-)?(?&lt;!°)[А-ЯІЇЄҐABCEIHOPX](?&lt;!(Куан[\h]+Ю|(Петр|Олександр)([аоу]|ові|ом)?[\h]+[IІ]+))\.[\h\v]*</beforebreak>
5473
- <afterbreak>(?!Від|Але)[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{2}</afterbreak>
5548
+ <afterbreak>[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{3}</afterbreak>
5474
5549
  </rule>
5475
5550
  <!-- Ів. Франко (але Ів Бутільє) -->
5476
5551
  <rule break="no">
@@ -5532,9 +5607,9 @@
5532
5607
  <beforebreak>\bдол\.[\h\v]*</beforebreak>
5533
5608
  <afterbreak>США</afterbreak>
5534
5609
  </rule>
5535
- <!-- п. 10 від 11.10.1933 -->
5610
+ <!-- п. 10 від 11.10.1933, д. Василь -->
5536
5611
  <rule break="no">
5537
- <beforebreak>(?&lt;!т\.[\h\v]?)\bп\.[\h\v]*</beforebreak>
5612
+ <beforebreak>(?&lt;!т\.[\h\v]?)\b[пд]\.[\h\v]*</beforebreak>
5538
5613
  <afterbreak></afterbreak>
5539
5614
  </rule>
5540
5615
  <!-- усталені скорочення, що збігаються з нескороченими словами -->
@@ -5556,10 +5631,14 @@
5556
5631
  </rule>
5557
5632
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5558
5633
  <rule break="no">
5559
- <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор)\.[\h\v]*</beforebreak>
5634
+ <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5560
5635
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5561
5636
  </rule>
5562
- <!-- смерть гр. Болтаровича -->
5637
+ <rule break="no">
5638
+ <beforebreak>\bМан\.[\h\v]*</beforebreak>
5639
+ <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5640
+ </rule>
5641
+ <!-- смерть гр. Болтаровича, but not "9 гр." -->
5563
5642
  <rule break="no">
5564
5643
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5565
5644
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
@@ -5567,7 +5646,7 @@
5567
5646
  <!-- арт. - артикул -->
5568
5647
  <!-- TODO: арт. - артист -->
5569
5648
  <rule break="no">
5570
- <beforebreak>\bарт\.[\h\v]*</beforebreak>
5649
+ <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5571
5650
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5572
5651
  </rule>
5573
5652
  <!-- ХІІ р., 3-6 арт. -->