srx-languagetool 0.1.0 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/srx/segment.srx CHANGED
@@ -1102,12 +1102,16 @@
1102
1102
  </rule>
1103
1103
  </languagerule>
1104
1104
  <languagerule languagerulename="English">
1105
+ <rule break="no">
1106
+ <beforebreak>[\u00A0\s]</beforebreak>
1107
+ <afterbreak>\n</afterbreak>
1108
+ </rule>
1105
1109
  <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1106
- <beforebreak>[a-zA-Z][!\?]\s</beforebreak>
1107
- <afterbreak>\)\s[a-zA-Z]</afterbreak>
1110
+ <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
+ <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1108
1112
  </rule>
1109
1113
  <rule break="no">
1110
- <beforebreak>Yahoo!\s</beforebreak>
1114
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1111
1115
  <afterbreak>\p{Ll}</afterbreak>
1112
1116
  </rule>
1113
1117
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -1118,6 +1122,10 @@
1118
1122
  <beforebreak>\bA\.</beforebreak>
1119
1123
  <afterbreak>I\b</afterbreak>
1120
1124
  </rule>
1125
+ <rule break="no"><!-- S.I (no dot at end) -->
1126
+ <beforebreak>\bS\.</beforebreak>
1127
+ <afterbreak>I\b</afterbreak>
1128
+ </rule>
1121
1129
  <rule break="no"><!-- L.A (no dot at end) -->
1122
1130
  <beforebreak>\bL\.</beforebreak>
1123
1131
  <afterbreak>A\b</afterbreak>
@@ -1126,6 +1134,14 @@
1126
1134
  <beforebreak>\bU\.</beforebreak>
1127
1135
  <afterbreak>[SK]\b</afterbreak>
1128
1136
  </rule>
1137
+ <rule break="no"><!-- I.S (no dot at end) -->
1138
+ <beforebreak>\bI\.</beforebreak>
1139
+ <afterbreak>S\b</afterbreak>
1140
+ </rule>
1141
+ <rule break="no"><!-- M.Z (no dot at end) -->
1142
+ <beforebreak>\bM\.</beforebreak>
1143
+ <afterbreak>Z\b</afterbreak>
1144
+ </rule>
1129
1145
  <rule break="no"><!-- URLs without "www."-->
1130
1146
  <beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
1131
1147
  <afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
@@ -1135,96 +1151,96 @@
1135
1151
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1136
1152
  </rule>
1137
1153
  <rule break="no"><!-- No. 5 -->
1138
- <beforebreak>\b[nN]o\.\s</beforebreak>
1154
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1139
1155
  <afterbreak>\p{N}</afterbreak>
1140
1156
  </rule>
1141
1157
  <rule break="no"><!-- Ph.D. -->
1142
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
1158
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1143
1159
  <afterbreak>D\.?</afterbreak>
1144
1160
  </rule>
1145
1161
  <rule break="no"><!-- min. -->
1146
- <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?\s*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.\s</beforebreak>
1162
+ <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1147
1163
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1148
1164
  </rule>
1149
1165
  <rule break="no"><!-- hr. -->
1150
- <beforebreak>\b(hr)\.\s</beforebreak>
1166
+ <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1151
1167
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1152
1168
  </rule>
1153
1169
  <rule break="no"><!-- Fig. 8 -->
1154
- <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1170
+ <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1155
1171
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1156
1172
  </rule>
1157
1173
  <rule break="no"><!-- Fig. (8) -->
1158
- <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1174
+ <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1159
1175
  <afterbreak>\(\p{N}\)</afterbreak>
1160
1176
  </rule>
1161
1177
  <rule break="no"><!-- I'm (...) great! -->
1162
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
1178
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1163
1179
  <afterbreak>[^\p{P}]</afterbreak>
1164
1180
  </rule>
1165
1181
  <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1166
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
1182
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1167
1183
  <afterbreak>[^\p{P}]</afterbreak>
1168
1184
  </rule>
1169
1185
  <rule break="no"><!-- e.g. -->
1170
- <beforebreak>\be\.g\.\s</beforebreak>
1186
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1171
1187
  <afterbreak></afterbreak>
1172
1188
  </rule>
1173
1189
  <rule break="no"><!-- vs. -->
1174
- <beforebreak>\bvs\.\s</beforebreak>
1190
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
1175
1191
  <afterbreak></afterbreak>
1176
1192
  </rule>
1177
1193
  <rule break="no"><!-- esp. -->
1178
- <beforebreak>\be[sx]p\.\s</beforebreak>
1194
+ <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1179
1195
  <afterbreak></afterbreak>
1180
1196
  </rule>
1181
1197
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1182
1198
  <rule break="no"><!-- Etc. -->
1183
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
1199
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1184
1200
  <afterbreak>[^\p{Lu}]</afterbreak>
1185
1201
  </rule>
1186
1202
  <rule break="no"><!-- BTW (by the way) -->
1187
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
1203
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1188
1204
  <afterbreak></afterbreak>
1189
1205
  </rule>
1190
1206
  <rule break="no">
1191
- <beforebreak>\bJan\.\s</beforebreak>
1207
+ <beforebreak>\bJan\.[\s\u00A0]</beforebreak>
1192
1208
  <afterbreak></afterbreak>
1193
1209
  </rule>
1194
1210
  <rule break="no">
1195
- <beforebreak>\bFeb\.\s</beforebreak>
1211
+ <beforebreak>\bFeb\.[\s\u00A0]</beforebreak>
1196
1212
  <afterbreak></afterbreak>
1197
1213
  </rule>
1198
1214
  <rule break="no">
1199
- <beforebreak>\bMar\.\s</beforebreak>
1215
+ <beforebreak>\bMar\.[\s\u00A0]</beforebreak>
1200
1216
  <afterbreak></afterbreak>
1201
1217
  </rule>
1202
1218
  <rule break="no">
1203
- <beforebreak>\bApr\.\s</beforebreak>
1219
+ <beforebreak>\bApr\.[\s\u00A0]</beforebreak>
1204
1220
  <afterbreak></afterbreak>
1205
1221
  </rule>
1206
1222
  <rule break="no">
1207
- <beforebreak>\bJu[nl]\.\s</beforebreak>
1223
+ <beforebreak>\bJu[nl]\.[\s\u00A0]</beforebreak>
1208
1224
  <afterbreak></afterbreak>
1209
1225
  </rule>
1210
1226
  <rule break="no">
1211
- <beforebreak>\bAug\.\s</beforebreak>
1227
+ <beforebreak>\bAug\.[\s\u00A0]</beforebreak>
1212
1228
  <afterbreak></afterbreak>
1213
1229
  </rule>
1214
1230
  <rule break="no">
1215
- <beforebreak>\bSept?\.\s</beforebreak>
1231
+ <beforebreak>\bSept?\.[\s\u00A0]</beforebreak>
1216
1232
  <afterbreak></afterbreak>
1217
1233
  </rule>
1218
1234
  <rule break="no">
1219
- <beforebreak>\bOct\.\s</beforebreak>
1235
+ <beforebreak>\bOct\.[\s\u00A0]</beforebreak>
1220
1236
  <afterbreak></afterbreak>
1221
1237
  </rule>
1222
1238
  <rule break="no">
1223
- <beforebreak>\bNov\.\s</beforebreak>
1239
+ <beforebreak>\bNov\.[\s\u00A0]</beforebreak>
1224
1240
  <afterbreak></afterbreak>
1225
1241
  </rule>
1226
1242
  <rule break="no">
1227
- <beforebreak>\bDec\.\s</beforebreak>
1243
+ <beforebreak>\bDec\.[\s\u00A0]</beforebreak>
1228
1244
  <afterbreak></afterbreak>
1229
1245
  </rule>
1230
1246
  <rule break="no">
@@ -1236,43 +1252,43 @@
1236
1252
  <afterbreak>3|Buzz|Crozz</afterbreak>
1237
1253
  </rule>
1238
1254
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1239
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
1255
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1240
1256
  <afterbreak></afterbreak>
1241
1257
  </rule>
1242
1258
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1243
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
1259
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1244
1260
  <afterbreak></afterbreak>
1245
1261
  </rule>
1246
1262
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1247
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
1263
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1248
1264
  <afterbreak></afterbreak>
1249
1265
  </rule>
1250
1266
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1251
- <beforebreak>\b[BM]\.\s?</beforebreak>
1267
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1252
1268
  <afterbreak>Eng\.?</afterbreak>
1253
1269
  </rule>
1254
1270
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1255
- <beforebreak>\bLL\.\s?</beforebreak>
1271
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1256
1272
  <afterbreak>[BM]\.?</afterbreak>
1257
1273
  </rule>
1258
1274
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1259
- <beforebreak>\b[BM]\.\s?</beforebreak>
1275
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1260
1276
  <afterbreak>Sc\.?</afterbreak>
1261
1277
  </rule>
1262
1278
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1263
- <beforebreak>\b[BM]\.\s?</beforebreak>
1279
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1264
1280
  <afterbreak>Comp?\.?</afterbreak>
1265
1281
  </rule>
1266
1282
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1267
- <beforebreak>\b[BM]\.\s?</beforebreak>
1283
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1268
1284
  <afterbreak>Arch\.?</afterbreak>
1269
1285
  </rule>
1270
1286
  <rule break="no">
1271
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
1287
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
1272
1288
  <afterbreak></afterbreak>
1273
1289
  </rule>
1274
1290
  <rule break="no">
1275
- <beforebreak>\bet\b\s\bal\.\s</beforebreak>
1291
+ <beforebreak>\bet\b[\s\u00A0]\bal\.[\s\u00A0]</beforebreak>
1276
1292
  <afterbreak></afterbreak>
1277
1293
  </rule>
1278
1294
  <rule break="no">
@@ -1280,51 +1296,51 @@
1280
1296
  <afterbreak></afterbreak>
1281
1297
  </rule>
1282
1298
  <rule break="no">
1283
- <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s</beforebreak>
1299
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[[\s\u00A0]\u00A0]</beforebreak>
1284
1300
  <afterbreak></afterbreak>
1285
1301
  </rule>
1286
1302
  <rule break="no">
1287
- <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s[A-Z]\.\s</beforebreak>
1303
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1288
1304
  <afterbreak></afterbreak>
1289
1305
  </rule>
1290
1306
  <rule break="no">
1291
- <beforebreak>\b(Drs|Messrs|Mmes)\.\s</beforebreak>
1292
- <afterbreak>(and\s)|\p{Lu}\p{Ll}+</afterbreak>
1307
+ <beforebreak>\b(Drs|Messrs|Mmes)\.[\s\u00A0]</beforebreak>
1308
+ <afterbreak>(and[\s\u00A0])|\p{Lu}\p{Ll}+</afterbreak>
1293
1309
  </rule>
1294
1310
  <rule break="no">
1295
- <beforebreak>\bcf\.\s</beforebreak>
1311
+ <beforebreak>\bcf\.[\s\u00A0]</beforebreak>
1296
1312
  <afterbreak></afterbreak>
1297
1313
  </rule>
1298
1314
  <rule break="no">
1299
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
1315
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
1300
1316
  <afterbreak></afterbreak>
1301
1317
  </rule>
1302
1318
  <rule break="no">
1303
- <beforebreak>\bCorp\.\s</beforebreak>
1319
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
1304
1320
  <afterbreak></afterbreak>
1305
1321
  </rule>
1306
1322
  <rule break="no">
1307
- <beforebreak>\bBros\.\s</beforebreak>
1323
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
1308
1324
  <afterbreak></afterbreak>
1309
1325
  </rule>
1310
1326
  <rule break="no">
1311
- <beforebreak>\bDist\.\s</beforebreak>
1327
+ <beforebreak>\bDist\.[\s\u00A0]</beforebreak>
1312
1328
  <afterbreak></afterbreak>
1313
1329
  </rule>
1314
1330
  <rule break="no">
1315
- <beforebreak>\bCo\.\s</beforebreak>
1331
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
1316
1332
  <afterbreak></afterbreak>
1317
1333
  </rule>
1318
1334
  <rule break="no">
1319
- <beforebreak>\bo'clock\s</beforebreak>
1335
+ <beforebreak>\bo'clock[\s\u00A0]</beforebreak>
1320
1336
  <afterbreak></afterbreak>
1321
1337
  </rule>
1322
1338
  <rule break="no">
1323
- <beforebreak>\bfo'c'sle\s</beforebreak>
1339
+ <beforebreak>\bfo'c'sle[\s\u00A0]</beforebreak>
1324
1340
  <afterbreak></afterbreak>
1325
1341
  </rule>
1326
1342
  <rule break="no">
1327
- <beforebreak>\bLtd\.\s</beforebreak>
1343
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
1328
1344
  <afterbreak>\p{Ll}+</afterbreak>
1329
1345
  </rule>
1330
1346
  <rule break="no">
@@ -1340,35 +1356,35 @@
1340
1356
  <afterbreak>\p{Ll}</afterbreak>
1341
1357
  </rule>
1342
1358
  <rule break="no">
1343
- <beforebreak>["”'’]\s*</beforebreak>
1344
- <afterbreak>\s*\p{Ll}</afterbreak>
1359
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
1360
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
1345
1361
  </rule>
1346
1362
  <rule break="no">
1347
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
1363
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
1348
1364
  <afterbreak></afterbreak>
1349
1365
  </rule>
1350
1366
  <rule break="no">
1351
- <beforebreak>\b\p{L}\.\s</beforebreak>
1352
- <afterbreak>\p{L}\.\s</afterbreak>
1367
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
1368
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
1353
1369
  </rule>
1354
1370
  <rule break="no">
1355
1371
  <beforebreak>\b\p{L}\.</beforebreak>
1356
1372
  <afterbreak>\p{L}\.</afterbreak>
1357
1373
  </rule>
1358
1374
  <rule break="no"><!-- Jones v. Smith -->
1359
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
1375
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1360
1376
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1361
1377
  </rule>
1362
1378
  <rule break="yes">
1363
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
1364
- <afterbreak>\p{N}+\)\s</afterbreak>
1379
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
1380
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
1365
1381
  </rule>
1366
1382
  <rule break="yes">
1367
- <beforebreak>\bOK\.\s</beforebreak>
1383
+ <beforebreak>\bOK\.[\s\u00A0]</beforebreak>
1368
1384
  <afterbreak>\p{Ll}+</afterbreak>
1369
1385
  </rule>
1370
1386
  <rule break="no">
1371
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
1387
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1372
1388
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1373
1389
  </rule>
1374
1390
  <rule break="no">
@@ -1376,35 +1392,35 @@
1376
1392
  <afterbreak>[^\p{Lu}]</afterbreak>
1377
1393
  </rule>
1378
1394
  <rule break="no">
1379
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
1395
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
1380
1396
  <afterbreak></afterbreak>
1381
1397
  </rule>
1382
1398
  <rule break="no">
1383
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
1399
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
1384
1400
  <afterbreak></afterbreak>
1385
1401
  </rule>
1386
1402
  <rule break="no">
1387
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
1403
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1388
1404
  <afterbreak></afterbreak>
1389
1405
  </rule>
1390
1406
  <rule break="no">
1391
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
1407
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
1392
1408
  <afterbreak>\p{Ll}+</afterbreak>
1393
1409
  </rule>
1394
1410
  <rule break="no">
1395
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
1411
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
1396
1412
  <afterbreak>\p{Ll}+</afterbreak>
1397
1413
  </rule>
1398
1414
  <rule break="no">
1399
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
1415
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1400
1416
  <afterbreak></afterbreak>
1401
1417
  </rule>
1402
1418
  <rule break="no"><!-- i.e. -->
1403
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
1419
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1404
1420
  <afterbreak></afterbreak>
1405
1421
  </rule>
1406
1422
  <rule break="yes">
1407
- <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
1423
+ <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak>
1408
1424
  <afterbreak></afterbreak>
1409
1425
  </rule>
1410
1426
  <rule break="yes">
@@ -1412,7 +1428,7 @@
1412
1428
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1413
1429
  </rule>
1414
1430
  <rule break="yes">
1415
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
1431
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
1416
1432
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
1417
1433
  </rule>
1418
1434
  </languagerule>
@@ -1511,6 +1527,16 @@
1511
1527
  </rule>
1512
1528
  </languagerule>
1513
1529
  <languagerule languagerulename="Dutch">
1530
+ <rule break="no">
1531
+ <!-- sp.a -->
1532
+ <beforebreak>\b(sp|SP)</beforebreak>
1533
+ <afterbreak>\.[aA]\b</afterbreak>
1534
+ </rule>
1535
+ <rule break="no">
1536
+ <!-- .Net -->
1537
+ <beforebreak>\s[.]</beforebreak>
1538
+ <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1539
+ </rule>
1514
1540
  <rule break="no"><!-- quoted sentence in sentence -->
1515
1541
  <beforebreak>[.?!][’'"]</beforebreak>
1516
1542
  <afterbreak> [a-z]</afterbreak>
@@ -1524,11 +1550,11 @@
1524
1550
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
1525
1551
  </rule>
1526
1552
  <rule break="no">
1527
- <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon)\.\s</beforebreak>
1553
+ <beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros)\.\s</beforebreak>
1528
1554
  <afterbreak></afterbreak>
1529
1555
  </rule>
1530
1556
  <rule break="no">
1531
- <beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr)\.\s</beforebreak>
1557
+ <beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak>
1532
1558
  <afterbreak></afterbreak>
1533
1559
  </rule>
1534
1560
  <rule break="no">
@@ -1668,12 +1694,28 @@
1668
1694
  <beforebreak>\bprof\.\s</beforebreak>
1669
1695
  <afterbreak></afterbreak>
1670
1696
  </rule>
1697
+ <rule break="no">
1698
+ <beforebreak>[.!?…][’'"]\s</beforebreak>
1699
+ <afterbreak>[a-z]</afterbreak>
1700
+ </rule>
1701
+ <rule break="no">
1702
+ <beforebreak>[.][.]\s</beforebreak>
1703
+ <afterbreak>[a-z]</afterbreak>
1704
+ </rule>
1705
+ <rule break="no">
1706
+ <beforebreak>SP[.]</beforebreak>
1707
+ <afterbreak>A</afterbreak>
1708
+ </rule>
1709
+ <rule break="no">
1710
+ <beforebreak>Warner Bros\.</beforebreak>
1711
+ <afterbreak>[a-z]</afterbreak>
1712
+ </rule>
1671
1713
  <rule break="yes">
1672
- <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1714
+ <beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
1673
1715
  <afterbreak></afterbreak>
1674
1716
  </rule>
1675
1717
  <rule break="yes">
1676
- <beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1718
+ <beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
1677
1719
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1678
1720
  </rule>
1679
1721
  <rule break="yes">
@@ -1705,6 +1747,40 @@
1705
1747
  <beforebreak>\bmax\.\s</beforebreak>
1706
1748
  <afterbreak>\p{Ll}</afterbreak>
1707
1749
  </rule>
1750
+ <rule break="yes">
1751
+ <beforebreak>[?!.]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]\s</beforebreak>
1752
+ <afterbreak>[A-Z][a-z]</afterbreak>
1753
+ </rule>
1754
+ <rule break="yes">
1755
+ <beforebreak>[?!.]\s</beforebreak>
1756
+ <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1757
+ </rule>
1758
+ <rule break="no">
1759
+ <!-- "E. coli etc. -->
1760
+ <beforebreak>"[A-Z][.]\s</beforebreak>
1761
+ <afterbreak>[a-z]</afterbreak>
1762
+ </rule>
1763
+ <rule break="no">
1764
+ <!-- Cornelisz. -->
1765
+ <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1766
+ <afterbreak>[a-z]</afterbreak>
1767
+ </rule>
1768
+ <rule break="no">
1769
+ <!-- De n. XIV/vagus (nervus) -->
1770
+ <beforebreak>De n[.]\s</beforebreak>
1771
+ <afterbreak>[a-z]|[XIV]</afterbreak>
1772
+ </rule>
1773
+ <rule break="no">
1774
+ <!-- MOL.E -->
1775
+ <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1776
+ <afterbreak>[A-Z]</afterbreak>
1777
+ </rule>
1778
+ <rule break="no">
1779
+ <!-- ..." betekent -->
1780
+ <beforebreak>\.\.</beforebreak>
1781
+ <afterbreak>" [a-z]</afterbreak>
1782
+ </rule>
1783
+ <!-- ##### end of Dutch #### -->
1708
1784
  </languagerule>
1709
1785
  <languagerule languagerulename="Slovak">
1710
1786
  <rule break="no">
@@ -4263,7 +4339,7 @@
4263
4339
  <rule break="no">
4264
4340
  <beforebreak>\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s</beforebreak>
4265
4341
  <afterbreak></afterbreak>
4266
- </rule>
4342
+ </rule>
4267
4343
  <rule break="no">
4268
4344
  <beforebreak>\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s</beforebreak>
4269
4345
  <afterbreak>\p{Ll}</afterbreak>
@@ -4531,146 +4607,146 @@
4531
4607
  </languagerule>
4532
4608
  <languagerule languagerulename="Catalan">
4533
4609
  <rule break="no">
4534
- <beforebreak>Yahoo!\s</beforebreak>
4610
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4535
4611
  <afterbreak>\p{Ll}</afterbreak>
4536
4612
  </rule>
4537
4613
  <rule break="yes">
4538
- <beforebreak>\w['’][nNtT]\.\s</beforebreak>
4614
+ <beforebreak>\w['’][nNtT]\.[\s\u00A0]</beforebreak>
4539
4615
  <afterbreak></afterbreak>
4540
4616
  </rule>
4541
4617
  <rule break="yes">
4542
- <beforebreak>\.\[\d+\]\s</beforebreak>
4618
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4543
4619
  <afterbreak></afterbreak>
4544
4620
  </rule>
4545
4621
  <!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
4546
4622
  <rule break="no">
4547
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4623
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4548
4624
  <afterbreak></afterbreak>
4549
4625
  </rule>
4550
4626
  <!-- Abbreviations that cannot finish sentences-->
4551
4627
  <rule break="no">
4552
- <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4628
+ <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4553
4629
  <afterbreak></afterbreak>
4554
4630
  </rule>
4555
4631
  <!-- Abbreviations that can finish sentences -->
4556
4632
  <rule break="no">
4557
- <beforebreak>\bs\.\s</beforebreak>
4633
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4558
4634
  <afterbreak>[XIV]+\b</afterbreak>
4559
4635
  </rule>
4560
4636
  <rule break="no">
4561
- <beforebreak>\b(min|m)\.\s</beforebreak>
4637
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4562
4638
  <afterbreak>[0-9]+\b</afterbreak>
4563
4639
  </rule>
4564
4640
  <rule break="no">
4565
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.\s</beforebreak>
4641
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
4566
4642
  <afterbreak>[XIV\d]+\b</afterbreak>
4567
4643
  </rule>
4568
4644
  <rule break="no">
4569
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4645
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4570
4646
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4571
4647
  </rule>
4572
4648
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4573
4649
  <rule break="no">
4574
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4650
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4575
4651
  <afterbreak>\p{Ll}</afterbreak>
4576
4652
  </rule>
4577
4653
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4578
4654
  <rule break="no">
4579
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4655
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4580
4656
  <afterbreak>\p{Ll}</afterbreak>
4581
4657
  </rule>
4582
4658
  <rule break="no">
4583
- <beforebreak>\bEE\.\s?</beforebreak>
4584
- <afterbreak>UU</afterbreak>
4659
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4660
+ <afterbreak>\p{Lu}{2}</afterbreak>
4585
4661
  </rule>
4586
4662
  <rule break="no">
4587
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4663
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4588
4664
  <afterbreak>\p{Ll}</afterbreak>
4589
4665
  </rule>
4590
4666
  <!-- max min etc -->
4591
4667
  <rule break="no">
4592
- <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4668
+ <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4593
4669
  <afterbreak>\p{Ll}</afterbreak>
4594
4670
  </rule>
4595
4671
  <!-- Composed abbrev. -->
4596
4672
  <rule break="no">
4597
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4673
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4598
4674
  <afterbreak></afterbreak>
4599
4675
  </rule>
4600
4676
  <!-- Units -->
4601
4677
  <rule break="no">
4602
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4678
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4603
4679
  <afterbreak>\p{Ll}</afterbreak>
4604
4680
  </rule>
4605
4681
  <!-- Ellipsis: ... lowercase -->
4606
4682
  <rule break="no">
4607
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4683
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4608
4684
  <afterbreak>\p{Ll}</afterbreak>
4609
4685
  </rule>
4610
4686
  <!-- (enum...) -->
4611
4687
  <rule break="no">
4612
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4688
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4613
4689
  <afterbreak>\p{Ll}</afterbreak>
4614
4690
  </rule>
4615
4691
  <!-- pero ¡ah! no estaba
4616
4692
  <rule break="no">
4617
- <beforebreak>\b¡\p{L}+!\s</beforebreak>
4693
+ <beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
4618
4694
  <afterbreak>\p{Ll}</afterbreak>
4619
4695
  </rule>
4620
4696
  -->
4621
4697
  <rule break="yes">
4622
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4698
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4623
4699
  <afterbreak></afterbreak>
4624
4700
  </rule>
4625
4701
  <rule break="yes">
4626
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4702
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4627
4703
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4628
4704
  </rule>
4629
4705
  <!-- paragraphs with opening "»" in dialogs-->
4630
4706
  <rule break="yes">
4631
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4632
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4707
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4708
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4633
4709
  </rule>
4634
4710
  </languagerule>
4635
4711
  <languagerule languagerulename="Spanish">
4636
4712
  <rule break="no">
4637
- <beforebreak>Yahoo!\s</beforebreak>
4713
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4638
4714
  <afterbreak>\p{Ll}</afterbreak>
4639
4715
  </rule>
4640
4716
  <rule break="yes">
4641
- <beforebreak>\.\[\d+\]\s</beforebreak>
4717
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4642
4718
  <afterbreak></afterbreak>
4643
4719
  </rule>
4644
4720
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4645
4721
  <rule break="no">
4646
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4722
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4647
4723
  <afterbreak/>
4648
4724
  </rule>
4649
4725
  <!-- Ellipsis: ... lowercase -->
4650
4726
  <rule break="no">
4651
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4727
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4652
4728
  <afterbreak>\p{Ll}</afterbreak>
4653
4729
  </rule>
4654
4730
  <!-- (enum...) -->
4655
4731
  <rule break="no">
4656
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4732
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4657
4733
  <afterbreak>\p{Ll}</afterbreak>
4658
4734
  </rule>
4659
4735
  <!-- Abbreviations that can finish sentences -->
4660
4736
  <rule break="no">
4661
- <beforebreak>\bs\.\s</beforebreak>
4737
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4662
4738
  <afterbreak>[XIV]+\b</afterbreak>
4663
4739
  </rule>
4664
4740
  <rule break="no">
4665
- <beforebreak>\b(min|m)\.\s</beforebreak>
4741
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4666
4742
  <afterbreak>[0-9]+\b</afterbreak>
4667
4743
  </rule>
4668
4744
  <rule break="no">
4669
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.\s</beforebreak>
4745
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0]</beforebreak>
4670
4746
  <afterbreak>[XIV\d]+\b</afterbreak>
4671
4747
  </rule>
4672
4748
  <rule break="no">
4673
- <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4749
+ <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4674
4750
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4675
4751
  </rule>
4676
4752
  <rule break="no">
@@ -4685,75 +4761,75 @@
4685
4761
  </rule>
4686
4762
  <!-- Abbreviations that cannot finish sentences-->
4687
4763
  <rule break="no">
4688
- <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4764
+ <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4689
4765
  <afterbreak/>
4690
4766
  </rule>
4691
4767
  <rule break="no">
4692
- <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.\s</beforebreak>
4768
+ <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4693
4769
  <afterbreak/>
4694
4770
  </rule>
4695
4771
  <rule break="no">
4696
- <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.\s</beforebreak>
4772
+ <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4697
4773
  <afterbreak/>
4698
4774
  </rule>
4699
4775
  <rule break="no">
4700
- <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.\s</beforebreak>
4776
+ <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4701
4777
  <afterbreak/>
4702
4778
  </rule>
4703
4779
  <rule break="no">
4704
- <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.\s</beforebreak>
4780
+ <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4705
4781
  <afterbreak/>
4706
4782
  </rule>
4707
4783
  <!-- Abbreviations that can finish sentences -->
4708
4784
  <rule break="no">
4709
- <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4785
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4710
4786
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4711
4787
  </rule>
4712
4788
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4713
4789
  <rule break="no">
4714
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4790
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4715
4791
  <afterbreak>\p{Ll}</afterbreak>
4716
4792
  </rule>
4717
4793
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4718
4794
  <rule break="no">
4719
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4795
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4720
4796
  <afterbreak>\p{Ll}</afterbreak>
4721
4797
  </rule>
4722
4798
  <rule break="no">
4723
- <beforebreak>\bEE\.\s?</beforebreak>
4724
- <afterbreak>UU</afterbreak>
4799
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4800
+ <afterbreak>\p{Lu}{2}</afterbreak>
4725
4801
  </rule>
4726
4802
  <rule break="no">
4727
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4803
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4728
4804
  <afterbreak>\p{Ll}</afterbreak>
4729
4805
  </rule>
4730
4806
  <!-- max min etc -->
4731
4807
  <rule break="no">
4732
- <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4808
+ <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4733
4809
  <afterbreak>\p{Ll}</afterbreak>
4734
4810
  </rule>
4735
4811
  <!-- Composed abbrev. -->
4736
4812
  <rule break="no">
4737
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4813
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4738
4814
  <afterbreak/>
4739
4815
  </rule>
4740
4816
  <!-- Units -->
4741
4817
  <rule break="no">
4742
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4818
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4743
4819
  <afterbreak>\p{Ll}</afterbreak>
4744
4820
  </rule>
4745
4821
  <rule break="yes">
4746
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4822
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4747
4823
  <afterbreak></afterbreak>
4748
4824
  </rule>
4749
4825
  <rule break="yes">
4750
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4826
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4751
4827
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4752
4828
  </rule>
4753
4829
  <!-- paragraphs with opening "»" in dialogs-->
4754
4830
  <rule break="yes">
4755
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4756
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4831
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4832
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4757
4833
  </rule>
4758
4834
  </languagerule>
4759
4835
  <languagerule languagerulename="German">
@@ -4767,17 +4843,17 @@
4767
4843
  </rule>
4768
4844
  <!--support simple lists in markdown style-->
4769
4845
  <rule break="yes">
4770
- <beforebreak>\r?\n\s*[-*]+\s</beforebreak>
4846
+ <beforebreak>\r?\n[\u00A0\s]*[-*]+[\u00A0\s]</beforebreak>
4771
4847
  <afterbreak></afterbreak>
4772
4848
  </rule>
4773
4849
  <!-- Split at e.g. "1a. Und ..." -->
4774
4850
  <rule break="yes">
4775
- <beforebreak>\d+[a-z]\.\s</beforebreak>
4851
+ <beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
4776
4852
  <afterbreak>\p{Lu}</afterbreak>
4777
4853
  </rule>
4778
4854
  <!-- Don't split at e.g. "d. h." -->
4779
4855
  <rule break="no">
4780
- <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?\s</beforebreak>
4856
+ <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4781
4857
  <afterbreak></afterbreak>
4782
4858
  </rule>
4783
4859
  <rule break="no">
@@ -4801,7 +4877,7 @@
4801
4877
  <afterbreak>3|Buzz|Crozz</afterbreak>
4802
4878
  </rule>
4803
4879
  <rule break="no">
4804
- <beforebreak>[1-3]\.\s</beforebreak>
4880
+ <beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
4805
4881
  <afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
4806
4882
  </rule>
4807
4883
  <rule break="no">
@@ -4816,126 +4892,126 @@
4816
4892
  <!-- Don't split after a white-space followed by a single letter followed
4817
4893
  by a dot followed by another whitespace. e.g. " p. " -->
4818
4894
  <rule break="no">
4819
- <beforebreak>\s\p{L}\.\s</beforebreak>
4895
+ <beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
4820
4896
  <afterbreak>\p{L}\.</afterbreak>
4821
4897
  </rule>
4822
4898
  <!-- Don't split at "bla bla... yada yada" -->
4823
4899
  <rule break="no">
4824
- <beforebreak>[\[\(]?\.\.\.[\]\)]?\s</beforebreak>
4900
+ <beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
4825
4901
  <afterbreak>\p{Ll}</afterbreak>
4826
4902
  </rule>
4827
4903
  <!-- Don't split [.?!] when they're quoted -->
4828
4904
  <rule break="no">
4829
- <beforebreak>['"„][\.!?…]['"“]\s</beforebreak>
4905
+ <beforebreak>['"„][\.!?…]['"“][\u00A0\s]</beforebreak>
4830
4906
  <afterbreak></afterbreak>
4831
4907
  </rule>
4832
4908
  <!-- Don't break after quote unless there's a capital letter
4833
4909
  e.g.: "That's right!" he said. -->
4834
4910
  <rule break="no">
4835
- <beforebreak>["'“]\s</beforebreak>
4911
+ <beforebreak>["'“][\u00A0\s]</beforebreak>
4836
4912
  <afterbreak>\p{Ll}</afterbreak>
4837
4913
  </rule>
4838
4914
  <!-- e.g. "Das ist . so." - assume one sentence. -->
4839
4915
  <rule break="no">
4840
- <beforebreak>\s([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?\s</beforebreak>
4916
+ <beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4841
4917
  <afterbreak></afterbreak>
4842
4918
  </rule>
4843
4919
  <!-- Numbers, dates e.g. "3.10. datiert" -->
4844
4920
  <rule break="no">
4845
- <beforebreak>\b\d+\.\s</beforebreak>
4921
+ <beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
4846
4922
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
4847
4923
  </rule>
4848
4924
  <!-- z.B. "Das hier ist ein(!) Satz." -->
4849
4925
  <rule break="no">
4850
- <beforebreak>[\(\[][!?]{1,3}[\]\)]\s</beforebreak>
4926
+ <beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
4851
4927
  <afterbreak></afterbreak>
4852
4928
  </rule>
4853
4929
  <!-- z.B. "Das hier ist (genau!) ein Satz." -->
4854
4930
  <rule break="no">
4855
- <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
4931
+ <beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
4856
4932
  <afterbreak></afterbreak>
4857
4933
  </rule>
4858
4934
  <!-- z.B. "bla (...) blubb" -> kein Satzende -->
4859
4935
  <rule break="no">
4860
- <beforebreak>[\(\)\[\]]\s</beforebreak>
4936
+ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
4861
4937
  <afterbreak></afterbreak>
4862
4938
  </rule>
4863
4939
  <!-- don't split at cases like "Friedrich II. wird auch..." -->
4864
4940
  <rule break="no">
4865
- <beforebreak>[\s ][IVX]+\.\s</beforebreak>
4941
+ <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]</beforebreak>
4866
4942
  <afterbreak>[^\p{Lu}]+</afterbreak>
4867
4943
  </rule>
4868
4944
  <!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
4869
4945
  <rule break="no">
4870
- <beforebreak>\d+\.\s</beforebreak>
4871
- <afterbreak>(und|oder|bis)\s</afterbreak>
4946
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4947
+ <afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
4872
4948
  </rule>
4873
4949
  <!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
4874
- ohne dass eine Satzgrenze erkannt wird
4950
+ ohne dass eine Satzgrenze erkannt wird
4875
4951
  (z.B. "am 13. Dezember" -> keine Satzgrenze) -->
4876
4952
  <rule break="no">
4877
- <beforebreak>\d+\.\s</beforebreak>
4953
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4878
4954
  <afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
4879
4955
  </rule>
4880
4956
  <rule break="no">
4881
- <beforebreak>\d+\.\s</beforebreak>
4957
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4882
4958
  <afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
4883
4959
  </rule>
4884
4960
  <rule break="no">
4885
- <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.\s</beforebreak>
4961
+ <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
4886
4962
  <afterbreak>\d\d(\d\d)?</afterbreak>
4887
4963
  </rule>
4888
4964
  <!-- ähnliche Fälle außerhalb der Monatsnamen -->
4889
4965
  <rule break="no">
4890
- <beforebreak>\d+\.\s</beforebreak>
4966
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4891
4967
  <afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
4892
4968
  </rule>
4893
4969
  <!-- English abbreviations - but these work globally for all languages -->
4894
4970
  <rule break="no">
4895
- <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.\s</beforebreak>
4971
+ <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]</beforebreak>
4896
4972
  <afterbreak></afterbreak>
4897
4973
  </rule>
4898
4974
  <!-- Latin abbreviations - but these work globally for all languages -->
4899
4975
  <rule break="no">
4900
- <beforebreak>\b(spp?)\.\s</beforebreak>
4976
+ <beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
4901
4977
  <afterbreak></afterbreak>
4902
4978
  </rule>
4903
4979
  <!-- German abbreviations -->
4904
4980
  <rule break="no">
4905
- <beforebreak>\b(Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.\s</beforebreak>
4981
+ <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|[Ee]inschl|[Vv]mtl|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]</beforebreak>
4906
4982
  <afterbreak></afterbreak>
4907
4983
  </rule>
4908
4984
  <rule break="no">
4909
- <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.\s</beforebreak>
4985
+ <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|Evt|evt|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
4910
4986
  <afterbreak></afterbreak>
4911
4987
  </rule>
4912
4988
  <rule break="no">
4913
- <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.\s</beforebreak>
4989
+ <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
4914
4990
  <afterbreak></afterbreak>
4915
4991
  </rule>
4916
4992
  <rule break="no">
4917
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.\s</beforebreak>
4993
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]</beforebreak>
4918
4994
  <afterbreak></afterbreak>
4919
4995
  </rule>
4920
4996
  <rule break="no">
4921
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.\s</beforebreak>
4997
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]</beforebreak>
4922
4998
  <afterbreak></afterbreak>
4923
4999
  </rule>
4924
5000
  <rule break="no">
4925
- <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.\s</beforebreak>
5001
+ <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]</beforebreak>
4926
5002
  <afterbreak></afterbreak>
4927
5003
  </rule>
4928
5004
  <rule break="no">
4929
- <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.\s</beforebreak>
5005
+ <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
4930
5006
  <afterbreak></afterbreak>
4931
5007
  </rule>
4932
5008
  <rule break="no">
4933
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.\s</beforebreak>
5009
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]</beforebreak>
4934
5010
  <afterbreak></afterbreak>
4935
5011
  </rule>
4936
5012
  <!-- Break rules -->
4937
5013
  <rule break="yes">
4938
- <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?\s+</beforebreak>
5014
+ <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
4939
5015
  <afterbreak></afterbreak>
4940
5016
  </rule>
4941
5017
  <rule break="yes">
@@ -4943,7 +5019,7 @@
4943
5019
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
4944
5020
  </rule>
4945
5021
  <rule break="yes">
4946
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5022
+ <beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
4947
5023
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
4948
5024
  </rule>
4949
5025
  <!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
@@ -5159,11 +5235,19 @@
5159
5235
  </languagerule>
5160
5236
  <languagerule languagerulename="French">
5161
5237
  <rule break="no">
5162
- <beforebreak>Yahoo!\s</beforebreak>
5238
+ <beforebreak>[\s\u00A0]</beforebreak>
5239
+ <afterbreak>[»”’"'›]</afterbreak>
5240
+ </rule>
5241
+ <rule break="yes">
5242
+ <beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
5243
+ <afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
5244
+ </rule>
5245
+ <rule break="no">
5246
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
5163
5247
  <afterbreak>\p{Ll}</afterbreak>
5164
5248
  </rule>
5165
5249
  <rule break="yes">
5166
- <beforebreak>\.\[\d+\]\s</beforebreak>
5250
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5167
5251
  <afterbreak></afterbreak>
5168
5252
  </rule>
5169
5253
  <rule break="no"><!-- URLs without "www."-->
@@ -5172,19 +5256,24 @@
5172
5256
  </rule>
5173
5257
  <rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
5174
5258
  <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5175
- <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
5259
+ <afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
5260
+ </rule>
5261
+ <rule break="no">
5262
+ <!-- gaffa.org -->
5263
+ <beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
5264
+ <afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
5176
5265
  </rule>
5177
5266
  <!-- French abbreviations -->
5178
5267
  <rule break="no">
5179
- <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op)\.\s</beforebreak>
5268
+ <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0]</beforebreak>
5180
5269
  <afterbreak>\p{Ll}</afterbreak>
5181
5270
  </rule>
5182
5271
  <rule break="no">
5183
- <beforebreak>\b(etc)\.\)\s</beforebreak>
5272
+ <beforebreak>\b(etc)\.\)[\s\u00A0]</beforebreak>
5184
5273
  <afterbreak></afterbreak>
5185
5274
  </rule>
5186
5275
  <rule break="no">
5187
- <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?)\.\s</beforebreak>
5276
+ <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0]</beforebreak>
5188
5277
  <afterbreak></afterbreak>
5189
5278
  </rule>
5190
5279
  <rule break="no">
@@ -5200,39 +5289,39 @@
5200
5289
  <afterbreak>\p{Ll}</afterbreak>
5201
5290
  </rule>
5202
5291
  <rule break="no">
5203
- <beforebreak>["”'’]\s*</beforebreak>
5204
- <afterbreak>\s*\p{Ll}</afterbreak>
5292
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
5293
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
5205
5294
  </rule>
5206
5295
  <rule break="no">
5207
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
5296
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
5208
5297
  <afterbreak></afterbreak>
5209
5298
  </rule>
5210
5299
  <rule break="no">
5211
- <beforebreak>\b\p{L}\.\s</beforebreak>
5212
- <afterbreak>\p{L}\.\s</afterbreak>
5300
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
5301
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
5213
5302
  </rule>
5214
5303
  <rule break="no">
5215
5304
  <beforebreak>\b\p{L}\.</beforebreak>
5216
5305
  <afterbreak>\p{L}\.</afterbreak>
5217
5306
  </rule>
5218
5307
  <rule break="no"><!-- Je suis (...) Chris. -->
5219
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
5308
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5220
5309
  <afterbreak>[^\p{P}]</afterbreak>
5221
5310
  </rule>
5222
5311
  <rule break="no"><!-- Je suis (...?) Chris. -->
5223
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
5312
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5224
5313
  <afterbreak>[^\p{P}]</afterbreak>
5225
5314
  </rule>
5226
5315
  <rule break="no"><!-- Jones v. Smith -->
5227
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
5316
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5228
5317
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5229
5318
  </rule>
5230
5319
  <rule break="yes">
5231
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
5232
- <afterbreak>\p{N}+\)\s</afterbreak>
5320
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
5321
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
5233
5322
  </rule>
5234
5323
  <rule break="no">
5235
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
5324
+ <beforebreak>[\.\s\u00A0]\p{L}{1,2}\.[\s\u00A0]</beforebreak>
5236
5325
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
5237
5326
  </rule>
5238
5327
  <rule break="no">
@@ -5240,31 +5329,31 @@
5240
5329
  <afterbreak>[^\p{Lu}]</afterbreak>
5241
5330
  </rule>
5242
5331
  <rule break="no">
5243
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
5332
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
5244
5333
  <afterbreak></afterbreak>
5245
5334
  </rule>
5246
5335
  <rule break="no">
5247
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
5336
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
5248
5337
  <afterbreak></afterbreak>
5249
5338
  </rule>
5250
5339
  <rule break="no">
5251
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
5340
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
5252
5341
  <afterbreak></afterbreak>
5253
5342
  </rule>
5254
5343
  <rule break="no">
5255
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
5344
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
5256
5345
  <afterbreak>\p{Ll}+</afterbreak>
5257
5346
  </rule>
5258
5347
  <rule break="no">
5259
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
5348
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
5260
5349
  <afterbreak>\p{Ll}+</afterbreak>
5261
5350
  </rule>
5262
5351
  <rule break="no">
5263
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
5352
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5264
5353
  <afterbreak></afterbreak>
5265
5354
  </rule>
5266
5355
  <rule break="no"><!-- i.e. -->
5267
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
5356
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5268
5357
  <afterbreak></afterbreak>
5269
5358
  </rule>
5270
5359
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -5280,28 +5369,28 @@
5280
5369
  <afterbreak>[SK]\b</afterbreak>
5281
5370
  </rule>
5282
5371
  <rule break="no"><!-- No. 5 -->
5283
- <beforebreak>\b[nN]o\.\s</beforebreak>
5372
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5284
5373
  <afterbreak>\p{N}</afterbreak>
5285
5374
  </rule>
5286
5375
  <rule break="no"><!-- Ph.D. -->
5287
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
5376
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5288
5377
  <afterbreak>D\.?</afterbreak>
5289
5378
  </rule>
5290
5379
  <rule break="no"><!-- e.g. -->
5291
- <beforebreak>\be\.g\.\s</beforebreak>
5380
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5292
5381
  <afterbreak></afterbreak>
5293
5382
  </rule>
5294
5383
  <rule break="no"><!-- vs. -->
5295
- <beforebreak>\bvs\.\s</beforebreak>
5384
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5296
5385
  <afterbreak></afterbreak>
5297
5386
  </rule>
5298
5387
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5299
5388
  <rule break="no"><!-- Etc. -->
5300
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
5389
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5301
5390
  <afterbreak>[^\p{Lu}]</afterbreak>
5302
5391
  </rule>
5303
5392
  <rule break="no"><!-- BTW (by the way) -->
5304
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
5393
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5305
5394
  <afterbreak></afterbreak>
5306
5395
  </rule>
5307
5396
  <rule break="no">
@@ -5313,64 +5402,68 @@
5313
5402
  <afterbreak>3|Buzz|Crozz</afterbreak>
5314
5403
  </rule>
5315
5404
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5316
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
5405
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5317
5406
  <afterbreak></afterbreak>
5318
5407
  </rule>
5319
5408
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5320
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
5409
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5321
5410
  <afterbreak></afterbreak>
5322
5411
  </rule>
5323
5412
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5324
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
5413
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5325
5414
  <afterbreak></afterbreak>
5326
5415
  </rule>
5327
5416
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5328
- <beforebreak>\b[BM]\.\s?</beforebreak>
5417
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5329
5418
  <afterbreak>Eng\.?</afterbreak>
5330
5419
  </rule>
5331
5420
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5332
- <beforebreak>\bLL\.\s?</beforebreak>
5421
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5333
5422
  <afterbreak>[BM]\.?</afterbreak>
5334
5423
  </rule>
5335
5424
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5336
- <beforebreak>\b[BM]\.\s?</beforebreak>
5425
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5337
5426
  <afterbreak>Sc\.?</afterbreak>
5338
5427
  </rule>
5339
5428
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5340
- <beforebreak>\b[BM]\.\s?</beforebreak>
5429
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5341
5430
  <afterbreak>Comp?\.?</afterbreak>
5342
5431
  </rule>
5343
5432
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5344
- <beforebreak>\b[BM]\.\s?</beforebreak>
5433
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5345
5434
  <afterbreak>Arch\.?</afterbreak>
5346
5435
  </rule>
5347
5436
  <rule break="no">
5348
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
5437
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
5349
5438
  <afterbreak></afterbreak>
5350
5439
  </rule>
5351
5440
  <rule break="no">
5352
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
5441
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
5353
5442
  <afterbreak></afterbreak>
5354
5443
  </rule>
5355
5444
  <rule break="no">
5356
- <beforebreak>\bCorp\.\s</beforebreak>
5445
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
5357
5446
  <afterbreak></afterbreak>
5358
5447
  </rule>
5359
5448
  <rule break="no">
5360
- <beforebreak>\bBros\.\s</beforebreak>
5449
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
5361
5450
  <afterbreak></afterbreak>
5362
5451
  </rule>
5363
5452
  <rule break="no">
5364
- <beforebreak>\bLtd\.\s</beforebreak>
5453
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
5365
5454
  <afterbreak>\p{Ll}+</afterbreak>
5366
5455
  </rule>
5367
5456
  <rule break="no">
5368
- <beforebreak>\bCo\.\s</beforebreak>
5457
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
5369
5458
  <afterbreak></afterbreak>
5370
5459
  </rule>
5460
+ <rule break="no">
5461
+ <beforebreak>\bE\.[\s\u00A0]</beforebreak>
5462
+ <afterbreak>\b[Cc]oli\b</afterbreak>
5463
+ </rule>
5371
5464
  <!-- Break rules -->
5372
5465
  <rule break="yes">
5373
- <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?\s+</beforebreak>
5466
+ <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?[\s\u00A0]+</beforebreak>
5374
5467
  <afterbreak></afterbreak>
5375
5468
  </rule>
5376
5469
  <rule break="yes">
@@ -5378,7 +5471,7 @@
5378
5471
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
5379
5472
  </rule>
5380
5473
  <rule break="yes">
5381
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5474
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
5382
5475
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5383
5476
  </rule>
5384
5477
  </languagerule>
@@ -5440,7 +5533,7 @@
5440
5533
  <!-- І. Коваль -->
5441
5534
  <rule break="no">
5442
5535
  <beforebreak>[\h\v.]([А-ЯІЇЄҐACEIHOPX]\.-)?(?&lt;!°)[А-ЯІЇЄҐABCEIHOPX](?&lt;!(Куан[\h]+Ю|(Петр|Олександр)([аоу]|ові|ом)?[\h]+[IІ]+))\.[\h\v]*</beforebreak>
5443
- <afterbreak>(?!Від|Але)[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{2}</afterbreak>
5536
+ <afterbreak>[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{3}</afterbreak>
5444
5537
  </rule>
5445
5538
  <!-- Ів. Франко (але Ів Бутільє) -->
5446
5539
  <rule break="no">
@@ -5526,10 +5619,14 @@
5526
5619
  </rule>
5527
5620
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5528
5621
  <rule break="no">
5529
- <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор)\.[\h\v]*</beforebreak>
5622
+ <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5530
5623
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5531
5624
  </rule>
5532
- <!-- смерть гр. Болтаровича -->
5625
+ <rule break="no">
5626
+ <beforebreak>\bМан\.[\h\v]*</beforebreak>
5627
+ <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5628
+ </rule>
5629
+ <!-- смерть гр. Болтаровича, but not "9 гр." -->
5533
5630
  <rule break="no">
5534
5631
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5535
5632
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
@@ -5537,7 +5634,7 @@
5537
5634
  <!-- арт. - артикул -->
5538
5635
  <!-- TODO: арт. - артист -->
5539
5636
  <rule break="no">
5540
- <beforebreak>\bарт\.[\h\v]*</beforebreak>
5637
+ <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5541
5638
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5542
5639
  </rule>
5543
5640
  <!-- ХІІ р., 3-6 арт. -->
@@ -5802,7 +5899,7 @@
5802
5899
  <afterbreak>['"«¡¿\p{Ps}\p{Pi}]?\p{Lu}\p{Ll}*</afterbreak>
5803
5900
  </rule>
5804
5901
  </languagerule>
5805
- <languagerule languagerulename="Japanese">
5902
+ <languagerule languagerulename="Ideographic">
5806
5903
  <rule break="no">
5807
5904
  <beforebreak>[:]+[\p{Pe}\p{Pf}\p{Po}"-[\u002C\u003A\u003B\u055D\u060C\u061B\u0703\u0704\u0705\u0706\u0707\u0708\u0709\u07F8\u1363\u1364\u1365\u1366\u1802\u1804\u1808\u204F\u205D\u3001\uA60D\uFE10\uFE11\uFE13\uFE14\uFE50\uFE51\uFE54\uFE55\uFF0C\uFF1A\uFF1B\uFF64]]*</beforebreak>
5808
5905
  <afterbreak>\s+\P{Lu}</afterbreak>
@@ -5870,7 +5967,7 @@
5870
5967
  </rule>
5871
5968
  <!-- Abbreviations that cannot finish sentences-->
5872
5969
  <rule break="no">
5873
- <beforebreak>\b(a|Ab|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|Álg|aprox|art|Artilh|auxil|av|Av)\.\s?</beforebreak>
5970
+ <beforebreak>\b(a|Ab|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s?</beforebreak>
5874
5971
  <afterbreak></afterbreak>
5875
5972
  </rule>
5876
5973
  <rule break="no">
@@ -6279,7 +6376,7 @@
6279
6376
  <afterbreak></afterbreak>
6280
6377
  </rule>
6281
6378
  <rule break="no">
6282
- <beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak>
6379
+ <beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak>
6283
6380
  <afterbreak></afterbreak>
6284
6381
  </rule>
6285
6382
  <rule break="no">
@@ -6359,7 +6456,7 @@
6359
6456
  </rule>
6360
6457
  <!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."-->
6361
6458
  <rule break="no">
6362
- <beforebreak>[\s ][IVX]+\s</beforebreak>
6459
+ <beforebreak>[\s ][IVX]+\s</beforebreak>
6363
6460
  <afterbreak>[^\p{Lu}]+</afterbreak>
6364
6461
  </rule>
6365
6462
  <!--Не раздвајај у случају као "од 13. до 14. века"-->
@@ -6598,10 +6695,8 @@
6598
6695
  <languagemap languagepattern="(UK|uk).*" languagerulename="Ukrainian"></languagemap>
6599
6696
  <languagemap languagepattern="(BE|be).*" languagerulename="Belarusian"></languagemap>
6600
6697
  <languagemap languagepattern="(GL|gl).*" languagerulename="Galician"></languagemap>
6601
- <languagemap languagepattern="(JA|ja).*" languagerulename="Japanese"></languagemap>
6602
- <!-- Modification: Japanese rules work well for Chinese; keep original name to
6603
- minimize diff -->
6604
- <languagemap languagepattern="(ZH|zh).*" languagerulename="Japanese"></languagemap>
6698
+ <languagemap languagepattern="(JA|ja).*" languagerulename="Ideographic"></languagemap>
6699
+ <languagemap languagepattern="(ZH|zh).*" languagerulename="Ideographic"></languagemap>
6605
6700
  <languagemap languagepattern="(BR|br).*" languagerulename="Breton"></languagemap>
6606
6701
  <languagemap languagepattern="(PT|pt).*" languagerulename="Portuguese"></languagemap>
6607
6702
  <languagemap languagepattern="(IT|it).*" languagerulename="Italian"></languagemap>