srx-languagetool 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/srx/segment.srx CHANGED
@@ -1102,12 +1102,16 @@
1102
1102
  </rule>
1103
1103
  </languagerule>
1104
1104
  <languagerule languagerulename="English">
1105
+ <rule break="no">
1106
+ <beforebreak>[\u00A0\s]</beforebreak>
1107
+ <afterbreak>\n</afterbreak>
1108
+ </rule>
1105
1109
  <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1106
- <beforebreak>[a-zA-Z][!\?]\s</beforebreak>
1107
- <afterbreak>\)\s[a-zA-Z]</afterbreak>
1110
+ <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
+ <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1108
1112
  </rule>
1109
1113
  <rule break="no">
1110
- <beforebreak>Yahoo!\s</beforebreak>
1114
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1111
1115
  <afterbreak>\p{Ll}</afterbreak>
1112
1116
  </rule>
1113
1117
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -1118,6 +1122,10 @@
1118
1122
  <beforebreak>\bA\.</beforebreak>
1119
1123
  <afterbreak>I\b</afterbreak>
1120
1124
  </rule>
1125
+ <rule break="no"><!-- S.I (no dot at end) -->
1126
+ <beforebreak>\bS\.</beforebreak>
1127
+ <afterbreak>I\b</afterbreak>
1128
+ </rule>
1121
1129
  <rule break="no"><!-- L.A (no dot at end) -->
1122
1130
  <beforebreak>\bL\.</beforebreak>
1123
1131
  <afterbreak>A\b</afterbreak>
@@ -1135,96 +1143,96 @@
1135
1143
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1136
1144
  </rule>
1137
1145
  <rule break="no"><!-- No. 5 -->
1138
- <beforebreak>\b[nN]o\.\s</beforebreak>
1146
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1139
1147
  <afterbreak>\p{N}</afterbreak>
1140
1148
  </rule>
1141
1149
  <rule break="no"><!-- Ph.D. -->
1142
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
1150
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1143
1151
  <afterbreak>D\.?</afterbreak>
1144
1152
  </rule>
1145
1153
  <rule break="no"><!-- min. -->
1146
- <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?\s*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.\s</beforebreak>
1154
+ <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1147
1155
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1148
1156
  </rule>
1149
1157
  <rule break="no"><!-- hr. -->
1150
- <beforebreak>\b(hr)\.\s</beforebreak>
1158
+ <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1151
1159
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1152
1160
  </rule>
1153
1161
  <rule break="no"><!-- Fig. 8 -->
1154
- <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1162
+ <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1155
1163
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1156
1164
  </rule>
1157
1165
  <rule break="no"><!-- Fig. (8) -->
1158
- <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1166
+ <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1159
1167
  <afterbreak>\(\p{N}\)</afterbreak>
1160
1168
  </rule>
1161
1169
  <rule break="no"><!-- I'm (...) great! -->
1162
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
1170
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1163
1171
  <afterbreak>[^\p{P}]</afterbreak>
1164
1172
  </rule>
1165
1173
  <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1166
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
1174
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1167
1175
  <afterbreak>[^\p{P}]</afterbreak>
1168
1176
  </rule>
1169
1177
  <rule break="no"><!-- e.g. -->
1170
- <beforebreak>\be\.g\.\s</beforebreak>
1178
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1171
1179
  <afterbreak></afterbreak>
1172
1180
  </rule>
1173
1181
  <rule break="no"><!-- vs. -->
1174
- <beforebreak>\bvs\.\s</beforebreak>
1182
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
1175
1183
  <afterbreak></afterbreak>
1176
1184
  </rule>
1177
1185
  <rule break="no"><!-- esp. -->
1178
- <beforebreak>\be[sx]p\.\s</beforebreak>
1186
+ <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1179
1187
  <afterbreak></afterbreak>
1180
1188
  </rule>
1181
1189
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1182
1190
  <rule break="no"><!-- Etc. -->
1183
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
1191
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1184
1192
  <afterbreak>[^\p{Lu}]</afterbreak>
1185
1193
  </rule>
1186
1194
  <rule break="no"><!-- BTW (by the way) -->
1187
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
1195
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1188
1196
  <afterbreak></afterbreak>
1189
1197
  </rule>
1190
1198
  <rule break="no">
1191
- <beforebreak>\bJan\.\s</beforebreak>
1199
+ <beforebreak>\bJan\.[\s\u00A0]</beforebreak>
1192
1200
  <afterbreak></afterbreak>
1193
1201
  </rule>
1194
1202
  <rule break="no">
1195
- <beforebreak>\bFeb\.\s</beforebreak>
1203
+ <beforebreak>\bFeb\.[\s\u00A0]</beforebreak>
1196
1204
  <afterbreak></afterbreak>
1197
1205
  </rule>
1198
1206
  <rule break="no">
1199
- <beforebreak>\bMar\.\s</beforebreak>
1207
+ <beforebreak>\bMar\.[\s\u00A0]</beforebreak>
1200
1208
  <afterbreak></afterbreak>
1201
1209
  </rule>
1202
1210
  <rule break="no">
1203
- <beforebreak>\bApr\.\s</beforebreak>
1211
+ <beforebreak>\bApr\.[\s\u00A0]</beforebreak>
1204
1212
  <afterbreak></afterbreak>
1205
1213
  </rule>
1206
1214
  <rule break="no">
1207
- <beforebreak>\bJu[nl]\.\s</beforebreak>
1215
+ <beforebreak>\bJu[nl]\.[\s\u00A0]</beforebreak>
1208
1216
  <afterbreak></afterbreak>
1209
1217
  </rule>
1210
1218
  <rule break="no">
1211
- <beforebreak>\bAug\.\s</beforebreak>
1219
+ <beforebreak>\bAug\.[\s\u00A0]</beforebreak>
1212
1220
  <afterbreak></afterbreak>
1213
1221
  </rule>
1214
1222
  <rule break="no">
1215
- <beforebreak>\bSept?\.\s</beforebreak>
1223
+ <beforebreak>\bSept?\.[\s\u00A0]</beforebreak>
1216
1224
  <afterbreak></afterbreak>
1217
1225
  </rule>
1218
1226
  <rule break="no">
1219
- <beforebreak>\bOct\.\s</beforebreak>
1227
+ <beforebreak>\bOct\.[\s\u00A0]</beforebreak>
1220
1228
  <afterbreak></afterbreak>
1221
1229
  </rule>
1222
1230
  <rule break="no">
1223
- <beforebreak>\bNov\.\s</beforebreak>
1231
+ <beforebreak>\bNov\.[\s\u00A0]</beforebreak>
1224
1232
  <afterbreak></afterbreak>
1225
1233
  </rule>
1226
1234
  <rule break="no">
1227
- <beforebreak>\bDec\.\s</beforebreak>
1235
+ <beforebreak>\bDec\.[\s\u00A0]</beforebreak>
1228
1236
  <afterbreak></afterbreak>
1229
1237
  </rule>
1230
1238
  <rule break="no">
@@ -1236,43 +1244,43 @@
1236
1244
  <afterbreak>3|Buzz|Crozz</afterbreak>
1237
1245
  </rule>
1238
1246
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1239
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
1247
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1240
1248
  <afterbreak></afterbreak>
1241
1249
  </rule>
1242
1250
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1243
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
1251
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1244
1252
  <afterbreak></afterbreak>
1245
1253
  </rule>
1246
1254
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1247
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
1255
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1248
1256
  <afterbreak></afterbreak>
1249
1257
  </rule>
1250
1258
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1251
- <beforebreak>\b[BM]\.\s?</beforebreak>
1259
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1252
1260
  <afterbreak>Eng\.?</afterbreak>
1253
1261
  </rule>
1254
1262
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1255
- <beforebreak>\bLL\.\s?</beforebreak>
1263
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1256
1264
  <afterbreak>[BM]\.?</afterbreak>
1257
1265
  </rule>
1258
1266
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1259
- <beforebreak>\b[BM]\.\s?</beforebreak>
1267
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1260
1268
  <afterbreak>Sc\.?</afterbreak>
1261
1269
  </rule>
1262
1270
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1263
- <beforebreak>\b[BM]\.\s?</beforebreak>
1271
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1264
1272
  <afterbreak>Comp?\.?</afterbreak>
1265
1273
  </rule>
1266
1274
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1267
- <beforebreak>\b[BM]\.\s?</beforebreak>
1275
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1268
1276
  <afterbreak>Arch\.?</afterbreak>
1269
1277
  </rule>
1270
1278
  <rule break="no">
1271
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
1279
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
1272
1280
  <afterbreak></afterbreak>
1273
1281
  </rule>
1274
1282
  <rule break="no">
1275
- <beforebreak>\bet\b\s\bal\.\s</beforebreak>
1283
+ <beforebreak>\bet\b[\s\u00A0]\bal\.[\s\u00A0]</beforebreak>
1276
1284
  <afterbreak></afterbreak>
1277
1285
  </rule>
1278
1286
  <rule break="no">
@@ -1280,51 +1288,51 @@
1280
1288
  <afterbreak></afterbreak>
1281
1289
  </rule>
1282
1290
  <rule break="no">
1283
- <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s</beforebreak>
1291
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[[\s\u00A0]\u00A0]</beforebreak>
1284
1292
  <afterbreak></afterbreak>
1285
1293
  </rule>
1286
1294
  <rule break="no">
1287
- <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s[A-Z]\.\s</beforebreak>
1295
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1288
1296
  <afterbreak></afterbreak>
1289
1297
  </rule>
1290
1298
  <rule break="no">
1291
- <beforebreak>\b(Drs|Messrs|Mmes)\.\s</beforebreak>
1292
- <afterbreak>(and\s)|\p{Lu}\p{Ll}+</afterbreak>
1299
+ <beforebreak>\b(Drs|Messrs|Mmes)\.[\s\u00A0]</beforebreak>
1300
+ <afterbreak>(and[\s\u00A0])|\p{Lu}\p{Ll}+</afterbreak>
1293
1301
  </rule>
1294
1302
  <rule break="no">
1295
- <beforebreak>\bcf\.\s</beforebreak>
1303
+ <beforebreak>\bcf\.[\s\u00A0]</beforebreak>
1296
1304
  <afterbreak></afterbreak>
1297
1305
  </rule>
1298
1306
  <rule break="no">
1299
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
1307
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
1300
1308
  <afterbreak></afterbreak>
1301
1309
  </rule>
1302
1310
  <rule break="no">
1303
- <beforebreak>\bCorp\.\s</beforebreak>
1311
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
1304
1312
  <afterbreak></afterbreak>
1305
1313
  </rule>
1306
1314
  <rule break="no">
1307
- <beforebreak>\bBros\.\s</beforebreak>
1315
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
1308
1316
  <afterbreak></afterbreak>
1309
1317
  </rule>
1310
1318
  <rule break="no">
1311
- <beforebreak>\bDist\.\s</beforebreak>
1319
+ <beforebreak>\bDist\.[\s\u00A0]</beforebreak>
1312
1320
  <afterbreak></afterbreak>
1313
1321
  </rule>
1314
1322
  <rule break="no">
1315
- <beforebreak>\bCo\.\s</beforebreak>
1323
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
1316
1324
  <afterbreak></afterbreak>
1317
1325
  </rule>
1318
1326
  <rule break="no">
1319
- <beforebreak>\bo'clock\s</beforebreak>
1327
+ <beforebreak>\bo'clock[\s\u00A0]</beforebreak>
1320
1328
  <afterbreak></afterbreak>
1321
1329
  </rule>
1322
1330
  <rule break="no">
1323
- <beforebreak>\bfo'c'sle\s</beforebreak>
1331
+ <beforebreak>\bfo'c'sle[\s\u00A0]</beforebreak>
1324
1332
  <afterbreak></afterbreak>
1325
1333
  </rule>
1326
1334
  <rule break="no">
1327
- <beforebreak>\bLtd\.\s</beforebreak>
1335
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
1328
1336
  <afterbreak>\p{Ll}+</afterbreak>
1329
1337
  </rule>
1330
1338
  <rule break="no">
@@ -1340,35 +1348,35 @@
1340
1348
  <afterbreak>\p{Ll}</afterbreak>
1341
1349
  </rule>
1342
1350
  <rule break="no">
1343
- <beforebreak>["”'’]\s*</beforebreak>
1344
- <afterbreak>\s*\p{Ll}</afterbreak>
1351
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
1352
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
1345
1353
  </rule>
1346
1354
  <rule break="no">
1347
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
1355
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
1348
1356
  <afterbreak></afterbreak>
1349
1357
  </rule>
1350
1358
  <rule break="no">
1351
- <beforebreak>\b\p{L}\.\s</beforebreak>
1352
- <afterbreak>\p{L}\.\s</afterbreak>
1359
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
1360
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
1353
1361
  </rule>
1354
1362
  <rule break="no">
1355
1363
  <beforebreak>\b\p{L}\.</beforebreak>
1356
1364
  <afterbreak>\p{L}\.</afterbreak>
1357
1365
  </rule>
1358
1366
  <rule break="no"><!-- Jones v. Smith -->
1359
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
1367
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1360
1368
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1361
1369
  </rule>
1362
1370
  <rule break="yes">
1363
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
1364
- <afterbreak>\p{N}+\)\s</afterbreak>
1371
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
1372
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
1365
1373
  </rule>
1366
1374
  <rule break="yes">
1367
- <beforebreak>\bOK\.\s</beforebreak>
1375
+ <beforebreak>\bOK\.[\s\u00A0]</beforebreak>
1368
1376
  <afterbreak>\p{Ll}+</afterbreak>
1369
1377
  </rule>
1370
1378
  <rule break="no">
1371
- <beforebreak>[\.\s](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.\s</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1379
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1372
1380
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1373
1381
  </rule>
1374
1382
  <rule break="no">
@@ -1376,35 +1384,35 @@
1376
1384
  <afterbreak>[^\p{Lu}]</afterbreak>
1377
1385
  </rule>
1378
1386
  <rule break="no">
1379
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
1387
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
1380
1388
  <afterbreak></afterbreak>
1381
1389
  </rule>
1382
1390
  <rule break="no">
1383
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
1391
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
1384
1392
  <afterbreak></afterbreak>
1385
1393
  </rule>
1386
1394
  <rule break="no">
1387
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
1395
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1388
1396
  <afterbreak></afterbreak>
1389
1397
  </rule>
1390
1398
  <rule break="no">
1391
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
1399
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
1392
1400
  <afterbreak>\p{Ll}+</afterbreak>
1393
1401
  </rule>
1394
1402
  <rule break="no">
1395
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
1403
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
1396
1404
  <afterbreak>\p{Ll}+</afterbreak>
1397
1405
  </rule>
1398
1406
  <rule break="no">
1399
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
1407
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1400
1408
  <afterbreak></afterbreak>
1401
1409
  </rule>
1402
1410
  <rule break="no"><!-- i.e. -->
1403
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
1411
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1404
1412
  <afterbreak></afterbreak>
1405
1413
  </rule>
1406
1414
  <rule break="yes">
1407
- <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
1415
+ <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak>
1408
1416
  <afterbreak></afterbreak>
1409
1417
  </rule>
1410
1418
  <rule break="yes">
@@ -1412,7 +1420,7 @@
1412
1420
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1413
1421
  </rule>
1414
1422
  <rule break="yes">
1415
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
1423
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
1416
1424
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
1417
1425
  </rule>
1418
1426
  </languagerule>
@@ -1511,6 +1519,11 @@
1511
1519
  </rule>
1512
1520
  </languagerule>
1513
1521
  <languagerule languagerulename="Dutch">
1522
+ <rule break="no">
1523
+ <!-- .Net -->
1524
+ <beforebreak>\s[.]</beforebreak>
1525
+ <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1526
+ </rule>
1514
1527
  <rule break="no"><!-- quoted sentence in sentence -->
1515
1528
  <beforebreak>[.?!][’'"]</beforebreak>
1516
1529
  <afterbreak> [a-z]</afterbreak>
@@ -1729,6 +1742,31 @@
1729
1742
  <beforebreak>[?!.]\s</beforebreak>
1730
1743
  <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1731
1744
  </rule>
1745
+ <rule break="no">
1746
+ <!-- "E. coli etc. -->
1747
+ <beforebreak>"[A-Z][.]\s</beforebreak>
1748
+ <afterbreak>[a-z]</afterbreak>
1749
+ </rule>
1750
+ <rule break="no">
1751
+ <!-- Cornelisz. -->
1752
+ <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1753
+ <afterbreak>[a-z]</afterbreak>
1754
+ </rule>
1755
+ <rule break="no">
1756
+ <!-- De n. XIV/vagus (nervus) -->
1757
+ <beforebreak>De n[.]\s</beforebreak>
1758
+ <afterbreak>[a-z]|[XIV]</afterbreak>
1759
+ </rule>
1760
+ <rule break="no">
1761
+ <!-- MOL.E -->
1762
+ <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1763
+ <afterbreak>[A-Z]</afterbreak>
1764
+ </rule>
1765
+ <rule break="no">
1766
+ <!-- ..." betekent -->
1767
+ <beforebreak>\.\.</beforebreak>
1768
+ <afterbreak>" [a-z]</afterbreak>
1769
+ </rule>
1732
1770
  <!-- ##### end of Dutch #### -->
1733
1771
  </languagerule>
1734
1772
  <languagerule languagerulename="Slovak">
@@ -4556,146 +4594,146 @@
4556
4594
  </languagerule>
4557
4595
  <languagerule languagerulename="Catalan">
4558
4596
  <rule break="no">
4559
- <beforebreak>Yahoo!\s</beforebreak>
4597
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4560
4598
  <afterbreak>\p{Ll}</afterbreak>
4561
4599
  </rule>
4562
4600
  <rule break="yes">
4563
- <beforebreak>\w['’][nNtT]\.\s</beforebreak>
4601
+ <beforebreak>\w['’][nNtT]\.[\s\u00A0]</beforebreak>
4564
4602
  <afterbreak></afterbreak>
4565
4603
  </rule>
4566
4604
  <rule break="yes">
4567
- <beforebreak>\.\[\d+\]\s</beforebreak>
4605
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4568
4606
  <afterbreak></afterbreak>
4569
4607
  </rule>
4570
4608
  <!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
4571
4609
  <rule break="no">
4572
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4610
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4573
4611
  <afterbreak></afterbreak>
4574
4612
  </rule>
4575
4613
  <!-- Abbreviations that cannot finish sentences-->
4576
4614
  <rule break="no">
4577
- <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4615
+ <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4578
4616
  <afterbreak></afterbreak>
4579
4617
  </rule>
4580
4618
  <!-- Abbreviations that can finish sentences -->
4581
4619
  <rule break="no">
4582
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4620
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4583
4621
  <afterbreak>[XIV]+\b</afterbreak>
4584
4622
  </rule>
4585
4623
  <rule break="no">
4586
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4624
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4587
4625
  <afterbreak>[0-9]+\b</afterbreak>
4588
4626
  </rule>
4589
4627
  <rule break="no">
4590
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.\s</beforebreak>
4628
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
4591
4629
  <afterbreak>[XIV\d]+\b</afterbreak>
4592
4630
  </rule>
4593
4631
  <rule break="no">
4594
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4632
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4595
4633
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4596
4634
  </rule>
4597
4635
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4598
4636
  <rule break="no">
4599
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4637
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4600
4638
  <afterbreak>\p{Ll}</afterbreak>
4601
4639
  </rule>
4602
4640
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4603
4641
  <rule break="no">
4604
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4642
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4605
4643
  <afterbreak>\p{Ll}</afterbreak>
4606
4644
  </rule>
4607
4645
  <rule break="no">
4608
- <beforebreak>\bEE\.\s?</beforebreak>
4609
- <afterbreak>UU</afterbreak>
4646
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4647
+ <afterbreak>\p{Lu}{2}</afterbreak>
4610
4648
  </rule>
4611
4649
  <rule break="no">
4612
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4650
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4613
4651
  <afterbreak>\p{Ll}</afterbreak>
4614
4652
  </rule>
4615
4653
  <!-- max min etc -->
4616
4654
  <rule break="no">
4617
- <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4655
+ <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4618
4656
  <afterbreak>\p{Ll}</afterbreak>
4619
4657
  </rule>
4620
4658
  <!-- Composed abbrev. -->
4621
4659
  <rule break="no">
4622
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4660
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4623
4661
  <afterbreak></afterbreak>
4624
4662
  </rule>
4625
4663
  <!-- Units -->
4626
4664
  <rule break="no">
4627
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4665
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4628
4666
  <afterbreak>\p{Ll}</afterbreak>
4629
4667
  </rule>
4630
4668
  <!-- Ellipsis: ... lowercase -->
4631
4669
  <rule break="no">
4632
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4670
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4633
4671
  <afterbreak>\p{Ll}</afterbreak>
4634
4672
  </rule>
4635
4673
  <!-- (enum...) -->
4636
4674
  <rule break="no">
4637
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4675
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4638
4676
  <afterbreak>\p{Ll}</afterbreak>
4639
4677
  </rule>
4640
4678
  <!-- pero ¡ah! no estaba
4641
4679
  <rule break="no">
4642
- <beforebreak>\b¡\p{L}+!\s</beforebreak>
4680
+ <beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
4643
4681
  <afterbreak>\p{Ll}</afterbreak>
4644
4682
  </rule>
4645
4683
  -->
4646
4684
  <rule break="yes">
4647
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4685
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4648
4686
  <afterbreak></afterbreak>
4649
4687
  </rule>
4650
4688
  <rule break="yes">
4651
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4689
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4652
4690
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4653
4691
  </rule>
4654
4692
  <!-- paragraphs with opening "»" in dialogs-->
4655
4693
  <rule break="yes">
4656
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4657
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4694
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4695
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4658
4696
  </rule>
4659
4697
  </languagerule>
4660
4698
  <languagerule languagerulename="Spanish">
4661
4699
  <rule break="no">
4662
- <beforebreak>Yahoo!\s</beforebreak>
4700
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4663
4701
  <afterbreak>\p{Ll}</afterbreak>
4664
4702
  </rule>
4665
4703
  <rule break="yes">
4666
- <beforebreak>\.\[\d+\]\s</beforebreak>
4704
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4667
4705
  <afterbreak></afterbreak>
4668
4706
  </rule>
4669
4707
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4670
4708
  <rule break="no">
4671
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4709
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4672
4710
  <afterbreak/>
4673
4711
  </rule>
4674
4712
  <!-- Ellipsis: ... lowercase -->
4675
4713
  <rule break="no">
4676
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4714
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4677
4715
  <afterbreak>\p{Ll}</afterbreak>
4678
4716
  </rule>
4679
4717
  <!-- (enum...) -->
4680
4718
  <rule break="no">
4681
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4719
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4682
4720
  <afterbreak>\p{Ll}</afterbreak>
4683
4721
  </rule>
4684
4722
  <!-- Abbreviations that can finish sentences -->
4685
4723
  <rule break="no">
4686
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4724
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4687
4725
  <afterbreak>[XIV]+\b</afterbreak>
4688
4726
  </rule>
4689
4727
  <rule break="no">
4690
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4728
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4691
4729
  <afterbreak>[0-9]+\b</afterbreak>
4692
4730
  </rule>
4693
4731
  <rule break="no">
4694
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.\s</beforebreak>
4732
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0]</beforebreak>
4695
4733
  <afterbreak>[XIV\d]+\b</afterbreak>
4696
4734
  </rule>
4697
4735
  <rule break="no">
4698
- <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4736
+ <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4699
4737
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4700
4738
  </rule>
4701
4739
  <rule break="no">
@@ -4710,75 +4748,75 @@
4710
4748
  </rule>
4711
4749
  <!-- Abbreviations that cannot finish sentences-->
4712
4750
  <rule break="no">
4713
- <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4751
+ <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4714
4752
  <afterbreak/>
4715
4753
  </rule>
4716
4754
  <rule break="no">
4717
- <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.\s</beforebreak>
4755
+ <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4718
4756
  <afterbreak/>
4719
4757
  </rule>
4720
4758
  <rule break="no">
4721
- <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.\s</beforebreak>
4759
+ <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4722
4760
  <afterbreak/>
4723
4761
  </rule>
4724
4762
  <rule break="no">
4725
- <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.\s</beforebreak>
4763
+ <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4726
4764
  <afterbreak/>
4727
4765
  </rule>
4728
4766
  <rule break="no">
4729
- <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.\s</beforebreak>
4767
+ <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4730
4768
  <afterbreak/>
4731
4769
  </rule>
4732
4770
  <!-- Abbreviations that can finish sentences -->
4733
4771
  <rule break="no">
4734
- <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4772
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4735
4773
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4736
4774
  </rule>
4737
4775
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4738
4776
  <rule break="no">
4739
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4777
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4740
4778
  <afterbreak>\p{Ll}</afterbreak>
4741
4779
  </rule>
4742
4780
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4743
4781
  <rule break="no">
4744
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4782
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4745
4783
  <afterbreak>\p{Ll}</afterbreak>
4746
4784
  </rule>
4747
4785
  <rule break="no">
4748
- <beforebreak>\bEE\.\s?</beforebreak>
4749
- <afterbreak>UU</afterbreak>
4786
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4787
+ <afterbreak>\p{Lu}{2}</afterbreak>
4750
4788
  </rule>
4751
4789
  <rule break="no">
4752
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4790
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4753
4791
  <afterbreak>\p{Ll}</afterbreak>
4754
4792
  </rule>
4755
4793
  <!-- max min etc -->
4756
4794
  <rule break="no">
4757
- <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4795
+ <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4758
4796
  <afterbreak>\p{Ll}</afterbreak>
4759
4797
  </rule>
4760
4798
  <!-- Composed abbrev. -->
4761
4799
  <rule break="no">
4762
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4800
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4763
4801
  <afterbreak/>
4764
4802
  </rule>
4765
4803
  <!-- Units -->
4766
4804
  <rule break="no">
4767
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4805
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4768
4806
  <afterbreak>\p{Ll}</afterbreak>
4769
4807
  </rule>
4770
4808
  <rule break="yes">
4771
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4809
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4772
4810
  <afterbreak></afterbreak>
4773
4811
  </rule>
4774
4812
  <rule break="yes">
4775
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4813
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4776
4814
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4777
4815
  </rule>
4778
4816
  <!-- paragraphs with opening "»" in dialogs-->
4779
4817
  <rule break="yes">
4780
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4781
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4818
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4819
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4782
4820
  </rule>
4783
4821
  </languagerule>
4784
4822
  <languagerule languagerulename="German">
@@ -4792,17 +4830,17 @@
4792
4830
  </rule>
4793
4831
  <!--support simple lists in markdown style-->
4794
4832
  <rule break="yes">
4795
- <beforebreak>\r?\n\s*[-*]+\s</beforebreak>
4833
+ <beforebreak>\r?\n[\u00A0\s]*[-*]+[\u00A0\s]</beforebreak>
4796
4834
  <afterbreak></afterbreak>
4797
4835
  </rule>
4798
4836
  <!-- Split at e.g. "1a. Und ..." -->
4799
4837
  <rule break="yes">
4800
- <beforebreak>\d+[a-z]\.\s</beforebreak>
4838
+ <beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
4801
4839
  <afterbreak>\p{Lu}</afterbreak>
4802
4840
  </rule>
4803
4841
  <!-- Don't split at e.g. "d. h." -->
4804
4842
  <rule break="no">
4805
- <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?\s</beforebreak>
4843
+ <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4806
4844
  <afterbreak></afterbreak>
4807
4845
  </rule>
4808
4846
  <rule break="no">
@@ -4826,7 +4864,7 @@
4826
4864
  <afterbreak>3|Buzz|Crozz</afterbreak>
4827
4865
  </rule>
4828
4866
  <rule break="no">
4829
- <beforebreak>[1-3]\.\s</beforebreak>
4867
+ <beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
4830
4868
  <afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
4831
4869
  </rule>
4832
4870
  <rule break="no">
@@ -4841,126 +4879,126 @@
4841
4879
  <!-- Don't split after a white-space followed by a single letter followed
4842
4880
  by a dot followed by another whitespace. e.g. " p. " -->
4843
4881
  <rule break="no">
4844
- <beforebreak>\s\p{L}\.\s</beforebreak>
4882
+ <beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
4845
4883
  <afterbreak>\p{L}\.</afterbreak>
4846
4884
  </rule>
4847
4885
  <!-- Don't split at "bla bla... yada yada" -->
4848
4886
  <rule break="no">
4849
- <beforebreak>[\[\(]?\.\.\.[\]\)]?\s</beforebreak>
4887
+ <beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
4850
4888
  <afterbreak>\p{Ll}</afterbreak>
4851
4889
  </rule>
4852
4890
  <!-- Don't split [.?!] when they're quoted -->
4853
4891
  <rule break="no">
4854
- <beforebreak>['"„][\.!?…]['"“]\s</beforebreak>
4892
+ <beforebreak>['"„][\.!?…]['"“][\u00A0\s]</beforebreak>
4855
4893
  <afterbreak></afterbreak>
4856
4894
  </rule>
4857
4895
  <!-- Don't break after quote unless there's a capital letter
4858
4896
  e.g.: "That's right!" he said. -->
4859
4897
  <rule break="no">
4860
- <beforebreak>["'“]\s</beforebreak>
4898
+ <beforebreak>["'“][\u00A0\s]</beforebreak>
4861
4899
  <afterbreak>\p{Ll}</afterbreak>
4862
4900
  </rule>
4863
4901
  <!-- e.g. "Das ist . so." - assume one sentence. -->
4864
4902
  <rule break="no">
4865
- <beforebreak>\s([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?\s</beforebreak>
4903
+ <beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4866
4904
  <afterbreak></afterbreak>
4867
4905
  </rule>
4868
4906
  <!-- Numbers, dates e.g. "3.10. datiert" -->
4869
4907
  <rule break="no">
4870
- <beforebreak>\b\d+\.\s</beforebreak>
4908
+ <beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
4871
4909
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
4872
4910
  </rule>
4873
4911
  <!-- z.B. "Das hier ist ein(!) Satz." -->
4874
4912
  <rule break="no">
4875
- <beforebreak>[\(\[][!?]{1,3}[\]\)]\s</beforebreak>
4913
+ <beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
4876
4914
  <afterbreak></afterbreak>
4877
4915
  </rule>
4878
4916
  <!-- z.B. "Das hier ist (genau!) ein Satz." -->
4879
4917
  <rule break="no">
4880
- <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
4918
+ <beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
4881
4919
  <afterbreak></afterbreak>
4882
4920
  </rule>
4883
4921
  <!-- z.B. "bla (...) blubb" -> kein Satzende -->
4884
4922
  <rule break="no">
4885
- <beforebreak>[\(\)\[\]]\s</beforebreak>
4923
+ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
4886
4924
  <afterbreak></afterbreak>
4887
4925
  </rule>
4888
4926
  <!-- don't split at cases like "Friedrich II. wird auch..." -->
4889
4927
  <rule break="no">
4890
- <beforebreak>[\s ][IVX]+\.\s</beforebreak>
4928
+ <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]</beforebreak>
4891
4929
  <afterbreak>[^\p{Lu}]+</afterbreak>
4892
4930
  </rule>
4893
4931
  <!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
4894
4932
  <rule break="no">
4895
- <beforebreak>\d+\.\s</beforebreak>
4896
- <afterbreak>(und|oder|bis)\s</afterbreak>
4933
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4934
+ <afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
4897
4935
  </rule>
4898
4936
  <!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
4899
4937
  ohne dass eine Satzgrenze erkannt wird
4900
4938
  (z.B. "am 13. Dezember" -> keine Satzgrenze) -->
4901
4939
  <rule break="no">
4902
- <beforebreak>\d+\.\s</beforebreak>
4940
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4903
4941
  <afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
4904
4942
  </rule>
4905
4943
  <rule break="no">
4906
- <beforebreak>\d+\.\s</beforebreak>
4944
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4907
4945
  <afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
4908
4946
  </rule>
4909
4947
  <rule break="no">
4910
- <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.\s</beforebreak>
4948
+ <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
4911
4949
  <afterbreak>\d\d(\d\d)?</afterbreak>
4912
4950
  </rule>
4913
4951
  <!-- ähnliche Fälle außerhalb der Monatsnamen -->
4914
4952
  <rule break="no">
4915
- <beforebreak>\d+\.\s</beforebreak>
4953
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4916
4954
  <afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
4917
4955
  </rule>
4918
4956
  <!-- English abbreviations - but these work globally for all languages -->
4919
4957
  <rule break="no">
4920
- <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.\s</beforebreak>
4958
+ <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.[\u00A0\s]</beforebreak>
4921
4959
  <afterbreak></afterbreak>
4922
4960
  </rule>
4923
4961
  <!-- Latin abbreviations - but these work globally for all languages -->
4924
4962
  <rule break="no">
4925
- <beforebreak>\b(spp?)\.\s</beforebreak>
4963
+ <beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
4926
4964
  <afterbreak></afterbreak>
4927
4965
  </rule>
4928
4966
  <!-- German abbreviations -->
4929
4967
  <rule break="no">
4930
- <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.\s</beforebreak>
4968
+ <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|bezgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.[\u00A0\s]</beforebreak>
4931
4969
  <afterbreak></afterbreak>
4932
4970
  </rule>
4933
4971
  <rule break="no">
4934
- <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.\s</beforebreak>
4972
+ <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
4935
4973
  <afterbreak></afterbreak>
4936
4974
  </rule>
4937
4975
  <rule break="no">
4938
- <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.\s</beforebreak>
4976
+ <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
4939
4977
  <afterbreak></afterbreak>
4940
4978
  </rule>
4941
4979
  <rule break="no">
4942
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.\s</beforebreak>
4980
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]</beforebreak>
4943
4981
  <afterbreak></afterbreak>
4944
4982
  </rule>
4945
4983
  <rule break="no">
4946
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.\s</beforebreak>
4984
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]</beforebreak>
4947
4985
  <afterbreak></afterbreak>
4948
4986
  </rule>
4949
4987
  <rule break="no">
4950
- <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.\s</beforebreak>
4988
+ <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]</beforebreak>
4951
4989
  <afterbreak></afterbreak>
4952
4990
  </rule>
4953
4991
  <rule break="no">
4954
- <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.\s</beforebreak>
4992
+ <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
4955
4993
  <afterbreak></afterbreak>
4956
4994
  </rule>
4957
4995
  <rule break="no">
4958
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.\s</beforebreak>
4996
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]</beforebreak>
4959
4997
  <afterbreak></afterbreak>
4960
4998
  </rule>
4961
4999
  <!-- Break rules -->
4962
5000
  <rule break="yes">
4963
- <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?\s+</beforebreak>
5001
+ <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
4964
5002
  <afterbreak></afterbreak>
4965
5003
  </rule>
4966
5004
  <rule break="yes">
@@ -4968,7 +5006,7 @@
4968
5006
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
4969
5007
  </rule>
4970
5008
  <rule break="yes">
4971
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5009
+ <beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
4972
5010
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
4973
5011
  </rule>
4974
5012
  <!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
@@ -5184,11 +5222,19 @@
5184
5222
  </languagerule>
5185
5223
  <languagerule languagerulename="French">
5186
5224
  <rule break="no">
5187
- <beforebreak>Yahoo!\s</beforebreak>
5225
+ <beforebreak>[\s\u00A0]</beforebreak>
5226
+ <afterbreak>[»”’"'›]</afterbreak>
5227
+ </rule>
5228
+ <rule break="yes">
5229
+ <beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
5230
+ <afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
5231
+ </rule>
5232
+ <rule break="no">
5233
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
5188
5234
  <afterbreak>\p{Ll}</afterbreak>
5189
5235
  </rule>
5190
5236
  <rule break="yes">
5191
- <beforebreak>\.\[\d+\]\s</beforebreak>
5237
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5192
5238
  <afterbreak></afterbreak>
5193
5239
  </rule>
5194
5240
  <rule break="no"><!-- URLs without "www."-->
@@ -5206,15 +5252,15 @@
5206
5252
  </rule>
5207
5253
  <!-- French abbreviations -->
5208
5254
  <rule break="no">
5209
- <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op)\.\s</beforebreak>
5255
+ <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0]</beforebreak>
5210
5256
  <afterbreak>\p{Ll}</afterbreak>
5211
5257
  </rule>
5212
5258
  <rule break="no">
5213
- <beforebreak>\b(etc)\.\)\s</beforebreak>
5259
+ <beforebreak>\b(etc)\.\)[\s\u00A0]</beforebreak>
5214
5260
  <afterbreak></afterbreak>
5215
5261
  </rule>
5216
5262
  <rule break="no">
5217
- <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.\s</beforebreak>
5263
+ <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0]</beforebreak>
5218
5264
  <afterbreak></afterbreak>
5219
5265
  </rule>
5220
5266
  <rule break="no">
@@ -5230,39 +5276,39 @@
5230
5276
  <afterbreak>\p{Ll}</afterbreak>
5231
5277
  </rule>
5232
5278
  <rule break="no">
5233
- <beforebreak>["”'’]\s*</beforebreak>
5234
- <afterbreak>\s*\p{Ll}</afterbreak>
5279
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
5280
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
5235
5281
  </rule>
5236
5282
  <rule break="no">
5237
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
5283
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
5238
5284
  <afterbreak></afterbreak>
5239
5285
  </rule>
5240
5286
  <rule break="no">
5241
- <beforebreak>\b\p{L}\.\s</beforebreak>
5242
- <afterbreak>\p{L}\.\s</afterbreak>
5287
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
5288
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
5243
5289
  </rule>
5244
5290
  <rule break="no">
5245
5291
  <beforebreak>\b\p{L}\.</beforebreak>
5246
5292
  <afterbreak>\p{L}\.</afterbreak>
5247
5293
  </rule>
5248
5294
  <rule break="no"><!-- Je suis (...) Chris. -->
5249
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
5295
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5250
5296
  <afterbreak>[^\p{P}]</afterbreak>
5251
5297
  </rule>
5252
5298
  <rule break="no"><!-- Je suis (...?) Chris. -->
5253
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
5299
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5254
5300
  <afterbreak>[^\p{P}]</afterbreak>
5255
5301
  </rule>
5256
5302
  <rule break="no"><!-- Jones v. Smith -->
5257
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
5303
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5258
5304
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5259
5305
  </rule>
5260
5306
  <rule break="yes">
5261
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
5262
- <afterbreak>\p{N}+\)\s</afterbreak>
5307
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
5308
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
5263
5309
  </rule>
5264
5310
  <rule break="no">
5265
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
5311
+ <beforebreak>[\.\s\u00A0]\p{L}{1,2}\.[\s\u00A0]</beforebreak>
5266
5312
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
5267
5313
  </rule>
5268
5314
  <rule break="no">
@@ -5270,31 +5316,31 @@
5270
5316
  <afterbreak>[^\p{Lu}]</afterbreak>
5271
5317
  </rule>
5272
5318
  <rule break="no">
5273
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
5319
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
5274
5320
  <afterbreak></afterbreak>
5275
5321
  </rule>
5276
5322
  <rule break="no">
5277
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
5323
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
5278
5324
  <afterbreak></afterbreak>
5279
5325
  </rule>
5280
5326
  <rule break="no">
5281
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
5327
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
5282
5328
  <afterbreak></afterbreak>
5283
5329
  </rule>
5284
5330
  <rule break="no">
5285
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
5331
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
5286
5332
  <afterbreak>\p{Ll}+</afterbreak>
5287
5333
  </rule>
5288
5334
  <rule break="no">
5289
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
5335
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
5290
5336
  <afterbreak>\p{Ll}+</afterbreak>
5291
5337
  </rule>
5292
5338
  <rule break="no">
5293
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
5339
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5294
5340
  <afterbreak></afterbreak>
5295
5341
  </rule>
5296
5342
  <rule break="no"><!-- i.e. -->
5297
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
5343
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5298
5344
  <afterbreak></afterbreak>
5299
5345
  </rule>
5300
5346
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -5310,28 +5356,28 @@
5310
5356
  <afterbreak>[SK]\b</afterbreak>
5311
5357
  </rule>
5312
5358
  <rule break="no"><!-- No. 5 -->
5313
- <beforebreak>\b[nN]o\.\s</beforebreak>
5359
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5314
5360
  <afterbreak>\p{N}</afterbreak>
5315
5361
  </rule>
5316
5362
  <rule break="no"><!-- Ph.D. -->
5317
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
5363
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5318
5364
  <afterbreak>D\.?</afterbreak>
5319
5365
  </rule>
5320
5366
  <rule break="no"><!-- e.g. -->
5321
- <beforebreak>\be\.g\.\s</beforebreak>
5367
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5322
5368
  <afterbreak></afterbreak>
5323
5369
  </rule>
5324
5370
  <rule break="no"><!-- vs. -->
5325
- <beforebreak>\bvs\.\s</beforebreak>
5371
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5326
5372
  <afterbreak></afterbreak>
5327
5373
  </rule>
5328
5374
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5329
5375
  <rule break="no"><!-- Etc. -->
5330
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
5376
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5331
5377
  <afterbreak>[^\p{Lu}]</afterbreak>
5332
5378
  </rule>
5333
5379
  <rule break="no"><!-- BTW (by the way) -->
5334
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
5380
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5335
5381
  <afterbreak></afterbreak>
5336
5382
  </rule>
5337
5383
  <rule break="no">
@@ -5343,64 +5389,64 @@
5343
5389
  <afterbreak>3|Buzz|Crozz</afterbreak>
5344
5390
  </rule>
5345
5391
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5346
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
5392
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5347
5393
  <afterbreak></afterbreak>
5348
5394
  </rule>
5349
5395
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5350
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
5396
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5351
5397
  <afterbreak></afterbreak>
5352
5398
  </rule>
5353
5399
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5354
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
5400
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5355
5401
  <afterbreak></afterbreak>
5356
5402
  </rule>
5357
5403
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5358
- <beforebreak>\b[BM]\.\s?</beforebreak>
5404
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5359
5405
  <afterbreak>Eng\.?</afterbreak>
5360
5406
  </rule>
5361
5407
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5362
- <beforebreak>\bLL\.\s?</beforebreak>
5408
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5363
5409
  <afterbreak>[BM]\.?</afterbreak>
5364
5410
  </rule>
5365
5411
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5366
- <beforebreak>\b[BM]\.\s?</beforebreak>
5412
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5367
5413
  <afterbreak>Sc\.?</afterbreak>
5368
5414
  </rule>
5369
5415
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5370
- <beforebreak>\b[BM]\.\s?</beforebreak>
5416
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5371
5417
  <afterbreak>Comp?\.?</afterbreak>
5372
5418
  </rule>
5373
5419
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5374
- <beforebreak>\b[BM]\.\s?</beforebreak>
5420
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5375
5421
  <afterbreak>Arch\.?</afterbreak>
5376
5422
  </rule>
5377
5423
  <rule break="no">
5378
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
5424
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
5379
5425
  <afterbreak></afterbreak>
5380
5426
  </rule>
5381
5427
  <rule break="no">
5382
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
5428
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
5383
5429
  <afterbreak></afterbreak>
5384
5430
  </rule>
5385
5431
  <rule break="no">
5386
- <beforebreak>\bCorp\.\s</beforebreak>
5432
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
5387
5433
  <afterbreak></afterbreak>
5388
5434
  </rule>
5389
5435
  <rule break="no">
5390
- <beforebreak>\bBros\.\s</beforebreak>
5436
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
5391
5437
  <afterbreak></afterbreak>
5392
5438
  </rule>
5393
5439
  <rule break="no">
5394
- <beforebreak>\bLtd\.\s</beforebreak>
5440
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
5395
5441
  <afterbreak>\p{Ll}+</afterbreak>
5396
5442
  </rule>
5397
5443
  <rule break="no">
5398
- <beforebreak>\bCo\.\s</beforebreak>
5444
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
5399
5445
  <afterbreak></afterbreak>
5400
5446
  </rule>
5401
5447
  <!-- Break rules -->
5402
5448
  <rule break="yes">
5403
- <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?\s+</beforebreak>
5449
+ <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?[\s\u00A0]+</beforebreak>
5404
5450
  <afterbreak></afterbreak>
5405
5451
  </rule>
5406
5452
  <rule break="yes">
@@ -5408,7 +5454,7 @@
5408
5454
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
5409
5455
  </rule>
5410
5456
  <rule break="yes">
5411
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5457
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
5412
5458
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5413
5459
  </rule>
5414
5460
  </languagerule>
@@ -5556,10 +5602,14 @@
5556
5602
  </rule>
5557
5603
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5558
5604
  <rule break="no">
5559
- <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор)\.[\h\v]*</beforebreak>
5605
+ <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5560
5606
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5561
5607
  </rule>
5562
- <!-- смерть гр. Болтаровича -->
5608
+ <rule break="no">
5609
+ <beforebreak>\bМан\.[\h\v]*</beforebreak>
5610
+ <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5611
+ </rule>
5612
+ <!-- смерть гр. Болтаровича, but not "9 гр." -->
5563
5613
  <rule break="no">
5564
5614
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5565
5615
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
@@ -5567,7 +5617,7 @@
5567
5617
  <!-- арт. - артикул -->
5568
5618
  <!-- TODO: арт. - артист -->
5569
5619
  <rule break="no">
5570
- <beforebreak>\bарт\.[\h\v]*</beforebreak>
5620
+ <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5571
5621
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5572
5622
  </rule>
5573
5623
  <!-- ХІІ р., 3-6 арт. -->