srx-languagetool 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/srx/segment.srx CHANGED
@@ -1102,12 +1102,16 @@
1102
1102
  </rule>
1103
1103
  </languagerule>
1104
1104
  <languagerule languagerulename="English">
1105
+ <rule break="no">
1106
+ <beforebreak>[\u00A0\s]</beforebreak>
1107
+ <afterbreak>\n</afterbreak>
1108
+ </rule>
1105
1109
  <rule break="no"><!-- Hello (Hi! ) my name is Chris -->
1106
- <beforebreak>[a-zA-Z][!\?]\s</beforebreak>
1107
- <afterbreak>\)\s[a-zA-Z]</afterbreak>
1110
+ <beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
1111
+ <afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
1108
1112
  </rule>
1109
1113
  <rule break="no">
1110
- <beforebreak>Yahoo!\s</beforebreak>
1114
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
1111
1115
  <afterbreak>\p{Ll}</afterbreak>
1112
1116
  </rule>
1113
1117
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -1118,6 +1122,10 @@
1118
1122
  <beforebreak>\bA\.</beforebreak>
1119
1123
  <afterbreak>I\b</afterbreak>
1120
1124
  </rule>
1125
+ <rule break="no"><!-- S.I (no dot at end) -->
1126
+ <beforebreak>\bS\.</beforebreak>
1127
+ <afterbreak>I\b</afterbreak>
1128
+ </rule>
1121
1129
  <rule break="no"><!-- L.A (no dot at end) -->
1122
1130
  <beforebreak>\bL\.</beforebreak>
1123
1131
  <afterbreak>A\b</afterbreak>
@@ -1135,96 +1143,96 @@
1135
1143
  <afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
1136
1144
  </rule>
1137
1145
  <rule break="no"><!-- No. 5 -->
1138
- <beforebreak>\b[nN]o\.\s</beforebreak>
1146
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
1139
1147
  <afterbreak>\p{N}</afterbreak>
1140
1148
  </rule>
1141
1149
  <rule break="no"><!-- Ph.D. -->
1142
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
1150
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
1143
1151
  <afterbreak>D\.?</afterbreak>
1144
1152
  </rule>
1145
1153
  <rule break="no"><!-- min. -->
1146
- <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?\s*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?\s*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.\s</beforebreak>
1154
+ <beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
1147
1155
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1148
1156
  </rule>
1149
1157
  <rule break="no"><!-- hr. -->
1150
- <beforebreak>\b(hr)\.\s</beforebreak>
1158
+ <beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
1151
1159
  <afterbreak>[^\p{Lu}]|I</afterbreak>
1152
1160
  </rule>
1153
1161
  <rule break="no"><!-- Fig. 8 -->
1154
- <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1162
+ <beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1155
1163
  <afterbreak>\p{N}|[IXV]+</afterbreak>
1156
1164
  </rule>
1157
1165
  <rule break="no"><!-- Fig. (8) -->
1158
- <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.\s</beforebreak>
1166
+ <beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
1159
1167
  <afterbreak>\(\p{N}\)</afterbreak>
1160
1168
  </rule>
1161
1169
  <rule break="no"><!-- I'm (...) great! -->
1162
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
1170
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
1163
1171
  <afterbreak>[^\p{P}]</afterbreak>
1164
1172
  </rule>
1165
1173
  <rule break="no"><!-- I will work with someone (Chris or ...?). -->
1166
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
1174
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
1167
1175
  <afterbreak>[^\p{P}]</afterbreak>
1168
1176
  </rule>
1169
1177
  <rule break="no"><!-- e.g. -->
1170
- <beforebreak>\be\.g\.\s</beforebreak>
1178
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
1171
1179
  <afterbreak></afterbreak>
1172
1180
  </rule>
1173
1181
  <rule break="no"><!-- vs. -->
1174
- <beforebreak>\bvs\.\s</beforebreak>
1182
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
1175
1183
  <afterbreak></afterbreak>
1176
1184
  </rule>
1177
1185
  <rule break="no"><!-- esp. -->
1178
- <beforebreak>\be[sx]p\.\s</beforebreak>
1186
+ <beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
1179
1187
  <afterbreak></afterbreak>
1180
1188
  </rule>
1181
1189
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
1182
1190
  <rule break="no"><!-- Etc. -->
1183
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
1191
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
1184
1192
  <afterbreak>[^\p{Lu}]</afterbreak>
1185
1193
  </rule>
1186
1194
  <rule break="no"><!-- BTW (by the way) -->
1187
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
1195
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
1188
1196
  <afterbreak></afterbreak>
1189
1197
  </rule>
1190
1198
  <rule break="no">
1191
- <beforebreak>\bJan\.\s</beforebreak>
1199
+ <beforebreak>\bJan\.[\s\u00A0]</beforebreak>
1192
1200
  <afterbreak></afterbreak>
1193
1201
  </rule>
1194
1202
  <rule break="no">
1195
- <beforebreak>\bFeb\.\s</beforebreak>
1203
+ <beforebreak>\bFeb\.[\s\u00A0]</beforebreak>
1196
1204
  <afterbreak></afterbreak>
1197
1205
  </rule>
1198
1206
  <rule break="no">
1199
- <beforebreak>\bMar\.\s</beforebreak>
1207
+ <beforebreak>\bMar\.[\s\u00A0]</beforebreak>
1200
1208
  <afterbreak></afterbreak>
1201
1209
  </rule>
1202
1210
  <rule break="no">
1203
- <beforebreak>\bApr\.\s</beforebreak>
1211
+ <beforebreak>\bApr\.[\s\u00A0]</beforebreak>
1204
1212
  <afterbreak></afterbreak>
1205
1213
  </rule>
1206
1214
  <rule break="no">
1207
- <beforebreak>\bJu[nl]\.\s</beforebreak>
1215
+ <beforebreak>\bJu[nl]\.[\s\u00A0]</beforebreak>
1208
1216
  <afterbreak></afterbreak>
1209
1217
  </rule>
1210
1218
  <rule break="no">
1211
- <beforebreak>\bAug\.\s</beforebreak>
1219
+ <beforebreak>\bAug\.[\s\u00A0]</beforebreak>
1212
1220
  <afterbreak></afterbreak>
1213
1221
  </rule>
1214
1222
  <rule break="no">
1215
- <beforebreak>\bSept?\.\s</beforebreak>
1223
+ <beforebreak>\bSept?\.[\s\u00A0]</beforebreak>
1216
1224
  <afterbreak></afterbreak>
1217
1225
  </rule>
1218
1226
  <rule break="no">
1219
- <beforebreak>\bOct\.\s</beforebreak>
1227
+ <beforebreak>\bOct\.[\s\u00A0]</beforebreak>
1220
1228
  <afterbreak></afterbreak>
1221
1229
  </rule>
1222
1230
  <rule break="no">
1223
- <beforebreak>\bNov\.\s</beforebreak>
1231
+ <beforebreak>\bNov\.[\s\u00A0]</beforebreak>
1224
1232
  <afterbreak></afterbreak>
1225
1233
  </rule>
1226
1234
  <rule break="no">
1227
- <beforebreak>\bDec\.\s</beforebreak>
1235
+ <beforebreak>\bDec\.[\s\u00A0]</beforebreak>
1228
1236
  <afterbreak></afterbreak>
1229
1237
  </rule>
1230
1238
  <rule break="no">
@@ -1236,43 +1244,43 @@
1236
1244
  <afterbreak>3|Buzz|Crozz</afterbreak>
1237
1245
  </rule>
1238
1246
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
1239
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
1247
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
1240
1248
  <afterbreak></afterbreak>
1241
1249
  </rule>
1242
1250
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
1243
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
1251
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
1244
1252
  <afterbreak></afterbreak>
1245
1253
  </rule>
1246
1254
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
1247
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
1255
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
1248
1256
  <afterbreak></afterbreak>
1249
1257
  </rule>
1250
1258
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
1251
- <beforebreak>\b[BM]\.\s?</beforebreak>
1259
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1252
1260
  <afterbreak>Eng\.?</afterbreak>
1253
1261
  </rule>
1254
1262
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
1255
- <beforebreak>\bLL\.\s?</beforebreak>
1263
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
1256
1264
  <afterbreak>[BM]\.?</afterbreak>
1257
1265
  </rule>
1258
1266
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
1259
- <beforebreak>\b[BM]\.\s?</beforebreak>
1267
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1260
1268
  <afterbreak>Sc\.?</afterbreak>
1261
1269
  </rule>
1262
1270
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
1263
- <beforebreak>\b[BM]\.\s?</beforebreak>
1271
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1264
1272
  <afterbreak>Comp?\.?</afterbreak>
1265
1273
  </rule>
1266
1274
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
1267
- <beforebreak>\b[BM]\.\s?</beforebreak>
1275
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
1268
1276
  <afterbreak>Arch\.?</afterbreak>
1269
1277
  </rule>
1270
1278
  <rule break="no">
1271
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
1279
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
1272
1280
  <afterbreak></afterbreak>
1273
1281
  </rule>
1274
1282
  <rule break="no">
1275
- <beforebreak>\bet\b\s\bal\.\s</beforebreak>
1283
+ <beforebreak>\bet\b[\s\u00A0]\bal\.[\s\u00A0]</beforebreak>
1276
1284
  <afterbreak></afterbreak>
1277
1285
  </rule>
1278
1286
  <rule break="no">
@@ -1280,51 +1288,51 @@
1280
1288
  <afterbreak></afterbreak>
1281
1289
  </rule>
1282
1290
  <rule break="no">
1283
- <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s</beforebreak>
1291
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[[\s\u00A0]\u00A0]</beforebreak>
1284
1292
  <afterbreak></afterbreak>
1285
1293
  </rule>
1286
1294
  <rule break="no">
1287
- <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.\s[A-Z]\.\s</beforebreak>
1295
+ <beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1288
1296
  <afterbreak></afterbreak>
1289
1297
  </rule>
1290
1298
  <rule break="no">
1291
- <beforebreak>\b(Drs|Messrs|Mmes)\.\s</beforebreak>
1292
- <afterbreak>(and\s)|\p{Lu}\p{Ll}+</afterbreak>
1299
+ <beforebreak>\b(Drs|Messrs|Mmes)\.[\s\u00A0]</beforebreak>
1300
+ <afterbreak>(and[\s\u00A0])|\p{Lu}\p{Ll}+</afterbreak>
1293
1301
  </rule>
1294
1302
  <rule break="no">
1295
- <beforebreak>\bcf\.\s</beforebreak>
1303
+ <beforebreak>\bcf\.[\s\u00A0]</beforebreak>
1296
1304
  <afterbreak></afterbreak>
1297
1305
  </rule>
1298
1306
  <rule break="no">
1299
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
1307
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
1300
1308
  <afterbreak></afterbreak>
1301
1309
  </rule>
1302
1310
  <rule break="no">
1303
- <beforebreak>\bCorp\.\s</beforebreak>
1311
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
1304
1312
  <afterbreak></afterbreak>
1305
1313
  </rule>
1306
1314
  <rule break="no">
1307
- <beforebreak>\bBros\.\s</beforebreak>
1315
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
1308
1316
  <afterbreak></afterbreak>
1309
1317
  </rule>
1310
1318
  <rule break="no">
1311
- <beforebreak>\bDist\.\s</beforebreak>
1319
+ <beforebreak>\bDist\.[\s\u00A0]</beforebreak>
1312
1320
  <afterbreak></afterbreak>
1313
1321
  </rule>
1314
1322
  <rule break="no">
1315
- <beforebreak>\bCo\.\s</beforebreak>
1323
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
1316
1324
  <afterbreak></afterbreak>
1317
1325
  </rule>
1318
1326
  <rule break="no">
1319
- <beforebreak>\bo'clock\s</beforebreak>
1327
+ <beforebreak>\bo'clock[\s\u00A0]</beforebreak>
1320
1328
  <afterbreak></afterbreak>
1321
1329
  </rule>
1322
1330
  <rule break="no">
1323
- <beforebreak>\bfo'c'sle\s</beforebreak>
1331
+ <beforebreak>\bfo'c'sle[\s\u00A0]</beforebreak>
1324
1332
  <afterbreak></afterbreak>
1325
1333
  </rule>
1326
1334
  <rule break="no">
1327
- <beforebreak>\bLtd\.\s</beforebreak>
1335
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
1328
1336
  <afterbreak>\p{Ll}+</afterbreak>
1329
1337
  </rule>
1330
1338
  <rule break="no">
@@ -1340,35 +1348,35 @@
1340
1348
  <afterbreak>\p{Ll}</afterbreak>
1341
1349
  </rule>
1342
1350
  <rule break="no">
1343
- <beforebreak>["”'’]\s*</beforebreak>
1344
- <afterbreak>\s*\p{Ll}</afterbreak>
1351
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
1352
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
1345
1353
  </rule>
1346
1354
  <rule break="no">
1347
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
1355
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
1348
1356
  <afterbreak></afterbreak>
1349
1357
  </rule>
1350
1358
  <rule break="no">
1351
- <beforebreak>\b\p{L}\.\s</beforebreak>
1352
- <afterbreak>\p{L}\.\s</afterbreak>
1359
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
1360
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
1353
1361
  </rule>
1354
1362
  <rule break="no">
1355
1363
  <beforebreak>\b\p{L}\.</beforebreak>
1356
1364
  <afterbreak>\p{L}\.</afterbreak>
1357
1365
  </rule>
1358
1366
  <rule break="no"><!-- Jones v. Smith -->
1359
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
1367
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
1360
1368
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
1361
1369
  </rule>
1362
1370
  <rule break="yes">
1363
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
1364
- <afterbreak>\p{N}+\)\s</afterbreak>
1371
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
1372
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
1365
1373
  </rule>
1366
1374
  <rule break="yes">
1367
- <beforebreak>\bOK\.\s</beforebreak>
1375
+ <beforebreak>\bOK\.[\s\u00A0]</beforebreak>
1368
1376
  <afterbreak>\p{Ll}+</afterbreak>
1369
1377
  </rule>
1370
1378
  <rule break="no">
1371
- <beforebreak>[\.\s](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.\s</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1379
+ <beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
1372
1380
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
1373
1381
  </rule>
1374
1382
  <rule break="no">
@@ -1376,35 +1384,35 @@
1376
1384
  <afterbreak>[^\p{Lu}]</afterbreak>
1377
1385
  </rule>
1378
1386
  <rule break="no">
1379
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
1387
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
1380
1388
  <afterbreak></afterbreak>
1381
1389
  </rule>
1382
1390
  <rule break="no">
1383
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
1391
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
1384
1392
  <afterbreak></afterbreak>
1385
1393
  </rule>
1386
1394
  <rule break="no">
1387
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
1395
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
1388
1396
  <afterbreak></afterbreak>
1389
1397
  </rule>
1390
1398
  <rule break="no">
1391
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
1399
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
1392
1400
  <afterbreak>\p{Ll}+</afterbreak>
1393
1401
  </rule>
1394
1402
  <rule break="no">
1395
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
1403
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
1396
1404
  <afterbreak>\p{Ll}+</afterbreak>
1397
1405
  </rule>
1398
1406
  <rule break="no">
1399
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
1407
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
1400
1408
  <afterbreak></afterbreak>
1401
1409
  </rule>
1402
1410
  <rule break="no"><!-- i.e. -->
1403
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
1411
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
1404
1412
  <afterbreak></afterbreak>
1405
1413
  </rule>
1406
1414
  <rule break="yes">
1407
- <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*\s</beforebreak>
1415
+ <beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak>
1408
1416
  <afterbreak></afterbreak>
1409
1417
  </rule>
1410
1418
  <rule break="yes">
@@ -1412,7 +1420,7 @@
1412
1420
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
1413
1421
  </rule>
1414
1422
  <rule break="yes">
1415
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
1423
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
1416
1424
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
1417
1425
  </rule>
1418
1426
  </languagerule>
@@ -1511,6 +1519,11 @@
1511
1519
  </rule>
1512
1520
  </languagerule>
1513
1521
  <languagerule languagerulename="Dutch">
1522
+ <rule break="no">
1523
+ <!-- .Net -->
1524
+ <beforebreak>\s[.]</beforebreak>
1525
+ <afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
1526
+ </rule>
1514
1527
  <rule break="no"><!-- quoted sentence in sentence -->
1515
1528
  <beforebreak>[.?!][’'"]</beforebreak>
1516
1529
  <afterbreak> [a-z]</afterbreak>
@@ -1729,6 +1742,31 @@
1729
1742
  <beforebreak>[?!.]\s</beforebreak>
1730
1743
  <afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
1731
1744
  </rule>
1745
+ <rule break="no">
1746
+ <!-- "E. coli etc. -->
1747
+ <beforebreak>"[A-Z][.]\s</beforebreak>
1748
+ <afterbreak>[a-z]</afterbreak>
1749
+ </rule>
1750
+ <rule break="no">
1751
+ <!-- Cornelisz. -->
1752
+ <beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
1753
+ <afterbreak>[a-z]</afterbreak>
1754
+ </rule>
1755
+ <rule break="no">
1756
+ <!-- De n. XIV/vagus (nervus) -->
1757
+ <beforebreak>De n[.]\s</beforebreak>
1758
+ <afterbreak>[a-z]|[XIV]</afterbreak>
1759
+ </rule>
1760
+ <rule break="no">
1761
+ <!-- MOL.E -->
1762
+ <beforebreak>[A-Z]{2,5}[.]</beforebreak>
1763
+ <afterbreak>[A-Z]</afterbreak>
1764
+ </rule>
1765
+ <rule break="no">
1766
+ <!-- ..." betekent -->
1767
+ <beforebreak>\.\.</beforebreak>
1768
+ <afterbreak>" [a-z]</afterbreak>
1769
+ </rule>
1732
1770
  <!-- ##### end of Dutch #### -->
1733
1771
  </languagerule>
1734
1772
  <languagerule languagerulename="Slovak">
@@ -4556,146 +4594,146 @@
4556
4594
  </languagerule>
4557
4595
  <languagerule languagerulename="Catalan">
4558
4596
  <rule break="no">
4559
- <beforebreak>Yahoo!\s</beforebreak>
4597
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4560
4598
  <afterbreak>\p{Ll}</afterbreak>
4561
4599
  </rule>
4562
4600
  <rule break="yes">
4563
- <beforebreak>\w['’][nNtT]\.\s</beforebreak>
4601
+ <beforebreak>\w['’][nNtT]\.[\s\u00A0]</beforebreak>
4564
4602
  <afterbreak></afterbreak>
4565
4603
  </rule>
4566
4604
  <rule break="yes">
4567
- <beforebreak>\.\[\d+\]\s</beforebreak>
4605
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4568
4606
  <afterbreak></afterbreak>
4569
4607
  </rule>
4570
4608
  <!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
4571
4609
  <rule break="no">
4572
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4610
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4573
4611
  <afterbreak></afterbreak>
4574
4612
  </rule>
4575
4613
  <!-- Abbreviations that cannot finish sentences-->
4576
4614
  <rule break="no">
4577
- <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4615
+ <beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4578
4616
  <afterbreak></afterbreak>
4579
4617
  </rule>
4580
4618
  <!-- Abbreviations that can finish sentences -->
4581
4619
  <rule break="no">
4582
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4620
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4583
4621
  <afterbreak>[XIV]+\b</afterbreak>
4584
4622
  </rule>
4585
4623
  <rule break="no">
4586
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4624
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4587
4625
  <afterbreak>[0-9]+\b</afterbreak>
4588
4626
  </rule>
4589
4627
  <rule break="no">
4590
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.\s</beforebreak>
4628
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
4591
4629
  <afterbreak>[XIV\d]+\b</afterbreak>
4592
4630
  </rule>
4593
4631
  <rule break="no">
4594
- <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4632
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4595
4633
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4596
4634
  </rule>
4597
4635
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4598
4636
  <rule break="no">
4599
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4637
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4600
4638
  <afterbreak>\p{Ll}</afterbreak>
4601
4639
  </rule>
4602
4640
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4603
4641
  <rule break="no">
4604
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4642
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4605
4643
  <afterbreak>\p{Ll}</afterbreak>
4606
4644
  </rule>
4607
4645
  <rule break="no">
4608
- <beforebreak>\bEE\.\s?</beforebreak>
4609
- <afterbreak>UU</afterbreak>
4646
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4647
+ <afterbreak>\p{Lu}{2}</afterbreak>
4610
4648
  </rule>
4611
4649
  <rule break="no">
4612
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4650
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4613
4651
  <afterbreak>\p{Ll}</afterbreak>
4614
4652
  </rule>
4615
4653
  <!-- max min etc -->
4616
4654
  <rule break="no">
4617
- <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4655
+ <beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4618
4656
  <afterbreak>\p{Ll}</afterbreak>
4619
4657
  </rule>
4620
4658
  <!-- Composed abbrev. -->
4621
4659
  <rule break="no">
4622
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4660
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4623
4661
  <afterbreak></afterbreak>
4624
4662
  </rule>
4625
4663
  <!-- Units -->
4626
4664
  <rule break="no">
4627
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4665
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4628
4666
  <afterbreak>\p{Ll}</afterbreak>
4629
4667
  </rule>
4630
4668
  <!-- Ellipsis: ... lowercase -->
4631
4669
  <rule break="no">
4632
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4670
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4633
4671
  <afterbreak>\p{Ll}</afterbreak>
4634
4672
  </rule>
4635
4673
  <!-- (enum...) -->
4636
4674
  <rule break="no">
4637
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4675
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4638
4676
  <afterbreak>\p{Ll}</afterbreak>
4639
4677
  </rule>
4640
4678
  <!-- pero ¡ah! no estaba
4641
4679
  <rule break="no">
4642
- <beforebreak>\b¡\p{L}+!\s</beforebreak>
4680
+ <beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
4643
4681
  <afterbreak>\p{Ll}</afterbreak>
4644
4682
  </rule>
4645
4683
  -->
4646
4684
  <rule break="yes">
4647
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4685
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4648
4686
  <afterbreak></afterbreak>
4649
4687
  </rule>
4650
4688
  <rule break="yes">
4651
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4689
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4652
4690
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4653
4691
  </rule>
4654
4692
  <!-- paragraphs with opening "»" in dialogs-->
4655
4693
  <rule break="yes">
4656
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4657
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4694
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4695
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4658
4696
  </rule>
4659
4697
  </languagerule>
4660
4698
  <languagerule languagerulename="Spanish">
4661
4699
  <rule break="no">
4662
- <beforebreak>Yahoo!\s</beforebreak>
4700
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
4663
4701
  <afterbreak>\p{Ll}</afterbreak>
4664
4702
  </rule>
4665
4703
  <rule break="yes">
4666
- <beforebreak>\.\[\d+\]\s</beforebreak>
4704
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
4667
4705
  <afterbreak></afterbreak>
4668
4706
  </rule>
4669
4707
  <!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
4670
4708
  <rule break="no">
4671
- <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.\s</beforebreak>
4709
+ <beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
4672
4710
  <afterbreak/>
4673
4711
  </rule>
4674
4712
  <!-- Ellipsis: ... lowercase -->
4675
4713
  <rule break="no">
4676
- <beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
4714
+ <beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
4677
4715
  <afterbreak>\p{Ll}</afterbreak>
4678
4716
  </rule>
4679
4717
  <!-- (enum...) -->
4680
4718
  <rule break="no">
4681
- <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
4719
+ <beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
4682
4720
  <afterbreak>\p{Ll}</afterbreak>
4683
4721
  </rule>
4684
4722
  <!-- Abbreviations that can finish sentences -->
4685
4723
  <rule break="no">
4686
- <beforebreak>\b(s|ca)\.\s</beforebreak>
4724
+ <beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
4687
4725
  <afterbreak>[XIV]+\b</afterbreak>
4688
4726
  </rule>
4689
4727
  <rule break="no">
4690
- <beforebreak>\b(min|m|ca)\.\s</beforebreak>
4728
+ <beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
4691
4729
  <afterbreak>[0-9]+\b</afterbreak>
4692
4730
  </rule>
4693
4731
  <rule break="no">
4694
- <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.\s</beforebreak>
4732
+ <beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0]</beforebreak>
4695
4733
  <afterbreak>[XIV\d]+\b</afterbreak>
4696
4734
  </rule>
4697
4735
  <rule break="no">
4698
- <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4736
+ <beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4699
4737
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4700
4738
  </rule>
4701
4739
  <rule break="no">
@@ -4710,75 +4748,75 @@
4710
4748
  </rule>
4711
4749
  <!-- Abbreviations that cannot finish sentences-->
4712
4750
  <rule break="no">
4713
- <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.\s</beforebreak>
4751
+ <beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
4714
4752
  <afterbreak/>
4715
4753
  </rule>
4716
4754
  <rule break="no">
4717
- <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.\s</beforebreak>
4755
+ <beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
4718
4756
  <afterbreak/>
4719
4757
  </rule>
4720
4758
  <rule break="no">
4721
- <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.\s</beforebreak>
4759
+ <beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
4722
4760
  <afterbreak/>
4723
4761
  </rule>
4724
4762
  <rule break="no">
4725
- <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.\s</beforebreak>
4763
+ <beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
4726
4764
  <afterbreak/>
4727
4765
  </rule>
4728
4766
  <rule break="no">
4729
- <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.\s</beforebreak>
4767
+ <beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
4730
4768
  <afterbreak/>
4731
4769
  </rule>
4732
4770
  <!-- Abbreviations that can finish sentences -->
4733
4771
  <rule break="no">
4734
- <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4772
+ <beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4735
4773
  <afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
4736
4774
  </rule>
4737
4775
  <!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
4738
4776
  <rule break="no">
4739
- <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4777
+ <beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4740
4778
  <afterbreak>\p{Ll}</afterbreak>
4741
4779
  </rule>
4742
4780
  <!-- Any word in acronyms like EE.UU. or BB. DD. -->
4743
4781
  <rule break="no">
4744
- <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4782
+ <beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4745
4783
  <afterbreak>\p{Ll}</afterbreak>
4746
4784
  </rule>
4747
4785
  <rule break="no">
4748
- <beforebreak>\bEE\.\s?</beforebreak>
4749
- <afterbreak>UU</afterbreak>
4786
+ <beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
4787
+ <afterbreak>\p{Lu}{2}</afterbreak>
4750
4788
  </rule>
4751
4789
  <rule break="no">
4752
- <beforebreak>EE\.\s?UU\.\s?</beforebreak>
4790
+ <beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
4753
4791
  <afterbreak>\p{Ll}</afterbreak>
4754
4792
  </rule>
4755
4793
  <!-- max min etc -->
4756
4794
  <rule break="no">
4757
- <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4795
+ <beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4758
4796
  <afterbreak>\p{Ll}</afterbreak>
4759
4797
  </rule>
4760
4798
  <!-- Composed abbrev. -->
4761
4799
  <rule break="no">
4762
- <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4800
+ <beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4763
4801
  <afterbreak/>
4764
4802
  </rule>
4765
4803
  <!-- Units -->
4766
4804
  <rule break="no">
4767
- <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*\s</beforebreak>
4805
+ <beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
4768
4806
  <afterbreak>\p{Ll}</afterbreak>
4769
4807
  </rule>
4770
4808
  <rule break="yes">
4771
- <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*\s</beforebreak>
4809
+ <beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
4772
4810
  <afterbreak></afterbreak>
4773
4811
  </rule>
4774
4812
  <rule break="yes">
4775
- <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+\s</beforebreak>
4813
+ <beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
4776
4814
  <afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
4777
4815
  </rule>
4778
4816
  <!-- paragraphs with opening "»" in dialogs-->
4779
4817
  <rule break="yes">
4780
- <beforebreak>[\.:!?…»]+\s</beforebreak>
4781
- <afterbreak>»[^\s\.:!?…]</afterbreak>
4818
+ <beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
4819
+ <afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
4782
4820
  </rule>
4783
4821
  </languagerule>
4784
4822
  <languagerule languagerulename="German">
@@ -4792,17 +4830,17 @@
4792
4830
  </rule>
4793
4831
  <!--support simple lists in markdown style-->
4794
4832
  <rule break="yes">
4795
- <beforebreak>\r?\n\s*[-*]+\s</beforebreak>
4833
+ <beforebreak>\r?\n[\u00A0\s]*[-*]+[\u00A0\s]</beforebreak>
4796
4834
  <afterbreak></afterbreak>
4797
4835
  </rule>
4798
4836
  <!-- Split at e.g. "1a. Und ..." -->
4799
4837
  <rule break="yes">
4800
- <beforebreak>\d+[a-z]\.\s</beforebreak>
4838
+ <beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
4801
4839
  <afterbreak>\p{Lu}</afterbreak>
4802
4840
  </rule>
4803
4841
  <!-- Don't split at e.g. "d. h." -->
4804
4842
  <rule break="no">
4805
- <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?\s</beforebreak>
4843
+ <beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4806
4844
  <afterbreak></afterbreak>
4807
4845
  </rule>
4808
4846
  <rule break="no">
@@ -4826,7 +4864,7 @@
4826
4864
  <afterbreak>3|Buzz|Crozz</afterbreak>
4827
4865
  </rule>
4828
4866
  <rule break="no">
4829
- <beforebreak>[1-3]\.\s</beforebreak>
4867
+ <beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
4830
4868
  <afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
4831
4869
  </rule>
4832
4870
  <rule break="no">
@@ -4841,126 +4879,126 @@
4841
4879
  <!-- Don't split after a white-space followed by a single letter followed
4842
4880
  by a dot followed by another whitespace. e.g. " p. " -->
4843
4881
  <rule break="no">
4844
- <beforebreak>\s\p{L}\.\s</beforebreak>
4882
+ <beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
4845
4883
  <afterbreak>\p{L}\.</afterbreak>
4846
4884
  </rule>
4847
4885
  <!-- Don't split at "bla bla... yada yada" -->
4848
4886
  <rule break="no">
4849
- <beforebreak>[\[\(]?\.\.\.[\]\)]?\s</beforebreak>
4887
+ <beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
4850
4888
  <afterbreak>\p{Ll}</afterbreak>
4851
4889
  </rule>
4852
4890
  <!-- Don't split [.?!] when they're quoted -->
4853
4891
  <rule break="no">
4854
- <beforebreak>['"„][\.!?…]['"“]\s</beforebreak>
4892
+ <beforebreak>['"„][\.!?…]['"“][\u00A0\s]</beforebreak>
4855
4893
  <afterbreak></afterbreak>
4856
4894
  </rule>
4857
4895
  <!-- Don't break after quote unless there's a capital letter
4858
4896
  e.g.: "That's right!" he said. -->
4859
4897
  <rule break="no">
4860
- <beforebreak>["'“]\s</beforebreak>
4898
+ <beforebreak>["'“][\u00A0\s]</beforebreak>
4861
4899
  <afterbreak>\p{Ll}</afterbreak>
4862
4900
  </rule>
4863
4901
  <!-- e.g. "Das ist . so." - assume one sentence. -->
4864
4902
  <rule break="no">
4865
- <beforebreak>\s([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?\s</beforebreak>
4903
+ <beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
4866
4904
  <afterbreak></afterbreak>
4867
4905
  </rule>
4868
4906
  <!-- Numbers, dates e.g. "3.10. datiert" -->
4869
4907
  <rule break="no">
4870
- <beforebreak>\b\d+\.\s</beforebreak>
4908
+ <beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
4871
4909
  <afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
4872
4910
  </rule>
4873
4911
  <!-- z.B. "Das hier ist ein(!) Satz." -->
4874
4912
  <rule break="no">
4875
- <beforebreak>[\(\[][!?]{1,3}[\]\)]\s</beforebreak>
4913
+ <beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
4876
4914
  <afterbreak></afterbreak>
4877
4915
  </rule>
4878
4916
  <!-- z.B. "Das hier ist (genau!) ein Satz." -->
4879
4917
  <rule break="no">
4880
- <beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
4918
+ <beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
4881
4919
  <afterbreak></afterbreak>
4882
4920
  </rule>
4883
4921
  <!-- z.B. "bla (...) blubb" -> kein Satzende -->
4884
4922
  <rule break="no">
4885
- <beforebreak>[\(\)\[\]]\s</beforebreak>
4923
+ <beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
4886
4924
  <afterbreak></afterbreak>
4887
4925
  </rule>
4888
4926
  <!-- don't split at cases like "Friedrich II. wird auch..." -->
4889
4927
  <rule break="no">
4890
- <beforebreak>[\s ][IVX]+\.\s</beforebreak>
4928
+ <beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]</beforebreak>
4891
4929
  <afterbreak>[^\p{Lu}]+</afterbreak>
4892
4930
  </rule>
4893
4931
  <!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
4894
4932
  <rule break="no">
4895
- <beforebreak>\d+\.\s</beforebreak>
4896
- <afterbreak>(und|oder|bis)\s</afterbreak>
4933
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4934
+ <afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
4897
4935
  </rule>
4898
4936
  <!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
4899
4937
  ohne dass eine Satzgrenze erkannt wird
4900
4938
  (z.B. "am 13. Dezember" -> keine Satzgrenze) -->
4901
4939
  <rule break="no">
4902
- <beforebreak>\d+\.\s</beforebreak>
4940
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4903
4941
  <afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
4904
4942
  </rule>
4905
4943
  <rule break="no">
4906
- <beforebreak>\d+\.\s</beforebreak>
4944
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4907
4945
  <afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
4908
4946
  </rule>
4909
4947
  <rule break="no">
4910
- <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.\s</beforebreak>
4948
+ <beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
4911
4949
  <afterbreak>\d\d(\d\d)?</afterbreak>
4912
4950
  </rule>
4913
4951
  <!-- ähnliche Fälle außerhalb der Monatsnamen -->
4914
4952
  <rule break="no">
4915
- <beforebreak>\d+\.\s</beforebreak>
4953
+ <beforebreak>\d+\.[\u00A0\s]</beforebreak>
4916
4954
  <afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
4917
4955
  </rule>
4918
4956
  <!-- English abbreviations - but these work globally for all languages -->
4919
4957
  <rule break="no">
4920
- <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.\s</beforebreak>
4958
+ <beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)\.[\u00A0\s]</beforebreak>
4921
4959
  <afterbreak></afterbreak>
4922
4960
  </rule>
4923
4961
  <!-- Latin abbreviations - but these work globally for all languages -->
4924
4962
  <rule break="no">
4925
- <beforebreak>\b(spp?)\.\s</beforebreak>
4963
+ <beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
4926
4964
  <afterbreak></afterbreak>
4927
4965
  </rule>
4928
4966
  <!-- German abbreviations -->
4929
4967
  <rule break="no">
4930
- <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.\s</beforebreak>
4968
+ <beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|bezgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)\.[\u00A0\s]</beforebreak>
4931
4969
  <afterbreak></afterbreak>
4932
4970
  </rule>
4933
4971
  <rule break="no">
4934
- <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.\s</beforebreak>
4972
+ <beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
4935
4973
  <afterbreak></afterbreak>
4936
4974
  </rule>
4937
4975
  <rule break="no">
4938
- <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.\s</beforebreak>
4976
+ <beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
4939
4977
  <afterbreak></afterbreak>
4940
4978
  </rule>
4941
4979
  <rule break="no">
4942
- <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.\s</beforebreak>
4980
+ <beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]</beforebreak>
4943
4981
  <afterbreak></afterbreak>
4944
4982
  </rule>
4945
4983
  <rule break="no">
4946
- <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.\s</beforebreak>
4984
+ <beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]</beforebreak>
4947
4985
  <afterbreak></afterbreak>
4948
4986
  </rule>
4949
4987
  <rule break="no">
4950
- <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.\s</beforebreak>
4988
+ <beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]</beforebreak>
4951
4989
  <afterbreak></afterbreak>
4952
4990
  </rule>
4953
4991
  <rule break="no">
4954
- <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.\s</beforebreak>
4992
+ <beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
4955
4993
  <afterbreak></afterbreak>
4956
4994
  </rule>
4957
4995
  <rule break="no">
4958
- <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.\s</beforebreak>
4996
+ <beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]</beforebreak>
4959
4997
  <afterbreak></afterbreak>
4960
4998
  </rule>
4961
4999
  <!-- Break rules -->
4962
5000
  <rule break="yes">
4963
- <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?\s+</beforebreak>
5001
+ <beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
4964
5002
  <afterbreak></afterbreak>
4965
5003
  </rule>
4966
5004
  <rule break="yes">
@@ -4968,7 +5006,7 @@
4968
5006
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
4969
5007
  </rule>
4970
5008
  <rule break="yes">
4971
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5009
+ <beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
4972
5010
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
4973
5011
  </rule>
4974
5012
  <!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
@@ -5184,11 +5222,19 @@
5184
5222
  </languagerule>
5185
5223
  <languagerule languagerulename="French">
5186
5224
  <rule break="no">
5187
- <beforebreak>Yahoo!\s</beforebreak>
5225
+ <beforebreak>[\s\u00A0]</beforebreak>
5226
+ <afterbreak>[»”’"'›]</afterbreak>
5227
+ </rule>
5228
+ <rule break="yes">
5229
+ <beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
5230
+ <afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
5231
+ </rule>
5232
+ <rule break="no">
5233
+ <beforebreak>Yahoo![\s\u00A0]</beforebreak>
5188
5234
  <afterbreak>\p{Ll}</afterbreak>
5189
5235
  </rule>
5190
5236
  <rule break="yes">
5191
- <beforebreak>\.\[\d+\]\s</beforebreak>
5237
+ <beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
5192
5238
  <afterbreak></afterbreak>
5193
5239
  </rule>
5194
5240
  <rule break="no"><!-- URLs without "www."-->
@@ -5206,15 +5252,15 @@
5206
5252
  </rule>
5207
5253
  <!-- French abbreviations -->
5208
5254
  <rule break="no">
5209
- <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op)\.\s</beforebreak>
5255
+ <beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0]</beforebreak>
5210
5256
  <afterbreak>\p{Ll}</afterbreak>
5211
5257
  </rule>
5212
5258
  <rule break="no">
5213
- <beforebreak>\b(etc)\.\)\s</beforebreak>
5259
+ <beforebreak>\b(etc)\.\)[\s\u00A0]</beforebreak>
5214
5260
  <afterbreak></afterbreak>
5215
5261
  </rule>
5216
5262
  <rule break="no">
5217
- <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.\s</beforebreak>
5263
+ <beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0]</beforebreak>
5218
5264
  <afterbreak></afterbreak>
5219
5265
  </rule>
5220
5266
  <rule break="no">
@@ -5230,39 +5276,39 @@
5230
5276
  <afterbreak>\p{Ll}</afterbreak>
5231
5277
  </rule>
5232
5278
  <rule break="no">
5233
- <beforebreak>["”'’]\s*</beforebreak>
5234
- <afterbreak>\s*\p{Ll}</afterbreak>
5279
+ <beforebreak>["”'’][\s\u00A0]*</beforebreak>
5280
+ <afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
5235
5281
  </rule>
5236
5282
  <rule break="no">
5237
- <beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
5283
+ <beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
5238
5284
  <afterbreak></afterbreak>
5239
5285
  </rule>
5240
5286
  <rule break="no">
5241
- <beforebreak>\b\p{L}\.\s</beforebreak>
5242
- <afterbreak>\p{L}\.\s</afterbreak>
5287
+ <beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
5288
+ <afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
5243
5289
  </rule>
5244
5290
  <rule break="no">
5245
5291
  <beforebreak>\b\p{L}\.</beforebreak>
5246
5292
  <afterbreak>\p{L}\.</afterbreak>
5247
5293
  </rule>
5248
5294
  <rule break="no"><!-- Je suis (...) Chris. -->
5249
- <beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
5295
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
5250
5296
  <afterbreak>[^\p{P}]</afterbreak>
5251
5297
  </rule>
5252
5298
  <rule break="no"><!-- Je suis (...?) Chris. -->
5253
- <beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
5299
+ <beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
5254
5300
  <afterbreak>[^\p{P}]</afterbreak>
5255
5301
  </rule>
5256
5302
  <rule break="no"><!-- Jones v. Smith -->
5257
- <beforebreak>\p{Lu}\p{L}+\sv\.\s</beforebreak>
5303
+ <beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
5258
5304
  <afterbreak>\p{Lu}\p{L}+</afterbreak>
5259
5305
  </rule>
5260
5306
  <rule break="yes">
5261
- <beforebreak>[^,][\s]\p{L}{2}\.\s</beforebreak>
5262
- <afterbreak>\p{N}+\)\s</afterbreak>
5307
+ <beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
5308
+ <afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
5263
5309
  </rule>
5264
5310
  <rule break="no">
5265
- <beforebreak>[\.\s]\p{L}{1,2}\.\s</beforebreak>
5311
+ <beforebreak>[\.\s\u00A0]\p{L}{1,2}\.[\s\u00A0]</beforebreak>
5266
5312
  <afterbreak>[\p{N}\p{Ll}]</afterbreak>
5267
5313
  </rule>
5268
5314
  <rule break="no">
@@ -5270,31 +5316,31 @@
5270
5316
  <afterbreak>[^\p{Lu}]</afterbreak>
5271
5317
  </rule>
5272
5318
  <rule break="no">
5273
- <beforebreak>\b\p{Lu}\.\s\p{Lu}\.\s</beforebreak>
5319
+ <beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
5274
5320
  <afterbreak></afterbreak>
5275
5321
  </rule>
5276
5322
  <rule break="no">
5277
- <beforebreak>\b\p{Lu}\.\p{Lu}\.\s</beforebreak>
5323
+ <beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
5278
5324
  <afterbreak></afterbreak>
5279
5325
  </rule>
5280
5326
  <rule break="no">
5281
- <beforebreak>[^\.]\s[A-Z]\.\s</beforebreak>
5327
+ <beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
5282
5328
  <afterbreak></afterbreak>
5283
5329
  </rule>
5284
5330
  <rule break="no">
5285
- <beforebreak>\b(:?Blvd|Ave|Mts?)\.\s</beforebreak>
5331
+ <beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
5286
5332
  <afterbreak>\p{Ll}+</afterbreak>
5287
5333
  </rule>
5288
5334
  <rule break="no">
5289
- <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.\s</beforebreak>
5335
+ <beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
5290
5336
  <afterbreak>\p{Ll}+</afterbreak>
5291
5337
  </rule>
5292
5338
  <rule break="no">
5293
- <beforebreak>\(\p{Ll}+\.\s</beforebreak>
5339
+ <beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
5294
5340
  <afterbreak></afterbreak>
5295
5341
  </rule>
5296
5342
  <rule break="no"><!-- i.e. -->
5297
- <beforebreak>i\.e\.\s</beforebreak><!-- "i.e." is never at end of sentence -->
5343
+ <beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
5298
5344
  <afterbreak></afterbreak>
5299
5345
  </rule>
5300
5346
  <rule break="no"><!-- U.S.A (no dot at end) -->
@@ -5310,28 +5356,28 @@
5310
5356
  <afterbreak>[SK]\b</afterbreak>
5311
5357
  </rule>
5312
5358
  <rule break="no"><!-- No. 5 -->
5313
- <beforebreak>\b[nN]o\.\s</beforebreak>
5359
+ <beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
5314
5360
  <afterbreak>\p{N}</afterbreak>
5315
5361
  </rule>
5316
5362
  <rule break="no"><!-- Ph.D. -->
5317
- <beforebreak>\bP[Hh]\.\s?</beforebreak>
5363
+ <beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
5318
5364
  <afterbreak>D\.?</afterbreak>
5319
5365
  </rule>
5320
5366
  <rule break="no"><!-- e.g. -->
5321
- <beforebreak>\be\.g\.\s</beforebreak>
5367
+ <beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
5322
5368
  <afterbreak></afterbreak>
5323
5369
  </rule>
5324
5370
  <rule break="no"><!-- vs. -->
5325
- <beforebreak>\bvs\.\s</beforebreak>
5371
+ <beforebreak>\bvs\.[\s\u00A0]</beforebreak>
5326
5372
  <afterbreak></afterbreak>
5327
5373
  </rule>
5328
5374
  <!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
5329
5375
  <rule break="no"><!-- Etc. -->
5330
- <beforebreak>\b[Ee]tc\.\s</beforebreak>
5376
+ <beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
5331
5377
  <afterbreak>[^\p{Lu}]</afterbreak>
5332
5378
  </rule>
5333
5379
  <rule break="no"><!-- BTW (by the way) -->
5334
- <beforebreak>\b([Bb]tw|BTW)\.\s</beforebreak>
5380
+ <beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
5335
5381
  <afterbreak></afterbreak>
5336
5382
  </rule>
5337
5383
  <rule break="no">
@@ -5343,64 +5389,64 @@
5343
5389
  <afterbreak>3|Buzz|Crozz</afterbreak>
5344
5390
  </rule>
5345
5391
  <rule break="no"><!-- Ph.D. (see rule PH_D) -->
5346
- <beforebreak>\bP[Hh]\.?\s?[Dd]\.\s</beforebreak>
5392
+ <beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
5347
5393
  <afterbreak></afterbreak>
5348
5394
  </rule>
5349
5395
  <rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
5350
- <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.\s</beforebreak>
5396
+ <beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
5351
5397
  <afterbreak></afterbreak>
5352
5398
  </rule>
5353
5399
  <rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
5354
- <beforebreak>\bLL\.\s?[BM]\.\s</beforebreak>
5400
+ <beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
5355
5401
  <afterbreak></afterbreak>
5356
5402
  </rule>
5357
5403
  <rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
5358
- <beforebreak>\b[BM]\.\s?</beforebreak>
5404
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5359
5405
  <afterbreak>Eng\.?</afterbreak>
5360
5406
  </rule>
5361
5407
  <rule break="no"><!-- LL.B. (Bachelor of Laws) -->
5362
- <beforebreak>\bLL\.\s?</beforebreak>
5408
+ <beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
5363
5409
  <afterbreak>[BM]\.?</afterbreak>
5364
5410
  </rule>
5365
5411
  <rule break="no"><!-- B.Sc. (Bachelor of Science) -->
5366
- <beforebreak>\b[BM]\.\s?</beforebreak>
5412
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5367
5413
  <afterbreak>Sc\.?</afterbreak>
5368
5414
  </rule>
5369
5415
  <rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
5370
- <beforebreak>\b[BM]\.\s?</beforebreak>
5416
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5371
5417
  <afterbreak>Comp?\.?</afterbreak>
5372
5418
  </rule>
5373
5419
  <rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
5374
- <beforebreak>\b[BM]\.\s?</beforebreak>
5420
+ <beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
5375
5421
  <afterbreak>Arch\.?</afterbreak>
5376
5422
  </rule>
5377
5423
  <rule break="no">
5378
- <beforebreak>\b[BM]\.?\s?(Sc|Eng|Comp|Arch)\.\s</beforebreak>
5424
+ <beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
5379
5425
  <afterbreak></afterbreak>
5380
5426
  </rule>
5381
5427
  <rule break="no">
5382
- <beforebreak>\bI(nc|NC)\.\s</beforebreak>
5428
+ <beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
5383
5429
  <afterbreak></afterbreak>
5384
5430
  </rule>
5385
5431
  <rule break="no">
5386
- <beforebreak>\bCorp\.\s</beforebreak>
5432
+ <beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
5387
5433
  <afterbreak></afterbreak>
5388
5434
  </rule>
5389
5435
  <rule break="no">
5390
- <beforebreak>\bBros\.\s</beforebreak>
5436
+ <beforebreak>\bBros\.[\s\u00A0]</beforebreak>
5391
5437
  <afterbreak></afterbreak>
5392
5438
  </rule>
5393
5439
  <rule break="no">
5394
- <beforebreak>\bLtd\.\s</beforebreak>
5440
+ <beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
5395
5441
  <afterbreak>\p{Ll}+</afterbreak>
5396
5442
  </rule>
5397
5443
  <rule break="no">
5398
- <beforebreak>\bCo\.\s</beforebreak>
5444
+ <beforebreak>\bCo\.[\s\u00A0]</beforebreak>
5399
5445
  <afterbreak></afterbreak>
5400
5446
  </rule>
5401
5447
  <!-- Break rules -->
5402
5448
  <rule break="yes">
5403
- <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?\s+</beforebreak>
5449
+ <beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?[\s\u00A0]+</beforebreak>
5404
5450
  <afterbreak></afterbreak>
5405
5451
  </rule>
5406
5452
  <rule break="yes">
@@ -5408,7 +5454,7 @@
5408
5454
  <afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
5409
5455
  </rule>
5410
5456
  <rule break="yes">
5411
- <beforebreak>\s\p{L}[\.!?…]\s</beforebreak>
5457
+ <beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
5412
5458
  <afterbreak>\p{Lu}\p{Ll}</afterbreak>
5413
5459
  </rule>
5414
5460
  </languagerule>
@@ -5556,10 +5602,14 @@
5556
5602
  </rule>
5557
5603
  <!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
5558
5604
  <rule break="no">
5559
- <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор)\.[\h\v]*</beforebreak>
5605
+ <beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
5560
5606
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
5561
5607
  </rule>
5562
- <!-- смерть гр. Болтаровича -->
5608
+ <rule break="no">
5609
+ <beforebreak>\bМан\.[\h\v]*</beforebreak>
5610
+ <afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
5611
+ </rule>
5612
+ <!-- смерть гр. Болтаровича, but not "9 гр." -->
5563
5613
  <rule break="no">
5564
5614
  <beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
5565
5615
  <afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
@@ -5567,7 +5617,7 @@
5567
5617
  <!-- арт. - артикул -->
5568
5618
  <!-- TODO: арт. - артист -->
5569
5619
  <rule break="no">
5570
- <beforebreak>\bарт\.[\h\v]*</beforebreak>
5620
+ <beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
5571
5621
  <afterbreak>[\h\v]*[0-9]</afterbreak>
5572
5622
  </rule>
5573
5623
  <!-- ХІІ р., 3-6 арт. -->