srx-languagetool 0.1.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/main.yml +2 -2
- data/.gitignore +1 -0
- data/.rubocop.yml +1 -1
- data/.rubocop_todo.yml +5 -4
- data/.ruby-version +1 -0
- data/CHANGELOG.md +17 -0
- data/Gemfile.lock +29 -25
- data/README.md +14 -2
- data/lib/srx/languagetool/version.rb +1 -1
- data/lib/srx/segment.srx +310 -215
- data/srx-languagetool.gemspec +2 -1
- metadata +6 -4
data/lib/srx/segment.srx
CHANGED
@@ -1102,12 +1102,16 @@
|
|
1102
1102
|
</rule>
|
1103
1103
|
</languagerule>
|
1104
1104
|
<languagerule languagerulename="English">
|
1105
|
+
<rule break="no">
|
1106
|
+
<beforebreak>[\u00A0\s]</beforebreak>
|
1107
|
+
<afterbreak>\n</afterbreak>
|
1108
|
+
</rule>
|
1105
1109
|
<rule break="no"><!-- Hello (Hi! ) my name is Chris -->
|
1106
|
-
<beforebreak>[a-zA-Z][!\?]\s</beforebreak>
|
1107
|
-
<afterbreak>\)\s[a-zA-Z]</afterbreak>
|
1110
|
+
<beforebreak>[a-zA-Z][!\?][\s\u00A0]</beforebreak>
|
1111
|
+
<afterbreak>\)[\s\u00A0][a-zA-Z]</afterbreak>
|
1108
1112
|
</rule>
|
1109
1113
|
<rule break="no">
|
1110
|
-
<beforebreak>Yahoo
|
1114
|
+
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
1111
1115
|
<afterbreak>\p{Ll}</afterbreak>
|
1112
1116
|
</rule>
|
1113
1117
|
<rule break="no"><!-- U.S.A (no dot at end) -->
|
@@ -1118,6 +1122,10 @@
|
|
1118
1122
|
<beforebreak>\bA\.</beforebreak>
|
1119
1123
|
<afterbreak>I\b</afterbreak>
|
1120
1124
|
</rule>
|
1125
|
+
<rule break="no"><!-- S.I (no dot at end) -->
|
1126
|
+
<beforebreak>\bS\.</beforebreak>
|
1127
|
+
<afterbreak>I\b</afterbreak>
|
1128
|
+
</rule>
|
1121
1129
|
<rule break="no"><!-- L.A (no dot at end) -->
|
1122
1130
|
<beforebreak>\bL\.</beforebreak>
|
1123
1131
|
<afterbreak>A\b</afterbreak>
|
@@ -1126,6 +1134,14 @@
|
|
1126
1134
|
<beforebreak>\bU\.</beforebreak>
|
1127
1135
|
<afterbreak>[SK]\b</afterbreak>
|
1128
1136
|
</rule>
|
1137
|
+
<rule break="no"><!-- I.S (no dot at end) -->
|
1138
|
+
<beforebreak>\bI\.</beforebreak>
|
1139
|
+
<afterbreak>S\b</afterbreak>
|
1140
|
+
</rule>
|
1141
|
+
<rule break="no"><!-- M.Z (no dot at end) -->
|
1142
|
+
<beforebreak>\bM\.</beforebreak>
|
1143
|
+
<afterbreak>Z\b</afterbreak>
|
1144
|
+
</rule>
|
1129
1145
|
<rule break="no"><!-- URLs without "www."-->
|
1130
1146
|
<beforebreak>\b(https?|ftp|file|chrome|chromium|android|(chrome|moz)\-extension):///?[A-Za-z0-9\-]+\.</beforebreak>
|
1131
1147
|
<afterbreak>[A-Za-z0-9\-]+(\.|\b)</afterbreak>
|
@@ -1135,96 +1151,96 @@
|
|
1135
1151
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl|be|dev|co|fr|dk|se)(\.|\b)</afterbreak>
|
1136
1152
|
</rule>
|
1137
1153
|
<rule break="no"><!-- No. 5 -->
|
1138
|
-
<beforebreak>\b[nN]o
|
1154
|
+
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
1139
1155
|
<afterbreak>\p{N}</afterbreak>
|
1140
1156
|
</rule>
|
1141
1157
|
<rule break="no"><!-- Ph.D. -->
|
1142
|
-
<beforebreak>\bP[Hh]
|
1158
|
+
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
1143
1159
|
<afterbreak>D\.?</afterbreak>
|
1144
1160
|
</rule>
|
1145
1161
|
<rule break="no"><!-- min. -->
|
1146
|
-
<beforebreak>\b([Ee]d|pp|[Vv]iz|i
|
1162
|
+
<beforebreak>\b([Ee]d|pp|[Vv]iz|i\.?[\s\u00A0]*e|[Vvol]|[Rr]col|maj|Lt|[Ff]ig|[Ff]igs|[Vv]iz|[Vv]ols|[Aa]pprox|[Ii]ncl?|[Aa]cc|Pres|[Cc]orp|[Ee]x|[Cc]onn|[Dd]ept|[Mm]in|max|[Gg]ovt|lb|lbf|ft|c\.?[\s\u00A0]*f|vs|dia|lbs|\d+-(:?oz|kc|in|h[rp]|ml)|M?sec)\.[\s\u00A0]</beforebreak>
|
1147
1163
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1148
1164
|
</rule>
|
1149
1165
|
<rule break="no"><!-- hr. -->
|
1150
|
-
<beforebreak>\b(hr)
|
1166
|
+
<beforebreak>\b(hr)\.[\s\u00A0]</beforebreak>
|
1151
1167
|
<afterbreak>[^\p{Lu}]|I</afterbreak>
|
1152
1168
|
</rule>
|
1153
1169
|
<rule break="no"><!-- Fig. 8 -->
|
1154
|
-
<beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s
|
1170
|
+
<beforebreak>\b([Vv]ol|[Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1155
1171
|
<afterbreak>\p{N}|[IXV]+</afterbreak>
|
1156
1172
|
</rule>
|
1157
1173
|
<rule break="no"><!-- Fig. (8) -->
|
1158
|
-
<beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s
|
1174
|
+
<beforebreak>\b([Ff]ig|[Dd]ef|[Ee]q|[Ll]em|[Pp]rop|[Tt]hm)s?\.[\s\u00A0]</beforebreak>
|
1159
1175
|
<afterbreak>\(\p{N}\)</afterbreak>
|
1160
1176
|
</rule>
|
1161
1177
|
<rule break="no"><!-- I'm (...) great! -->
|
1162
|
-
<beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
|
1178
|
+
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
1163
1179
|
<afterbreak>[^\p{P}]</afterbreak>
|
1164
1180
|
</rule>
|
1165
1181
|
<rule break="no"><!-- I will work with someone (Chris or ...?). -->
|
1166
|
-
<beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
|
1182
|
+
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
1167
1183
|
<afterbreak>[^\p{P}]</afterbreak>
|
1168
1184
|
</rule>
|
1169
1185
|
<rule break="no"><!-- e.g. -->
|
1170
|
-
<beforebreak>\be\.g
|
1186
|
+
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
1171
1187
|
<afterbreak></afterbreak>
|
1172
1188
|
</rule>
|
1173
1189
|
<rule break="no"><!-- vs. -->
|
1174
|
-
<beforebreak>\bvs
|
1190
|
+
<beforebreak>\bvs\.[\s\u00A0]</beforebreak>
|
1175
1191
|
<afterbreak></afterbreak>
|
1176
1192
|
</rule>
|
1177
1193
|
<rule break="no"><!-- esp. -->
|
1178
|
-
<beforebreak>\be[sx]p
|
1194
|
+
<beforebreak>\be[sx]p\.[\s\u00A0]</beforebreak>
|
1179
1195
|
<afterbreak></afterbreak>
|
1180
1196
|
</rule>
|
1181
1197
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
1182
1198
|
<rule break="no"><!-- Etc. -->
|
1183
|
-
<beforebreak>\b[Ee]tc
|
1199
|
+
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
1184
1200
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
1185
1201
|
</rule>
|
1186
1202
|
<rule break="no"><!-- BTW (by the way) -->
|
1187
|
-
<beforebreak>\b([Bb]tw|BTW)
|
1203
|
+
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
1188
1204
|
<afterbreak></afterbreak>
|
1189
1205
|
</rule>
|
1190
1206
|
<rule break="no">
|
1191
|
-
<beforebreak>\bJan
|
1207
|
+
<beforebreak>\bJan\.[\s\u00A0]</beforebreak>
|
1192
1208
|
<afterbreak></afterbreak>
|
1193
1209
|
</rule>
|
1194
1210
|
<rule break="no">
|
1195
|
-
<beforebreak>\bFeb
|
1211
|
+
<beforebreak>\bFeb\.[\s\u00A0]</beforebreak>
|
1196
1212
|
<afterbreak></afterbreak>
|
1197
1213
|
</rule>
|
1198
1214
|
<rule break="no">
|
1199
|
-
<beforebreak>\bMar
|
1215
|
+
<beforebreak>\bMar\.[\s\u00A0]</beforebreak>
|
1200
1216
|
<afterbreak></afterbreak>
|
1201
1217
|
</rule>
|
1202
1218
|
<rule break="no">
|
1203
|
-
<beforebreak>\bApr
|
1219
|
+
<beforebreak>\bApr\.[\s\u00A0]</beforebreak>
|
1204
1220
|
<afterbreak></afterbreak>
|
1205
1221
|
</rule>
|
1206
1222
|
<rule break="no">
|
1207
|
-
<beforebreak>\bJu[nl]
|
1223
|
+
<beforebreak>\bJu[nl]\.[\s\u00A0]</beforebreak>
|
1208
1224
|
<afterbreak></afterbreak>
|
1209
1225
|
</rule>
|
1210
1226
|
<rule break="no">
|
1211
|
-
<beforebreak>\bAug
|
1227
|
+
<beforebreak>\bAug\.[\s\u00A0]</beforebreak>
|
1212
1228
|
<afterbreak></afterbreak>
|
1213
1229
|
</rule>
|
1214
1230
|
<rule break="no">
|
1215
|
-
<beforebreak>\bSept
|
1231
|
+
<beforebreak>\bSept?\.[\s\u00A0]</beforebreak>
|
1216
1232
|
<afterbreak></afterbreak>
|
1217
1233
|
</rule>
|
1218
1234
|
<rule break="no">
|
1219
|
-
<beforebreak>\bOct
|
1235
|
+
<beforebreak>\bOct\.[\s\u00A0]</beforebreak>
|
1220
1236
|
<afterbreak></afterbreak>
|
1221
1237
|
</rule>
|
1222
1238
|
<rule break="no">
|
1223
|
-
<beforebreak>\bNov
|
1239
|
+
<beforebreak>\bNov\.[\s\u00A0]</beforebreak>
|
1224
1240
|
<afterbreak></afterbreak>
|
1225
1241
|
</rule>
|
1226
1242
|
<rule break="no">
|
1227
|
-
<beforebreak>\bDec
|
1243
|
+
<beforebreak>\bDec\.[\s\u00A0]</beforebreak>
|
1228
1244
|
<afterbreak></afterbreak>
|
1229
1245
|
</rule>
|
1230
1246
|
<rule break="no">
|
@@ -1236,43 +1252,43 @@
|
|
1236
1252
|
<afterbreak>3|Buzz|Crozz</afterbreak>
|
1237
1253
|
</rule>
|
1238
1254
|
<rule break="no"><!-- Ph.D. (see rule PH_D) -->
|
1239
|
-
<beforebreak>\bP[Hh]
|
1255
|
+
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
1240
1256
|
<afterbreak></afterbreak>
|
1241
1257
|
</rule>
|
1242
1258
|
<rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
|
1243
|
-
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)
|
1259
|
+
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
1244
1260
|
<afterbreak></afterbreak>
|
1245
1261
|
</rule>
|
1246
1262
|
<rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
|
1247
|
-
<beforebreak>\bLL
|
1263
|
+
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
1248
1264
|
<afterbreak></afterbreak>
|
1249
1265
|
</rule>
|
1250
1266
|
<rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
|
1251
|
-
<beforebreak>\b[BM]
|
1267
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1252
1268
|
<afterbreak>Eng\.?</afterbreak>
|
1253
1269
|
</rule>
|
1254
1270
|
<rule break="no"><!-- LL.B. (Bachelor of Laws) -->
|
1255
|
-
<beforebreak>\bLL
|
1271
|
+
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
1256
1272
|
<afterbreak>[BM]\.?</afterbreak>
|
1257
1273
|
</rule>
|
1258
1274
|
<rule break="no"><!-- B.Sc. (Bachelor of Science) -->
|
1259
|
-
<beforebreak>\b[BM]
|
1275
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1260
1276
|
<afterbreak>Sc\.?</afterbreak>
|
1261
1277
|
</rule>
|
1262
1278
|
<rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
|
1263
|
-
<beforebreak>\b[BM]
|
1279
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1264
1280
|
<afterbreak>Comp?\.?</afterbreak>
|
1265
1281
|
</rule>
|
1266
1282
|
<rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
|
1267
|
-
<beforebreak>\b[BM]
|
1283
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
1268
1284
|
<afterbreak>Arch\.?</afterbreak>
|
1269
1285
|
</rule>
|
1270
1286
|
<rule break="no">
|
1271
|
-
<beforebreak>\b[BM]
|
1287
|
+
<beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
|
1272
1288
|
<afterbreak></afterbreak>
|
1273
1289
|
</rule>
|
1274
1290
|
<rule break="no">
|
1275
|
-
<beforebreak>\bet\b\s\bal
|
1291
|
+
<beforebreak>\bet\b[\s\u00A0]\bal\.[\s\u00A0]</beforebreak>
|
1276
1292
|
<afterbreak></afterbreak>
|
1277
1293
|
</rule>
|
1278
1294
|
<rule break="no">
|
@@ -1280,51 +1296,51 @@
|
|
1280
1296
|
<afterbreak></afterbreak>
|
1281
1297
|
</rule>
|
1282
1298
|
<rule break="no">
|
1283
|
-
<beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)
|
1299
|
+
<beforebreak>\b(Atty|Sg?t|[SG]en|Ft|Gov|Hon|Prof|Mr?s|Mt|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[[\s\u00A0]\u00A0]</beforebreak>
|
1284
1300
|
<afterbreak></afterbreak>
|
1285
1301
|
</rule>
|
1286
1302
|
<rule break="no">
|
1287
|
-
<beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)
|
1303
|
+
<beforebreak>\b(Atty|Sg?t|[SG]en|Gov|Hon|Prof|Mr?s|[DMJS]r|Col|Maj|L(ieu)?t|Brig|Capt|Cmdr|Cmnd|Revd?|Rep)\.[\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
|
1288
1304
|
<afterbreak></afterbreak>
|
1289
1305
|
</rule>
|
1290
1306
|
<rule break="no">
|
1291
|
-
<beforebreak>\b(Drs|Messrs|Mmes)
|
1292
|
-
<afterbreak>(and\s)|\p{Lu}\p{Ll}+</afterbreak>
|
1307
|
+
<beforebreak>\b(Drs|Messrs|Mmes)\.[\s\u00A0]</beforebreak>
|
1308
|
+
<afterbreak>(and[\s\u00A0])|\p{Lu}\p{Ll}+</afterbreak>
|
1293
1309
|
</rule>
|
1294
1310
|
<rule break="no">
|
1295
|
-
<beforebreak>\bcf
|
1311
|
+
<beforebreak>\bcf\.[\s\u00A0]</beforebreak>
|
1296
1312
|
<afterbreak></afterbreak>
|
1297
1313
|
</rule>
|
1298
1314
|
<rule break="no">
|
1299
|
-
<beforebreak>\bI(nc|NC)
|
1315
|
+
<beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
|
1300
1316
|
<afterbreak></afterbreak>
|
1301
1317
|
</rule>
|
1302
1318
|
<rule break="no">
|
1303
|
-
<beforebreak>\bCorp
|
1319
|
+
<beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
|
1304
1320
|
<afterbreak></afterbreak>
|
1305
1321
|
</rule>
|
1306
1322
|
<rule break="no">
|
1307
|
-
<beforebreak>\bBros
|
1323
|
+
<beforebreak>\bBros\.[\s\u00A0]</beforebreak>
|
1308
1324
|
<afterbreak></afterbreak>
|
1309
1325
|
</rule>
|
1310
1326
|
<rule break="no">
|
1311
|
-
<beforebreak>\bDist
|
1327
|
+
<beforebreak>\bDist\.[\s\u00A0]</beforebreak>
|
1312
1328
|
<afterbreak></afterbreak>
|
1313
1329
|
</rule>
|
1314
1330
|
<rule break="no">
|
1315
|
-
<beforebreak>\bCo
|
1331
|
+
<beforebreak>\bCo\.[\s\u00A0]</beforebreak>
|
1316
1332
|
<afterbreak></afterbreak>
|
1317
1333
|
</rule>
|
1318
1334
|
<rule break="no">
|
1319
|
-
<beforebreak>\bo'clock\s</beforebreak>
|
1335
|
+
<beforebreak>\bo'clock[\s\u00A0]</beforebreak>
|
1320
1336
|
<afterbreak></afterbreak>
|
1321
1337
|
</rule>
|
1322
1338
|
<rule break="no">
|
1323
|
-
<beforebreak>\bfo'c'sle\s</beforebreak>
|
1339
|
+
<beforebreak>\bfo'c'sle[\s\u00A0]</beforebreak>
|
1324
1340
|
<afterbreak></afterbreak>
|
1325
1341
|
</rule>
|
1326
1342
|
<rule break="no">
|
1327
|
-
<beforebreak>\bLtd
|
1343
|
+
<beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
|
1328
1344
|
<afterbreak>\p{Ll}+</afterbreak>
|
1329
1345
|
</rule>
|
1330
1346
|
<rule break="no">
|
@@ -1340,35 +1356,35 @@
|
|
1340
1356
|
<afterbreak>\p{Ll}</afterbreak>
|
1341
1357
|
</rule>
|
1342
1358
|
<rule break="no">
|
1343
|
-
<beforebreak>["”'’]\s*</beforebreak>
|
1344
|
-
<afterbreak
|
1359
|
+
<beforebreak>["”'’][\s\u00A0]*</beforebreak>
|
1360
|
+
<afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
|
1345
1361
|
</rule>
|
1346
1362
|
<rule break="no">
|
1347
|
-
<beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
|
1363
|
+
<beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
|
1348
1364
|
<afterbreak></afterbreak>
|
1349
1365
|
</rule>
|
1350
1366
|
<rule break="no">
|
1351
|
-
<beforebreak>\b\p{L}
|
1352
|
-
<afterbreak>\p{L}
|
1367
|
+
<beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
|
1368
|
+
<afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
|
1353
1369
|
</rule>
|
1354
1370
|
<rule break="no">
|
1355
1371
|
<beforebreak>\b\p{L}\.</beforebreak>
|
1356
1372
|
<afterbreak>\p{L}\.</afterbreak>
|
1357
1373
|
</rule>
|
1358
1374
|
<rule break="no"><!-- Jones v. Smith -->
|
1359
|
-
<beforebreak>\p{Lu}\p{L}
|
1375
|
+
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
1360
1376
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
1361
1377
|
</rule>
|
1362
1378
|
<rule break="yes">
|
1363
|
-
<beforebreak>[^,][\s]\p{L}{2}
|
1364
|
-
<afterbreak>\p{N}+\)\s</afterbreak>
|
1379
|
+
<beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
|
1380
|
+
<afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
|
1365
1381
|
</rule>
|
1366
1382
|
<rule break="yes">
|
1367
|
-
<beforebreak>\bOK
|
1383
|
+
<beforebreak>\bOK\.[\s\u00A0]</beforebreak>
|
1368
1384
|
<afterbreak>\p{Ll}+</afterbreak>
|
1369
1385
|
</rule>
|
1370
1386
|
<rule break="no">
|
1371
|
-
<beforebreak>[\.\s]\p{L}{1,2}
|
1387
|
+
<beforebreak>[\.\s\u00A0](?!(on|it|of|to|be|by|at|he|we|so|do|if|up|my|me|us|go|am))\p{L}{1,2}\.[\s\u00A0]</beforebreak><!-- not 'no'/'in', these could be abbreviations-->
|
1372
1388
|
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
1373
1389
|
</rule>
|
1374
1390
|
<rule break="no">
|
@@ -1376,35 +1392,35 @@
|
|
1376
1392
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
1377
1393
|
</rule>
|
1378
1394
|
<rule break="no">
|
1379
|
-
<beforebreak>\b\p{Lu}
|
1395
|
+
<beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
|
1380
1396
|
<afterbreak></afterbreak>
|
1381
1397
|
</rule>
|
1382
1398
|
<rule break="no">
|
1383
|
-
<beforebreak>\b\p{Lu}\.\p{Lu}
|
1399
|
+
<beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
|
1384
1400
|
<afterbreak></afterbreak>
|
1385
1401
|
</rule>
|
1386
1402
|
<rule break="no">
|
1387
|
-
<beforebreak>[^\.]\s[A-Z]
|
1403
|
+
<beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
|
1388
1404
|
<afterbreak></afterbreak>
|
1389
1405
|
</rule>
|
1390
1406
|
<rule break="no">
|
1391
|
-
<beforebreak>\b(:?Blvd|Ave|Mts?)
|
1407
|
+
<beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
|
1392
1408
|
<afterbreak>\p{Ll}+</afterbreak>
|
1393
1409
|
</rule>
|
1394
1410
|
<rule break="no">
|
1395
|
-
<beforebreak>\b(?:Kan|Ill|M[ai]ss)
|
1411
|
+
<beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
|
1396
1412
|
<afterbreak>\p{Ll}+</afterbreak>
|
1397
1413
|
</rule>
|
1398
1414
|
<rule break="no">
|
1399
|
-
<beforebreak>\(\p{Ll}
|
1415
|
+
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
1400
1416
|
<afterbreak></afterbreak>
|
1401
1417
|
</rule>
|
1402
1418
|
<rule break="no"><!-- i.e. -->
|
1403
|
-
<beforebreak>i\.e
|
1419
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
|
1404
1420
|
<afterbreak></afterbreak>
|
1405
1421
|
</rule>
|
1406
1422
|
<rule break="yes">
|
1407
|
-
<beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]
|
1423
|
+
<beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A"'\p{Pe}\u0002¹²³]*[\s\u00A0]</beforebreak>
|
1408
1424
|
<afterbreak></afterbreak>
|
1409
1425
|
</rule>
|
1410
1426
|
<rule break="yes">
|
@@ -1412,7 +1428,7 @@
|
|
1412
1428
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1413
1429
|
</rule>
|
1414
1430
|
<rule break="yes">
|
1415
|
-
<beforebreak
|
1431
|
+
<beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
|
1416
1432
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
1417
1433
|
</rule>
|
1418
1434
|
</languagerule>
|
@@ -1511,6 +1527,16 @@
|
|
1511
1527
|
</rule>
|
1512
1528
|
</languagerule>
|
1513
1529
|
<languagerule languagerulename="Dutch">
|
1530
|
+
<rule break="no">
|
1531
|
+
<!-- sp.a -->
|
1532
|
+
<beforebreak>\b(sp|SP)</beforebreak>
|
1533
|
+
<afterbreak>\.[aA]\b</afterbreak>
|
1534
|
+
</rule>
|
1535
|
+
<rule break="no">
|
1536
|
+
<!-- .Net -->
|
1537
|
+
<beforebreak>\s[.]</beforebreak>
|
1538
|
+
<afterbreak>[Nn][Ee][Tt](\b|-)</afterbreak>
|
1539
|
+
</rule>
|
1514
1540
|
<rule break="no"><!-- quoted sentence in sentence -->
|
1515
1541
|
<beforebreak>[.?!][’'"]</beforebreak>
|
1516
1542
|
<afterbreak> [a-z]</afterbreak>
|
@@ -1524,11 +1550,11 @@
|
|
1524
1550
|
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
1525
1551
|
</rule>
|
1526
1552
|
<rule break="no">
|
1527
|
-
<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon)\.\s</beforebreak>
|
1553
|
+
<beforebreak>\b(Drs|Art|Afr|Am|Ar|Br|Cie|Comp|Dhr|([Pp]rof\.)?[Dd]r|Em|Fa|Kon|Bros)\.\s</beforebreak>
|
1528
1554
|
<afterbreak></afterbreak>
|
1529
1555
|
</rule>
|
1530
1556
|
<rule break="no">
|
1531
|
-
<beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr)\.\s</beforebreak>
|
1557
|
+
<beforebreak>\b(Mej|Mevr|Mgr|Mw|Ndl|Ned|Nl|No|Prof|Secr|Chr|Jac)\.\s</beforebreak>
|
1532
1558
|
<afterbreak></afterbreak>
|
1533
1559
|
</rule>
|
1534
1560
|
<rule break="no">
|
@@ -1668,12 +1694,28 @@
|
|
1668
1694
|
<beforebreak>\bprof\.\s</beforebreak>
|
1669
1695
|
<afterbreak></afterbreak>
|
1670
1696
|
</rule>
|
1697
|
+
<rule break="no">
|
1698
|
+
<beforebreak>[.!?…][’'"]\s</beforebreak>
|
1699
|
+
<afterbreak>[a-z]</afterbreak>
|
1700
|
+
</rule>
|
1701
|
+
<rule break="no">
|
1702
|
+
<beforebreak>[.][.]\s</beforebreak>
|
1703
|
+
<afterbreak>[a-z]</afterbreak>
|
1704
|
+
</rule>
|
1705
|
+
<rule break="no">
|
1706
|
+
<beforebreak>SP[.]</beforebreak>
|
1707
|
+
<afterbreak>A</afterbreak>
|
1708
|
+
</rule>
|
1709
|
+
<rule break="no">
|
1710
|
+
<beforebreak>Warner Bros\.</beforebreak>
|
1711
|
+
<afterbreak>[a-z]</afterbreak>
|
1712
|
+
</rule>
|
1671
1713
|
<rule break="yes">
|
1672
|
-
<beforebreak>[\.!?…][\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
|
1714
|
+
<beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002¹²³]*\s</beforebreak>
|
1673
1715
|
<afterbreak></afterbreak>
|
1674
1716
|
</rule>
|
1675
1717
|
<rule break="yes">
|
1676
|
-
<beforebreak>[\.!?…]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
|
1718
|
+
<beforebreak>[\.!?…][’'"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]*</beforebreak>
|
1677
1719
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
1678
1720
|
</rule>
|
1679
1721
|
<rule break="yes">
|
@@ -1705,6 +1747,40 @@
|
|
1705
1747
|
<beforebreak>\bmax\.\s</beforebreak>
|
1706
1748
|
<afterbreak>\p{Ll}</afterbreak>
|
1707
1749
|
</rule>
|
1750
|
+
<rule break="yes">
|
1751
|
+
<beforebreak>[?!.]['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002]\s</beforebreak>
|
1752
|
+
<afterbreak>[A-Z][a-z]</afterbreak>
|
1753
|
+
</rule>
|
1754
|
+
<rule break="yes">
|
1755
|
+
<beforebreak>[?!.]\s</beforebreak>
|
1756
|
+
<afterbreak>['"\u00BB\u2019\u201D\u203A\u00AB\p{Pe}\u0002][A-Z][a-z]</afterbreak>
|
1757
|
+
</rule>
|
1758
|
+
<rule break="no">
|
1759
|
+
<!-- "E. coli etc. -->
|
1760
|
+
<beforebreak>"[A-Z][.]\s</beforebreak>
|
1761
|
+
<afterbreak>[a-z]</afterbreak>
|
1762
|
+
</rule>
|
1763
|
+
<rule break="no">
|
1764
|
+
<!-- Cornelisz. -->
|
1765
|
+
<beforebreak>[A-Z][a-z].*sz[.]\s</beforebreak>
|
1766
|
+
<afterbreak>[a-z]</afterbreak>
|
1767
|
+
</rule>
|
1768
|
+
<rule break="no">
|
1769
|
+
<!-- De n. XIV/vagus (nervus) -->
|
1770
|
+
<beforebreak>De n[.]\s</beforebreak>
|
1771
|
+
<afterbreak>[a-z]|[XIV]</afterbreak>
|
1772
|
+
</rule>
|
1773
|
+
<rule break="no">
|
1774
|
+
<!-- MOL.E -->
|
1775
|
+
<beforebreak>[A-Z]{2,5}[.]</beforebreak>
|
1776
|
+
<afterbreak>[A-Z]</afterbreak>
|
1777
|
+
</rule>
|
1778
|
+
<rule break="no">
|
1779
|
+
<!-- ..." betekent -->
|
1780
|
+
<beforebreak>\.\.</beforebreak>
|
1781
|
+
<afterbreak>" [a-z]</afterbreak>
|
1782
|
+
</rule>
|
1783
|
+
<!-- ##### end of Dutch #### -->
|
1708
1784
|
</languagerule>
|
1709
1785
|
<languagerule languagerulename="Slovak">
|
1710
1786
|
<rule break="no">
|
@@ -4263,7 +4339,7 @@
|
|
4263
4339
|
<rule break="no">
|
4264
4340
|
<beforebreak>\b(бульв|г|д|доп|др|е|зам|Зам|и|им|инд|исп|Исп)\.\s</beforebreak>
|
4265
4341
|
<afterbreak></afterbreak>
|
4266
|
-
</rule>
|
4342
|
+
</rule>
|
4267
4343
|
<rule break="no">
|
4268
4344
|
<beforebreak>\b(англ|в|вв|га|гг|гл|гос|грн|дм|долл|е|ед)\.\s</beforebreak>
|
4269
4345
|
<afterbreak>\p{Ll}</afterbreak>
|
@@ -4531,146 +4607,146 @@
|
|
4531
4607
|
</languagerule>
|
4532
4608
|
<languagerule languagerulename="Catalan">
|
4533
4609
|
<rule break="no">
|
4534
|
-
<beforebreak>Yahoo
|
4610
|
+
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
4535
4611
|
<afterbreak>\p{Ll}</afterbreak>
|
4536
4612
|
</rule>
|
4537
4613
|
<rule break="yes">
|
4538
|
-
<beforebreak>\w['’][nNtT]
|
4614
|
+
<beforebreak>\w['’][nNtT]\.[\s\u00A0]</beforebreak>
|
4539
4615
|
<afterbreak></afterbreak>
|
4540
4616
|
</rule>
|
4541
4617
|
<rule break="yes">
|
4542
|
-
<beforebreak>\.\[\d+\]\s</beforebreak>
|
4618
|
+
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
|
4543
4619
|
<afterbreak></afterbreak>
|
4544
4620
|
</rule>
|
4545
4621
|
<!-- initials: A. C. Jones. Problem: [...] d'Alfons I. Ell era [...] -->
|
4546
4622
|
<rule break="no">
|
4547
|
-
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]
|
4623
|
+
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
|
4548
4624
|
<afterbreak></afterbreak>
|
4549
4625
|
</rule>
|
4550
4626
|
<!-- Abbreviations that cannot finish sentences-->
|
4551
4627
|
<rule break="no">
|
4552
|
-
<beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))
|
4628
|
+
<beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|Dra\. Ma|Sta\. Ma|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
|
4553
4629
|
<afterbreak></afterbreak>
|
4554
4630
|
</rule>
|
4555
4631
|
<!-- Abbreviations that can finish sentences -->
|
4556
4632
|
<rule break="no">
|
4557
|
-
<beforebreak>\
|
4633
|
+
<beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
|
4558
4634
|
<afterbreak>[XIV]+\b</afterbreak>
|
4559
4635
|
</rule>
|
4560
4636
|
<rule break="no">
|
4561
|
-
<beforebreak>\b(min|m)
|
4637
|
+
<beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
|
4562
4638
|
<afterbreak>[0-9]+\b</afterbreak>
|
4563
4639
|
</rule>
|
4564
4640
|
<rule break="no">
|
4565
|
-
<beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)
|
4641
|
+
<beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol)\.[\s\u00A0]</beforebreak>
|
4566
4642
|
<afterbreak>[XIV\d]+\b</afterbreak>
|
4567
4643
|
</rule>
|
4568
4644
|
<rule break="no">
|
4569
|
-
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4645
|
+
<beforebreak>\b([Ee]ds?|[Cc]oords?|\d+(r|n|t|è|é|ns|es)|masc|fem|sing|pl|adj|adv|g|kg|m|km|cm|ha|u|h|hrs|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4570
4646
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4571
4647
|
</rule>
|
4572
4648
|
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
|
4573
4649
|
<rule break="no">
|
4574
|
-
<beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']
|
4650
|
+
<beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4575
4651
|
<afterbreak>\p{Ll}</afterbreak>
|
4576
4652
|
</rule>
|
4577
4653
|
<!-- Any word in acronyms like EE.UU. or BB. DD. -->
|
4578
4654
|
<rule break="no">
|
4579
|
-
<beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']
|
4655
|
+
<beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4580
4656
|
<afterbreak>\p{Ll}</afterbreak>
|
4581
4657
|
</rule>
|
4582
4658
|
<rule break="no">
|
4583
|
-
<beforebreak>\
|
4584
|
-
<afterbreak
|
4659
|
+
<beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
|
4660
|
+
<afterbreak>\p{Lu}{2}</afterbreak>
|
4585
4661
|
</rule>
|
4586
4662
|
<rule break="no">
|
4587
|
-
<beforebreak>EE
|
4663
|
+
<beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
|
4588
4664
|
<afterbreak>\p{Ll}</afterbreak>
|
4589
4665
|
</rule>
|
4590
4666
|
<!-- max min etc -->
|
4591
4667
|
<rule break="no">
|
4592
|
-
<beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4668
|
+
<beforebreak>\b([Ee]tc|m[aáà]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4593
4669
|
<afterbreak>\p{Ll}</afterbreak>
|
4594
4670
|
</rule>
|
4595
4671
|
<!-- Composed abbrev. -->
|
4596
4672
|
<rule break="no">
|
4597
|
-
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4673
|
+
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4598
4674
|
<afterbreak></afterbreak>
|
4599
4675
|
</rule>
|
4600
4676
|
<!-- Units -->
|
4601
4677
|
<rule break="no">
|
4602
|
-
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4678
|
+
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4603
4679
|
<afterbreak>\p{Ll}</afterbreak>
|
4604
4680
|
</rule>
|
4605
4681
|
<!-- Ellipsis: ... lowercase -->
|
4606
4682
|
<rule break="no">
|
4607
|
-
<beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
|
4683
|
+
<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
|
4608
4684
|
<afterbreak>\p{Ll}</afterbreak>
|
4609
4685
|
</rule>
|
4610
4686
|
<!-- (enum...) -->
|
4611
4687
|
<rule break="no">
|
4612
|
-
<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
|
4688
|
+
<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
|
4613
4689
|
<afterbreak>\p{Ll}</afterbreak>
|
4614
4690
|
</rule>
|
4615
4691
|
<!-- pero ¡ah! no estaba
|
4616
4692
|
<rule break="no">
|
4617
|
-
<beforebreak>\b¡\p{L}
|
4693
|
+
<beforebreak>\b¡\p{L}+![\s\u00A0]</beforebreak>
|
4618
4694
|
<afterbreak>\p{Ll}</afterbreak>
|
4619
4695
|
</rule>
|
4620
4696
|
-->
|
4621
4697
|
<rule break="yes">
|
4622
|
-
<beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]
|
4698
|
+
<beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
|
4623
4699
|
<afterbreak></afterbreak>
|
4624
4700
|
</rule>
|
4625
4701
|
<rule break="yes">
|
4626
|
-
<beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]
|
4702
|
+
<beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
|
4627
4703
|
<afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
|
4628
4704
|
</rule>
|
4629
4705
|
<!-- paragraphs with opening "»" in dialogs-->
|
4630
4706
|
<rule break="yes">
|
4631
|
-
<beforebreak>[\.:!?…»]
|
4632
|
-
<afterbreak>»[^\s\.:!?…]</afterbreak>
|
4707
|
+
<beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
|
4708
|
+
<afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
|
4633
4709
|
</rule>
|
4634
4710
|
</languagerule>
|
4635
4711
|
<languagerule languagerulename="Spanish">
|
4636
4712
|
<rule break="no">
|
4637
|
-
<beforebreak>Yahoo
|
4713
|
+
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
4638
4714
|
<afterbreak>\p{Ll}</afterbreak>
|
4639
4715
|
</rule>
|
4640
4716
|
<rule break="yes">
|
4641
|
-
<beforebreak>\.\[\d+\]\s</beforebreak>
|
4717
|
+
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
|
4642
4718
|
<afterbreak></afterbreak>
|
4643
4719
|
</rule>
|
4644
4720
|
<!-- initials: A. C. Jones. Problem: [...] de Alfons I. Él era [...] -->
|
4645
4721
|
<rule break="no">
|
4646
|
-
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]
|
4722
|
+
<beforebreak>\b[A-ZÀÉÈÍÓÒÚ]\.[\s\u00A0]</beforebreak>
|
4647
4723
|
<afterbreak/>
|
4648
4724
|
</rule>
|
4649
4725
|
<!-- Ellipsis: ... lowercase -->
|
4650
4726
|
<rule break="no">
|
4651
|
-
<beforebreak>[^\s](\Q...\E|…)\s</beforebreak>
|
4727
|
+
<beforebreak>[^\s\u00A0](\Q...\E|…)[\s\u00A0]</beforebreak>
|
4652
4728
|
<afterbreak>\p{Ll}</afterbreak>
|
4653
4729
|
</rule>
|
4654
4730
|
<!-- (enum...) -->
|
4655
4731
|
<rule break="no">
|
4656
|
-
<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”]\s</beforebreak>
|
4732
|
+
<beforebreak>\b(\Q...\E|…)[\p{Pe}»"’”][\s\u00A0]</beforebreak>
|
4657
4733
|
<afterbreak>\p{Ll}</afterbreak>
|
4658
4734
|
</rule>
|
4659
4735
|
<!-- Abbreviations that can finish sentences -->
|
4660
4736
|
<rule break="no">
|
4661
|
-
<beforebreak>\
|
4737
|
+
<beforebreak>\b(s|ca)\.[\s\u00A0]</beforebreak>
|
4662
4738
|
<afterbreak>[XIV]+\b</afterbreak>
|
4663
4739
|
</rule>
|
4664
4740
|
<rule break="no">
|
4665
|
-
<beforebreak>\b(min|m)
|
4741
|
+
<beforebreak>\b(min|m|ca)\.[\s\u00A0]</beforebreak>
|
4666
4742
|
<afterbreak>[0-9]+\b</afterbreak>
|
4667
4743
|
</rule>
|
4668
4744
|
<rule break="no">
|
4669
|
-
<beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)
|
4745
|
+
<beforebreak>\b([Cc]ap|[Aa]rts?|pp|[Vv]ol|p|[Pp][aá]gs?|ps)\.[\s\u00A0]</beforebreak>
|
4670
4746
|
<afterbreak>[XIV\d]+\b</afterbreak>
|
4671
4747
|
</rule>
|
4672
4748
|
<rule break="no">
|
4673
|
-
<beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4749
|
+
<beforebreak>\b(\d+(r|er|n|ero|era|mo|ma|vo|va|no|na|to|ta|do|da|h|hr|gr|grs|o|a)s?|g|kg|m|km|cm|ha|u|h|hrs|H|HR|HRS|s|ss|alt|cant|cast|cert|com|dir|gr|nom|parc|pres|set|Sr|Jr|Admón|Adm|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4674
4750
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4675
4751
|
</rule>
|
4676
4752
|
<rule break="no">
|
@@ -4685,75 +4761,75 @@
|
|
4685
4761
|
</rule>
|
4686
4762
|
<!-- Abbreviations that cannot finish sentences-->
|
4687
4763
|
<rule break="no">
|
4688
|
-
<beforebreak>\b(dc|(?iu)(n|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))
|
4764
|
+
<beforebreak>\b(dc|(?iu)(n|[Aa]yto|Mr|C|Dr|Dra|E|Emm|Emma|Excm|Excma|Hble|I|Il·lm|Il·lma|Il·ltre|Im|Ima|Mgfc|Mgfca|Mn|R|Rev|Sr|Sra|Sres|Sras|Srs|St|Sta|a|abr|abs|acad|add|adj|adm|admdor|admdora|admtiu|admtiva|adv|ag|agl|agr|agron|agròn|aj|ajud|al|alim|amb|ampl|ant|ap|apmt|apnt|apr|aprox|apt|arm|arq|arqueol|arquit|assign|assoc|atm|aut|aux|av|b|batx|bda|bibl|bl|bnc|butll|bxs|c|calef|cartogr|cat|catedr|catol|cf|cia|cin|cint|circul|cit|climat|col|col·l|compt|cons|constr|cont|contr|conv|corp|corr|cpl|cpt|cró|ct|cte|ctra|cts|d|dept|derog|des|desp|dg|dip|disp|distr|div|dj|dl|doc|drec|ds|dt|dta|dte|dupl|dv|e|econ|ed|ef|entl|esc|esp|espf|esq|ex|exc|exp|exped|ext|f|fac|fca|febr|fig|figs|fra|gen|gov|gral|i|imp|impr|impt|inc|insp|inst|int|inv|j|jul|jur|jurispr|leg|llic|loc|ltda|làm|merc|mil·l|màx|mín|neg|nov|nre|núm|o|oct|op|p|pàg|pàgs|paq|par|pda|pg|pl|pobl|pol|ppda|ppt|pral|prev|prof|progr|prov|pta|ptes|ptge|pvt|pàg|quadr|quint|r|rbla|ref|reg|rev|secr|serv|sgt|sotsp|subsp|supl|supt|t|tel|telegr|tit|trad|trans|transcr|transf|trav|tripl|trv|tt|tèc|univ|urb|v|var|veg|venc|vid|vig|vocab|vs|x|àt|íd))\.[\s\u00A0]</beforebreak>
|
4689
4765
|
<afterbreak/>
|
4690
4766
|
</rule>
|
4691
4767
|
<rule break="no">
|
4692
|
-
<beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)
|
4768
|
+
<beforebreak>\b([Aa]vda|[Pp][ol]|Pl?za|[Aa]dm|[Dd]pto|Sr|Mr|Srta|ej)\.[\s\u00A0]</beforebreak>
|
4693
4769
|
<afterbreak/>
|
4694
4770
|
</rule>
|
4695
4771
|
<rule break="no">
|
4696
|
-
<beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)
|
4772
|
+
<beforebreak>\b(Dña|Dr[a]?|Sra|Sto|S(ri)?ta|Ldo|Ing|Prof|Excmo|Ilmo|Mgfco|admdor|admdora)\.[\s\u00A0]</beforebreak>
|
4697
4773
|
<afterbreak/>
|
4698
4774
|
</rule>
|
4699
4775
|
<rule break="no">
|
4700
|
-
<beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)
|
4776
|
+
<beforebreak>\b([Aa]rt|[Cc]ód|[Ss]ecc|[Tt]ít)\.[\s\u00A0]</beforebreak>
|
4701
4777
|
<afterbreak/>
|
4702
4778
|
</rule>
|
4703
4779
|
<rule break="no">
|
4704
|
-
<beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr
|
4780
|
+
<beforebreak>\b([Ee]d(it)?|[Nn]o|n|[Nn]úm|[Pp]ág|p|c|\d+er)|[V\.]gr\.[\s\u00A0]</beforebreak>
|
4705
4781
|
<afterbreak/>
|
4706
4782
|
</rule>
|
4707
4783
|
<!-- Abbreviations that can finish sentences -->
|
4708
4784
|
<rule break="no">
|
4709
|
-
<beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4785
|
+
<beforebreak>\b([Ee]ds?|[Cc]oords?|grs?|Sr|Jr|Admón|Inc|Co|Hnos|Vda|[VU]d[s]?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4710
4786
|
<afterbreak>[\-¡¿«»"'\u2018\u201C\p{Ps}\u2012\u2013\u2014\u2015\u2053]*\p{Ll}</afterbreak>
|
4711
4787
|
</rule>
|
4712
4788
|
<!-- Any word in acronyms like U.S.A.F or F. B. I. or C. or c.s.p. or p. e. -->
|
4713
4789
|
<rule break="no">
|
4714
|
-
<beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']
|
4790
|
+
<beforebreak>\b(\p{L}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4715
4791
|
<afterbreak>\p{Ll}</afterbreak>
|
4716
4792
|
</rule>
|
4717
4793
|
<!-- Any word in acronyms like EE.UU. or BB. DD. -->
|
4718
4794
|
<rule break="no">
|
4719
|
-
<beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']
|
4795
|
+
<beforebreak>\b([\p{Lu}]{2}\.)+[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4720
4796
|
<afterbreak>\p{Ll}</afterbreak>
|
4721
4797
|
</rule>
|
4722
4798
|
<rule break="no">
|
4723
|
-
<beforebreak>\
|
4724
|
-
<afterbreak
|
4799
|
+
<beforebreak>\b\p{Lu}{2}\.[\s\u00A0]?</beforebreak>
|
4800
|
+
<afterbreak>\p{Lu}{2}</afterbreak>
|
4725
4801
|
</rule>
|
4726
4802
|
<rule break="no">
|
4727
|
-
<beforebreak>EE
|
4803
|
+
<beforebreak>EE\.[\s\u00A0]?UU\.[\s\u00A0]?</beforebreak>
|
4728
4804
|
<afterbreak>\p{Ll}</afterbreak>
|
4729
4805
|
</rule>
|
4730
4806
|
<!-- max min etc -->
|
4731
4807
|
<rule break="no">
|
4732
|
-
<beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4808
|
+
<beforebreak>\b([Ee]tc|m[aá]x|m[ií]n|aprox|\d+o)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4733
4809
|
<afterbreak>\p{Ll}</afterbreak>
|
4734
4810
|
</rule>
|
4735
4811
|
<!-- Composed abbrev. -->
|
4736
4812
|
<rule break="no">
|
4737
|
-
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4813
|
+
<beforebreak>\bet al\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4738
4814
|
<afterbreak/>
|
4739
4815
|
</rule>
|
4740
4816
|
<!-- Units -->
|
4741
4817
|
<rule break="no">
|
4742
|
-
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']
|
4818
|
+
<beforebreak>\b([Pp]ta[s]?|K[gm][s]|[mc]?[gmls]|[Hh](rs)?)\.[\p{Pe}\p{Pf}\p{Pd}"']*[\s\u00A0]</beforebreak>
|
4743
4819
|
<afterbreak>\p{Ll}</afterbreak>
|
4744
4820
|
</rule>
|
4745
4821
|
<rule break="yes">
|
4746
|
-
<beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]
|
4822
|
+
<beforebreak>[\.…][\u00BB\u2019\u201D\u203A"'\u0002]*[\s\u00A0]</beforebreak>
|
4747
4823
|
<afterbreak></afterbreak>
|
4748
4824
|
</rule>
|
4749
4825
|
<rule break="yes">
|
4750
|
-
<beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]
|
4826
|
+
<beforebreak>\b[\p{L}'’·\-]+[\p{Pf}\p{Pe}\u00BB\u2019\u201D\u203A"'\u0002]*[\.:!?…]+[\s\u00A0]</beforebreak>
|
4751
4827
|
<afterbreak>[¡¿«»"'\u2018\u201C"\p{Ps}]*\p{Lu}\p{L}*</afterbreak>
|
4752
4828
|
</rule>
|
4753
4829
|
<!-- paragraphs with opening "»" in dialogs-->
|
4754
4830
|
<rule break="yes">
|
4755
|
-
<beforebreak>[\.:!?…»]
|
4756
|
-
<afterbreak>»[^\s\.:!?…]</afterbreak>
|
4831
|
+
<beforebreak>[\.:!?…»]+[\s\u00A0]</beforebreak>
|
4832
|
+
<afterbreak>»[^\u00A0\s\.:!?…]</afterbreak>
|
4757
4833
|
</rule>
|
4758
4834
|
</languagerule>
|
4759
4835
|
<languagerule languagerulename="German">
|
@@ -4767,17 +4843,17 @@
|
|
4767
4843
|
</rule>
|
4768
4844
|
<!--support simple lists in markdown style-->
|
4769
4845
|
<rule break="yes">
|
4770
|
-
<beforebreak>\r?\n\s*[-*]
|
4846
|
+
<beforebreak>\r?\n[\u00A0\s]*[-*]+[\u00A0\s]</beforebreak>
|
4771
4847
|
<afterbreak></afterbreak>
|
4772
4848
|
</rule>
|
4773
4849
|
<!-- Split at e.g. "1a. Und ..." -->
|
4774
4850
|
<rule break="yes">
|
4775
|
-
<beforebreak>\d+[a-z]
|
4851
|
+
<beforebreak>\d+[a-z]\.[\u00A0\s]</beforebreak>
|
4776
4852
|
<afterbreak>\p{Lu}</afterbreak>
|
4777
4853
|
</rule>
|
4778
4854
|
<!-- Don't split at e.g. "d. h." -->
|
4779
4855
|
<rule break="no">
|
4780
|
-
<beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]
|
4856
|
+
<beforebreak>[^-\p{L}'’/]\p{L}[\.!?…]['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
|
4781
4857
|
<afterbreak></afterbreak>
|
4782
4858
|
</rule>
|
4783
4859
|
<rule break="no">
|
@@ -4801,7 +4877,7 @@
|
|
4801
4877
|
<afterbreak>3|Buzz|Crozz</afterbreak>
|
4802
4878
|
</rule>
|
4803
4879
|
<rule break="no">
|
4804
|
-
<beforebreak>[1-3]
|
4880
|
+
<beforebreak>[1-3]\.[\u00A0\s]</beforebreak>
|
4805
4881
|
<afterbreak>Liga|Bundesliga|Fußball(-B|b)undesliga</afterbreak>
|
4806
4882
|
</rule>
|
4807
4883
|
<rule break="no">
|
@@ -4816,126 +4892,126 @@
|
|
4816
4892
|
<!-- Don't split after a white-space followed by a single letter followed
|
4817
4893
|
by a dot followed by another whitespace. e.g. " p. " -->
|
4818
4894
|
<rule break="no">
|
4819
|
-
<beforebreak
|
4895
|
+
<beforebreak>[\u00A0\s]\p{L}\.[\u00A0\s]</beforebreak>
|
4820
4896
|
<afterbreak>\p{L}\.</afterbreak>
|
4821
4897
|
</rule>
|
4822
4898
|
<!-- Don't split at "bla bla... yada yada" -->
|
4823
4899
|
<rule break="no">
|
4824
|
-
<beforebreak>[\[\(]?\.\.\.[\]\)]
|
4900
|
+
<beforebreak>[\[\(]?\.\.\.[\]\)]?[\u00A0\s]</beforebreak>
|
4825
4901
|
<afterbreak>\p{Ll}</afterbreak>
|
4826
4902
|
</rule>
|
4827
4903
|
<!-- Don't split [.?!] when they're quoted -->
|
4828
4904
|
<rule break="no">
|
4829
|
-
<beforebreak>['"„][\.!?…]['"“]\s</beforebreak>
|
4905
|
+
<beforebreak>['"„][\.!?…]['"“][\u00A0\s]</beforebreak>
|
4830
4906
|
<afterbreak></afterbreak>
|
4831
4907
|
</rule>
|
4832
4908
|
<!-- Don't break after quote unless there's a capital letter
|
4833
4909
|
e.g.: "That's right!" he said. -->
|
4834
4910
|
<rule break="no">
|
4835
|
-
<beforebreak>["'“]\s</beforebreak>
|
4911
|
+
<beforebreak>["'“][\u00A0\s]</beforebreak>
|
4836
4912
|
<afterbreak>\p{Ll}</afterbreak>
|
4837
4913
|
</rule>
|
4838
4914
|
<!-- e.g. "Das ist . so." - assume one sentence. -->
|
4839
4915
|
<rule break="no">
|
4840
|
-
<beforebreak
|
4916
|
+
<beforebreak>[\u00A0\s]([\.!?]{1,3}|…)['|"|“|«|\)|\]|\}]?[\u00A0\s]</beforebreak>
|
4841
4917
|
<afterbreak></afterbreak>
|
4842
4918
|
</rule>
|
4843
4919
|
<!-- Numbers, dates e.g. "3.10. datiert" -->
|
4844
4920
|
<rule break="no">
|
4845
|
-
<beforebreak>\b\d
|
4921
|
+
<beforebreak>\b\d+\.[\u00A0\s]</beforebreak>
|
4846
4922
|
<afterbreak>\p{Ll}|\p{Lu}{2,}</afterbreak>
|
4847
4923
|
</rule>
|
4848
4924
|
<!-- z.B. "Das hier ist ein(!) Satz." -->
|
4849
4925
|
<rule break="no">
|
4850
|
-
<beforebreak>[\(\[][!?]{1,3}[\]\)]\s</beforebreak>
|
4926
|
+
<beforebreak>[\(\[][!?]{1,3}[\]\)][\u00A0\s]</beforebreak>
|
4851
4927
|
<afterbreak></afterbreak>
|
4852
4928
|
</rule>
|
4853
4929
|
<!-- z.B. "Das hier ist (genau!) ein Satz." -->
|
4854
4930
|
<rule break="no">
|
4855
|
-
<beforebreak>[!?]{1,3}[\)\]]\s</beforebreak>
|
4931
|
+
<beforebreak>[!?]{1,3}[\)\]][\u00A0\s]</beforebreak>
|
4856
4932
|
<afterbreak></afterbreak>
|
4857
4933
|
</rule>
|
4858
4934
|
<!-- z.B. "bla (...) blubb" -> kein Satzende -->
|
4859
4935
|
<rule break="no">
|
4860
|
-
<beforebreak>[\(\)\[\]]\s</beforebreak>
|
4936
|
+
<beforebreak>[\(\)\[\]][\u00A0\s]</beforebreak>
|
4861
4937
|
<afterbreak></afterbreak>
|
4862
4938
|
</rule>
|
4863
4939
|
<!-- don't split at cases like "Friedrich II. wird auch..." -->
|
4864
4940
|
<rule break="no">
|
4865
|
-
<beforebreak>[\s
|
4941
|
+
<beforebreak>[\u00A0\s ][IVX]+\.[\u00A0\s]</beforebreak>
|
4866
4942
|
<afterbreak>[^\p{Lu}]+</afterbreak>
|
4867
4943
|
</rule>
|
4868
4944
|
<!-- don't split at cases like "im 13. oder 14. Jahrhundert" -->
|
4869
4945
|
<rule break="no">
|
4870
|
-
<beforebreak>\d
|
4871
|
-
<afterbreak>(und|oder|bis)\s</afterbreak>
|
4946
|
+
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4947
|
+
<afterbreak>(und|oder|bis)[\u00A0\s]</afterbreak>
|
4872
4948
|
</rule>
|
4873
4949
|
<!-- einige deutsche Monate, vor denen eine Zahl erscheinen kann,
|
4874
|
-
ohne dass eine Satzgrenze erkannt wird
|
4950
|
+
ohne dass eine Satzgrenze erkannt wird
|
4875
4951
|
(z.B. "am 13. Dezember" -> keine Satzgrenze) -->
|
4876
4952
|
<rule break="no">
|
4877
|
-
<beforebreak>\d
|
4953
|
+
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4878
4954
|
<afterbreak>Januar|Jänner|Februar|März|Merz|April|Mai|Ju[ln]i|August|September|Oktober|November|Dezember</afterbreak>
|
4879
4955
|
</rule>
|
4880
4956
|
<rule break="no">
|
4881
|
-
<beforebreak>\d
|
4957
|
+
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4882
4958
|
<afterbreak>J[aä]n|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez</afterbreak>
|
4883
4959
|
</rule>
|
4884
4960
|
<rule break="no">
|
4885
|
-
<beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)
|
4961
|
+
<beforebreak>(Jan|Jän|Febr?|Mär|Apr|Mai|Ju[nl]|Aug|Sept?|Okt|Nov|Dez)\.[\u00A0\s]</beforebreak>
|
4886
4962
|
<afterbreak>\d\d(\d\d)?</afterbreak>
|
4887
4963
|
</rule>
|
4888
4964
|
<!-- ähnliche Fälle außerhalb der Monatsnamen -->
|
4889
4965
|
<rule break="no">
|
4890
|
-
<beforebreak>\d
|
4966
|
+
<beforebreak>\d+\.[\u00A0\s]</beforebreak>
|
4891
4967
|
<afterbreak>Amtsperiode|Breitengrads?|Breitengrades|Jubiläum|Jhd?|Jhdts?|Konferenz|(Jahres|Partei)(-K|k)onferenz|Längengrade?s?|Tags?|Tages|(Jahres|Spiel|Partei|Geburts)tag|(Jahres|Spiel|Partei|Geburts)tages|(Jahres|Spiel|Partei|Geburts)tags|Jahrhunderts?|Jahrtausend|Platz|Platzes|Lebensjahrs?|Lebensjahres|Lochs?|Loches|Grads|Grades|Obergeschoss|Stock(werk)?s?|Etage|Klasse|Runde|Bezirk|Etappe|Staffel|Sinfonie</afterbreak>
|
4892
4968
|
</rule>
|
4893
4969
|
<!-- English abbreviations - but these work globally for all languages -->
|
4894
4970
|
<rule break="no">
|
4895
|
-
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd)
|
4971
|
+
<beforebreak>\b(Mrs?|No|pp|St|no|Sr|Jr|Bros|etc|[Bb]tw|vs|esp|[Ff]ig|Jan|Feb|Mar|Apr|Ju[nl]|Aug|Sept?|O[ck]t|Nov|Dec|PhD|BSc|BEng|BComp|BArch|al|cf|Inc|Ms|MEng|MSc|MComp|Gen|Sen|Prof|Corp|Co|co|Ltd|Buchst)\.[\u00A0\s]</beforebreak>
|
4896
4972
|
<afterbreak></afterbreak>
|
4897
4973
|
</rule>
|
4898
4974
|
<!-- Latin abbreviations - but these work globally for all languages -->
|
4899
4975
|
<rule break="no">
|
4900
|
-
<beforebreak>\b(spp?)
|
4976
|
+
<beforebreak>\b(spp?)\.[\u00A0\s]</beforebreak>
|
4901
4977
|
<afterbreak></afterbreak>
|
4902
4978
|
</rule>
|
4903
4979
|
<!-- German abbreviations -->
|
4904
4980
|
<rule break="no">
|
4905
|
-
<beforebreak>\b(Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|Abzw|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw)
|
4981
|
+
<beforebreak>\b(ggü|Mag|mtl|versch|d|Übers|usw|Bzw|bzw|Ab[hkst]|abzgl|[Ee]inschl|[Vv]mtl|bezgl|Abzw|[Vv]sl|ahd|Akk|aktual|allg|alltagsspr|altdt|alttest|amerikan|Anh|Ank|Anm|Art|autom|Auftragsnr|Az|Bat|bayr|Bde?|bearb|Bed|Bem|bes|bez|Bez|Bhf|bspw|btto|bw|Dtl|Dez)\.[\u00A0\s]</beforebreak>
|
4906
4982
|
<afterbreak></afterbreak>
|
4907
4983
|
</rule>
|
4908
4984
|
<rule break="no">
|
4909
|
-
<beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|exkl|Expl|Exz)
|
4985
|
+
<beforebreak>\b(cts?|Ca|ca|chem|chin|Chr|cresc|dat|Dat|desgl|ders|dgl|Dipl|Dir?|Doz?|durchg|durchges|Dr|dt|ebd|Ed|eigtl|Eigtl|eigl|Eigl|akt|Engl|engl|Erg|al|et[cw]|Etw|ev(tl)?|Evtl|Evt|evt|exkl|Expl|Exz)\.[\u00A0\s]</beforebreak>
|
4910
4986
|
<afterbreak></afterbreak>
|
4911
4987
|
</rule>
|
4912
4988
|
<rule break="no">
|
4913
|
-
<beforebreak>\bDipl\.-[A-Z][a-z]{2,4}
|
4989
|
+
<beforebreak>\bDipl\.-[A-Z][a-z]{2,4}\.[\u00A0\s]</beforebreak>
|
4914
4990
|
<afterbreak></afterbreak>
|
4915
4991
|
</rule>
|
4916
4992
|
<rule break="no">
|
4917
|
-
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)
|
4993
|
+
<beforebreak>\b(ff|Fa|fachspr|fam|fem|Fem|Fr|franz|frz?|[Aa]ltfranz|frdl|Frl|Fut|Gd|gebr?|Gebr|geh|geleg|gen|Gen|germ|gesch|ges|get|ggf|Ggf|Ggs|ggT|Gr|[Gg]rds|griech)\.[\u00A0\s]</beforebreak>
|
4918
4994
|
<afterbreak></afterbreak>
|
4919
4995
|
</rule>
|
4920
4996
|
<rule break="no">
|
4921
|
-
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)
|
4997
|
+
<beforebreak>\b(hebr|hg|hl|Hrsg|Hg|hist|hochd|hochspr|Hptst|Hr|hrsg|Allg|IdNr|ill|inkl|incl|Ind|Inf|Ing|ital|Tr|jap|Jb|Jg|Jhd?|Jhdts?|jmd[mns]?|jur|Kap|kart|kath|kfm|kaufm|Kfm|kgl|Kl|Konj|königl|Krs?|Kto)\.[\u00A0\s]</beforebreak>
|
4922
4998
|
<afterbreak></afterbreak>
|
4923
4999
|
</rule>
|
4924
5000
|
<rule break="no">
|
4925
|
-
<beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)
|
5001
|
+
<beforebreak>\b(lat|lfd|Lit|lt|Lz|Mask|mask|max|Mrd|mdal|me[dt]|phil|mhd|Mio?|mind?|Mo|mod|nachm|nördlBr|neutr|Nhd|Nom|Nrn?|Num|Obj|od|dgl|offz)\.[\u00A0\s]</beforebreak>
|
4926
5002
|
<afterbreak></afterbreak>
|
4927
5003
|
</rule>
|
4928
5004
|
<rule break="no">
|
4929
|
-
<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)
|
5005
|
+
<beforebreak>\b(Part|Per[fs]|Pfd|Pl(ur)?|pl|Plusq|Pos|pp|Prä[ps]|Prät|Pro[vf]|rd|reg|resp|Rhld|rit|Sa|südl|Br|se[ln]|Sept|Sing|sign|So|sog|Sp|Std?|stacc|Str|stud|Subst|sva|svw|sZ)\.[\u00A0\s]</beforebreak>
|
4930
5006
|
<afterbreak></afterbreak>
|
4931
5007
|
</rule>
|
4932
5008
|
<rule break="no">
|
4933
|
-
<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)
|
5009
|
+
<beforebreak>\b(Tel|teilw|Temp|trans|Tsd|übertr|übl|ff|überarb|ugs|univ|unveränd|urspr|USt|UST|USt\-IdNr|sw|vgl|vll|Vll|vlt|Vlt|vllt|Vllt|Vgl|Vol|vollst|vorm|Vp|Vs|vs|wesentl|wg|Whg|Hd|Ztr|zus|Zus|zzt?|zzgl|zB|zb|Zz|Zt|zw|Min|Bzgl|bzgl|bezügl|Frhr|ggfs|insb|autom|Mw[sS]t)\.[\u00A0\s]</beforebreak>
|
4934
5010
|
<afterbreak></afterbreak>
|
4935
5011
|
</rule>
|
4936
5012
|
<!-- Break rules -->
|
4937
5013
|
<rule break="yes">
|
4938
|
-
<beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]
|
5014
|
+
<beforebreak>[\.!?…][\u0002|'|"|“|«|‹|\)|\]|\}¹²³]?[\u00A0\s]+</beforebreak>
|
4939
5015
|
<afterbreak></afterbreak>
|
4940
5016
|
</rule>
|
4941
5017
|
<rule break="yes">
|
@@ -4943,7 +5019,7 @@
|
|
4943
5019
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
4944
5020
|
</rule>
|
4945
5021
|
<rule break="yes">
|
4946
|
-
<beforebreak
|
5022
|
+
<beforebreak>[\u00A0\s]\p{L}[\.!?…][\u00A0\s]</beforebreak>
|
4947
5023
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
4948
5024
|
</rule>
|
4949
5025
|
<!-- z.B. 2 sentences: “Liebst du mich?” “Ja!” -->
|
@@ -5159,11 +5235,19 @@
|
|
5159
5235
|
</languagerule>
|
5160
5236
|
<languagerule languagerulename="French">
|
5161
5237
|
<rule break="no">
|
5162
|
-
<beforebreak>
|
5238
|
+
<beforebreak>[\s\u00A0]</beforebreak>
|
5239
|
+
<afterbreak>[»”’"'›]</afterbreak>
|
5240
|
+
</rule>
|
5241
|
+
<rule break="yes">
|
5242
|
+
<beforebreak>[\.!?][\s\u00A0][»”’"'›][\s\u00A0]</beforebreak>
|
5243
|
+
<afterbreak>[«“‘‹"'\p{Lu}]</afterbreak>
|
5244
|
+
</rule>
|
5245
|
+
<rule break="no">
|
5246
|
+
<beforebreak>Yahoo![\s\u00A0]</beforebreak>
|
5163
5247
|
<afterbreak>\p{Ll}</afterbreak>
|
5164
5248
|
</rule>
|
5165
5249
|
<rule break="yes">
|
5166
|
-
<beforebreak>\.\[\d+\]\s</beforebreak>
|
5250
|
+
<beforebreak>\.\[\d+\][\s\u00A0]</beforebreak>
|
5167
5251
|
<afterbreak></afterbreak>
|
5168
5252
|
</rule>
|
5169
5253
|
<rule break="no"><!-- URLs without "www."-->
|
@@ -5172,19 +5256,24 @@
|
|
5172
5256
|
</rule>
|
5173
5257
|
<rule break="no"><!-- Subdomains without "www." (e.g. foo.MyDomain.com)-->
|
5174
5258
|
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5175
|
-
<afterbreak>[A-Za-z0-9\-]+\.(com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
5259
|
+
<afterbreak>[A-Za-z0-9\-]+\.(fr|com|net|org|info|de|es|edu|co|eu|nl|io|cn|uk|gov|biz|ca|tk|ru|br|jp|pl)(\.|\b)</afterbreak>
|
5260
|
+
</rule>
|
5261
|
+
<rule break="no">
|
5262
|
+
<!-- gaffa.org -->
|
5263
|
+
<beforebreak>\b[A-Za-z0-9\-]+\.</beforebreak>
|
5264
|
+
<afterbreak>[A-Za-z]{2,5}(\.|\b)</afterbreak>
|
5176
5265
|
</rule>
|
5177
5266
|
<!-- French abbreviations -->
|
5178
5267
|
<rule break="no">
|
5179
|
-
<beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op)
|
5268
|
+
<beforebreak>\b((?iu)J\.\-C|art|app|cf|chap|e(nv|tc)|fém|fig|masc|p|sing|suiv|suppl|tél|op|ex)\.[\s\u00A0]</beforebreak>
|
5180
5269
|
<afterbreak>\p{Ll}</afterbreak>
|
5181
5270
|
</rule>
|
5182
5271
|
<rule break="no">
|
5183
|
-
<beforebreak>\b(etc)\.\)\s</beforebreak>
|
5272
|
+
<beforebreak>\b(etc)\.\)[\s\u00A0]</beforebreak>
|
5184
5273
|
<afterbreak></afterbreak>
|
5185
5274
|
</rule>
|
5186
5275
|
<rule break="no">
|
5187
|
-
<beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM
|
5276
|
+
<beforebreak>\b(apr|ave?|boul|Mr?|Mrs|MM?|Mlle)\.[\s\u00A0]</beforebreak>
|
5188
5277
|
<afterbreak></afterbreak>
|
5189
5278
|
</rule>
|
5190
5279
|
<rule break="no">
|
@@ -5200,39 +5289,39 @@
|
|
5200
5289
|
<afterbreak>\p{Ll}</afterbreak>
|
5201
5290
|
</rule>
|
5202
5291
|
<rule break="no">
|
5203
|
-
<beforebreak>["”'’]\s*</beforebreak>
|
5204
|
-
<afterbreak
|
5292
|
+
<beforebreak>["”'’][\s\u00A0]*</beforebreak>
|
5293
|
+
<afterbreak>[\s\u00A0]*\p{Ll}</afterbreak>
|
5205
5294
|
</rule>
|
5206
5295
|
<rule break="no">
|
5207
|
-
<beforebreak>['"„][\.!?…]['"”]\s</beforebreak>
|
5296
|
+
<beforebreak>['"„][\.!?…]['"”][\s\u00A0]</beforebreak>
|
5208
5297
|
<afterbreak></afterbreak>
|
5209
5298
|
</rule>
|
5210
5299
|
<rule break="no">
|
5211
|
-
<beforebreak>\b\p{L}
|
5212
|
-
<afterbreak>\p{L}
|
5300
|
+
<beforebreak>\b\p{L}\.[\s\u00A0]</beforebreak>
|
5301
|
+
<afterbreak>\p{L}\.[\s\u00A0]</afterbreak>
|
5213
5302
|
</rule>
|
5214
5303
|
<rule break="no">
|
5215
5304
|
<beforebreak>\b\p{L}\.</beforebreak>
|
5216
5305
|
<afterbreak>\p{L}\.</afterbreak>
|
5217
5306
|
</rule>
|
5218
5307
|
<rule break="no"><!-- Je suis (...) Chris. -->
|
5219
|
-
<beforebreak>(…|\.\.\.)\s?\)\s</beforebreak>
|
5308
|
+
<beforebreak>(…|\.\.\.)[\s\u00A0]?\)[\s\u00A0]</beforebreak>
|
5220
5309
|
<afterbreak>[^\p{P}]</afterbreak>
|
5221
5310
|
</rule>
|
5222
5311
|
<rule break="no"><!-- Je suis (...?) Chris. -->
|
5223
|
-
<beforebreak>(…|\.\.\.)\s?\?\)\s</beforebreak>
|
5312
|
+
<beforebreak>(…|\.\.\.)[\s\u00A0]?\?\)[\s\u00A0]</beforebreak>
|
5224
5313
|
<afterbreak>[^\p{P}]</afterbreak>
|
5225
5314
|
</rule>
|
5226
5315
|
<rule break="no"><!-- Jones v. Smith -->
|
5227
|
-
<beforebreak>\p{Lu}\p{L}
|
5316
|
+
<beforebreak>\p{Lu}\p{L}+[\s\u00A0]v\.[\s\u00A0]</beforebreak>
|
5228
5317
|
<afterbreak>\p{Lu}\p{L}+</afterbreak>
|
5229
5318
|
</rule>
|
5230
5319
|
<rule break="yes">
|
5231
|
-
<beforebreak>[^,][\s]\p{L}{2}
|
5232
|
-
<afterbreak>\p{N}+\)\s</afterbreak>
|
5320
|
+
<beforebreak>[^,][\s\u00A0]\p{L}{2}\.[\s\u00A0]</beforebreak>
|
5321
|
+
<afterbreak>\p{N}+\)[\s\u00A0]</afterbreak>
|
5233
5322
|
</rule>
|
5234
5323
|
<rule break="no">
|
5235
|
-
<beforebreak>[\.\s]\p{L}{1,2}
|
5324
|
+
<beforebreak>[\.\s\u00A0]\p{L}{1,2}\.[\s\u00A0]</beforebreak>
|
5236
5325
|
<afterbreak>[\p{N}\p{Ll}]</afterbreak>
|
5237
5326
|
</rule>
|
5238
5327
|
<rule break="no">
|
@@ -5240,31 +5329,31 @@
|
|
5240
5329
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
5241
5330
|
</rule>
|
5242
5331
|
<rule break="no">
|
5243
|
-
<beforebreak>\b\p{Lu}
|
5332
|
+
<beforebreak>\b\p{Lu}\.[\s\u00A0]\p{Lu}\.[\s\u00A0]</beforebreak>
|
5244
5333
|
<afterbreak></afterbreak>
|
5245
5334
|
</rule>
|
5246
5335
|
<rule break="no">
|
5247
|
-
<beforebreak>\b\p{Lu}\.\p{Lu}
|
5336
|
+
<beforebreak>\b\p{Lu}\.\p{Lu}\.[\s\u00A0]</beforebreak>
|
5248
5337
|
<afterbreak></afterbreak>
|
5249
5338
|
</rule>
|
5250
5339
|
<rule break="no">
|
5251
|
-
<beforebreak>[^\.]\s[A-Z]
|
5340
|
+
<beforebreak>[^\.][\s\u00A0][A-Z]\.[\s\u00A0]</beforebreak>
|
5252
5341
|
<afterbreak></afterbreak>
|
5253
5342
|
</rule>
|
5254
5343
|
<rule break="no">
|
5255
|
-
<beforebreak>\b(:?Blvd|Ave|Mts?)
|
5344
|
+
<beforebreak>\b(:?Blvd|Ave|Mts?)\.[\s\u00A0]</beforebreak>
|
5256
5345
|
<afterbreak>\p{Ll}+</afterbreak>
|
5257
5346
|
</rule>
|
5258
5347
|
<rule break="no">
|
5259
|
-
<beforebreak>\b(?:Kan|Ill|M[ai]ss)
|
5348
|
+
<beforebreak>\b(?:Kan|Ill|M[ai]ss)\.[\s\u00A0]</beforebreak>
|
5260
5349
|
<afterbreak>\p{Ll}+</afterbreak>
|
5261
5350
|
</rule>
|
5262
5351
|
<rule break="no">
|
5263
|
-
<beforebreak>\(\p{Ll}
|
5352
|
+
<beforebreak>\(\p{Ll}+\.[\s\u00A0]</beforebreak>
|
5264
5353
|
<afterbreak></afterbreak>
|
5265
5354
|
</rule>
|
5266
5355
|
<rule break="no"><!-- i.e. -->
|
5267
|
-
<beforebreak>i\.e
|
5356
|
+
<beforebreak>i\.e\.[\s\u00A0]</beforebreak><!-- "i.e." is never at end of sentence -->
|
5268
5357
|
<afterbreak></afterbreak>
|
5269
5358
|
</rule>
|
5270
5359
|
<rule break="no"><!-- U.S.A (no dot at end) -->
|
@@ -5280,28 +5369,28 @@
|
|
5280
5369
|
<afterbreak>[SK]\b</afterbreak>
|
5281
5370
|
</rule>
|
5282
5371
|
<rule break="no"><!-- No. 5 -->
|
5283
|
-
<beforebreak>\b[nN]o
|
5372
|
+
<beforebreak>\b[nN]o\.[\s\u00A0]</beforebreak>
|
5284
5373
|
<afterbreak>\p{N}</afterbreak>
|
5285
5374
|
</rule>
|
5286
5375
|
<rule break="no"><!-- Ph.D. -->
|
5287
|
-
<beforebreak>\bP[Hh]
|
5376
|
+
<beforebreak>\bP[Hh]\.[\s\u00A0]?</beforebreak>
|
5288
5377
|
<afterbreak>D\.?</afterbreak>
|
5289
5378
|
</rule>
|
5290
5379
|
<rule break="no"><!-- e.g. -->
|
5291
|
-
<beforebreak>\be\.g
|
5380
|
+
<beforebreak>\be\.g\.[\s\u00A0]</beforebreak>
|
5292
5381
|
<afterbreak></afterbreak>
|
5293
5382
|
</rule>
|
5294
5383
|
<rule break="no"><!-- vs. -->
|
5295
|
-
<beforebreak>\bvs
|
5384
|
+
<beforebreak>\bvs\.[\s\u00A0]</beforebreak>
|
5296
5385
|
<afterbreak></afterbreak>
|
5297
5386
|
</rule>
|
5298
5387
|
<!--"Etc." can end the sentence, so we check for the uppercase letter after it.-->
|
5299
5388
|
<rule break="no"><!-- Etc. -->
|
5300
|
-
<beforebreak>\b[Ee]tc
|
5389
|
+
<beforebreak>\b[Ee]tc\.[\s\u00A0]</beforebreak>
|
5301
5390
|
<afterbreak>[^\p{Lu}]</afterbreak>
|
5302
5391
|
</rule>
|
5303
5392
|
<rule break="no"><!-- BTW (by the way) -->
|
5304
|
-
<beforebreak>\b([Bb]tw|BTW)
|
5393
|
+
<beforebreak>\b([Bb]tw|BTW)\.[\s\u00A0]</beforebreak>
|
5305
5394
|
<afterbreak></afterbreak>
|
5306
5395
|
</rule>
|
5307
5396
|
<rule break="no">
|
@@ -5313,64 +5402,68 @@
|
|
5313
5402
|
<afterbreak>3|Buzz|Crozz</afterbreak>
|
5314
5403
|
</rule>
|
5315
5404
|
<rule break="no"><!-- Ph.D. (see rule PH_D) -->
|
5316
|
-
<beforebreak>\bP[Hh]
|
5405
|
+
<beforebreak>\bP[Hh]\.?[\s\u00A0]?[Dd]\.[\s\u00A0]</beforebreak>
|
5317
5406
|
<afterbreak></afterbreak>
|
5318
5407
|
</rule>
|
5319
5408
|
<rule break="no"><!-- "I have a B. Eng. degree" (see rule BACHELOR_ABBR) -->
|
5320
|
-
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)
|
5409
|
+
<beforebreak>\b(P[hH][dD]|BSc|BEng|BComp|BArch|MSc|MEng|MComp)\.[\s\u00A0]</beforebreak>
|
5321
5410
|
<afterbreak></afterbreak>
|
5322
5411
|
</rule>
|
5323
5412
|
<rule break="no"><!-- "I have a LL.B degree." (see rule PH_D) -->
|
5324
|
-
<beforebreak>\bLL
|
5413
|
+
<beforebreak>\bLL\.[\s\u00A0]?[BM]\.[\s\u00A0]</beforebreak>
|
5325
5414
|
<afterbreak></afterbreak>
|
5326
5415
|
</rule>
|
5327
5416
|
<rule break="no"><!-- B.Eng. (Bachelor of Engineering) -->
|
5328
|
-
<beforebreak>\b[BM]
|
5417
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5329
5418
|
<afterbreak>Eng\.?</afterbreak>
|
5330
5419
|
</rule>
|
5331
5420
|
<rule break="no"><!-- LL.B. (Bachelor of Laws) -->
|
5332
|
-
<beforebreak>\bLL
|
5421
|
+
<beforebreak>\bLL\.[\s\u00A0]?</beforebreak>
|
5333
5422
|
<afterbreak>[BM]\.?</afterbreak>
|
5334
5423
|
</rule>
|
5335
5424
|
<rule break="no"><!-- B.Sc. (Bachelor of Science) -->
|
5336
|
-
<beforebreak>\b[BM]
|
5425
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5337
5426
|
<afterbreak>Sc\.?</afterbreak>
|
5338
5427
|
</rule>
|
5339
5428
|
<rule break="no"><!-- B.Comp. (Bachelor of Computing) -->
|
5340
|
-
<beforebreak>\b[BM]
|
5429
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5341
5430
|
<afterbreak>Comp?\.?</afterbreak>
|
5342
5431
|
</rule>
|
5343
5432
|
<rule break="no"><!-- B.Arch. (Bachelor of Architecture) -->
|
5344
|
-
<beforebreak>\b[BM]
|
5433
|
+
<beforebreak>\b[BM]\.[\s\u00A0]?</beforebreak>
|
5345
5434
|
<afterbreak>Arch\.?</afterbreak>
|
5346
5435
|
</rule>
|
5347
5436
|
<rule break="no">
|
5348
|
-
<beforebreak>\b[BM]
|
5437
|
+
<beforebreak>\b[BM]\.?[\s\u00A0]?(Sc|Eng|Comp|Arch)\.[\s\u00A0]</beforebreak>
|
5349
5438
|
<afterbreak></afterbreak>
|
5350
5439
|
</rule>
|
5351
5440
|
<rule break="no">
|
5352
|
-
<beforebreak>\bI(nc|NC)
|
5441
|
+
<beforebreak>\bI(nc|NC)\.[\s\u00A0]</beforebreak>
|
5353
5442
|
<afterbreak></afterbreak>
|
5354
5443
|
</rule>
|
5355
5444
|
<rule break="no">
|
5356
|
-
<beforebreak>\bCorp
|
5445
|
+
<beforebreak>\bCorp\.[\s\u00A0]</beforebreak>
|
5357
5446
|
<afterbreak></afterbreak>
|
5358
5447
|
</rule>
|
5359
5448
|
<rule break="no">
|
5360
|
-
<beforebreak>\bBros
|
5449
|
+
<beforebreak>\bBros\.[\s\u00A0]</beforebreak>
|
5361
5450
|
<afterbreak></afterbreak>
|
5362
5451
|
</rule>
|
5363
5452
|
<rule break="no">
|
5364
|
-
<beforebreak>\bLtd
|
5453
|
+
<beforebreak>\bLtd\.[\s\u00A0]</beforebreak>
|
5365
5454
|
<afterbreak>\p{Ll}+</afterbreak>
|
5366
5455
|
</rule>
|
5367
5456
|
<rule break="no">
|
5368
|
-
<beforebreak>\bCo
|
5457
|
+
<beforebreak>\bCo\.[\s\u00A0]</beforebreak>
|
5369
5458
|
<afterbreak></afterbreak>
|
5370
5459
|
</rule>
|
5460
|
+
<rule break="no">
|
5461
|
+
<beforebreak>\bE\.[\s\u00A0]</beforebreak>
|
5462
|
+
<afterbreak>\b[Cc]oli\b</afterbreak>
|
5463
|
+
</rule>
|
5371
5464
|
<!-- Break rules -->
|
5372
5465
|
<rule break="yes">
|
5373
|
-
<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]
|
5466
|
+
<beforebreak>[\.!?…][\u0002|'|"|«|\)|\]|\}¹²³]?[\s\u00A0]+</beforebreak>
|
5374
5467
|
<afterbreak></afterbreak>
|
5375
5468
|
</rule>
|
5376
5469
|
<rule break="yes">
|
@@ -5378,7 +5471,7 @@
|
|
5378
5471
|
<afterbreak>\p{Lu}[^\p{Lu}]</afterbreak>
|
5379
5472
|
</rule>
|
5380
5473
|
<rule break="yes">
|
5381
|
-
<beforebreak
|
5474
|
+
<beforebreak>[\s\u00A0]\p{L}[\.!?…][\s\u00A0]</beforebreak>
|
5382
5475
|
<afterbreak>\p{Lu}\p{Ll}</afterbreak>
|
5383
5476
|
</rule>
|
5384
5477
|
</languagerule>
|
@@ -5440,7 +5533,7 @@
|
|
5440
5533
|
<!-- І. Коваль -->
|
5441
5534
|
<rule break="no">
|
5442
5535
|
<beforebreak>[\h\v.]([А-ЯІЇЄҐACEIHOPX]\.-)?(?<!°)[А-ЯІЇЄҐABCEIHOPX](?<!(Куан[\h]+Ю|(Петр|Олександр)([аоу]|ові|ом)?[\h]+[IІ]+))\.[\h\v]*</beforebreak>
|
5443
|
-
<afterbreak>
|
5536
|
+
<afterbreak>[А-ЯІЇЄҐ][а-яіїєґА-ЯІЇЄҐ'’ʼ]{3}</afterbreak>
|
5444
5537
|
</rule>
|
5445
5538
|
<!-- Ів. Франко (але Ів Бутільє) -->
|
5446
5539
|
<rule break="no">
|
@@ -5526,10 +5619,14 @@
|
|
5526
5619
|
</rule>
|
5527
5620
|
<!-- abbreviation with proper noun: проф. Грицько, о. Лісове -->
|
5528
5621
|
<rule break="no">
|
5529
|
-
<beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]
|
5622
|
+
<beforebreak>\b([Аа]кад|[Пп]роф|[Дд]оц|[Аа]сист|[Рр]еж|[Аа]рх|[Сс]вв?|о|ім|упоряд|чл\.-кор|[Пп]реп)\.[\h\v]*</beforebreak>
|
5530
5623
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
5531
5624
|
</rule>
|
5532
|
-
|
5625
|
+
<rule break="no">
|
5626
|
+
<beforebreak>\bМан\.[\h\v]*</beforebreak>
|
5627
|
+
<afterbreak>[\h\v]*([Сс]іті|[Юю]н)</afterbreak>
|
5628
|
+
</rule>
|
5629
|
+
<!-- смерть гр. Болтаровича, but not "9 гр." -->
|
5533
5630
|
<rule break="no">
|
5534
5631
|
<beforebreak>[^0-9][\h\v]+[Гг]р\.[\h\v]*</beforebreak>
|
5535
5632
|
<afterbreak>[\h\v]*[А-ЯІЇЄҐA-Z]</afterbreak>
|
@@ -5537,7 +5634,7 @@
|
|
5537
5634
|
<!-- арт. - артикул -->
|
5538
5635
|
<!-- TODO: арт. - артист -->
|
5539
5636
|
<rule break="no">
|
5540
|
-
<beforebreak>\b
|
5637
|
+
<beforebreak>\b([Аа]рт|[Мм]ал|[Рр]ис)\.[\h\v]*</beforebreak>
|
5541
5638
|
<afterbreak>[\h\v]*[0-9]</afterbreak>
|
5542
5639
|
</rule>
|
5543
5640
|
<!-- ХІІ р., 3-6 арт. -->
|
@@ -5802,7 +5899,7 @@
|
|
5802
5899
|
<afterbreak>['"«¡¿\p{Ps}\p{Pi}]?\p{Lu}\p{Ll}*</afterbreak>
|
5803
5900
|
</rule>
|
5804
5901
|
</languagerule>
|
5805
|
-
<languagerule languagerulename="
|
5902
|
+
<languagerule languagerulename="Ideographic">
|
5806
5903
|
<rule break="no">
|
5807
5904
|
<beforebreak>[:]+[\p{Pe}\p{Pf}\p{Po}"-[\u002C\u003A\u003B\u055D\u060C\u061B\u0703\u0704\u0705\u0706\u0707\u0708\u0709\u07F8\u1363\u1364\u1365\u1366\u1802\u1804\u1808\u204F\u205D\u3001\uA60D\uFE10\uFE11\uFE13\uFE14\uFE50\uFE51\uFE54\uFE55\uFF0C\uFF1A\uFF1B\uFF64]]*</beforebreak>
|
5808
5905
|
<afterbreak>\s+\P{Lu}</afterbreak>
|
@@ -5870,7 +5967,7 @@
|
|
5870
5967
|
</rule>
|
5871
5968
|
<!-- Abbreviations that cannot finish sentences-->
|
5872
5969
|
<rule break="no">
|
5873
|
-
<beforebreak>\b(a|Ab|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|Álg|aprox|
|
5970
|
+
<beforebreak>\b(a|Ab|abrev|absol|acad|Açor|A\. ?D|add|adj|adv|advers|Aeron|afér|Agric|Álg|aprox|[Aa]rts?|Artilh|auxil|av|Av)\.\s?</beforebreak>
|
5874
5971
|
<afterbreak></afterbreak>
|
5875
5972
|
</rule>
|
5876
5973
|
<rule break="no">
|
@@ -6279,7 +6376,7 @@
|
|
6279
6376
|
<afterbreak></afterbreak>
|
6280
6377
|
</rule>
|
6281
6378
|
<rule break="no">
|
6282
|
-
<beforebreak>[^\.]\s[
|
6379
|
+
<beforebreak>[^\.]\s[ضصثقفغعهخحجچشسیبلاتنمکگ\ظطزرذدپوًٌٍَُِّْA-Z]\.\s</beforebreak>
|
6283
6380
|
<afterbreak></afterbreak>
|
6284
6381
|
</rule>
|
6285
6382
|
<rule break="no">
|
@@ -6359,7 +6456,7 @@
|
|
6359
6456
|
</rule>
|
6360
6457
|
<!--Не раздвајај у случају као на пр.: "Петар I дошао је ..."-->
|
6361
6458
|
<rule break="no">
|
6362
|
-
<beforebreak>[\s
|
6459
|
+
<beforebreak>[\s ][IVX]+\s</beforebreak>
|
6363
6460
|
<afterbreak>[^\p{Lu}]+</afterbreak>
|
6364
6461
|
</rule>
|
6365
6462
|
<!--Не раздвајај у случају као "од 13. до 14. века"-->
|
@@ -6598,10 +6695,8 @@
|
|
6598
6695
|
<languagemap languagepattern="(UK|uk).*" languagerulename="Ukrainian"></languagemap>
|
6599
6696
|
<languagemap languagepattern="(BE|be).*" languagerulename="Belarusian"></languagemap>
|
6600
6697
|
<languagemap languagepattern="(GL|gl).*" languagerulename="Galician"></languagemap>
|
6601
|
-
<languagemap languagepattern="(JA|ja).*" languagerulename="
|
6602
|
-
|
6603
|
-
minimize diff -->
|
6604
|
-
<languagemap languagepattern="(ZH|zh).*" languagerulename="Japanese"></languagemap>
|
6698
|
+
<languagemap languagepattern="(JA|ja).*" languagerulename="Ideographic"></languagemap>
|
6699
|
+
<languagemap languagepattern="(ZH|zh).*" languagerulename="Ideographic"></languagemap>
|
6605
6700
|
<languagemap languagepattern="(BR|br).*" languagerulename="Breton"></languagemap>
|
6606
6701
|
<languagemap languagepattern="(PT|pt).*" languagerulename="Portuguese"></languagemap>
|
6607
6702
|
<languagemap languagepattern="(IT|it).*" languagerulename="Italian"></languagemap>
|