rita 3.1.2 → 3.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/rita.cjs +46 -44
- package/dist/rita.cjs.map +1 -1
- package/dist/rita.js +46 -44
- package/dist/rita.js.map +1 -1
- package/dist/rita.min.js +72 -72
- package/dist/rita.min.js.map +1 -1
- package/package.json +1 -1
package/dist/rita.cjs
CHANGED
|
@@ -874,6 +874,7 @@ var Tokenizer = class {
|
|
|
874
874
|
}
|
|
875
875
|
tokenize(input, opts = {
|
|
876
876
|
// regex: null,
|
|
877
|
+
// debug: false,
|
|
877
878
|
// splitHyphens: false,
|
|
878
879
|
// splitContractions: false
|
|
879
880
|
}) {
|
|
@@ -883,7 +884,11 @@ var Tokenizer = class {
|
|
|
883
884
|
return input.split(opts.regex);
|
|
884
885
|
let { tags, text } = this.pushTags(input.trim());
|
|
885
886
|
for (let i = 0; i < TOKENIZE_RE.length; i += 2) {
|
|
887
|
+
if (opts.debug)
|
|
888
|
+
var pre = text;
|
|
886
889
|
text = text.replace(TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
|
|
890
|
+
if (opts.debug && text !== pre)
|
|
891
|
+
console.log("HIT" + i, pre + " -> " + text, TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
|
|
887
892
|
}
|
|
888
893
|
if (opts.splitHyphens) {
|
|
889
894
|
text = text.replace(/([a-zA-Z]+)-([a-zA-Z]+)/g, "$1 - $2");
|
|
@@ -1105,85 +1110,85 @@ var APOS_RE = /^[\u2019']+$/;
|
|
|
1105
1110
|
var NL_RE = /(\r?\n)+/g;
|
|
1106
1111
|
var WWW_RE = /^(www[0-9]?|WWW[0-9]?)$/;
|
|
1107
1112
|
var NOSP_BF_PUNCT_RE = /^[,\.\;\:\?\!\)""\u201c\u201d\u2019\u2018`'%\u2026\u2103\^\*\u00b0\/\u2044\u2012\u2013\u2014\-@]+$/;
|
|
1108
|
-
var LINEBREAK_RE =
|
|
1113
|
+
var LINEBREAK_RE = /\r?\n/;
|
|
1109
1114
|
var URL_RE = /((http[s]?):(\/\/))?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/;
|
|
1110
1115
|
var EMAIL_RE = /^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$/;
|
|
1111
1116
|
var TOKENIZE_RE = [
|
|
1112
1117
|
// save --------
|
|
1113
|
-
|
|
1118
|
+
/\b([Ee])[.]([Gg])[.]/g,
|
|
1114
1119
|
"_$1$2_",
|
|
1115
1120
|
//E.g
|
|
1116
|
-
|
|
1121
|
+
/\b([Ii])[.]([Ee])[.]/g,
|
|
1117
1122
|
"_$1$2_",
|
|
1118
1123
|
//i.e
|
|
1119
|
-
|
|
1124
|
+
/\b([Aa])[.]([Mm])[.]/g,
|
|
1120
1125
|
"_$1$2_",
|
|
1121
1126
|
//a.m.
|
|
1122
|
-
|
|
1127
|
+
/\b([Pp])[.]([Mm])[.]/g,
|
|
1123
1128
|
"_$1$2_",
|
|
1124
1129
|
//p.m.
|
|
1125
|
-
|
|
1130
|
+
/\b(Cap)[\.]/g,
|
|
1126
1131
|
"_Cap_",
|
|
1127
1132
|
//Cap.
|
|
1128
|
-
|
|
1133
|
+
/\b([Cc])[\.]/g,
|
|
1129
1134
|
"_$1_",
|
|
1130
1135
|
//c.
|
|
1131
|
-
|
|
1136
|
+
/\b([Ee][Tt])[\s]([Aa][Ll])[\.]/,
|
|
1132
1137
|
"_$1zzz$2_",
|
|
1133
1138
|
// et al.
|
|
1134
|
-
|
|
1139
|
+
/\b(etc|ETC)[\.]/g,
|
|
1135
1140
|
"_$1_",
|
|
1136
1141
|
//etc.
|
|
1137
|
-
|
|
1142
|
+
/\b([Pp])[\.]([Ss])[\.]/g,
|
|
1138
1143
|
"_$1$2dot_",
|
|
1139
1144
|
// p.s.
|
|
1140
|
-
|
|
1145
|
+
/\b([Pp])[\.]([Ss])/g,
|
|
1141
1146
|
"_$1$2_",
|
|
1142
1147
|
// p.s
|
|
1143
|
-
|
|
1148
|
+
/\b([Pp])([Hh])[\.]([Dd])/g,
|
|
1144
1149
|
"_$1$2$3_",
|
|
1145
1150
|
// Ph.D
|
|
1146
|
-
|
|
1151
|
+
/\b([Rr])[\.]([Ii])[\.]([Pp])/g,
|
|
1147
1152
|
"_$1$2$3_",
|
|
1148
1153
|
// R.I.P
|
|
1149
|
-
|
|
1154
|
+
/\b([Vv])([Ss]?)[\.]/g,
|
|
1150
1155
|
"_$1$2_",
|
|
1151
1156
|
// vs. and v.
|
|
1152
|
-
|
|
1157
|
+
/\b([Mm])([Rr]|[Ss]|[Xx])\./g,
|
|
1153
1158
|
"_$1$2_",
|
|
1154
1159
|
// Mr. Ms. and Mx.
|
|
1155
|
-
|
|
1160
|
+
/\b([Dd])([Rr])[\.]/g,
|
|
1156
1161
|
"_$1$2_",
|
|
1157
1162
|
// Dr.
|
|
1158
|
-
|
|
1163
|
+
/\b([Pp])([Ff])[\.]/g,
|
|
1159
1164
|
"_$1$2_",
|
|
1160
1165
|
// Pf.
|
|
1161
|
-
|
|
1166
|
+
/\b([Ii])([Nn])([Dd]|[Cc])[\.]/g,
|
|
1162
1167
|
"_$1$2$3_",
|
|
1163
1168
|
// Ind. and Inc.
|
|
1164
|
-
|
|
1169
|
+
/\b([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g,
|
|
1165
1170
|
"_$1$2dcs$3$4$5_",
|
|
1166
1171
|
// co., ltd.
|
|
1167
|
-
|
|
1172
|
+
/\b([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g,
|
|
1168
1173
|
"_$1$2ds$3$4$5_",
|
|
1169
1174
|
// co. ltd.
|
|
1170
|
-
|
|
1175
|
+
/\b([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g,
|
|
1171
1176
|
"_$1$2dc$3$4$5_",
|
|
1172
1177
|
// co.,ltd.
|
|
1173
|
-
|
|
1178
|
+
/\b([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g,
|
|
1174
1179
|
"_$1$2$3$4_",
|
|
1175
1180
|
// Corp. and Co.
|
|
1176
|
-
|
|
1181
|
+
/\b([Ll])([Tt])([Dd])[\.]/g,
|
|
1177
1182
|
"_$1$2$3_",
|
|
1178
1183
|
// ltd.
|
|
1179
|
-
|
|
1184
|
+
/\b(prof|Prof|PROF)[\.]/g,
|
|
1180
1185
|
"_$1_",
|
|
1181
1186
|
//Prof.
|
|
1182
1187
|
// /(\w+([\.-_]?\w+)*)@(\w+([\.-_]?\w+)*)\.(\w{2,3})/g, "$1__AT__$3.$5", //email addresses
|
|
1183
1188
|
// /^\w+([\.-]?\w+)+@\w+([\.:]?\w+)+(\.[a-zA-Z0-9]{2,3})+$/g, "$1__AT__$2", //email addresses
|
|
1184
|
-
|
|
1189
|
+
/\b([\w.]+)@(\w+\.\w+)/g,
|
|
1185
1190
|
"$1__AT__$2",
|
|
1186
|
-
|
|
1191
|
+
/\b((http[s]?):(\/\/))([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/g,
|
|
1187
1192
|
"$2COLON$3$4$5",
|
|
1188
1193
|
//urls with http(s)
|
|
1189
1194
|
//decimal #
|
|
@@ -1212,12 +1217,10 @@ var TOKENIZE_RE = [
|
|
|
1212
1217
|
/\r/g,
|
|
1213
1218
|
" _CARRIAGERETURN_ ",
|
|
1214
1219
|
// CR
|
|
1215
|
-
|
|
1216
|
-
" _RECORDSEPARATOR_ ",
|
|
1217
|
-
// RS
|
|
1220
|
+
///\036/g, " _RECORDSEPARATOR_ ", // RS
|
|
1218
1221
|
//--------------------------
|
|
1219
1222
|
/\.\.\.\s/g,
|
|
1220
|
-
"
|
|
1223
|
+
"_elipsis_ ",
|
|
1221
1224
|
/([\?!\"\u201C\.,;:@#$%&])/g,
|
|
1222
1225
|
" $1 ",
|
|
1223
1226
|
/\u2026/g,
|
|
@@ -1259,7 +1262,7 @@ var TOKENIZE_RE = [
|
|
|
1259
1262
|
" ^ ",
|
|
1260
1263
|
/\u00b0/g,
|
|
1261
1264
|
" \xB0 ",
|
|
1262
|
-
/
|
|
1265
|
+
/_elipsis_/g,
|
|
1263
1266
|
" ... ",
|
|
1264
1267
|
//pop ------------------
|
|
1265
1268
|
/_([Ee])([Gg])_/g,
|
|
@@ -1356,30 +1359,29 @@ var TOKENIZE_RE = [
|
|
|
1356
1359
|
"\r\n",
|
|
1357
1360
|
// CR LF
|
|
1358
1361
|
/_LINEFEEDCARRIAGERETURN_/g,
|
|
1359
|
-
"\n\r"
|
|
1362
|
+
"\n\r"
|
|
1360
1363
|
// LF CR
|
|
1361
|
-
|
|
1362
|
-
"\\036"
|
|
1363
|
-
// RS
|
|
1364
|
+
///_RECORDSEPARATOR_/g, "\\036", // RS
|
|
1364
1365
|
];
|
|
1365
1366
|
var CONTRACTS_RE = [
|
|
1366
1367
|
// TODO: 'She'd have wanted' -> 'She would have wanted'
|
|
1367
|
-
|
|
1368
|
+
// WORKING HERE: add word boundaries \b to these
|
|
1369
|
+
/\b([Cc])an['\u2019]t/g,
|
|
1368
1370
|
"$1an not",
|
|
1369
|
-
|
|
1371
|
+
/\b([Dd])idn['\u2019]t/g,
|
|
1370
1372
|
"$1id not",
|
|
1371
|
-
|
|
1373
|
+
/\b([CcWw])ouldn['\u2019]t/g,
|
|
1372
1374
|
"$1ould not",
|
|
1373
|
-
|
|
1375
|
+
/\b([Ss])houldn['\u2019]t/g,
|
|
1374
1376
|
"$1hould not",
|
|
1375
|
-
|
|
1377
|
+
/\b([Ii])t['\u2019]s/g,
|
|
1376
1378
|
"$1t is",
|
|
1377
|
-
|
|
1379
|
+
/\b([tT]hat)['\u2019]s/g,
|
|
1378
1380
|
"$1 is",
|
|
1379
|
-
|
|
1381
|
+
/\b(she|he|you|they|i)['\u2019]d/gi,
|
|
1380
1382
|
"$1 had",
|
|
1381
1383
|
// changed from would, 12/8/23
|
|
1382
|
-
|
|
1384
|
+
/\b(she|he|you|they|i)['\u2019]ll/gi,
|
|
1383
1385
|
"$1 will",
|
|
1384
1386
|
/n['\u2019]t /g,
|
|
1385
1387
|
" not ",
|
|
@@ -43237,7 +43239,7 @@ markov_default.parent = RiTa;
|
|
|
43237
43239
|
stemmer_default.tokenizer = RiTa.tokenizer;
|
|
43238
43240
|
RiTa.SILENT = false;
|
|
43239
43241
|
RiTa.SILENCE_LTS = false;
|
|
43240
|
-
RiTa.VERSION = "3.1.
|
|
43242
|
+
RiTa.VERSION = "3.1.3";
|
|
43241
43243
|
RiTa.FIRST = 1;
|
|
43242
43244
|
RiTa.SECOND = 2;
|
|
43243
43245
|
RiTa.THIRD = 3;
|