rita 3.1.1 → 3.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,3 +1,6 @@
1
+ ![ritajs](pen.jpg)
2
+
3
+
1
4
  <a href="https://github.com/dhowe/rjs3/actions"><img src="https://github.com/dhowe/rjs3/actions/workflows/node.js.yml/badge.svg" alt="ci tests"></a> <a href="https://www.npmjs.com/package/rita"><img src="https://img.shields.io/npm/v/rita.svg" alt="npm version"></a> <a href="https://www.gnu.org/licenses/gpl-3.0.en.html"><img src="https://img.shields.io/badge/license-GPL-orange.svg" alt="license"></a>
2
5
 
3
6
 
@@ -22,6 +25,7 @@ Note: version 3.0 contains breaking changes -- please check the [release notes](
22
25
  * For [node](#with-nodejs-and-npm): `$ npm install rita`
23
26
  ```let { RiTa } = require('rita');```
24
27
  * For [browsers](#a-simple-browser-sketch): ```<script src="https://unpkg.com/rita"></script>```
28
+ * For [p5.js](#with-p5js): ```<script src="https://unpkg.com/rita"></script>```
25
29
  * For [developers](#developing)
26
30
 
27
31
 
package/dist/rita.cjs CHANGED
@@ -874,6 +874,7 @@ var Tokenizer = class {
874
874
  }
875
875
  tokenize(input, opts = {
876
876
  // regex: null,
877
+ // debug: false,
877
878
  // splitHyphens: false,
878
879
  // splitContractions: false
879
880
  }) {
@@ -883,7 +884,11 @@ var Tokenizer = class {
883
884
  return input.split(opts.regex);
884
885
  let { tags, text } = this.pushTags(input.trim());
885
886
  for (let i = 0; i < TOKENIZE_RE.length; i += 2) {
887
+ if (opts.debug)
888
+ var pre = text;
886
889
  text = text.replace(TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
890
+ if (opts.debug && text !== pre)
891
+ console.log("HIT" + i, pre + " -> " + text, TOKENIZE_RE[i], TOKENIZE_RE[i + 1]);
887
892
  }
888
893
  if (opts.splitHyphens) {
889
894
  text = text.replace(/([a-zA-Z]+)-([a-zA-Z]+)/g, "$1 - $2");
@@ -1105,85 +1110,85 @@ var APOS_RE = /^[\u2019']+$/;
1105
1110
  var NL_RE = /(\r?\n)+/g;
1106
1111
  var WWW_RE = /^(www[0-9]?|WWW[0-9]?)$/;
1107
1112
  var NOSP_BF_PUNCT_RE = /^[,\.\;\:\?\!\)""\u201c\u201d\u2019\u2018`'%\u2026\u2103\^\*\u00b0\/\u2044\u2012\u2013\u2014\-@]+$/;
1108
- var LINEBREAK_RE = /[\n\r\036]/;
1113
+ var LINEBREAK_RE = /\r?\n/;
1109
1114
  var URL_RE = /((http[s]?):(\/\/))?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/;
1110
1115
  var EMAIL_RE = /^\w+([\.-]?\w+)*@\w+([\.-]?\w+)*(\.\w{2,3})+$/;
1111
1116
  var TOKENIZE_RE = [
1112
1117
  // save --------
1113
- /([Ee])[.]([Gg])[.]/g,
1118
+ /\b([Ee])[.]([Gg])[.]/g,
1114
1119
  "_$1$2_",
1115
1120
  //E.g
1116
- /([Ii])[.]([Ee])[.]/g,
1121
+ /\b([Ii])[.]([Ee])[.]/g,
1117
1122
  "_$1$2_",
1118
1123
  //i.e
1119
- /([Aa])[.]([Mm])[.]/g,
1124
+ /\b([Aa])[.]([Mm])[.]/g,
1120
1125
  "_$1$2_",
1121
1126
  //a.m.
1122
- /([Pp])[.]([Mm])[.]/g,
1127
+ /\b([Pp])[.]([Mm])[.]/g,
1123
1128
  "_$1$2_",
1124
1129
  //p.m.
1125
- /(Cap)[\.]/g,
1130
+ /\b(Cap)[\.]/g,
1126
1131
  "_Cap_",
1127
1132
  //Cap.
1128
- /([Cc])[\.]/g,
1133
+ /\b([Cc])[\.]/g,
1129
1134
  "_$1_",
1130
1135
  //c.
1131
- /([Ee][Tt])[\s]([Aa][Ll])[\.]/,
1136
+ /\b([Ee][Tt])[\s]([Aa][Ll])[\.]/,
1132
1137
  "_$1zzz$2_",
1133
1138
  // et al.
1134
- /(etc|ETC)[\.]/g,
1139
+ /\b(etc|ETC)[\.]/g,
1135
1140
  "_$1_",
1136
1141
  //etc.
1137
- /([Pp])[\.]([Ss])[\.]/g,
1142
+ /\b([Pp])[\.]([Ss])[\.]/g,
1138
1143
  "_$1$2dot_",
1139
1144
  // p.s.
1140
- /([Pp])[\.]([Ss])/g,
1145
+ /\b([Pp])[\.]([Ss])/g,
1141
1146
  "_$1$2_",
1142
1147
  // p.s
1143
- /([Pp])([Hh])[\.]([Dd])/g,
1148
+ /\b([Pp])([Hh])[\.]([Dd])/g,
1144
1149
  "_$1$2$3_",
1145
1150
  // Ph.D
1146
- /([Rr])[\.]([Ii])[\.]([Pp])/g,
1151
+ /\b([Rr])[\.]([Ii])[\.]([Pp])/g,
1147
1152
  "_$1$2$3_",
1148
1153
  // R.I.P
1149
- /([Vv])([Ss]?)[\.]/g,
1154
+ /\b([Vv])([Ss]?)[\.]/g,
1150
1155
  "_$1$2_",
1151
1156
  // vs. and v.
1152
- /([Mm])([Rr]|[Ss]|[Xx])[\.]/g,
1157
+ /\b([Mm])([Rr]|[Ss]|[Xx])\./g,
1153
1158
  "_$1$2_",
1154
1159
  // Mr. Ms. and Mx.
1155
- /([Dd])([Rr])[\.]/g,
1160
+ /\b([Dd])([Rr])[\.]/g,
1156
1161
  "_$1$2_",
1157
1162
  // Dr.
1158
- /([Pp])([Ff])[\.]/g,
1163
+ /\b([Pp])([Ff])[\.]/g,
1159
1164
  "_$1$2_",
1160
1165
  // Pf.
1161
- /([Ii])([Nn])([Dd]|[Cc])[\.]/g,
1166
+ /\b([Ii])([Nn])([Dd]|[Cc])[\.]/g,
1162
1167
  "_$1$2$3_",
1163
1168
  // Ind. and Inc.
1164
- /([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g,
1169
+ /\b([Cc])([Oo])[\.][\,][\s]([Ll])([Tt])([Dd])[\.]/g,
1165
1170
  "_$1$2dcs$3$4$5_",
1166
1171
  // co., ltd.
1167
- /([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g,
1172
+ /\b([Cc])([Oo])[\.][\s]([Ll])([Tt])([Dd])[\.]/g,
1168
1173
  "_$1$2ds$3$4$5_",
1169
1174
  // co. ltd.
1170
- /([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g,
1175
+ /\b([Cc])([Oo])[\.][\,]([Ll])([Tt])([Dd])[\.]/g,
1171
1176
  "_$1$2dc$3$4$5_",
1172
1177
  // co.,ltd.
1173
- /([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g,
1178
+ /\b([Cc])([Oo])([Rr]?)([Pp]?)[\.]/g,
1174
1179
  "_$1$2$3$4_",
1175
1180
  // Corp. and Co.
1176
- /([Ll])([Tt])([Dd])[\.]/g,
1181
+ /\b([Ll])([Tt])([Dd])[\.]/g,
1177
1182
  "_$1$2$3_",
1178
1183
  // ltd.
1179
- /(prof|Prof|PROF)[\.]/g,
1184
+ /\b(prof|Prof|PROF)[\.]/g,
1180
1185
  "_$1_",
1181
1186
  //Prof.
1182
1187
  // /(\w+([\.-_]?\w+)*)@(\w+([\.-_]?\w+)*)\.(\w{2,3})/g, "$1__AT__$3.$5", //email addresses
1183
1188
  // /^\w+([\.-]?\w+)+@\w+([\.:]?\w+)+(\.[a-zA-Z0-9]{2,3})+$/g, "$1__AT__$2", //email addresses
1184
- /([\w.]+)@(\w+\.\w+)/g,
1189
+ /\b([\w.]+)@(\w+\.\w+)/g,
1185
1190
  "$1__AT__$2",
1186
- /((http[s]?):(\/\/))([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/g,
1191
+ /\b((http[s]?):(\/\/))([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b)([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)/g,
1187
1192
  "$2COLON$3$4$5",
1188
1193
  //urls with http(s)
1189
1194
  //decimal #
@@ -1212,12 +1217,10 @@ var TOKENIZE_RE = [
1212
1217
  /\r/g,
1213
1218
  " _CARRIAGERETURN_ ",
1214
1219
  // CR
1215
- /\036/g,
1216
- " _RECORDSEPARATOR_ ",
1217
- // RS
1220
+ ///\036/g, " _RECORDSEPARATOR_ ", // RS
1218
1221
  //--------------------------
1219
1222
  /\.\.\.\s/g,
1220
- "_elipsisDDD_ ",
1223
+ "_elipsis_ ",
1221
1224
  /([\?!\"\u201C\.,;:@#$%&])/g,
1222
1225
  " $1 ",
1223
1226
  /\u2026/g,
@@ -1259,7 +1262,7 @@ var TOKENIZE_RE = [
1259
1262
  " ^ ",
1260
1263
  /\u00b0/g,
1261
1264
  " \xB0 ",
1262
- /_elipsisDDD_/g,
1265
+ /_elipsis_/g,
1263
1266
  " ... ",
1264
1267
  //pop ------------------
1265
1268
  /_([Ee])([Gg])_/g,
@@ -1356,30 +1359,29 @@ var TOKENIZE_RE = [
1356
1359
  "\r\n",
1357
1360
  // CR LF
1358
1361
  /_LINEFEEDCARRIAGERETURN_/g,
1359
- "\n\r",
1362
+ "\n\r"
1360
1363
  // LF CR
1361
- /_RECORDSEPARATOR_/g,
1362
- "\\036"
1363
- // RS
1364
+ ///_RECORDSEPARATOR_/g, "\\036", // RS
1364
1365
  ];
1365
1366
  var CONTRACTS_RE = [
1366
1367
  // TODO: 'She'd have wanted' -> 'She would have wanted'
1367
- /([Cc])an['\u2019]t/g,
1368
+ // WORKING HERE: add word boundaries \b to these
1369
+ /\b([Cc])an['\u2019]t/g,
1368
1370
  "$1an not",
1369
- /([Dd])idn['\u2019]t/g,
1371
+ /\b([Dd])idn['\u2019]t/g,
1370
1372
  "$1id not",
1371
- /([CcWw])ouldn['\u2019]t/g,
1373
+ /\b([CcWw])ouldn['\u2019]t/g,
1372
1374
  "$1ould not",
1373
- /([Ss])houldn['\u2019]t/g,
1375
+ /\b([Ss])houldn['\u2019]t/g,
1374
1376
  "$1hould not",
1375
- /([Ii])t['\u2019]s/g,
1377
+ /\b([Ii])t['\u2019]s/g,
1376
1378
  "$1t is",
1377
- /([tT]hat)['\u2019]s/g,
1379
+ /\b([tT]hat)['\u2019]s/g,
1378
1380
  "$1 is",
1379
- /(she|he|you|they|i)['\u2019]d/gi,
1381
+ /\b(she|he|you|they|i)['\u2019]d/gi,
1380
1382
  "$1 had",
1381
1383
  // changed from would, 12/8/23
1382
- /(she|he|you|they|i)['\u2019]ll/gi,
1384
+ /\b(she|he|you|they|i)['\u2019]ll/gi,
1383
1385
  "$1 will",
1384
1386
  /n['\u2019]t /g,
1385
1387
  " not ",
@@ -1444,11 +1446,10 @@ var Conjugator = class {
1444
1446
  });
1445
1447
  this.RiTa = parent;
1446
1448
  this._reset();
1447
- this.RiTa.search({ pos: "v", limit: -1, minLength: -1 }).then((res) => {
1448
- this.allVerbs = res;
1449
- this.verbsEndingInE = res.filter((v) => v.endsWith("e"));
1450
- this.verbsEndingInDouble = res.filter((v) => /([^])\1$/.test(v));
1451
- });
1449
+ let data = this.RiTa.lexicon.data;
1450
+ this.allVerbs = Object.keys(data).filter((word) => data[word][1].split(" ").includes("vb"));
1451
+ this.verbsEndingInE = this.allVerbs.filter((v) => v.endsWith("e"));
1452
+ this.verbsEndingInDouble = this.allVerbs.filter((v) => /([^])\1$/.test(v));
1452
1453
  }
1453
1454
  // TODO: add handling of past tense modals.
1454
1455
  conjugate(verb, args) {
@@ -27402,7 +27403,7 @@ var Lexicon = class {
27402
27403
  }
27403
27404
  regex = opts.regex;
27404
27405
  if (typeof regex === "string") {
27405
- if (opts && opts.type === "stresses") {
27406
+ if (opts && /^stress(es)?$/.test(opts.type)) {
27406
27407
  if (/^\^?[01]+\$?$/.test(regex)) {
27407
27408
  regex = regex.replace(/([01])(?=([01]))/g, "$1/");
27408
27409
  }
@@ -43238,7 +43239,7 @@ markov_default.parent = RiTa;
43238
43239
  stemmer_default.tokenizer = RiTa.tokenizer;
43239
43240
  RiTa.SILENT = false;
43240
43241
  RiTa.SILENCE_LTS = false;
43241
- RiTa.VERSION = "3.1.1";
43242
+ RiTa.VERSION = "3.1.3";
43242
43243
  RiTa.FIRST = 1;
43243
43244
  RiTa.SECOND = 2;
43244
43245
  RiTa.THIRD = 3;