@nationaldesignstudio/rampart 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -34966,6 +34966,9 @@ async function loadNerClassifier(options = {}) {
34966
34966
  if (tokenizer?.encode) {
34967
34967
  adapter.countTokens = (text) => tokenizer.encode(text, { add_special_tokens: false }).length;
34968
34968
  }
34969
+ if (tokenizer?.tokenize) {
34970
+ adapter.tokenize = (text) => tokenizer.tokenize(text);
34971
+ }
34969
34972
  return adapter;
34970
34973
  }
34971
34974
  async function detectNer(raw, classifier, minScore = DEFAULT_OPTIONS.minScore) {
@@ -35057,8 +35060,9 @@ async function detectNerWindow(raw, classifier, minScore = DEFAULT_OPTIONS.minSc
35057
35060
  const inferText = raw.replaceAll("-", " ");
35058
35061
  const entities = await classifier(inferText, { aggregation_strategy: "simple" });
35059
35062
  const folded = foldForModel(inferText);
35063
+ const indexOffsets = classifier.tokenize === undefined ? undefined : buildTokenIndexOffsets(folded, classifier.tokenize(inferText));
35060
35064
  const candidates = [];
35061
- for (const entity of mergeBioTokens(entities, folded)) {
35065
+ for (const entity of mergeBioTokens(entities, folded, indexOffsets)) {
35062
35066
  const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
35063
35067
  if (label === undefined)
35064
35068
  continue;
@@ -35095,7 +35099,31 @@ function foldForModel(raw) {
35095
35099
  }
35096
35100
  return { text, rawStart, rawEnd };
35097
35101
  }
35098
- function mergeBioTokens(entities, folded) {
35102
+ function buildTokenIndexOffsets(folded, contentTokens) {
35103
+ const offsets = [[0, 0]];
35104
+ let cursor = 0;
35105
+ for (const token of contentTokens) {
35106
+ const piece = token.startsWith("##") ? token.slice(2) : token;
35107
+ const at = token.startsWith("##") ? cursor : folded.text.indexOf(piece, cursor);
35108
+ if (piece.length === 0 || at < 0 || at + piece.length > folded.text.length) {
35109
+ offsets.push([0, 0]);
35110
+ continue;
35111
+ }
35112
+ offsets.push([folded.rawStart[at], folded.rawEnd[at + piece.length - 1]]);
35113
+ cursor = at + piece.length;
35114
+ }
35115
+ offsets.push([0, 0]);
35116
+ return offsets;
35117
+ }
35118
+ function offsetFromTokenIndex(indexOffsets, index) {
35119
+ if (indexOffsets === undefined || index === undefined)
35120
+ return null;
35121
+ const pair = indexOffsets[index];
35122
+ if (pair === undefined || pair[0] === pair[1])
35123
+ return null;
35124
+ return pair;
35125
+ }
35126
+ function mergeBioTokens(entities, folded, indexOffsets) {
35099
35127
  const out = [];
35100
35128
  let cursor = 0;
35101
35129
  let current = null;
@@ -35115,16 +35143,24 @@ function mergeBioTokens(entities, folded) {
35115
35143
  if (rawLabel === undefined)
35116
35144
  continue;
35117
35145
  const { prefix, base } = stripBio(rawLabel);
35118
- const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
35119
- if (!word)
35120
- continue;
35121
- const at = folded.text.indexOf(word, cursor);
35122
- if (at < 0)
35123
- continue;
35124
- const start = folded.rawStart[at];
35125
- const end = folded.rawEnd[at + word.length - 1];
35126
- cursor = at + word.length;
35127
- const continues = current !== null && current.group === base && prefix !== "B";
35146
+ const indexed = offsetFromTokenIndex(indexOffsets, entity.index);
35147
+ let start;
35148
+ let end;
35149
+ if (indexed !== null) {
35150
+ [start, end] = indexed;
35151
+ } else {
35152
+ const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
35153
+ if (!word)
35154
+ continue;
35155
+ const at = folded.text.indexOf(word, cursor);
35156
+ if (at < 0)
35157
+ continue;
35158
+ start = folded.rawStart[at];
35159
+ end = folded.rawEnd[at + word.length - 1];
35160
+ cursor = at + word.length;
35161
+ }
35162
+ const isSubword = (entity.word ?? "").startsWith("##");
35163
+ const continues = current !== null && current.group === base && (prefix !== "B" || isSubword);
35128
35164
  if (continues && current !== null) {
35129
35165
  current.end = end;
35130
35166
  current.score += entity.score;
@@ -35161,7 +35197,7 @@ function repairSpans(raw, spans, anchorScore) {
35161
35197
  }
35162
35198
  kept = merged;
35163
35199
  for (let i = 0;i < kept.length; i++) {
35164
- const repaired = rescueCapitalizedParticles(raw, kept[i]);
35200
+ const repaired = rescueCapitalizedParticles(raw, kept[i], kept, i);
35165
35201
  if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
35166
35202
  kept[i] = repaired;
35167
35203
  changed = true;
@@ -35208,17 +35244,28 @@ function mergeAdjacentConnectors(raw, spans) {
35208
35244
  }
35209
35245
  return merged;
35210
35246
  }
35211
- function rescueCapitalizedParticles(raw, span) {
35247
+ function rescueCapitalizedParticles(raw, span, all = [], selfIndex = -1) {
35212
35248
  if (!PERSON_LABELS.has(span.label))
35213
35249
  return span;
35250
+ let leftBound = 0;
35251
+ let rightBound = raw.length;
35252
+ for (let i = 0;i < all.length; i++) {
35253
+ if (i === selfIndex)
35254
+ continue;
35255
+ const other = all[i];
35256
+ if (other.end <= span.start && other.end > leftBound)
35257
+ leftBound = other.end;
35258
+ if (other.start >= span.end && other.start < rightBound)
35259
+ rightBound = other.start;
35260
+ }
35214
35261
  let start = span.start;
35215
35262
  let end = span.end;
35216
35263
  const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
35217
- if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1)) {
35264
+ if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1) && start - left[0].length >= leftBound) {
35218
35265
  start -= left[0].length;
35219
35266
  }
35220
35267
  const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
35221
- if (right !== null && !right[1].includes(".")) {
35268
+ if (right !== null && !right[1].includes(".") && end + right[0].length <= rightBound) {
35222
35269
  end += right[0].length;
35223
35270
  }
35224
35271
  return start === span.start && end === span.end ? span : { ...span, start, end, text: raw.slice(start, end) };
@@ -35635,5 +35682,5 @@ export {
35635
35682
  ChatGuard
35636
35683
  };
35637
35684
 
35638
- //# debugId=6D8E27590A5F82A064756E2164756E21
35685
+ //# debugId=03EC2D3780505F3564756E2164756E21
35639
35686
  //# sourceMappingURL=index.js.map