@nationaldesignstudio/rampart 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +64 -17
- package/dist/index.js.map +3 -3
- package/dist/src/ner/classifier.d.ts +4 -0
- package/dist/src/ner/classifier.d.ts.map +1 -1
- package/dist/src/ner/worker.js +64 -17
- package/dist/src/ner/worker.js.map +3 -3
- package/eval/public-cases.ts +31 -0
- package/package.json +2 -2
- package/src/ner/classifier.ts +107 -24
|
@@ -21,6 +21,8 @@ interface RawEntity {
|
|
|
21
21
|
readonly start: number;
|
|
22
22
|
readonly end: number;
|
|
23
23
|
readonly word: string;
|
|
24
|
+
/** Index into `[CLS] + content + [SEP]`; used for tokenizer-backed offset recovery. */
|
|
25
|
+
readonly index?: number;
|
|
24
26
|
}
|
|
25
27
|
/** Counts the model tokens in a string, excluding the [CLS]/[SEP] specials. */
|
|
26
28
|
export type TokenCounter = (text: string) => number;
|
|
@@ -36,6 +38,8 @@ export interface TokenClassifier {
|
|
|
36
38
|
aggregation_strategy?: "simple" | "first" | "max";
|
|
37
39
|
}): Promise<RawEntity[]>;
|
|
38
40
|
countTokens?: TokenCounter;
|
|
41
|
+
/** WordPiece tokens for `text` (no specials); drives index→char offset recovery. */
|
|
42
|
+
tokenize?: (text: string) => readonly string[];
|
|
39
43
|
}
|
|
40
44
|
/** The shipped Rampart token-classifier on Hugging Face (q4 ONNX only). */
|
|
41
45
|
export declare const RAMPART_MODEL_ID = "nationaldesignstudio/rampart";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../../../src/ner/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAGH,OAAO,KAAK,EAAY,IAAI,EAAE,MAAM,UAAU,CAAC;AAE/C,0EAA0E;AAC1E,UAAU,SAAS;IACjB,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../../../src/ner/classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAGH,OAAO,KAAK,EAAY,IAAI,EAAE,MAAM,UAAU,CAAC;AAE/C,0EAA0E;AAC1E,UAAU,SAAS;IACjB,QAAQ,CAAC,YAAY,CAAC,EAAE,MAAM,CAAC;IAC/B,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,CAAC;IACzB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAC;IACvB,QAAQ,CAAC,GAAG,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,IAAI,EAAE,MAAM,CAAC;IACtB,uFAAuF;IACvF,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,+EAA+E;AAC/E,MAAM,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;AAEpD;;;;;;GAMG;AACH,MAAM,WAAW,eAAe;IAC9B,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE;QAAE,oBAAoB,CAAC,EAAE,QAAQ,GAAG,OAAO,GAAG,KAAK,CAAA;KAAE,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC;IACtG,WAAW,CAAC,EAAE,YAAY,CAAC;IAC3B,oFAAoF;IACpF,QAAQ,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,SAAS,MAAM,EAAE,CAAC;CAChD;AA6BD,2EAA2E;AAC3E,eAAO,MAAM,gBAAgB,iCAAiC,CAAC;AAE/D,MAAM,WAAW,UAAU;IACzB;;;;OAIG;IACH,QAAQ,CAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IACxB,wEAAwE;IACxE,QAAQ,CAAC,MAAM,CAAC,EAAE,MAAM,GAAG,QAAQ,GAAG,KAAK,CAAC;IAC5C,yEAAyE;IACzE,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;CAC5B;AAiBD,wFAAwF;AACxF,eAAO,MAAM,gBAAgB,QAAyC,CAAC;AAEvE;;;;;;;;;;GAUG;AACH,eAAO,MAAM,iBAAiB,KAAK,CAAC;AAWpC;;;;GAIG;AACH,wBAAsB,iBAAiB,CAAC,OAAO,GAAE,UAAe,GAAG,OAAO,CAAC,eAAe,CAAC,CAgC1F;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAsB,SAAS,CAC7B,GAAG,EAAE,MAAM,EACX,UAAU,EAAE,eAAe,EAC3B,QAAQ,GAAE,MAAiC,GAC1C,OAAO,CAAC,IAAI,EAAE,CAAC,CAoBjB"}
|
package/dist/src/ner/worker.js
CHANGED
|
@@ -34966,6 +34966,9 @@ async function loadNerClassifier(options = {}) {
|
|
|
34966
34966
|
if (tokenizer?.encode) {
|
|
34967
34967
|
adapter.countTokens = (text) => tokenizer.encode(text, { add_special_tokens: false }).length;
|
|
34968
34968
|
}
|
|
34969
|
+
if (tokenizer?.tokenize) {
|
|
34970
|
+
adapter.tokenize = (text) => tokenizer.tokenize(text);
|
|
34971
|
+
}
|
|
34969
34972
|
return adapter;
|
|
34970
34973
|
}
|
|
34971
34974
|
async function detectNer(raw, classifier, minScore = DEFAULT_OPTIONS.minScore) {
|
|
@@ -35057,8 +35060,9 @@ async function detectNerWindow(raw, classifier, minScore = DEFAULT_OPTIONS.minSc
|
|
|
35057
35060
|
const inferText = raw.replaceAll("-", " ");
|
|
35058
35061
|
const entities = await classifier(inferText, { aggregation_strategy: "simple" });
|
|
35059
35062
|
const folded = foldForModel(inferText);
|
|
35063
|
+
const indexOffsets = classifier.tokenize === undefined ? undefined : buildTokenIndexOffsets(folded, classifier.tokenize(inferText));
|
|
35060
35064
|
const candidates = [];
|
|
35061
|
-
for (const entity of mergeBioTokens(entities, folded)) {
|
|
35065
|
+
for (const entity of mergeBioTokens(entities, folded, indexOffsets)) {
|
|
35062
35066
|
const label = GROUP_TO_LABEL[entity.group.toUpperCase()];
|
|
35063
35067
|
if (label === undefined)
|
|
35064
35068
|
continue;
|
|
@@ -35095,7 +35099,31 @@ function foldForModel(raw) {
|
|
|
35095
35099
|
}
|
|
35096
35100
|
return { text, rawStart, rawEnd };
|
|
35097
35101
|
}
|
|
35098
|
-
function
|
|
35102
|
+
function buildTokenIndexOffsets(folded, contentTokens) {
|
|
35103
|
+
const offsets = [[0, 0]];
|
|
35104
|
+
let cursor = 0;
|
|
35105
|
+
for (const token of contentTokens) {
|
|
35106
|
+
const piece = token.startsWith("##") ? token.slice(2) : token;
|
|
35107
|
+
const at = token.startsWith("##") ? cursor : folded.text.indexOf(piece, cursor);
|
|
35108
|
+
if (piece.length === 0 || at < 0 || at + piece.length > folded.text.length) {
|
|
35109
|
+
offsets.push([0, 0]);
|
|
35110
|
+
continue;
|
|
35111
|
+
}
|
|
35112
|
+
offsets.push([folded.rawStart[at], folded.rawEnd[at + piece.length - 1]]);
|
|
35113
|
+
cursor = at + piece.length;
|
|
35114
|
+
}
|
|
35115
|
+
offsets.push([0, 0]);
|
|
35116
|
+
return offsets;
|
|
35117
|
+
}
|
|
35118
|
+
function offsetFromTokenIndex(indexOffsets, index) {
|
|
35119
|
+
if (indexOffsets === undefined || index === undefined)
|
|
35120
|
+
return null;
|
|
35121
|
+
const pair = indexOffsets[index];
|
|
35122
|
+
if (pair === undefined || pair[0] === pair[1])
|
|
35123
|
+
return null;
|
|
35124
|
+
return pair;
|
|
35125
|
+
}
|
|
35126
|
+
function mergeBioTokens(entities, folded, indexOffsets) {
|
|
35099
35127
|
const out = [];
|
|
35100
35128
|
let cursor = 0;
|
|
35101
35129
|
let current = null;
|
|
@@ -35115,16 +35143,24 @@ function mergeBioTokens(entities, folded) {
|
|
|
35115
35143
|
if (rawLabel === undefined)
|
|
35116
35144
|
continue;
|
|
35117
35145
|
const { prefix, base } = stripBio(rawLabel);
|
|
35118
|
-
const
|
|
35119
|
-
|
|
35120
|
-
|
|
35121
|
-
|
|
35122
|
-
|
|
35123
|
-
|
|
35124
|
-
|
|
35125
|
-
|
|
35126
|
-
|
|
35127
|
-
|
|
35146
|
+
const indexed = offsetFromTokenIndex(indexOffsets, entity.index);
|
|
35147
|
+
let start;
|
|
35148
|
+
let end;
|
|
35149
|
+
if (indexed !== null) {
|
|
35150
|
+
[start, end] = indexed;
|
|
35151
|
+
} else {
|
|
35152
|
+
const word = (entity.word ?? "").replace(/^##/, "").toLowerCase();
|
|
35153
|
+
if (!word)
|
|
35154
|
+
continue;
|
|
35155
|
+
const at = folded.text.indexOf(word, cursor);
|
|
35156
|
+
if (at < 0)
|
|
35157
|
+
continue;
|
|
35158
|
+
start = folded.rawStart[at];
|
|
35159
|
+
end = folded.rawEnd[at + word.length - 1];
|
|
35160
|
+
cursor = at + word.length;
|
|
35161
|
+
}
|
|
35162
|
+
const isSubword = (entity.word ?? "").startsWith("##");
|
|
35163
|
+
const continues = current !== null && current.group === base && (prefix !== "B" || isSubword);
|
|
35128
35164
|
if (continues && current !== null) {
|
|
35129
35165
|
current.end = end;
|
|
35130
35166
|
current.score += entity.score;
|
|
@@ -35161,7 +35197,7 @@ function repairSpans(raw, spans, anchorScore) {
|
|
|
35161
35197
|
}
|
|
35162
35198
|
kept = merged;
|
|
35163
35199
|
for (let i = 0;i < kept.length; i++) {
|
|
35164
|
-
const repaired = rescueCapitalizedParticles(raw, kept[i]);
|
|
35200
|
+
const repaired = rescueCapitalizedParticles(raw, kept[i], kept, i);
|
|
35165
35201
|
if (repaired.start !== kept[i].start || repaired.end !== kept[i].end) {
|
|
35166
35202
|
kept[i] = repaired;
|
|
35167
35203
|
changed = true;
|
|
@@ -35208,17 +35244,28 @@ function mergeAdjacentConnectors(raw, spans) {
|
|
|
35208
35244
|
}
|
|
35209
35245
|
return merged;
|
|
35210
35246
|
}
|
|
35211
|
-
function rescueCapitalizedParticles(raw, span) {
|
|
35247
|
+
function rescueCapitalizedParticles(raw, span, all = [], selfIndex = -1) {
|
|
35212
35248
|
if (!PERSON_LABELS.has(span.label))
|
|
35213
35249
|
return span;
|
|
35250
|
+
let leftBound = 0;
|
|
35251
|
+
let rightBound = raw.length;
|
|
35252
|
+
for (let i = 0;i < all.length; i++) {
|
|
35253
|
+
if (i === selfIndex)
|
|
35254
|
+
continue;
|
|
35255
|
+
const other = all[i];
|
|
35256
|
+
if (other.end <= span.start && other.end > leftBound)
|
|
35257
|
+
leftBound = other.end;
|
|
35258
|
+
if (other.start >= span.end && other.start < rightBound)
|
|
35259
|
+
rightBound = other.start;
|
|
35260
|
+
}
|
|
35214
35261
|
let start = span.start;
|
|
35215
35262
|
let end = span.end;
|
|
35216
35263
|
const left = LEFT_PARTICLE_RE.exec(raw.slice(0, start));
|
|
35217
|
-
if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1)) {
|
|
35264
|
+
if (left !== null && CONNECTOR_RE.test(left[2]) && (!left[2].includes(".") || left[1].length === 1) && start - left[0].length >= leftBound) {
|
|
35218
35265
|
start -= left[0].length;
|
|
35219
35266
|
}
|
|
35220
35267
|
const right = RIGHT_PARTICLE_RE.exec(raw.slice(end));
|
|
35221
|
-
if (right !== null && !right[1].includes(".")) {
|
|
35268
|
+
if (right !== null && !right[1].includes(".") && end + right[0].length <= rightBound) {
|
|
35222
35269
|
end += right[0].length;
|
|
35223
35270
|
}
|
|
35224
35271
|
return start === span.start && end === span.end ? span : { ...span, start, end, text: raw.slice(start, end) };
|
|
@@ -35298,5 +35345,5 @@ export {
|
|
|
35298
35345
|
createWorkerClassifier
|
|
35299
35346
|
};
|
|
35300
35347
|
|
|
35301
|
-
//# debugId=
|
|
35348
|
+
//# debugId=474CA63CBA7E3BFF64756E2164756E21
|
|
35302
35349
|
//# sourceMappingURL=worker.js.map
|