@elanlanguages/bridge-anonymization 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ner/ner-model.js +1 -1
- package/dist/ner/ner-model.js.map +1 -1
- package/dist/ner/tokenizer.d.ts +26 -53
- package/dist/ner/tokenizer.d.ts.map +1 -1
- package/dist/ner/tokenizer.js +174 -196
- package/dist/ner/tokenizer.js.map +1 -1
- package/package.json +1 -1
package/dist/ner/ner-model.js
CHANGED
|
@@ -205,7 +205,7 @@ export function createNERModel(config) {
|
|
|
205
205
|
vocabPath: config.vocabPath,
|
|
206
206
|
labelMap: config.labelMap ?? DEFAULT_LABEL_MAP,
|
|
207
207
|
maxLength: config.maxLength ?? 512,
|
|
208
|
-
doLowerCase: config.doLowerCase ??
|
|
208
|
+
doLowerCase: config.doLowerCase ?? false, // XLM-RoBERTa is cased
|
|
209
209
|
modelVersion: config.modelVersion ?? '1.0.0',
|
|
210
210
|
};
|
|
211
211
|
return new NERModel(fullConfig);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,
|
|
1
|
+
{"version":3,"file":"ner-model.js","sourceRoot":"","sources":["../../src/ner/ner-model.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,OAAO,EAAE,WAAW,EAAmB,MAAM,mBAAmB,CAAC;AAEjE,OAAO,EACL,kBAAkB,EAClB,iBAAiB,GAGlB,MAAM,gBAAgB,CAAC;AACxB,OAAO,EACL,aAAa,EACb,oBAAoB,EACpB,qBAAqB,EACrB,kBAAkB,GACnB,MAAM,kBAAkB,CAAC;AAgC1B;;GAEG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAG;IAC/B,GAAG;IACH,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,OAAO;IACP,QAAQ;IACR,QAAQ;CACT,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,QAAQ;IACX,GAAG,GAAsB,IAAI,CAAC;IAC9B,OAAO,GAAmB,IAAI,CAAC;IAC/B,SAAS,GAA8B,IAAI,CAAC;IAC5C,MAAM,CAAiB;IACvB,QAAQ,GAAG,KAAK,CAAC;IAEzB,YAAY,MAAsB;QAChC,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO;QAE1B,gEAAgE;QAChE,IAAI,CAAC,GAAG,GAAG,MAAM,WAAW,EAAE,CAAC;QAE/B,kBAAkB;QAClB,IAAI,CAAC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAE7E,4BAA4B;QAC5B,MAAM,KAAK,GAAG,MAAM,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC7D,IAAI,CAAC,SAAS,GAAG,IAAI,kBAAkB,CAAC,KAAK,EAAE;YAC7C,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,SAAS;YAChC,WAAW,EAAE,IAAI,CAAC,MAAM,CAAC,WAAW;SACrC,CAAC,CAAC;QAEH,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAY,EACZ,MAA4B;QAE5B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAEpC,IAAI,CAAC,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,SAAS,KAAK,IAAI,EAAE,CAAC;YACvE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QAC1D,CAAC;QAED,iBAAiB;QACjB,MAAM,YAAY,GAAG,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEnD,gBAAgB;QAChB,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC;QAEtE,8BAA8B;QAC9B,MAAM,WAAW,GAAG,aAAa,CAC/B,YAAY,CAAC,MAAM,EACnB,MAAM,EACN,WAAW,EACX,IAAI,CACL,CAAC;QAEF,wDAAwD;QACxD,MAAM,aAAa,GAAG,IAAI,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,oBAAoB,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;QAE7D,qBAAqB;QACrB,KAAK,GAAG,qBAAqB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAC3C,KAAK,GAAG,kBAAkB,CAAC,KAAK,EAAE,IAAI,CAAC,CAAC;QAExC,oCAAoC;QACpC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,KAAK,GAAG,KAAK,CAAC,MAAM,CAClB,CAAC,IAAI,EAAE,EAAE,CACP,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,MAAM,CAAC,eAAe,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,CAC9E,CAAC;QACJ,CAAC;QAED,MAAM,OAAO,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;QAElC,OAAO;YACL,KAAK;YACL,gBAAgB,EAAE,OAAO,GAAG,SAAS;YACrC,YAAY,EAAE,IAAI,CAAC,MAAM,CAAC,YAAY;SACvC,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CACxB,YAAgC;QAEhC,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,IAAI,IAAI,CAAC,GAAG,KAAK,IAAI,EAAE,CAAC;YAC/C,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,MAAM,OAAO,GAAG,IAAI,CAAC,OAIpB,CAAC;QAEF,MAAM,SAAS,GAAG,YAAY,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE/C,iBAAiB;QACjB,MAAM,cAAc,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CACxC,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACrD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,mBAAmB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC7C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,aAAa,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EAC1D,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,MAAM,kBAAkB,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,MAAM,CAC5C,OAAO,EACP,aAAa,CAAC,IAAI,CAAC,YAAY,CAAC,YAAY,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,EACzD,CAAC,CAAC,EAAE,SAAS,CAAC,CACf,CAAC;QAEF,gBAAgB;QAChB,MAAM,KAAK,GAA4B;YACrC,SAAS,EAAE,cAAc;YACzB,cAAc,EAAE,mBAAmB;SACpC,CAAC;QAEF,uCAAuC;QACvC,MAAM,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACtC,IAAI,UAAU,CAAC,QAAQ,CAAC,gBAAgB,CAAC,EAAE,CAAC;YAC1C,KAAK,CAAC,gBAAgB,CAAC,GAAG,kBAAkB,CAAC;QAC/C,CAAC;QAED,MAAM,OAAO,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAEzC,oBAAoB;QACpB,MAAM,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,UAAU,KAAK,SAAS,EAAE,CAAC;YAC7B,MAAM,IAAI,KAAK,CAAC,sBAAsB,CAAC,CAAC;QAC1C,CAAC;QAED,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;QACnC,IAAI,MAAM,KAAK,SAAS,EAAE,CAAC;YACzB,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;QAC7C,CAAC;QAED,+CAA+C;QAC/C,OAAO,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC/C,CAAC;IAED;;OAEG;IACK,aAAa,CACnB,MAA8B,EAC9B,SAAiB;QAEjB,MAAM,IAAI,GAAG,MAAM,CAAC,IAAoB,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAE9C,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;QAEjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACnC,4BAA4B;YAC5B,MAAM,WAAW,GAAa,EAAE,CAAC;YACjC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACnC,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,SAAS,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YACjD,CAAC;YAED,gBAAgB;YAChB,MAAM,KAAK,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;YAEnC,aAAa;YACb,IAAI,MAAM,GAAG,CAAC,CAAC;YACf,IAAI,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;YAC5B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,OAAO,EAAE,CAAC;oBAC9B,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;oBACxB,MAAM,GAAG,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,CAAC;YACjD,WAAW,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5B,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;IACjC,CAAC;IAED;;OAEG;IACK,gBAAgB,CAAC,MAA4B;QACnD,IAAI,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,CAAC;QAErC,yCAAyC;QACzC,IAAI,YAAY,GAAG,GAAG,CAAC;QACvB,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,eAAe,EAAE,CAAC;YAC1C,MAAM,SAAS,GAAG,MAAM,CAAC,oBAAoB,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC;YAC/D,IAAI,SAAS,GAAG,YAAY,EAAE,CAAC;gBAC7B,YAAY,GAAG,SAAS,CAAC;YAC3B,CAAC;QACH,CAAC;QAED,OAAO,YAAY,CAAC;IACtB,CAAC;IAED;;OAEG;IACH,IAAI,OAAO;QACT,OAAO,IAAI,CAAC,MAAM,CAAC,YAAY,CAAC;IAClC,CAAC;IAED;;OAEG;IACH,IAAI,MAAM;QACR,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,OAAO;QACX,IAAI,IAAI,CAAC,OAAO,KAAK,IAAI,EAAE,CAAC;YAC1B,+EAA+E;YAC/E,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC;QACtB,CAAC;QACD,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC;QACtB,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;IACxB,CAAC;CACF;AAED;;GAEG;AACH,SAAS,OAAO,CAAC,MAAgB;IAC/B,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,MAAM,CAAC,CAAC;IACrC,MAAM,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAC;IAC5D,MAAM,MAAM,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;IACpD,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc,CAAC,MAA0E;IACvG,MAAM,UAAU,GAAmB;QACjC,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,SAAS,EAAE,MAAM,CAAC,SAAS;QAC3B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,iBAAiB;QAC9C,SAAS,EAAE,MAAM,CAAC,SAAS,IAAI,GAAG;QAClC,WAAW,EAAE,MAAM,CAAC,WAAW,IAAI,KAAK,EAAE,uBAAuB;QACjE,YAAY,EAAE,MAAM,CAAC,YAAY,IAAI,OAAO;KAC7C,CAAC;IAEF,OAAO,IAAI,QAAQ,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC;AAED;;;GAGG;AACH,MAAM,OAAO,YAAY;IACd,OAAO,GAAG,YAAY,CAAC;IACvB,MAAM,GAAG,IAAI,CAAC;IAEvB,KAAK,CAAC,IAAI;QACR,QAAQ;IACV,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,KAAa,EAAE,OAA6B;QACxD,OAAO;YACL,KAAK,EAAE,EAAE;YACT,gBAAgB,EAAE,CAAC;YACnB,YAAY,EAAE,IAAI,CAAC,OAAO;SAC3B,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO;QACX,QAAQ;IACV,CAAC;CACF;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB;IAChC,OAAO,IAAI,YAAY,EAAE,CAAC;AAC5B,CAAC"}
|
package/dist/ner/tokenizer.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* HuggingFace Tokenizer
|
|
3
|
+
* Loads and uses tokenizers from HuggingFace's tokenizer.json format
|
|
4
|
+
* Supports Unigram (SentencePiece) and BPE tokenizers
|
|
5
5
|
*/
|
|
6
6
|
/**
|
|
7
7
|
* Token with offset information
|
|
@@ -15,9 +15,9 @@ export interface Token {
|
|
|
15
15
|
start: number;
|
|
16
16
|
/** End character offset in original text */
|
|
17
17
|
end: number;
|
|
18
|
-
/** Whether this is a continuation token
|
|
18
|
+
/** Whether this is a continuation token */
|
|
19
19
|
isContinuation: boolean;
|
|
20
|
-
/** Whether this is a special token
|
|
20
|
+
/** Whether this is a special token */
|
|
21
21
|
isSpecial: boolean;
|
|
22
22
|
}
|
|
23
23
|
/**
|
|
@@ -39,75 +39,44 @@ export interface TokenizationResult {
|
|
|
39
39
|
* Tokenizer configuration
|
|
40
40
|
*/
|
|
41
41
|
export interface TokenizerConfig {
|
|
42
|
-
/** Path to vocabulary file */
|
|
43
|
-
vocabPath?: string;
|
|
44
|
-
/** Vocabulary as a Map */
|
|
45
|
-
vocab?: Map<string, number>;
|
|
46
42
|
/** Maximum sequence length */
|
|
47
43
|
maxLength: number;
|
|
48
|
-
/** Unknown token */
|
|
49
|
-
unkToken: string;
|
|
50
|
-
/** Classification token */
|
|
51
|
-
clsToken: string;
|
|
52
|
-
/** Separator token */
|
|
53
|
-
sepToken: string;
|
|
54
|
-
/** Padding token */
|
|
55
|
-
padToken: string;
|
|
56
|
-
/** Mask token */
|
|
57
|
-
maskToken: string;
|
|
58
44
|
/** Whether to lowercase input */
|
|
59
45
|
doLowerCase: boolean;
|
|
60
|
-
/** Strip accents */
|
|
61
|
-
stripAccents: boolean;
|
|
62
46
|
}
|
|
63
47
|
/**
|
|
64
|
-
* Default tokenizer configuration
|
|
48
|
+
* Default tokenizer configuration
|
|
65
49
|
*/
|
|
66
50
|
export declare const DEFAULT_TOKENIZER_CONFIG: TokenizerConfig;
|
|
67
51
|
/**
|
|
68
|
-
* WordPiece Tokenizer
|
|
52
|
+
* WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
|
|
69
53
|
*/
|
|
70
54
|
export declare class WordPieceTokenizer {
|
|
71
55
|
private vocab;
|
|
72
56
|
private inverseVocab;
|
|
73
57
|
private config;
|
|
74
|
-
private
|
|
58
|
+
private sortedVocab;
|
|
75
59
|
private clsId;
|
|
76
60
|
private sepId;
|
|
77
61
|
private padId;
|
|
62
|
+
private unkId;
|
|
63
|
+
private clsToken;
|
|
64
|
+
private sepToken;
|
|
65
|
+
private padToken;
|
|
66
|
+
private unkToken;
|
|
78
67
|
constructor(vocab: Map<string, number>, config?: Partial<TokenizerConfig>);
|
|
79
68
|
/**
|
|
80
|
-
*
|
|
81
|
-
*/
|
|
82
|
-
tokenize(text: string): TokenizationResult;
|
|
83
|
-
/**
|
|
84
|
-
* Preprocesses text (lowercase, accent stripping)
|
|
85
|
-
*/
|
|
86
|
-
private preprocess;
|
|
87
|
-
/**
|
|
88
|
-
* Strips accents from text
|
|
89
|
-
*/
|
|
90
|
-
private stripAccents;
|
|
91
|
-
/**
|
|
92
|
-
* Splits text into words while tracking character offsets
|
|
93
|
-
*/
|
|
94
|
-
private splitIntoWords;
|
|
95
|
-
/**
|
|
96
|
-
* Tokenizes a single word using WordPiece algorithm
|
|
97
|
-
*/
|
|
98
|
-
private tokenizeWord;
|
|
99
|
-
/**
|
|
100
|
-
* Splits a word into pieces, handling punctuation
|
|
69
|
+
* Detect special tokens from vocabulary
|
|
101
70
|
*/
|
|
102
|
-
private
|
|
71
|
+
private detectSpecialTokens;
|
|
103
72
|
/**
|
|
104
|
-
*
|
|
73
|
+
* Tokenizes text into tokens with offset tracking
|
|
105
74
|
*/
|
|
106
|
-
|
|
75
|
+
tokenize(text: string): TokenizationResult;
|
|
107
76
|
/**
|
|
108
|
-
*
|
|
77
|
+
* Find the best matching token using greedy longest-match
|
|
109
78
|
*/
|
|
110
|
-
private
|
|
79
|
+
private findBestToken;
|
|
111
80
|
/**
|
|
112
81
|
* Decodes token IDs back to text
|
|
113
82
|
*/
|
|
@@ -126,11 +95,15 @@ export declare class WordPieceTokenizer {
|
|
|
126
95
|
getToken(id: number): string | undefined;
|
|
127
96
|
}
|
|
128
97
|
/**
|
|
129
|
-
* Loads vocabulary from a
|
|
98
|
+
* Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
|
|
99
|
+
*/
|
|
100
|
+
export declare function loadVocabFromFile(filePath: string): Promise<Map<string, number>>;
|
|
101
|
+
/**
|
|
102
|
+
* Parses HuggingFace tokenizer.json format
|
|
130
103
|
*/
|
|
131
|
-
export declare function
|
|
104
|
+
export declare function parseHFTokenizerJson(content: string): Map<string, number>;
|
|
132
105
|
/**
|
|
133
|
-
* Parses vocabulary from string content
|
|
106
|
+
* Parses vocabulary from string content (vocab.txt format)
|
|
134
107
|
*/
|
|
135
108
|
export declare function parseVocab(content: string): Map<string, number>;
|
|
136
109
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH;;GAEG;AACH,MAAM,WAAW,KAAK;IACpB,6BAA6B;IAC7B,EAAE,EAAE,MAAM,CAAC;IACX,mBAAmB;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,8CAA8C;IAC9C,KAAK,EAAE,MAAM,CAAC;IACd,4CAA4C;IAC5C,GAAG,EAAE,MAAM,CAAC;IACZ,2CAA2C;IAC3C,cAAc,EAAE,OAAO,CAAC;IACxB,sCAAsC;IACtC,SAAS,EAAE,OAAO,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,sBAAsB;IACtB,MAAM,EAAE,KAAK,EAAE,CAAC;IAChB,0BAA0B;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,qBAAqB;IACrB,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,6CAA6C;IAC7C,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,8DAA8D;IAC9D,eAAe,EAAE,KAAK,CAAC,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,IAAI,CAAC,CAAC;CACjD;AAsBD;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,8BAA8B;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,iCAAiC;IACjC,WAAW,EAAE,OAAO,CAAC;CACtB;AAED;;GAEG;AACH,eAAO,MAAM,wBAAwB,EAAE,eAGtC,CAAC;AAEF;;GAEG;AACH,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,KAAK,CAAsB;IACnC,OAAO,CAAC,YAAY,CAAsB;IAC1C,OAAO,CAAC,MAAM,CAAkB;IAChC,OAAO,CAAC,WAAW,CAA0B;IAG7C,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAC1B,OAAO,CAAC,KAAK,CAAa;IAG1B,OAAO,CAAC,QAAQ,CAAiB;IACjC,OAAO,CAAC,QAAQ,CAAkB;IAClC,OAAO,CAAC,QAAQ,CAAmB;IACnC,OAAO,CAAC,QAAQ,CAAmB;gBAEvB,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM;IAiB7E;;OAEG;IACH,OAAO,CAAC,mBAAmB;IAyB3B;;OAEG;IACH,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,kBAAkB;IAsF1C;;OAEG;IACH,OAAO,CAAC,aAAa;IA0CrB;;OAEG;IACH,MAAM,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,MAAM;IAmBlC;;OAEG;IACH,IAAI,SAAS,IAAI,MAAM,CAEtB;IAED;;OAEG;IACH,UAAU,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAI7C;;OAEG;IACH,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;CAGzC;AAED;;GAEG;AACH,wBAAsB,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAUtF;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAmCzE;AAED;;GAEG;AACH,wBAAgB,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAY/D;AAED;;GAEG;AACH,wBAAgB,eAAe,IAAI,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAuBrD"}
|
package/dist/ner/tokenizer.js
CHANGED
|
@@ -1,33 +1,33 @@
|
|
|
1
1
|
/**
|
|
2
|
-
*
|
|
3
|
-
*
|
|
4
|
-
*
|
|
2
|
+
* HuggingFace Tokenizer
|
|
3
|
+
* Loads and uses tokenizers from HuggingFace's tokenizer.json format
|
|
4
|
+
* Supports Unigram (SentencePiece) and BPE tokenizers
|
|
5
5
|
*/
|
|
6
6
|
/**
|
|
7
|
-
* Default tokenizer configuration
|
|
7
|
+
* Default tokenizer configuration
|
|
8
8
|
*/
|
|
9
9
|
export const DEFAULT_TOKENIZER_CONFIG = {
|
|
10
10
|
maxLength: 512,
|
|
11
|
-
|
|
12
|
-
clsToken: '[CLS]',
|
|
13
|
-
sepToken: '[SEP]',
|
|
14
|
-
padToken: '[PAD]',
|
|
15
|
-
maskToken: '[MASK]',
|
|
16
|
-
doLowerCase: true,
|
|
17
|
-
stripAccents: true,
|
|
11
|
+
doLowerCase: false, // XLM-RoBERTa doesn't lowercase
|
|
18
12
|
};
|
|
19
13
|
/**
|
|
20
|
-
* WordPiece Tokenizer
|
|
14
|
+
* WordPiece Tokenizer - supports both HuggingFace JSON and vocab.txt formats
|
|
21
15
|
*/
|
|
22
16
|
export class WordPieceTokenizer {
|
|
23
17
|
vocab;
|
|
24
18
|
inverseVocab;
|
|
25
19
|
config;
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
clsId;
|
|
29
|
-
sepId;
|
|
30
|
-
padId;
|
|
20
|
+
sortedVocab;
|
|
21
|
+
// Special token IDs (XLM-RoBERTa style)
|
|
22
|
+
clsId = 0; // <s>
|
|
23
|
+
sepId = 2; // </s>
|
|
24
|
+
padId = 1; // <pad>
|
|
25
|
+
unkId = 3; // <unk>
|
|
26
|
+
// Special token strings
|
|
27
|
+
clsToken = '<s>';
|
|
28
|
+
sepToken = '</s>';
|
|
29
|
+
padToken = '<pad>';
|
|
30
|
+
unkToken = '<unk>';
|
|
31
31
|
constructor(vocab, config = {}) {
|
|
32
32
|
this.vocab = vocab;
|
|
33
33
|
this.config = { ...DEFAULT_TOKENIZER_CONFIG, ...config };
|
|
@@ -36,11 +36,37 @@ export class WordPieceTokenizer {
|
|
|
36
36
|
for (const [token, id] of vocab) {
|
|
37
37
|
this.inverseVocab.set(id, token);
|
|
38
38
|
}
|
|
39
|
-
//
|
|
40
|
-
this.
|
|
41
|
-
|
|
42
|
-
this.
|
|
43
|
-
|
|
39
|
+
// Sort vocab by token length (longest first) for greedy matching
|
|
40
|
+
this.sortedVocab = Array.from(vocab.entries()).sort((a, b) => b[0].length - a[0].length);
|
|
41
|
+
// Try to detect special tokens from vocab
|
|
42
|
+
this.detectSpecialTokens();
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Detect special tokens from vocabulary
|
|
46
|
+
*/
|
|
47
|
+
detectSpecialTokens() {
|
|
48
|
+
// XLM-RoBERTa style
|
|
49
|
+
if (this.vocab.has('<s>')) {
|
|
50
|
+
this.clsToken = '<s>';
|
|
51
|
+
this.clsId = this.vocab.get('<s>') ?? 0;
|
|
52
|
+
this.sepToken = '</s>';
|
|
53
|
+
this.sepId = this.vocab.get('</s>') ?? 2;
|
|
54
|
+
this.padToken = '<pad>';
|
|
55
|
+
this.padId = this.vocab.get('<pad>') ?? 1;
|
|
56
|
+
this.unkToken = '<unk>';
|
|
57
|
+
this.unkId = this.vocab.get('<unk>') ?? 3;
|
|
58
|
+
}
|
|
59
|
+
// BERT style
|
|
60
|
+
else if (this.vocab.has('[CLS]')) {
|
|
61
|
+
this.clsToken = '[CLS]';
|
|
62
|
+
this.clsId = this.vocab.get('[CLS]') ?? 101;
|
|
63
|
+
this.sepToken = '[SEP]';
|
|
64
|
+
this.sepId = this.vocab.get('[SEP]') ?? 102;
|
|
65
|
+
this.padToken = '[PAD]';
|
|
66
|
+
this.padId = this.vocab.get('[PAD]') ?? 0;
|
|
67
|
+
this.unkToken = '[UNK]';
|
|
68
|
+
this.unkId = this.vocab.get('[UNK]') ?? 100;
|
|
69
|
+
}
|
|
44
70
|
}
|
|
45
71
|
/**
|
|
46
72
|
* Tokenizes text into tokens with offset tracking
|
|
@@ -48,10 +74,10 @@ export class WordPieceTokenizer {
|
|
|
48
74
|
tokenize(text) {
|
|
49
75
|
const tokens = [];
|
|
50
76
|
const tokenToCharSpan = [];
|
|
51
|
-
// Add
|
|
77
|
+
// Add CLS token
|
|
52
78
|
tokens.push({
|
|
53
79
|
id: this.clsId,
|
|
54
|
-
token: this.
|
|
80
|
+
token: this.clsToken,
|
|
55
81
|
start: 0,
|
|
56
82
|
end: 0,
|
|
57
83
|
isContinuation: false,
|
|
@@ -59,21 +85,33 @@ export class WordPieceTokenizer {
|
|
|
59
85
|
});
|
|
60
86
|
tokenToCharSpan.push(null);
|
|
61
87
|
// Preprocess text
|
|
62
|
-
const processedText = this.
|
|
63
|
-
//
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
tokenToCharSpan.push([t.start, t.end]);
|
|
88
|
+
const processedText = this.config.doLowerCase ? text.toLowerCase() : text;
|
|
89
|
+
// Tokenize using greedy longest-match
|
|
90
|
+
let pos = 0;
|
|
91
|
+
while (pos < processedText.length) {
|
|
92
|
+
// Skip whitespace
|
|
93
|
+
if (/\s/.test(processedText[pos])) {
|
|
94
|
+
pos++;
|
|
95
|
+
continue;
|
|
71
96
|
}
|
|
97
|
+
// Find the longest matching token starting at this position
|
|
98
|
+
const { token, id, length } = this.findBestToken(processedText, pos);
|
|
99
|
+
const isFirstOfWord = pos === 0 || /\s/.test(processedText[pos - 1]);
|
|
100
|
+
tokens.push({
|
|
101
|
+
id,
|
|
102
|
+
token,
|
|
103
|
+
start: pos,
|
|
104
|
+
end: pos + length,
|
|
105
|
+
isContinuation: !isFirstOfWord && !token.startsWith('▁'),
|
|
106
|
+
isSpecial: false,
|
|
107
|
+
});
|
|
108
|
+
tokenToCharSpan.push([pos, pos + length]);
|
|
109
|
+
pos += length;
|
|
72
110
|
}
|
|
73
|
-
// Add
|
|
111
|
+
// Add SEP token
|
|
74
112
|
tokens.push({
|
|
75
113
|
id: this.sepId,
|
|
76
|
-
token: this.
|
|
114
|
+
token: this.sepToken,
|
|
77
115
|
start: text.length,
|
|
78
116
|
end: text.length,
|
|
79
117
|
isContinuation: false,
|
|
@@ -85,10 +123,9 @@ export class WordPieceTokenizer {
|
|
|
85
123
|
if (tokens.length > maxTokens) {
|
|
86
124
|
tokens.length = maxTokens - 1;
|
|
87
125
|
tokenToCharSpan.length = maxTokens - 1;
|
|
88
|
-
// Add [SEP] at end
|
|
89
126
|
tokens.push({
|
|
90
127
|
id: this.sepId,
|
|
91
|
-
token: this.
|
|
128
|
+
token: this.sepToken,
|
|
92
129
|
start: text.length,
|
|
93
130
|
end: text.length,
|
|
94
131
|
isContinuation: false,
|
|
@@ -109,161 +146,66 @@ export class WordPieceTokenizer {
|
|
|
109
146
|
};
|
|
110
147
|
}
|
|
111
148
|
/**
|
|
112
|
-
*
|
|
113
|
-
*/
|
|
114
|
-
preprocess(text) {
|
|
115
|
-
let processed = text;
|
|
116
|
-
if (this.config.doLowerCase) {
|
|
117
|
-
processed = processed.toLowerCase();
|
|
118
|
-
}
|
|
119
|
-
if (this.config.stripAccents) {
|
|
120
|
-
processed = this.stripAccents(processed);
|
|
121
|
-
}
|
|
122
|
-
return processed;
|
|
123
|
-
}
|
|
124
|
-
/**
|
|
125
|
-
* Strips accents from text
|
|
126
|
-
*/
|
|
127
|
-
stripAccents(text) {
|
|
128
|
-
return text.normalize('NFD').replace(/[\u0300-\u036f]/g, '');
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Splits text into words while tracking character offsets
|
|
132
|
-
*/
|
|
133
|
-
splitIntoWords(processedText, originalText) {
|
|
134
|
-
const words = [];
|
|
135
|
-
// Split on whitespace and punctuation while keeping track of positions
|
|
136
|
-
const wordPattern = /\S+/g;
|
|
137
|
-
let match;
|
|
138
|
-
while ((match = wordPattern.exec(processedText)) !== null) {
|
|
139
|
-
// Find corresponding position in original text
|
|
140
|
-
// Since we may have lowercased, we need to map positions
|
|
141
|
-
const start = match.index;
|
|
142
|
-
const end = start + match[0].length;
|
|
143
|
-
words.push({
|
|
144
|
-
word: match[0],
|
|
145
|
-
start,
|
|
146
|
-
end,
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
-
return words;
|
|
150
|
-
}
|
|
151
|
-
/**
|
|
152
|
-
* Tokenizes a single word using WordPiece algorithm
|
|
153
|
-
*/
|
|
154
|
-
tokenizeWord(word, startOffset, endOffset) {
|
|
155
|
-
const tokens = [];
|
|
156
|
-
// Handle punctuation separately
|
|
157
|
-
const subwords = this.splitWordIntoPieces(word);
|
|
158
|
-
let currentOffset = startOffset;
|
|
159
|
-
for (let i = 0; i < subwords.length; i++) {
|
|
160
|
-
let subword = subwords[i];
|
|
161
|
-
const isContinuation = i > 0;
|
|
162
|
-
// For continuation tokens, add ## prefix for vocab lookup
|
|
163
|
-
const vocabKey = isContinuation ? '##' + subword : subword;
|
|
164
|
-
// Look up in vocabulary
|
|
165
|
-
let tokenId = this.vocab.get(vocabKey);
|
|
166
|
-
// If not found, try to find longest matching prefix
|
|
167
|
-
if (tokenId === undefined) {
|
|
168
|
-
const { id, token } = this.findLongestMatch(subword, isContinuation);
|
|
169
|
-
tokenId = id;
|
|
170
|
-
subword = token;
|
|
171
|
-
}
|
|
172
|
-
const tokenLength = subword.length;
|
|
173
|
-
const tokenEnd = Math.min(currentOffset + tokenLength, endOffset);
|
|
174
|
-
tokens.push({
|
|
175
|
-
id: tokenId,
|
|
176
|
-
token: isContinuation ? '##' + subword : subword,
|
|
177
|
-
start: currentOffset,
|
|
178
|
-
end: tokenEnd,
|
|
179
|
-
isContinuation,
|
|
180
|
-
isSpecial: false,
|
|
181
|
-
});
|
|
182
|
-
currentOffset = tokenEnd;
|
|
183
|
-
}
|
|
184
|
-
return tokens;
|
|
185
|
-
}
|
|
186
|
-
/**
|
|
187
|
-
* Splits a word into pieces, handling punctuation
|
|
149
|
+
* Find the best matching token using greedy longest-match
|
|
188
150
|
*/
|
|
189
|
-
|
|
190
|
-
const
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
151
|
+
findBestToken(text, startPos) {
|
|
152
|
+
const remaining = text.slice(startPos);
|
|
153
|
+
// Check if this starts a new word (preceded by space or start)
|
|
154
|
+
const isWordStart = startPos === 0 || /\s/.test(text[startPos - 1]);
|
|
155
|
+
// For SentencePiece models, word-initial tokens start with ▁
|
|
156
|
+
if (isWordStart) {
|
|
157
|
+
// Try with ▁ prefix first
|
|
158
|
+
const withPrefix = '▁' + remaining;
|
|
159
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
160
|
+
if (withPrefix.startsWith(vocabToken)) {
|
|
161
|
+
// Return the match length without the ▁ since that's not in original text
|
|
162
|
+
return {
|
|
163
|
+
token: vocabToken,
|
|
164
|
+
id,
|
|
165
|
+
length: vocabToken.length - 1 // Subtract 1 for the ▁
|
|
166
|
+
};
|
|
197
167
|
}
|
|
198
|
-
pieces.push(char);
|
|
199
168
|
}
|
|
200
|
-
else {
|
|
201
|
-
current += char;
|
|
202
|
-
}
|
|
203
|
-
}
|
|
204
|
-
if (current.length > 0) {
|
|
205
|
-
pieces.push(current);
|
|
206
169
|
}
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
/[\u2000-\u206F]/.test(char) || // General punctuation
|
|
220
|
-
/[\u3000-\u303F]/.test(char) // CJK punctuation
|
|
221
|
-
);
|
|
222
|
-
}
|
|
223
|
-
/**
|
|
224
|
-
* Finds the longest matching token in vocabulary
|
|
225
|
-
*/
|
|
226
|
-
findLongestMatch(word, isContinuation) {
|
|
227
|
-
const prefix = isContinuation ? '##' : '';
|
|
228
|
-
// Try progressively shorter substrings
|
|
229
|
-
for (let end = word.length; end > 0; end--) {
|
|
230
|
-
const subword = word.slice(0, end);
|
|
231
|
-
const vocabKey = prefix + subword;
|
|
232
|
-
const id = this.vocab.get(vocabKey);
|
|
233
|
-
if (id !== undefined) {
|
|
234
|
-
return { id, token: subword };
|
|
170
|
+
// Try exact match without prefix
|
|
171
|
+
for (const [vocabToken, id] of this.sortedVocab) {
|
|
172
|
+
// Skip special tokens and tokens starting with ▁ for non-word-start positions
|
|
173
|
+
if (vocabToken.startsWith('<') || vocabToken.startsWith('['))
|
|
174
|
+
continue;
|
|
175
|
+
if (!isWordStart && vocabToken.startsWith('▁'))
|
|
176
|
+
continue;
|
|
177
|
+
if (remaining.startsWith(vocabToken.replace(/^▁/, ''))) {
|
|
178
|
+
const matchLength = vocabToken.replace(/^▁/, '').length;
|
|
179
|
+
if (matchLength > 0) {
|
|
180
|
+
return { token: vocabToken, id, length: matchLength };
|
|
181
|
+
}
|
|
235
182
|
}
|
|
236
183
|
}
|
|
237
|
-
//
|
|
238
|
-
|
|
184
|
+
// Single character fallback
|
|
185
|
+
const char = remaining[0];
|
|
186
|
+
const charId = this.vocab.get(char) ?? this.vocab.get('▁' + char) ?? this.unkId;
|
|
187
|
+
return { token: char, id: charId, length: 1 };
|
|
239
188
|
}
|
|
240
189
|
/**
|
|
241
190
|
* Decodes token IDs back to text
|
|
242
191
|
*/
|
|
243
192
|
decode(tokenIds) {
|
|
244
|
-
const
|
|
193
|
+
const parts = [];
|
|
245
194
|
for (const id of tokenIds) {
|
|
246
195
|
const token = this.inverseVocab.get(id);
|
|
247
196
|
if (token === undefined)
|
|
248
197
|
continue;
|
|
249
|
-
|
|
250
|
-
if (token === this.config.clsToken ||
|
|
251
|
-
token === this.config.sepToken ||
|
|
252
|
-
token === this.config.padToken) {
|
|
198
|
+
if (token === this.clsToken || token === this.sepToken || token === this.padToken)
|
|
253
199
|
continue;
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
tokens.push(token.slice(2));
|
|
200
|
+
// SentencePiece uses ▁ to mark word boundaries
|
|
201
|
+
if (token.startsWith('▁')) {
|
|
202
|
+
parts.push(' ' + token.slice(1));
|
|
258
203
|
}
|
|
259
204
|
else {
|
|
260
|
-
|
|
261
|
-
tokens.push(' ');
|
|
262
|
-
}
|
|
263
|
-
tokens.push(token);
|
|
205
|
+
parts.push(token);
|
|
264
206
|
}
|
|
265
207
|
}
|
|
266
|
-
return
|
|
208
|
+
return parts.join('').trim();
|
|
267
209
|
}
|
|
268
210
|
/**
|
|
269
211
|
* Gets vocabulary size
|
|
@@ -285,15 +227,58 @@ export class WordPieceTokenizer {
|
|
|
285
227
|
}
|
|
286
228
|
}
|
|
287
229
|
/**
|
|
288
|
-
* Loads vocabulary from a
|
|
230
|
+
* Loads vocabulary from a file (supports tokenizer.json and vocab.txt)
|
|
289
231
|
*/
|
|
290
|
-
export async function loadVocabFromFile(
|
|
232
|
+
export async function loadVocabFromFile(filePath) {
|
|
291
233
|
const fs = await import('fs/promises');
|
|
292
|
-
const content = await fs.readFile(
|
|
293
|
-
|
|
234
|
+
const content = await fs.readFile(filePath, 'utf-8');
|
|
235
|
+
// Detect format
|
|
236
|
+
if (filePath.endsWith('.json') || content.trim().startsWith('{')) {
|
|
237
|
+
return parseHFTokenizerJson(content);
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
return parseVocab(content);
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* Parses HuggingFace tokenizer.json format
|
|
245
|
+
*/
|
|
246
|
+
export function parseHFTokenizerJson(content) {
|
|
247
|
+
const vocab = new Map();
|
|
248
|
+
try {
|
|
249
|
+
const config = JSON.parse(content);
|
|
250
|
+
// Add special tokens first
|
|
251
|
+
if (config.added_tokens) {
|
|
252
|
+
for (const token of config.added_tokens) {
|
|
253
|
+
vocab.set(token.content, token.id);
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
// Add model vocabulary
|
|
257
|
+
if (config.model?.vocab) {
|
|
258
|
+
if (Array.isArray(config.model.vocab)) {
|
|
259
|
+
// Unigram format: array of [token, score] pairs
|
|
260
|
+
for (let i = 0; i < config.model.vocab.length; i++) {
|
|
261
|
+
const entry = config.model.vocab[i];
|
|
262
|
+
if (entry && typeof entry[0] === 'string') {
|
|
263
|
+
vocab.set(entry[0], i);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
else {
|
|
268
|
+
// BPE/WordPiece format: object mapping token -> id
|
|
269
|
+
for (const [token, id] of Object.entries(config.model.vocab)) {
|
|
270
|
+
vocab.set(token, id);
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
catch (e) {
|
|
276
|
+
throw new Error(`Failed to parse tokenizer.json: ${e}`);
|
|
277
|
+
}
|
|
278
|
+
return vocab;
|
|
294
279
|
}
|
|
295
280
|
/**
|
|
296
|
-
* Parses vocabulary from string content
|
|
281
|
+
* Parses vocabulary from string content (vocab.txt format)
|
|
297
282
|
*/
|
|
298
283
|
export function parseVocab(content) {
|
|
299
284
|
const vocab = new Map();
|
|
@@ -311,26 +296,19 @@ export function parseVocab(content) {
|
|
|
311
296
|
*/
|
|
312
297
|
export function createTestVocab() {
|
|
313
298
|
const tokens = [
|
|
314
|
-
'
|
|
315
|
-
'
|
|
316
|
-
'
|
|
317
|
-
'
|
|
318
|
-
'
|
|
319
|
-
'
|
|
320
|
-
'
|
|
321
|
-
'
|
|
322
|
-
'
|
|
323
|
-
'
|
|
324
|
-
'
|
|
325
|
-
'
|
|
326
|
-
'germany',
|
|
327
|
-
'##s',
|
|
328
|
-
'##ed',
|
|
329
|
-
'##ing',
|
|
330
|
-
',',
|
|
331
|
-
'.',
|
|
299
|
+
'<s>',
|
|
300
|
+
'<pad>',
|
|
301
|
+
'</s>',
|
|
302
|
+
'<unk>',
|
|
303
|
+
'▁Hello',
|
|
304
|
+
'▁John',
|
|
305
|
+
'▁Smith',
|
|
306
|
+
'▁from',
|
|
307
|
+
'▁Acme',
|
|
308
|
+
'▁Corp',
|
|
309
|
+
'▁in',
|
|
310
|
+
'▁Berlin',
|
|
332
311
|
'!',
|
|
333
|
-
'?',
|
|
334
312
|
];
|
|
335
313
|
const vocab = new Map();
|
|
336
314
|
tokens.forEach((token, index) => {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/ner/tokenizer.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAkEH;;GAEG;AACH,MAAM,CAAC,MAAM,wBAAwB,GAAoB;IACvD,SAAS,EAAE,GAAG;IACd,WAAW,EAAE,KAAK,EAAE,gCAAgC;CACrD,CAAC;AAEF;;GAEG;AACH,MAAM,OAAO,kBAAkB;IACrB,KAAK,CAAsB;IAC3B,YAAY,CAAsB;IAClC,MAAM,CAAkB;IACxB,WAAW,CAA0B;IAE7C,wCAAwC;IAChC,KAAK,GAAW,CAAC,CAAC,CAAE,MAAM;IAC1B,KAAK,GAAW,CAAC,CAAC,CAAE,OAAO;IAC3B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAC5B,KAAK,GAAW,CAAC,CAAC,CAAE,QAAQ;IAEpC,wBAAwB;IAChB,QAAQ,GAAW,KAAK,CAAC;IACzB,QAAQ,GAAW,MAAM,CAAC;IAC1B,QAAQ,GAAW,OAAO,CAAC;IAC3B,QAAQ,GAAW,OAAO,CAAC;IAEnC,YAAY,KAA0B,EAAE,SAAmC,EAAE;QAC3E,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC;QACnB,IAAI,CAAC,MAAM,GAAG,EAAE,GAAG,wBAAwB,EAAE,GAAG,MAAM,EAAE,CAAC;QAEzD,sBAAsB;QACtB,IAAI,CAAC,YAAY,GAAG,IAAI,GAAG,EAAE,CAAC;QAC9B,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;YAChC,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,EAAE,KAAK,CAAC,CAAC;QACnC,CAAC;QAED,iEAAiE;QACjE,IAAI,CAAC,WAAW,GAAG,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAEzF,0CAA0C;QAC1C,IAAI,CAAC,mBAAmB,EAAE,CAAC;IAC7B,CAAC;IAED;;OAEG;IACK,mBAAmB;QACzB,oBAAoB;QACpB,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;YAC1B,IAAI,CAAC,QAAQ,GAAG,KAAK,CAAC;YACtB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;YACxC,IAAI,CAAC,QAAQ,GAAG,MAAM,CAAC;YACvB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;YACzC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;QAC5C,CAAC;QACD,aAAa;aACR,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE,CAAC;YACjC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;YAC5C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAC1C,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC;YACxB,IAAI,CAAC,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC;QAC9C,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,IAAY;QACnB,MAAM,MAAM,GAAY,EAAE,CAAC;QAC3B,MAAM,eAAe,GAAmC,EAAE,CAAC;QAE3D,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,CAAC;YACR,GAAG,EAAE,CAAC;YACN,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,kBAAkB;QAClB,MAAM,aAAa,GAAG,IAAI,CAAC,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;QAE1E,sCAAsC;QACtC,IAAI,GAAG,GAAG,CAAC,CAAC;QACZ,OAAO,GAAG,GAAG,aAAa,CAAC,MAAM,EAAE,CAAC;YAClC,kBAAkB;YAClB,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,CAAE,CAAC,EAAE,CAAC;gBACnC,GAAG,EAAE,CAAC;gBACN,SAAS;YACX,CAAC;YAED,4DAA4D;YAC5D,MAAM,EAAE,KAAK,EAAE,EAAE,EAAE,MAAM,EAAE,GAAG,IAAI,CAAC,aAAa,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC;YAErE,MAAM,aAAa,GAAG,GAAG,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,GAAG,CAAC,CAAE,CAAC,CAAC;YAEtE,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,KAAK;gBACL,KAAK,EAAE,GAAG;gBACV,GAAG,EAAE,GAAG,GAAG,MAAM;gBACjB,cAAc,EAAE,CAAC,aAAa,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC;gBACxD,SAAS,EAAE,KAAK;aACjB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,CAAC,GAAG,EAAE,GAAG,GAAG,MAAM,CAAC,CAAC,CAAC;YAE1C,GAAG,IAAI,MAAM,CAAC;QAChB,CAAC;QAED,gBAAgB;QAChB,MAAM,CAAC,IAAI,CAAC;YACV,EAAE,EAAE,IAAI,CAAC,KAAK;YACd,KAAK,EAAE,IAAI,CAAC,QAAQ;YACpB,KAAK,EAAE,IAAI,CAAC,MAAM;YAClB,GAAG,EAAE,IAAI,CAAC,MAAM;YAChB,cAAc,EAAE,KAAK;YACrB,SAAS,EAAE,IAAI;SAChB,CAAC,CAAC;QACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAE3B,wBAAwB;QACxB,MAAM,SAAS,GAAG,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC;QACxC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YAC9B,MAAM,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YAC9B,eAAe,CAAC,MAAM,GAAG,SAAS,GAAG,CAAC,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE,EAAE,IAAI,CAAC,KAAK;gBACd,KAAK,EAAE,IAAI,CAAC,QAAQ;gBACpB,KAAK,EAAE,IAAI,CAAC,MAAM;gBAClB,GAAG,EAAE,IAAI,CAAC,MAAM;gBAChB,cAAc,EAAE,KAAK;gBACrB,SAAS,EAAE,IAAI;aAChB,CAAC,CAAC;YACH,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7B,CAAC;QAED,eAAe;QACf,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QACzC,MAAM,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAC1C,MAAM,YAAY,GAAG,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;QAEzC,OAAO;YACL,MAAM;YACN,QAAQ;YACR,aAAa;YACb,YAAY;YACZ,eAAe;SAChB,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,QAAgB;QAClD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC;QAEvC,+DAA+D;QAC/D,MAAM,WAAW,GAAG,QAAQ,KAAK,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAE,CAAC,CAAC;QAErE,6DAA6D;QAC7D,IAAI,WAAW,EAAE,CAAC;YAChB,0BAA0B;YAC1B,MAAM,UAAU,GAAG,GAAG,GAAG,SAAS,CAAC;YACnC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBAChD,IAAI,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;oBACtC,0EAA0E;oBAC1E,OAAO;wBACL,KAAK,EAAE,UAAU;wBACjB,EAAE;wBACF,MAAM,EAAE,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,uBAAuB;qBACtD,CAAC;gBACJ,CAAC;YACH,CAAC;QACH,CAAC;QAED,iCAAiC;QACjC,KAAK,MAAM,CAAC,UAAU,EAAE,EAAE,CAAC,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;YAChD,8EAA8E;YAC9E,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YACvE,IAAI,CAAC,WAAW,IAAI,UAAU,CAAC,UAAU,CAAC,GAAG,CAAC;gBAAE,SAAS;YAEzD,IAAI,SAAS,CAAC,UAAU,CAAC,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,MAAM,WAAW,GAAG,UAAU,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC;gBACxD,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;oBACpB,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,EAAE,MAAM,EAAE,WAAW,EAAE,CAAC;gBACxD,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC;QAC3B,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,GAAG,IAAI,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC;QAChF,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC;IAChD,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAkB;QACvB,MAAM,KAAK,GAAa,EAAE,CAAC;QAE3B,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;YACxC,IAAI,KAAK,KAAK,SAAS;gBAAE,SAAS;YAClC,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ,IAAI,KAAK,KAAK,IAAI,CAAC,QAAQ;gBAAE,SAAS;YAE5F,+CAA+C;YAC/C,IAAI,KAAK,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC1B,KAAK,CAAC,IAAI,CAAC,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;YACnC,CAAC;iBAAM,CAAC;gBACN,KAAK,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACpB,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,IAAI,SAAS;QACX,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,UAAU,CAAC,KAAa;QACtB,OAAO,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,QAAQ,CAAC,EAAU;QACjB,OAAO,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;IACnC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,iBAAiB,CAAC,QAAgB;IACtD,MAAM,EAAE,GAAG,MAAM,MAAM,CAAC,aAAa,CAAC,CAAC;IACvC,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAErD,gBAAgB;IAChB,IAAI,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACjE,OAAO,oBAAoB,CAAC,OAAO,CAAC,CAAC;IACvC,CAAC;SAAM,CAAC;QACN,OAAO,UAAU,CAAC,OAAO,CAAC,CAAC;IAC7B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAAC,OAAe;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IAExC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAsB,CAAC;QAExD,2BAA2B;QAC3B,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YACxB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;gBACxC,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,OAAO,EAAE,KAAK,CAAC,EAAE,CAAC,CAAC;YACrC,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,IAAI,MAAM,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC;YACxB,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,gDAAgD;gBAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;oBACnD,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;oBACpC,IAAI,KAAK,IAAI,OAAO,KAAK,CAAC,CAAC,CAAC,KAAK,QAAQ,EAAE,CAAC;wBAC1C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;oBACzB,CAAC;gBACH,CAAC;YACH,CAAC;iBAAM,CAAC;gBACN,mDAAmD;gBACnD,KAAK,MAAM,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;oBAC7D,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,EAAY,CAAC,CAAC;gBACjC,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAAC,OAAO,CAAC,EAAE,CAAC;QACX,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,UAAU,CAAC,OAAe;IACxC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAElC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC;QAC/B,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5C,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QACtB,CAAC;IACH,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,eAAe;IAC7B,MAAM,MAAM,GAAG;QACb,KAAK;QACL,OAAO;QACP,MAAM;QACN,OAAO;QACP,QAAQ;QACR,OAAO;QACP,QAAQ;QACR,OAAO;QACP,OAAO;QACP,OAAO;QACP,KAAK;QACL,SAAS;QACT,GAAG;KACJ,CAAC;IAEF,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAC;IACxC,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,EAAE;QAC9B,KAAK,CAAC,GAAG,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC,CAAC,CAAC;IAEH,OAAO,KAAK,CAAC;AACf,CAAC"}
|