@storyteller-platform/align 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +6 -5
- package/dist/align/__tests__/align.test.js +6 -5
- package/dist/align/align.cjs +133 -81
- package/dist/align/align.d.cts +1 -0
- package/dist/align/align.d.ts +1 -0
- package/dist/align/align.js +133 -81
- package/dist/align/getSentenceRanges.cjs +78 -149
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +78 -149
- package/dist/align/slugify.cjs +2 -0
- package/dist/align/slugify.js +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
- package/dist/errorAlign/__tests__/native.test.cjs +118 -0
- package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/native.test.js +107 -0
- package/dist/errorAlign/backtraceGraph.cjs +298 -0
- package/dist/errorAlign/backtraceGraph.d.cts +103 -0
- package/dist/errorAlign/backtraceGraph.d.ts +103 -0
- package/dist/errorAlign/backtraceGraph.js +270 -0
- package/dist/errorAlign/beamSearch.cjs +302 -0
- package/dist/errorAlign/beamSearch.d.cts +53 -0
- package/dist/errorAlign/beamSearch.d.ts +53 -0
- package/dist/errorAlign/beamSearch.js +268 -0
- package/dist/errorAlign/core.cjs +33 -0
- package/dist/errorAlign/core.d.cts +5 -0
- package/dist/errorAlign/core.d.ts +5 -0
- package/dist/errorAlign/core.js +11 -0
- package/dist/errorAlign/editDistance.cjs +115 -0
- package/dist/errorAlign/editDistance.d.cts +46 -0
- package/dist/errorAlign/editDistance.d.ts +46 -0
- package/dist/errorAlign/editDistance.js +90 -0
- package/dist/errorAlign/errorAlign.cjs +159 -0
- package/dist/errorAlign/errorAlign.d.cts +15 -0
- package/dist/errorAlign/errorAlign.d.ts +15 -0
- package/dist/errorAlign/errorAlign.js +145 -0
- package/dist/errorAlign/graphMetadata.cjs +97 -0
- package/dist/errorAlign/graphMetadata.d.cts +44 -0
- package/dist/errorAlign/graphMetadata.d.ts +44 -0
- package/dist/errorAlign/graphMetadata.js +64 -0
- package/dist/errorAlign/hash.cjs +173 -0
- package/dist/errorAlign/hash.d.cts +28 -0
- package/dist/errorAlign/hash.d.ts +28 -0
- package/dist/errorAlign/hash.js +150 -0
- package/dist/errorAlign/native.cjs +60 -0
- package/dist/errorAlign/native.d.cts +18 -0
- package/dist/errorAlign/native.d.ts +18 -0
- package/dist/errorAlign/native.js +24 -0
- package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
- package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
- package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
- package/dist/errorAlign/node-gyp-build.d.js +0 -0
- package/dist/errorAlign/pathToAlignment.cjs +122 -0
- package/dist/errorAlign/pathToAlignment.d.cts +11 -0
- package/dist/errorAlign/pathToAlignment.d.ts +11 -0
- package/dist/errorAlign/pathToAlignment.js +89 -0
- package/dist/errorAlign/utils.cjs +301 -0
- package/dist/errorAlign/utils.d.cts +107 -0
- package/dist/errorAlign/utils.d.ts +107 -0
- package/dist/errorAlign/utils.js +248 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/markup/__tests__/markup.test.cjs +108 -81
- package/dist/markup/__tests__/markup.test.js +109 -82
- package/dist/markup/__tests__/parseDom.test.cjs +112 -0
- package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
- package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
- package/dist/markup/__tests__/parseDom.test.js +89 -0
- package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
- package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
- package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
- package/dist/markup/__tests__/serializeDom.test.js +97 -0
- package/dist/markup/__tests__/transform.test.cjs +122 -0
- package/dist/markup/__tests__/transform.test.d.cts +2 -0
- package/dist/markup/__tests__/transform.test.d.ts +2 -0
- package/dist/markup/__tests__/transform.test.js +99 -0
- package/dist/markup/map.cjs +261 -0
- package/dist/markup/map.d.cts +50 -0
- package/dist/markup/map.d.ts +50 -0
- package/dist/markup/map.js +236 -0
- package/dist/markup/markup.cjs +23 -201
- package/dist/markup/markup.d.cts +5 -9
- package/dist/markup/markup.d.ts +5 -9
- package/dist/markup/markup.js +24 -203
- package/dist/markup/model.cjs +172 -0
- package/dist/markup/model.d.cts +57 -0
- package/dist/markup/model.d.ts +57 -0
- package/dist/markup/model.js +145 -0
- package/dist/markup/parseDom.cjs +59 -0
- package/dist/markup/parseDom.d.cts +7 -0
- package/dist/markup/parseDom.d.ts +7 -0
- package/dist/markup/parseDom.js +35 -0
- package/dist/markup/segmentation.cjs +11 -57
- package/dist/markup/segmentation.d.cts +6 -2
- package/dist/markup/segmentation.d.ts +6 -2
- package/dist/markup/segmentation.js +11 -58
- package/dist/markup/serializeDom.cjs +87 -0
- package/dist/markup/serializeDom.d.cts +7 -0
- package/dist/markup/serializeDom.d.ts +7 -0
- package/dist/markup/serializeDom.js +63 -0
- package/dist/markup/transform.cjs +92 -0
- package/dist/markup/transform.d.cts +11 -0
- package/dist/markup/transform.d.ts +11 -0
- package/dist/markup/transform.js +71 -0
- package/dist/types/node-gyp-build.d.cjs +1 -0
- package/dist/types/node-gyp-build.d.d.cts +3 -0
- package/dist/types/node-gyp-build.d.d.ts +3 -0
- package/dist/types/node-gyp-build.d.js +0 -0
- package/package.json +11 -4
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var errorAlign_exports = {};
|
|
20
|
+
__export(errorAlign_exports, {
|
|
21
|
+
errorAlign: () => errorAlign
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(errorAlign_exports);
|
|
24
|
+
var import_itertools = require("itertools");
|
|
25
|
+
var import_backtraceGraph = require("./backtraceGraph.cjs");
|
|
26
|
+
var import_graphMetadata = require("./graphMetadata.cjs");
|
|
27
|
+
var import_native = require("./native.cjs");
|
|
28
|
+
var import_pathToAlignment = require("./pathToAlignment.cjs");
|
|
29
|
+
var import_utils = require("./utils.cjs");
|
|
30
|
+
function errorAlign(ref, hyp, tokenizer = import_utils.basicTokenizer, normalizer = import_utils.basicNormalizer, beamSize = 100, wordLevelPass = true) {
|
|
31
|
+
const graphMetadata = prepareGraphMetadata(ref, hyp, tokenizer, normalizer);
|
|
32
|
+
if (graphMetadata.refNorm === graphMetadata.hypNorm) {
|
|
33
|
+
return alignIdenticalInputs(graphMetadata);
|
|
34
|
+
}
|
|
35
|
+
if (!wordLevelPass) {
|
|
36
|
+
return alignBeamSearch(graphMetadata, beamSize);
|
|
37
|
+
}
|
|
38
|
+
return alignWithWordLevelPass(graphMetadata, beamSize);
|
|
39
|
+
}
|
|
40
|
+
function prepareGraphMetadata(ref, hyp, tokenizer = import_utils.basicTokenizer, normalizer = import_utils.basicNormalizer) {
|
|
41
|
+
const unpackedTokenizer = (0, import_utils.unpackRegexMatch)(tokenizer);
|
|
42
|
+
const refTokenMatches = unpackedTokenizer(ref);
|
|
43
|
+
const hypTokenMatches = unpackedTokenizer(hyp);
|
|
44
|
+
const ensuredNormalizer = (0, import_utils.ensureLengthPreservation)(normalizer);
|
|
45
|
+
const refNorm = refTokenMatches.map(([r]) => ensuredNormalizer(r));
|
|
46
|
+
const hypNorm = hypTokenMatches.map(([h]) => ensuredNormalizer(h));
|
|
47
|
+
return {
|
|
48
|
+
refRaw: ref,
|
|
49
|
+
hypRaw: hyp,
|
|
50
|
+
refTokenMatches,
|
|
51
|
+
hypTokenMatches,
|
|
52
|
+
refNorm,
|
|
53
|
+
hypNorm
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
function alignIdenticalInputs(graphMetadata) {
|
|
57
|
+
const alignments = [];
|
|
58
|
+
for (const i of (0, import_itertools.range)(graphMetadata.refTokenMatches.length)) {
|
|
59
|
+
const alignment = getMatchAlignmentFromTokenIndices(graphMetadata, {
|
|
60
|
+
refIndex: i,
|
|
61
|
+
hypIndex: i
|
|
62
|
+
});
|
|
63
|
+
alignments.push(alignment);
|
|
64
|
+
}
|
|
65
|
+
return alignments;
|
|
66
|
+
}
|
|
67
|
+
function alignBeamSearch(graphMetadata, beamSize, refStart, refEnd, hypStart, hypEnd) {
|
|
68
|
+
const src = new import_graphMetadata.SubgraphMetadata(
|
|
69
|
+
graphMetadata.refRaw,
|
|
70
|
+
graphMetadata.hypRaw,
|
|
71
|
+
graphMetadata.refTokenMatches.slice(refStart, refEnd),
|
|
72
|
+
graphMetadata.hypTokenMatches.slice(hypStart, hypEnd),
|
|
73
|
+
graphMetadata.refNorm.slice(refStart, refEnd),
|
|
74
|
+
graphMetadata.hypNorm.slice(hypStart, hypEnd)
|
|
75
|
+
);
|
|
76
|
+
const path = (0, import_native.errorAlignBeamSearch)(src, beamSize);
|
|
77
|
+
return (0, import_pathToAlignment.getAlignments)(path);
|
|
78
|
+
}
|
|
79
|
+
function alignWithWordLevelPass(graphMetadata, beamSize) {
|
|
80
|
+
const { backtraceMatrix } = (0, import_native.computeLevenshteinDistanceMatrix)(
|
|
81
|
+
graphMetadata.refNorm,
|
|
82
|
+
graphMetadata.hypNorm,
|
|
83
|
+
true
|
|
84
|
+
);
|
|
85
|
+
const backtraceGraph = new import_backtraceGraph.BacktraceGraph(backtraceMatrix);
|
|
86
|
+
const matchIndices = backtraceGraph.getUnambiguousNodeMatches();
|
|
87
|
+
matchIndices.push([
|
|
88
|
+
graphMetadata.hypNorm.length,
|
|
89
|
+
graphMetadata.refNorm.length
|
|
90
|
+
]);
|
|
91
|
+
let hypStart = 0;
|
|
92
|
+
let refStart = 0;
|
|
93
|
+
const alignments = [];
|
|
94
|
+
const endIndex = matchIndices.length - 1;
|
|
95
|
+
for (const [i, [hypEnd, refEnd]] of (0, import_itertools.enumerate)(matchIndices)) {
|
|
96
|
+
const refIsEmpty = refStart === refEnd;
|
|
97
|
+
const hypIsEmpty = hypStart === hypEnd;
|
|
98
|
+
if (!refIsEmpty && !hypIsEmpty) {
|
|
99
|
+
alignments.push(
|
|
100
|
+
...alignBeamSearch(
|
|
101
|
+
graphMetadata,
|
|
102
|
+
beamSize,
|
|
103
|
+
refStart,
|
|
104
|
+
refEnd,
|
|
105
|
+
hypStart,
|
|
106
|
+
hypEnd
|
|
107
|
+
)
|
|
108
|
+
);
|
|
109
|
+
} else if (refIsEmpty && !hypIsEmpty) {
|
|
110
|
+
for (const tokenIndex of (0, import_itertools.range)(hypStart, hypEnd)) {
|
|
111
|
+
alignments.push(
|
|
112
|
+
getInsertAlignmentFromTokenIndex(graphMetadata, tokenIndex)
|
|
113
|
+
);
|
|
114
|
+
}
|
|
115
|
+
} else if (hypIsEmpty && !refIsEmpty) {
|
|
116
|
+
for (const tokenIndex of (0, import_itertools.range)(refStart, refEnd)) {
|
|
117
|
+
alignments.push(
|
|
118
|
+
getDeleteAlignmentFromTokenIndex(graphMetadata, tokenIndex)
|
|
119
|
+
);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (i < endIndex) {
|
|
123
|
+
alignments.push(
|
|
124
|
+
getMatchAlignmentFromTokenIndices(graphMetadata, {
|
|
125
|
+
refIndex: refEnd,
|
|
126
|
+
hypIndex: hypEnd
|
|
127
|
+
})
|
|
128
|
+
);
|
|
129
|
+
}
|
|
130
|
+
refStart = refEnd + 1;
|
|
131
|
+
hypStart = hypEnd + 1;
|
|
132
|
+
}
|
|
133
|
+
return alignments;
|
|
134
|
+
}
|
|
135
|
+
function getMatchAlignmentFromTokenIndices(graphMetadata, { refIndex, hypIndex }) {
|
|
136
|
+
const refSlice = graphMetadata.refTokenMatches[refIndex][1];
|
|
137
|
+
const hypSlice = graphMetadata.hypTokenMatches[hypIndex][1];
|
|
138
|
+
return new import_utils.Alignment(
|
|
139
|
+
"MATCH",
|
|
140
|
+
refSlice,
|
|
141
|
+
hypSlice,
|
|
142
|
+
graphMetadata.refRaw.slice(...refSlice),
|
|
143
|
+
graphMetadata.hypRaw.slice(...hypSlice)
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
function getInsertAlignmentFromTokenIndex(graphMetadata, hypIndex) {
|
|
147
|
+
const slice = graphMetadata.hypTokenMatches[hypIndex][1];
|
|
148
|
+
const token = graphMetadata.hypRaw.slice(...slice);
|
|
149
|
+
return new import_utils.Alignment("INSERT", null, slice, null, token);
|
|
150
|
+
}
|
|
151
|
+
function getDeleteAlignmentFromTokenIndex(graphMetadata, refIndex) {
|
|
152
|
+
const slice = graphMetadata.refTokenMatches[refIndex][1];
|
|
153
|
+
const token = graphMetadata.refRaw.slice(...slice);
|
|
154
|
+
return new import_utils.Alignment("DELETE", slice, null, token);
|
|
155
|
+
}
|
|
156
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
157
|
+
0 && (module.exports = {
|
|
158
|
+
errorAlign
|
|
159
|
+
});
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { basicTokenizer, basicNormalizer, Alignment } from './utils.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Run error alignment between reference and hypothesis texts.
|
|
5
|
+
*
|
|
6
|
+
* @param ref The reference sequence/transcript.
|
|
7
|
+
* @param hyp The hypothesis sequence/transcript.
|
|
8
|
+
* @param tokenizer A function to tokenize the sequence. Must be regex-based and return Match objects.
|
|
9
|
+
* @param normalizer A function to normalize the tokens. Defaults to basicNormalizer.
|
|
10
|
+
* @param beamSize The beam size for the beam search alignment.
|
|
11
|
+
* @param wordLevelPass Whether to perform a word-level aligment pass to identify unambiguous matches.
|
|
12
|
+
*/
|
|
13
|
+
declare function errorAlign(ref: string, hyp: string, tokenizer?: typeof basicTokenizer, normalizer?: typeof basicNormalizer, beamSize?: number, wordLevelPass?: boolean): Alignment[];
|
|
14
|
+
|
|
15
|
+
export { errorAlign };
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { basicTokenizer, basicNormalizer, Alignment } from './utils.js';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Run error alignment between reference and hypothesis texts.
|
|
5
|
+
*
|
|
6
|
+
* @param ref The reference sequence/transcript.
|
|
7
|
+
* @param hyp The hypothesis sequence/transcript.
|
|
8
|
+
* @param tokenizer A function to tokenize the sequence. Must be regex-based and return Match objects.
|
|
9
|
+
* @param normalizer A function to normalize the tokens. Defaults to basicNormalizer.
|
|
10
|
+
* @param beamSize The beam size for the beam search alignment.
|
|
11
|
+
* @param wordLevelPass Whether to perform a word-level aligment pass to identify unambiguous matches.
|
|
12
|
+
*/
|
|
13
|
+
declare function errorAlign(ref: string, hyp: string, tokenizer?: typeof basicTokenizer, normalizer?: typeof basicNormalizer, beamSize?: number, wordLevelPass?: boolean): Alignment[];
|
|
14
|
+
|
|
15
|
+
export { errorAlign };
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import "../chunk-BIEQXUOY.js";
|
|
2
|
+
import { enumerate, range } from "itertools";
|
|
3
|
+
import { BacktraceGraph } from "./backtraceGraph.js";
|
|
4
|
+
import { SubgraphMetadata } from "./graphMetadata.js";
|
|
5
|
+
import {
|
|
6
|
+
computeLevenshteinDistanceMatrix,
|
|
7
|
+
errorAlignBeamSearch
|
|
8
|
+
} from "./native.js";
|
|
9
|
+
import { getAlignments } from "./pathToAlignment.js";
|
|
10
|
+
import {
|
|
11
|
+
Alignment,
|
|
12
|
+
basicNormalizer,
|
|
13
|
+
basicTokenizer,
|
|
14
|
+
ensureLengthPreservation,
|
|
15
|
+
unpackRegexMatch
|
|
16
|
+
} from "./utils.js";
|
|
17
|
+
function errorAlign(ref, hyp, tokenizer = basicTokenizer, normalizer = basicNormalizer, beamSize = 100, wordLevelPass = true) {
|
|
18
|
+
const graphMetadata = prepareGraphMetadata(ref, hyp, tokenizer, normalizer);
|
|
19
|
+
if (graphMetadata.refNorm === graphMetadata.hypNorm) {
|
|
20
|
+
return alignIdenticalInputs(graphMetadata);
|
|
21
|
+
}
|
|
22
|
+
if (!wordLevelPass) {
|
|
23
|
+
return alignBeamSearch(graphMetadata, beamSize);
|
|
24
|
+
}
|
|
25
|
+
return alignWithWordLevelPass(graphMetadata, beamSize);
|
|
26
|
+
}
|
|
27
|
+
function prepareGraphMetadata(ref, hyp, tokenizer = basicTokenizer, normalizer = basicNormalizer) {
|
|
28
|
+
const unpackedTokenizer = unpackRegexMatch(tokenizer);
|
|
29
|
+
const refTokenMatches = unpackedTokenizer(ref);
|
|
30
|
+
const hypTokenMatches = unpackedTokenizer(hyp);
|
|
31
|
+
const ensuredNormalizer = ensureLengthPreservation(normalizer);
|
|
32
|
+
const refNorm = refTokenMatches.map(([r]) => ensuredNormalizer(r));
|
|
33
|
+
const hypNorm = hypTokenMatches.map(([h]) => ensuredNormalizer(h));
|
|
34
|
+
return {
|
|
35
|
+
refRaw: ref,
|
|
36
|
+
hypRaw: hyp,
|
|
37
|
+
refTokenMatches,
|
|
38
|
+
hypTokenMatches,
|
|
39
|
+
refNorm,
|
|
40
|
+
hypNorm
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
function alignIdenticalInputs(graphMetadata) {
|
|
44
|
+
const alignments = [];
|
|
45
|
+
for (const i of range(graphMetadata.refTokenMatches.length)) {
|
|
46
|
+
const alignment = getMatchAlignmentFromTokenIndices(graphMetadata, {
|
|
47
|
+
refIndex: i,
|
|
48
|
+
hypIndex: i
|
|
49
|
+
});
|
|
50
|
+
alignments.push(alignment);
|
|
51
|
+
}
|
|
52
|
+
return alignments;
|
|
53
|
+
}
|
|
54
|
+
function alignBeamSearch(graphMetadata, beamSize, refStart, refEnd, hypStart, hypEnd) {
|
|
55
|
+
const src = new SubgraphMetadata(
|
|
56
|
+
graphMetadata.refRaw,
|
|
57
|
+
graphMetadata.hypRaw,
|
|
58
|
+
graphMetadata.refTokenMatches.slice(refStart, refEnd),
|
|
59
|
+
graphMetadata.hypTokenMatches.slice(hypStart, hypEnd),
|
|
60
|
+
graphMetadata.refNorm.slice(refStart, refEnd),
|
|
61
|
+
graphMetadata.hypNorm.slice(hypStart, hypEnd)
|
|
62
|
+
);
|
|
63
|
+
const path = errorAlignBeamSearch(src, beamSize);
|
|
64
|
+
return getAlignments(path);
|
|
65
|
+
}
|
|
66
|
+
function alignWithWordLevelPass(graphMetadata, beamSize) {
|
|
67
|
+
const { backtraceMatrix } = computeLevenshteinDistanceMatrix(
|
|
68
|
+
graphMetadata.refNorm,
|
|
69
|
+
graphMetadata.hypNorm,
|
|
70
|
+
true
|
|
71
|
+
);
|
|
72
|
+
const backtraceGraph = new BacktraceGraph(backtraceMatrix);
|
|
73
|
+
const matchIndices = backtraceGraph.getUnambiguousNodeMatches();
|
|
74
|
+
matchIndices.push([
|
|
75
|
+
graphMetadata.hypNorm.length,
|
|
76
|
+
graphMetadata.refNorm.length
|
|
77
|
+
]);
|
|
78
|
+
let hypStart = 0;
|
|
79
|
+
let refStart = 0;
|
|
80
|
+
const alignments = [];
|
|
81
|
+
const endIndex = matchIndices.length - 1;
|
|
82
|
+
for (const [i, [hypEnd, refEnd]] of enumerate(matchIndices)) {
|
|
83
|
+
const refIsEmpty = refStart === refEnd;
|
|
84
|
+
const hypIsEmpty = hypStart === hypEnd;
|
|
85
|
+
if (!refIsEmpty && !hypIsEmpty) {
|
|
86
|
+
alignments.push(
|
|
87
|
+
...alignBeamSearch(
|
|
88
|
+
graphMetadata,
|
|
89
|
+
beamSize,
|
|
90
|
+
refStart,
|
|
91
|
+
refEnd,
|
|
92
|
+
hypStart,
|
|
93
|
+
hypEnd
|
|
94
|
+
)
|
|
95
|
+
);
|
|
96
|
+
} else if (refIsEmpty && !hypIsEmpty) {
|
|
97
|
+
for (const tokenIndex of range(hypStart, hypEnd)) {
|
|
98
|
+
alignments.push(
|
|
99
|
+
getInsertAlignmentFromTokenIndex(graphMetadata, tokenIndex)
|
|
100
|
+
);
|
|
101
|
+
}
|
|
102
|
+
} else if (hypIsEmpty && !refIsEmpty) {
|
|
103
|
+
for (const tokenIndex of range(refStart, refEnd)) {
|
|
104
|
+
alignments.push(
|
|
105
|
+
getDeleteAlignmentFromTokenIndex(graphMetadata, tokenIndex)
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
if (i < endIndex) {
|
|
110
|
+
alignments.push(
|
|
111
|
+
getMatchAlignmentFromTokenIndices(graphMetadata, {
|
|
112
|
+
refIndex: refEnd,
|
|
113
|
+
hypIndex: hypEnd
|
|
114
|
+
})
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
refStart = refEnd + 1;
|
|
118
|
+
hypStart = hypEnd + 1;
|
|
119
|
+
}
|
|
120
|
+
return alignments;
|
|
121
|
+
}
|
|
122
|
+
function getMatchAlignmentFromTokenIndices(graphMetadata, { refIndex, hypIndex }) {
|
|
123
|
+
const refSlice = graphMetadata.refTokenMatches[refIndex][1];
|
|
124
|
+
const hypSlice = graphMetadata.hypTokenMatches[hypIndex][1];
|
|
125
|
+
return new Alignment(
|
|
126
|
+
"MATCH",
|
|
127
|
+
refSlice,
|
|
128
|
+
hypSlice,
|
|
129
|
+
graphMetadata.refRaw.slice(...refSlice),
|
|
130
|
+
graphMetadata.hypRaw.slice(...hypSlice)
|
|
131
|
+
);
|
|
132
|
+
}
|
|
133
|
+
function getInsertAlignmentFromTokenIndex(graphMetadata, hypIndex) {
|
|
134
|
+
const slice = graphMetadata.hypTokenMatches[hypIndex][1];
|
|
135
|
+
const token = graphMetadata.hypRaw.slice(...slice);
|
|
136
|
+
return new Alignment("INSERT", null, slice, null, token);
|
|
137
|
+
}
|
|
138
|
+
function getDeleteAlignmentFromTokenIndex(graphMetadata, refIndex) {
|
|
139
|
+
const slice = graphMetadata.refTokenMatches[refIndex][1];
|
|
140
|
+
const token = graphMetadata.refRaw.slice(...slice);
|
|
141
|
+
return new Alignment("DELETE", slice, null, token);
|
|
142
|
+
}
|
|
143
|
+
export {
|
|
144
|
+
errorAlign
|
|
145
|
+
};
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var graphMetadata_exports = {};
|
|
30
|
+
__export(graphMetadata_exports, {
|
|
31
|
+
SubgraphMetadata: () => SubgraphMetadata
|
|
32
|
+
});
|
|
33
|
+
module.exports = __toCommonJS(graphMetadata_exports);
|
|
34
|
+
var import_itertools = require("itertools");
|
|
35
|
+
var import_memoize = __toESM(require("memoize"), 1);
|
|
36
|
+
var import_backtraceGraph = require("./backtraceGraph.cjs");
|
|
37
|
+
var import_editDistance = require("./editDistance.cjs");
|
|
38
|
+
var import_utils = require("./utils.cjs");
|
|
39
|
+
class SubgraphMetadata {
|
|
40
|
+
constructor(refRaw, hypRaw, refTokenMatches, hypTokenMatches, refNorm, hypNorm) {
|
|
41
|
+
this.refRaw = refRaw;
|
|
42
|
+
this.hypRaw = hypRaw;
|
|
43
|
+
this.refTokenMatches = refTokenMatches;
|
|
44
|
+
this.hypTokenMatches = hypTokenMatches;
|
|
45
|
+
this.refNorm = refNorm;
|
|
46
|
+
this.hypNorm = hypNorm;
|
|
47
|
+
this.ref = embedTokens(refNorm);
|
|
48
|
+
this.hyp = embedTokens(hypNorm);
|
|
49
|
+
this.refMaxIndex = this.ref.length - 1;
|
|
50
|
+
this.hypMaxIndex = this.hyp.length - 1;
|
|
51
|
+
this.refCharTypes = getCharTypes(this.ref);
|
|
52
|
+
this.hypCharTypes = getCharTypes(this.hyp);
|
|
53
|
+
this.refIndexMap = createIndexMap(refTokenMatches);
|
|
54
|
+
this.hypIndexMap = createIndexMap(hypTokenMatches);
|
|
55
|
+
const { backtraceMatrix } = (0, import_editDistance.computeErrorAlignDistanceMatrix)(
|
|
56
|
+
this.ref,
|
|
57
|
+
this.hyp,
|
|
58
|
+
true
|
|
59
|
+
);
|
|
60
|
+
this.backtraceGraph = new import_backtraceGraph.BacktraceGraph(backtraceMatrix);
|
|
61
|
+
this.backtraceNodeSet = this.backtraceGraph.getNodeSet();
|
|
62
|
+
this.unambiguousMatches = this.backtraceGraph.getUnambiguousTokenSpanMatches(this.ref);
|
|
63
|
+
}
|
|
64
|
+
ref;
|
|
65
|
+
hyp;
|
|
66
|
+
refMaxIndex;
|
|
67
|
+
hypMaxIndex;
|
|
68
|
+
refCharTypes;
|
|
69
|
+
hypCharTypes;
|
|
70
|
+
refIndexMap;
|
|
71
|
+
hypIndexMap;
|
|
72
|
+
backtraceGraph;
|
|
73
|
+
backtraceNodeSet;
|
|
74
|
+
unambiguousMatches;
|
|
75
|
+
}
|
|
76
|
+
function embedTokens(textTokens) {
|
|
77
|
+
return textTokens.map((t) => `${import_utils.START_DELIMITER}${t}${import_utils.END_DELIMITER}`).join("");
|
|
78
|
+
}
|
|
79
|
+
const categorizeCharCached = (0, import_memoize.default)(function categorizeCharCached2(c) {
|
|
80
|
+
return (0, import_utils.categorizeChar)(c);
|
|
81
|
+
});
|
|
82
|
+
function getCharTypes(text) {
|
|
83
|
+
return text.split("").map((c) => categorizeCharCached(c));
|
|
84
|
+
}
|
|
85
|
+
function createIndexMap(textTokens) {
|
|
86
|
+
const indexMap = [];
|
|
87
|
+
for (const [_, span] of textTokens) {
|
|
88
|
+
indexMap.push(-1);
|
|
89
|
+
indexMap.push(...(0, import_itertools.range)(...span));
|
|
90
|
+
indexMap.push(-1);
|
|
91
|
+
}
|
|
92
|
+
return indexMap;
|
|
93
|
+
}
|
|
94
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
95
|
+
0 && (module.exports = {
|
|
96
|
+
SubgraphMetadata
|
|
97
|
+
});
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { BacktraceGraph, Index } from './backtraceGraph.cjs';
|
|
2
|
+
import './utils.cjs';
|
|
3
|
+
|
|
4
|
+
type TokenWithSpan = [string, [number, number]];
|
|
5
|
+
interface GraphMetadata {
|
|
6
|
+
refRaw: string;
|
|
7
|
+
hypRaw: string;
|
|
8
|
+
refTokenMatches: TokenWithSpan[];
|
|
9
|
+
hypTokenMatches: TokenWithSpan[];
|
|
10
|
+
refNorm: string[];
|
|
11
|
+
hypNorm: string[];
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Data class to hold information needed for beam search alignment.
|
|
15
|
+
*
|
|
16
|
+
* This data class encapsulates all necessary infomation about a subgraph
|
|
17
|
+
* derived from the reference and hypothesis texts, including their tokenized
|
|
18
|
+
* and normalized forms, as well as derived attributes used during
|
|
19
|
+
* the alignment process.
|
|
20
|
+
*
|
|
21
|
+
* It works as a reference for the `Path` class during beam search alignment.
|
|
22
|
+
*/
|
|
23
|
+
declare class SubgraphMetadata {
|
|
24
|
+
refRaw: string;
|
|
25
|
+
hypRaw: string;
|
|
26
|
+
refTokenMatches: [string, [number, number]][];
|
|
27
|
+
hypTokenMatches: [string, [number, number]][];
|
|
28
|
+
refNorm: string[];
|
|
29
|
+
hypNorm: string[];
|
|
30
|
+
ref: string;
|
|
31
|
+
hyp: string;
|
|
32
|
+
refMaxIndex: number;
|
|
33
|
+
hypMaxIndex: number;
|
|
34
|
+
refCharTypes: number[];
|
|
35
|
+
hypCharTypes: number[];
|
|
36
|
+
refIndexMap: number[];
|
|
37
|
+
hypIndexMap: number[];
|
|
38
|
+
backtraceGraph: BacktraceGraph;
|
|
39
|
+
backtraceNodeSet: Set<Index>;
|
|
40
|
+
unambiguousMatches: Set<Index>;
|
|
41
|
+
constructor(refRaw: string, hypRaw: string, refTokenMatches: [string, [number, number]][], hypTokenMatches: [string, [number, number]][], refNorm: string[], hypNorm: string[]);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export { type GraphMetadata, SubgraphMetadata, type TokenWithSpan };
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { BacktraceGraph, Index } from './backtraceGraph.js';
|
|
2
|
+
import './utils.js';
|
|
3
|
+
|
|
4
|
+
type TokenWithSpan = [string, [number, number]];
|
|
5
|
+
interface GraphMetadata {
|
|
6
|
+
refRaw: string;
|
|
7
|
+
hypRaw: string;
|
|
8
|
+
refTokenMatches: TokenWithSpan[];
|
|
9
|
+
hypTokenMatches: TokenWithSpan[];
|
|
10
|
+
refNorm: string[];
|
|
11
|
+
hypNorm: string[];
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Data class to hold information needed for beam search alignment.
|
|
15
|
+
*
|
|
16
|
+
* This data class encapsulates all necessary infomation about a subgraph
|
|
17
|
+
* derived from the reference and hypothesis texts, including their tokenized
|
|
18
|
+
* and normalized forms, as well as derived attributes used during
|
|
19
|
+
* the alignment process.
|
|
20
|
+
*
|
|
21
|
+
* It works as a reference for the `Path` class during beam search alignment.
|
|
22
|
+
*/
|
|
23
|
+
declare class SubgraphMetadata {
|
|
24
|
+
refRaw: string;
|
|
25
|
+
hypRaw: string;
|
|
26
|
+
refTokenMatches: [string, [number, number]][];
|
|
27
|
+
hypTokenMatches: [string, [number, number]][];
|
|
28
|
+
refNorm: string[];
|
|
29
|
+
hypNorm: string[];
|
|
30
|
+
ref: string;
|
|
31
|
+
hyp: string;
|
|
32
|
+
refMaxIndex: number;
|
|
33
|
+
hypMaxIndex: number;
|
|
34
|
+
refCharTypes: number[];
|
|
35
|
+
hypCharTypes: number[];
|
|
36
|
+
refIndexMap: number[];
|
|
37
|
+
hypIndexMap: number[];
|
|
38
|
+
backtraceGraph: BacktraceGraph;
|
|
39
|
+
backtraceNodeSet: Set<Index>;
|
|
40
|
+
unambiguousMatches: Set<Index>;
|
|
41
|
+
constructor(refRaw: string, hypRaw: string, refTokenMatches: [string, [number, number]][], hypTokenMatches: [string, [number, number]][], refNorm: string[], hypNorm: string[]);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export { type GraphMetadata, SubgraphMetadata, type TokenWithSpan };
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import "../chunk-BIEQXUOY.js";
|
|
2
|
+
import { range } from "itertools";
|
|
3
|
+
import memoize from "memoize";
|
|
4
|
+
import { BacktraceGraph } from "./backtraceGraph.js";
|
|
5
|
+
import { computeErrorAlignDistanceMatrix } from "./editDistance.js";
|
|
6
|
+
import { END_DELIMITER, START_DELIMITER, categorizeChar } from "./utils.js";
|
|
7
|
+
class SubgraphMetadata {
|
|
8
|
+
constructor(refRaw, hypRaw, refTokenMatches, hypTokenMatches, refNorm, hypNorm) {
|
|
9
|
+
this.refRaw = refRaw;
|
|
10
|
+
this.hypRaw = hypRaw;
|
|
11
|
+
this.refTokenMatches = refTokenMatches;
|
|
12
|
+
this.hypTokenMatches = hypTokenMatches;
|
|
13
|
+
this.refNorm = refNorm;
|
|
14
|
+
this.hypNorm = hypNorm;
|
|
15
|
+
this.ref = embedTokens(refNorm);
|
|
16
|
+
this.hyp = embedTokens(hypNorm);
|
|
17
|
+
this.refMaxIndex = this.ref.length - 1;
|
|
18
|
+
this.hypMaxIndex = this.hyp.length - 1;
|
|
19
|
+
this.refCharTypes = getCharTypes(this.ref);
|
|
20
|
+
this.hypCharTypes = getCharTypes(this.hyp);
|
|
21
|
+
this.refIndexMap = createIndexMap(refTokenMatches);
|
|
22
|
+
this.hypIndexMap = createIndexMap(hypTokenMatches);
|
|
23
|
+
const { backtraceMatrix } = computeErrorAlignDistanceMatrix(
|
|
24
|
+
this.ref,
|
|
25
|
+
this.hyp,
|
|
26
|
+
true
|
|
27
|
+
);
|
|
28
|
+
this.backtraceGraph = new BacktraceGraph(backtraceMatrix);
|
|
29
|
+
this.backtraceNodeSet = this.backtraceGraph.getNodeSet();
|
|
30
|
+
this.unambiguousMatches = this.backtraceGraph.getUnambiguousTokenSpanMatches(this.ref);
|
|
31
|
+
}
|
|
32
|
+
ref;
|
|
33
|
+
hyp;
|
|
34
|
+
refMaxIndex;
|
|
35
|
+
hypMaxIndex;
|
|
36
|
+
refCharTypes;
|
|
37
|
+
hypCharTypes;
|
|
38
|
+
refIndexMap;
|
|
39
|
+
hypIndexMap;
|
|
40
|
+
backtraceGraph;
|
|
41
|
+
backtraceNodeSet;
|
|
42
|
+
unambiguousMatches;
|
|
43
|
+
}
|
|
44
|
+
function embedTokens(textTokens) {
|
|
45
|
+
return textTokens.map((t) => `${START_DELIMITER}${t}${END_DELIMITER}`).join("");
|
|
46
|
+
}
|
|
47
|
+
const categorizeCharCached = memoize(function categorizeCharCached2(c) {
|
|
48
|
+
return categorizeChar(c);
|
|
49
|
+
});
|
|
50
|
+
function getCharTypes(text) {
|
|
51
|
+
return text.split("").map((c) => categorizeCharCached(c));
|
|
52
|
+
}
|
|
53
|
+
function createIndexMap(textTokens) {
|
|
54
|
+
const indexMap = [];
|
|
55
|
+
for (const [_, span] of textTokens) {
|
|
56
|
+
indexMap.push(-1);
|
|
57
|
+
indexMap.push(...range(...span));
|
|
58
|
+
indexMap.push(-1);
|
|
59
|
+
}
|
|
60
|
+
return indexMap;
|
|
61
|
+
}
|
|
62
|
+
export {
|
|
63
|
+
SubgraphMetadata
|
|
64
|
+
};
|