@storyteller-platform/align 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +6 -5
- package/dist/align/__tests__/align.test.js +6 -5
- package/dist/align/align.cjs +133 -81
- package/dist/align/align.d.cts +1 -0
- package/dist/align/align.d.ts +1 -0
- package/dist/align/align.js +133 -81
- package/dist/align/getSentenceRanges.cjs +78 -149
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +78 -149
- package/dist/align/slugify.cjs +2 -0
- package/dist/align/slugify.js +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
- package/dist/errorAlign/__tests__/native.test.cjs +118 -0
- package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/native.test.js +107 -0
- package/dist/errorAlign/backtraceGraph.cjs +298 -0
- package/dist/errorAlign/backtraceGraph.d.cts +103 -0
- package/dist/errorAlign/backtraceGraph.d.ts +103 -0
- package/dist/errorAlign/backtraceGraph.js +270 -0
- package/dist/errorAlign/beamSearch.cjs +302 -0
- package/dist/errorAlign/beamSearch.d.cts +53 -0
- package/dist/errorAlign/beamSearch.d.ts +53 -0
- package/dist/errorAlign/beamSearch.js +268 -0
- package/dist/errorAlign/core.cjs +33 -0
- package/dist/errorAlign/core.d.cts +5 -0
- package/dist/errorAlign/core.d.ts +5 -0
- package/dist/errorAlign/core.js +11 -0
- package/dist/errorAlign/editDistance.cjs +115 -0
- package/dist/errorAlign/editDistance.d.cts +46 -0
- package/dist/errorAlign/editDistance.d.ts +46 -0
- package/dist/errorAlign/editDistance.js +90 -0
- package/dist/errorAlign/errorAlign.cjs +159 -0
- package/dist/errorAlign/errorAlign.d.cts +15 -0
- package/dist/errorAlign/errorAlign.d.ts +15 -0
- package/dist/errorAlign/errorAlign.js +145 -0
- package/dist/errorAlign/graphMetadata.cjs +97 -0
- package/dist/errorAlign/graphMetadata.d.cts +44 -0
- package/dist/errorAlign/graphMetadata.d.ts +44 -0
- package/dist/errorAlign/graphMetadata.js +64 -0
- package/dist/errorAlign/hash.cjs +173 -0
- package/dist/errorAlign/hash.d.cts +28 -0
- package/dist/errorAlign/hash.d.ts +28 -0
- package/dist/errorAlign/hash.js +150 -0
- package/dist/errorAlign/native.cjs +60 -0
- package/dist/errorAlign/native.d.cts +18 -0
- package/dist/errorAlign/native.d.ts +18 -0
- package/dist/errorAlign/native.js +24 -0
- package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
- package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
- package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
- package/dist/errorAlign/node-gyp-build.d.js +0 -0
- package/dist/errorAlign/pathToAlignment.cjs +122 -0
- package/dist/errorAlign/pathToAlignment.d.cts +11 -0
- package/dist/errorAlign/pathToAlignment.d.ts +11 -0
- package/dist/errorAlign/pathToAlignment.js +89 -0
- package/dist/errorAlign/utils.cjs +301 -0
- package/dist/errorAlign/utils.d.cts +107 -0
- package/dist/errorAlign/utils.d.ts +107 -0
- package/dist/errorAlign/utils.js +248 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/markup/__tests__/markup.test.cjs +108 -81
- package/dist/markup/__tests__/markup.test.js +109 -82
- package/dist/markup/__tests__/parseDom.test.cjs +112 -0
- package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
- package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
- package/dist/markup/__tests__/parseDom.test.js +89 -0
- package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
- package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
- package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
- package/dist/markup/__tests__/serializeDom.test.js +97 -0
- package/dist/markup/__tests__/transform.test.cjs +122 -0
- package/dist/markup/__tests__/transform.test.d.cts +2 -0
- package/dist/markup/__tests__/transform.test.d.ts +2 -0
- package/dist/markup/__tests__/transform.test.js +99 -0
- package/dist/markup/map.cjs +261 -0
- package/dist/markup/map.d.cts +50 -0
- package/dist/markup/map.d.ts +50 -0
- package/dist/markup/map.js +236 -0
- package/dist/markup/markup.cjs +23 -201
- package/dist/markup/markup.d.cts +5 -9
- package/dist/markup/markup.d.ts +5 -9
- package/dist/markup/markup.js +24 -203
- package/dist/markup/model.cjs +172 -0
- package/dist/markup/model.d.cts +57 -0
- package/dist/markup/model.d.ts +57 -0
- package/dist/markup/model.js +145 -0
- package/dist/markup/parseDom.cjs +59 -0
- package/dist/markup/parseDom.d.cts +7 -0
- package/dist/markup/parseDom.d.ts +7 -0
- package/dist/markup/parseDom.js +35 -0
- package/dist/markup/segmentation.cjs +11 -57
- package/dist/markup/segmentation.d.cts +6 -2
- package/dist/markup/segmentation.d.ts +6 -2
- package/dist/markup/segmentation.js +11 -58
- package/dist/markup/serializeDom.cjs +87 -0
- package/dist/markup/serializeDom.d.cts +7 -0
- package/dist/markup/serializeDom.d.ts +7 -0
- package/dist/markup/serializeDom.js +63 -0
- package/dist/markup/transform.cjs +92 -0
- package/dist/markup/transform.d.cts +11 -0
- package/dist/markup/transform.d.ts +11 -0
- package/dist/markup/transform.js +71 -0
- package/dist/types/node-gyp-build.d.cjs +1 -0
- package/dist/types/node-gyp-build.d.d.cts +3 -0
- package/dist/types/node-gyp-build.d.d.ts +3 -0
- package/dist/types/node-gyp-build.d.js +0 -0
- package/package.json +11 -4
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import "../chunk-BIEQXUOY.js";
|
|
2
|
+
import assert from "node:assert";
|
|
3
|
+
import { range } from "itertools";
|
|
4
|
+
import {
|
|
5
|
+
Counter,
|
|
6
|
+
END_DELIMITER,
|
|
7
|
+
OP_TYPE_COMBO_MAP,
|
|
8
|
+
START_DELIMITER,
|
|
9
|
+
reversed
|
|
10
|
+
} from "./utils.js";
|
|
11
|
+
class Node {
|
|
12
|
+
constructor(hypIndex, refIndex) {
|
|
13
|
+
this.hypIndex = hypIndex;
|
|
14
|
+
this.refIndex = refIndex;
|
|
15
|
+
}
|
|
16
|
+
children = /* @__PURE__ */ new Map();
|
|
17
|
+
parents = /* @__PURE__ */ new Map();
|
|
18
|
+
get index() {
|
|
19
|
+
return [this.hypIndex, this.refIndex];
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Get the offset index of th enode so indices match the hypothesis and reference strings.
|
|
23
|
+
*
|
|
24
|
+
* Root will be at (-1, -1).
|
|
25
|
+
*/
|
|
26
|
+
get offsetIndex() {
|
|
27
|
+
return [this.hypIndex - 1, this.refIndex - 1];
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Check if the node is a terminal node (i.e., it has no children).
|
|
31
|
+
*/
|
|
32
|
+
get isTerminal() {
|
|
33
|
+
return this.children.size === 0;
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Check if the node is a root node (i.e., it has no parents).
|
|
37
|
+
*/
|
|
38
|
+
get isRoot() {
|
|
39
|
+
return this.parents.size === 0;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
class NodeMap {
|
|
43
|
+
map;
|
|
44
|
+
constructor(entries) {
|
|
45
|
+
const keyedEntries = entries == null ? void 0 : entries.map(
|
|
46
|
+
([index, node]) => [`${index[0]}-${index[1]}`, { index, node }]
|
|
47
|
+
);
|
|
48
|
+
this.map = new Map(keyedEntries);
|
|
49
|
+
}
|
|
50
|
+
get([hypIndex, refIndex]) {
|
|
51
|
+
var _a;
|
|
52
|
+
const key = `${hypIndex}-${refIndex}`;
|
|
53
|
+
return (_a = this.map.get(key)) == null ? void 0 : _a.node;
|
|
54
|
+
}
|
|
55
|
+
set([hypIndex, refIndex], node) {
|
|
56
|
+
const key = `${hypIndex}-${refIndex}`;
|
|
57
|
+
this.map.set(key, { index: [hypIndex, refIndex], node });
|
|
58
|
+
}
|
|
59
|
+
has([hypIndex, refIndex]) {
|
|
60
|
+
const key = `${hypIndex}-${refIndex}`;
|
|
61
|
+
return this.map.has(key);
|
|
62
|
+
}
|
|
63
|
+
*entries() {
|
|
64
|
+
for (const { index, node } of this.map.values()) {
|
|
65
|
+
yield [index, node];
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
*values() {
|
|
69
|
+
for (const { node } of this.map.values()) {
|
|
70
|
+
yield node;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
class BacktraceGraph {
|
|
75
|
+
constructor(backtrackMatrix) {
|
|
76
|
+
this.backtrackMatrix = backtrackMatrix;
|
|
77
|
+
this.hypDim = backtrackMatrix.length;
|
|
78
|
+
this.refDim = backtrackMatrix[0].length;
|
|
79
|
+
this.hypMaxIndex = this.hypDim - 1;
|
|
80
|
+
this.refMaxIndex = this.refDim - 1;
|
|
81
|
+
}
|
|
82
|
+
hypDim;
|
|
83
|
+
refDim;
|
|
84
|
+
hypMaxIndex;
|
|
85
|
+
refMaxIndex;
|
|
86
|
+
_nodes = null;
|
|
87
|
+
/**
|
|
88
|
+
* Get the nodes in the graph.
|
|
89
|
+
*/
|
|
90
|
+
get nodes() {
|
|
91
|
+
if (this._nodes) return this._nodes;
|
|
92
|
+
const terminalNode = new Node(this.hypMaxIndex, this.refMaxIndex);
|
|
93
|
+
this._nodes = new NodeMap([[terminalNode.index, terminalNode]]);
|
|
94
|
+
for (const index of this.iterTopologicalOrder({ reverse: true })) {
|
|
95
|
+
if (this._nodes.has(index) && (index[0] !== 0 || index[1] !== 0)) {
|
|
96
|
+
this.addParentsFromBacktrace(index);
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
this._nodes = new NodeMap(
|
|
100
|
+
Array.from(this._nodes.entries()).toSorted(([[a1, a2]], [[b1, b2]]) => {
|
|
101
|
+
if (a1 === b1) return a2 - b2;
|
|
102
|
+
return a1 - b1;
|
|
103
|
+
})
|
|
104
|
+
);
|
|
105
|
+
return this._nodes;
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Get the node at the given index.
|
|
109
|
+
*
|
|
110
|
+
* @param hypIndex Hyp/row index.
|
|
111
|
+
* @param refIndex Ref/column index.
|
|
112
|
+
*/
|
|
113
|
+
getNode(hypIndex, refIndex) {
|
|
114
|
+
return this.nodes.get([hypIndex, refIndex]);
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Get the set of all node indices in the graph.
|
|
118
|
+
*/
|
|
119
|
+
getNodeSet() {
|
|
120
|
+
const transitions = /* @__PURE__ */ new Set();
|
|
121
|
+
for (const node of this.nodes.values()) {
|
|
122
|
+
transitions.add(node.offsetIndex);
|
|
123
|
+
}
|
|
124
|
+
return transitions;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Get a path through the graph.
|
|
128
|
+
*
|
|
129
|
+
* @param sample If true, sample a path randomly based on transition probabilities.
|
|
130
|
+
* Otherwise, return the first path deterministically.
|
|
131
|
+
* @returns A list of nodes representing the path.
|
|
132
|
+
*/
|
|
133
|
+
getPath({ sample } = { sample: false }) {
|
|
134
|
+
let node = this.getNode(0, 0);
|
|
135
|
+
assert(node == null ? void 0 : node.isRoot, "The node at (-1, -1) was expected to be a root node.");
|
|
136
|
+
const path = [];
|
|
137
|
+
while (!node.isTerminal) {
|
|
138
|
+
const opType = sample ? choose(Array.from(node.children.keys())) : (
|
|
139
|
+
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
140
|
+
node.children.keys().next().value
|
|
141
|
+
);
|
|
142
|
+
node = node.children.get(opType);
|
|
143
|
+
path.push([opType, node]);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Get nodes that can only be accounted for by a match.
|
|
148
|
+
*
|
|
149
|
+
* @returns A list of index tuples representing the unambiguous node matches.
|
|
150
|
+
*/
|
|
151
|
+
getUnambiguousNodeMatches() {
|
|
152
|
+
const matchIndices = /* @__PURE__ */ new Set();
|
|
153
|
+
const matchPerToken = {
|
|
154
|
+
ref: new Counter(),
|
|
155
|
+
hyp: new Counter()
|
|
156
|
+
};
|
|
157
|
+
const refOpTypes = /* @__PURE__ */ new Set(["MATCH", "SUBSTITUTE", "DELETE"]);
|
|
158
|
+
const hypOpTypes = /* @__PURE__ */ new Set(["MATCH", "SUBSTITUTE", "INSERT"]);
|
|
159
|
+
for (const [[hypIndex, refIndex], node] of this.nodes.entries()) {
|
|
160
|
+
if (node.parents.size === 1 && node.parents.has("MATCH")) {
|
|
161
|
+
matchIndices.add([hypIndex, refIndex]);
|
|
162
|
+
}
|
|
163
|
+
if (refOpTypes.intersection(node.parents).size) {
|
|
164
|
+
matchPerToken.ref.set(refIndex, matchPerToken.ref.get(refIndex) + 1);
|
|
165
|
+
}
|
|
166
|
+
if (hypOpTypes.intersection(node.parents).size) {
|
|
167
|
+
matchPerToken.hyp.set(hypIndex, matchPerToken.hyp.get(hypIndex) + 1);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
const unambiguousMatches = [];
|
|
171
|
+
for (const [hypIndex, refIndex] of matchIndices) {
|
|
172
|
+
if (matchPerToken.ref.get(refIndex) === 1 && matchPerToken.hyp.get(hypIndex) === 1) {
|
|
173
|
+
unambiguousMatches.push([hypIndex - 1, refIndex - 1]);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return unambiguousMatches.toSorted(([_a, a], [_b, b]) => a - b);
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Get word spans (i.e., <...>) that are unambiguously matched.
|
|
180
|
+
*
|
|
181
|
+
* That is, there is only one subpath that can account for the span using MATCH operations.
|
|
182
|
+
* Other subpaths that include INSERT, DELETE, SUBSTITUTE operations are not considered.
|
|
183
|
+
*
|
|
184
|
+
* @returns A list of index tuples representing the end node of unambiguous span matches.
|
|
185
|
+
*/
|
|
186
|
+
getUnambiguousTokenSpanMatches(ref) {
|
|
187
|
+
var _a;
|
|
188
|
+
ref = "_" + ref;
|
|
189
|
+
const monoMatchEndNodes = /* @__PURE__ */ new Set();
|
|
190
|
+
const refIndexes = new Counter();
|
|
191
|
+
const hypIndexes = new Counter();
|
|
192
|
+
for (const [[hypIndex, refIndex], node] of this.nodes.entries()) {
|
|
193
|
+
if (!node.parents.has("MATCH") || ref[refIndex] !== START_DELIMITER)
|
|
194
|
+
continue;
|
|
195
|
+
let _refIndex = refIndex + 1;
|
|
196
|
+
let _hypIndex = hypIndex + 1;
|
|
197
|
+
while (true) {
|
|
198
|
+
const _index = [_hypIndex, _refIndex];
|
|
199
|
+
if (!this.nodes.has(_index)) {
|
|
200
|
+
break;
|
|
201
|
+
}
|
|
202
|
+
if (!((_a = this.nodes.get(_index)) == null ? void 0 : _a.parents)) {
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
205
|
+
if (ref[_refIndex] === END_DELIMITER) {
|
|
206
|
+
const endIndex = _index;
|
|
207
|
+
monoMatchEndNodes.add(endIndex);
|
|
208
|
+
refIndexes.set(_refIndex, refIndexes.get(_refIndex) + 1);
|
|
209
|
+
hypIndexes.set(_hypIndex, hypIndexes.get(_hypIndex) + 1);
|
|
210
|
+
break;
|
|
211
|
+
}
|
|
212
|
+
_refIndex += 1;
|
|
213
|
+
_hypIndex += 1;
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
return new Set(
|
|
217
|
+
Array.from(monoMatchEndNodes).filter(([h, r]) => hypIndexes.get(h) === 1 && refIndexes.get(r) === 1).map(([h, r]) => [h - 1, r - 1])
|
|
218
|
+
);
|
|
219
|
+
}
|
|
220
|
+
/**
|
|
221
|
+
* Create a parent node based on the index of the current node and the operation type.
|
|
222
|
+
*/
|
|
223
|
+
parentNodeFromOpType(index, opType) {
|
|
224
|
+
const hypIndex = opType !== "DELETE" ? index[0] - 1 : index[0];
|
|
225
|
+
const refIndex = opType !== "INSERT" ? index[1] - 1 : index[1];
|
|
226
|
+
const parentIndex = [hypIndex, refIndex];
|
|
227
|
+
const nodes = this._nodes;
|
|
228
|
+
assert(!!nodes, "Called parentIndexFromOpType before instantiating _nodes");
|
|
229
|
+
if (!nodes.has(parentIndex)) {
|
|
230
|
+
nodes.set(parentIndex, new Node(...parentIndex));
|
|
231
|
+
}
|
|
232
|
+
return nodes.get(parentIndex);
|
|
233
|
+
}
|
|
234
|
+
/**
|
|
235
|
+
* Iterate through the nodes in topological order.
|
|
236
|
+
*/
|
|
237
|
+
*iterTopologicalOrder({ reverse } = { reverse: false }) {
|
|
238
|
+
for (const i of reverse ? reversed(range(this.hypDim)) : range(this.hypDim)) {
|
|
239
|
+
for (const j of reverse ? reversed(range(this.refDim)) : range(this.refDim)) {
|
|
240
|
+
yield [i, j];
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Add parents to the node at the given index based on the backtrace matrix.
|
|
246
|
+
*/
|
|
247
|
+
addParentsFromBacktrace(index) {
|
|
248
|
+
var _a;
|
|
249
|
+
const node = (_a = this._nodes) == null ? void 0 : _a.get(index);
|
|
250
|
+
assert(
|
|
251
|
+
!!node,
|
|
252
|
+
`Node at index ${index.toString()} does not exist in the graph.`
|
|
253
|
+
);
|
|
254
|
+
const opTypeComboCode = this.backtrackMatrix[node.hypIndex][node.refIndex];
|
|
255
|
+
const opTypeCombo = OP_TYPE_COMBO_MAP[opTypeComboCode];
|
|
256
|
+
for (const opType of opTypeCombo) {
|
|
257
|
+
const parentNode = this.parentNodeFromOpType(node.index, opType);
|
|
258
|
+
node.parents.set(opType, parentNode);
|
|
259
|
+
parentNode.children.set(opType, node);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
function choose(choices) {
|
|
264
|
+
const index = Math.round(Math.random() * choices.length);
|
|
265
|
+
return choices[index];
|
|
266
|
+
}
|
|
267
|
+
export {
|
|
268
|
+
BacktraceGraph,
|
|
269
|
+
Node
|
|
270
|
+
};
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __export = (target, all) => {
|
|
9
|
+
for (var name in all)
|
|
10
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
11
|
+
};
|
|
12
|
+
var __copyProps = (to, from, except, desc) => {
|
|
13
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
14
|
+
for (let key of __getOwnPropNames(from))
|
|
15
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
16
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
17
|
+
}
|
|
18
|
+
return to;
|
|
19
|
+
};
|
|
20
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
21
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
22
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
23
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
24
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
25
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
26
|
+
mod
|
|
27
|
+
));
|
|
28
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
|
+
var beamSearch_exports = {};
|
|
30
|
+
__export(beamSearch_exports, {
|
|
31
|
+
Path: () => Path,
|
|
32
|
+
errorAlignBeamSearch: () => errorAlignBeamSearch
|
|
33
|
+
});
|
|
34
|
+
module.exports = __toCommonJS(beamSearch_exports);
|
|
35
|
+
var import_node_assert = __toESM(require("node:assert"), 1);
|
|
36
|
+
var import_hash = require("./hash.cjs");
|
|
37
|
+
var import_utils = require("./utils.cjs");
|
|
38
|
+
const INT64_MASK = (1n << 64n) - 1n;
|
|
39
|
+
const SORT_ID_BASE = 146527n;
|
|
40
|
+
class Path {
|
|
41
|
+
constructor(src) {
|
|
42
|
+
this.src = src;
|
|
43
|
+
}
|
|
44
|
+
refIndex = -1;
|
|
45
|
+
hypIndex = -1;
|
|
46
|
+
lastHypIndex = -1;
|
|
47
|
+
lastRefIndex = -1;
|
|
48
|
+
closedCost = 0;
|
|
49
|
+
openCost = 0;
|
|
50
|
+
atUnambiguousMatchNode = false;
|
|
51
|
+
endIndices = [];
|
|
52
|
+
sortId = 0n;
|
|
53
|
+
/**
|
|
54
|
+
* Get the ID of the path used for pruning.
|
|
55
|
+
*/
|
|
56
|
+
get pruneId() {
|
|
57
|
+
return (0, import_hash.hash)([
|
|
58
|
+
this.hypIndex,
|
|
59
|
+
this.refIndex,
|
|
60
|
+
this.lastHypIndex,
|
|
61
|
+
this.lastRefIndex
|
|
62
|
+
]);
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Get the cost of the path.
|
|
66
|
+
*/
|
|
67
|
+
get cost() {
|
|
68
|
+
const isSub = isSubstitution(
|
|
69
|
+
this.hypIndex,
|
|
70
|
+
this.refIndex,
|
|
71
|
+
this.lastHypIndex,
|
|
72
|
+
this.lastRefIndex
|
|
73
|
+
);
|
|
74
|
+
return this.closedCost + this.openCost + (isSub ? this.openCost : 0);
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Get the normalized cost of the path.
|
|
78
|
+
*/
|
|
79
|
+
get normCost() {
|
|
80
|
+
const cost = this.cost;
|
|
81
|
+
if (cost === 0) return 0;
|
|
82
|
+
return cost / (this.refIndex + this.hypIndex + 3);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Get the current node index of the path.
|
|
86
|
+
*/
|
|
87
|
+
get index() {
|
|
88
|
+
return [this.hypIndex, this.refIndex];
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Check if the path has reached the terminal node.
|
|
92
|
+
*/
|
|
93
|
+
get atEnd() {
|
|
94
|
+
return this.hypIndex === this.src.hypMaxIndex && this.refIndex === this.src.refMaxIndex;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Update the sort ID for path ordering. Ensures identical behavior as C++ implementation.
|
|
98
|
+
*/
|
|
99
|
+
updateSortId(t) {
|
|
100
|
+
this.sortId = this.sortId * SORT_ID_BASE + t & INT64_MASK;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
function* expand(parent) {
|
|
104
|
+
const deletePath = addDelete(parent);
|
|
105
|
+
if (deletePath) yield deletePath;
|
|
106
|
+
const insertPath = addInsert(parent);
|
|
107
|
+
if (insertPath) yield insertPath;
|
|
108
|
+
const subOrMatchPath = addSubstitutionOrMatch(parent);
|
|
109
|
+
if (subOrMatchPath) yield subOrMatchPath;
|
|
110
|
+
}
|
|
111
|
+
function addSubstitutionOrMatch(parent) {
|
|
112
|
+
if (parent.refIndex >= parent.src.refMaxIndex || parent.hypIndex >= parent.src.hypMaxIndex) {
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
let child = transitionToChildNode(parent, {
|
|
116
|
+
refStep: 1,
|
|
117
|
+
hypStep: 1
|
|
118
|
+
});
|
|
119
|
+
const isMatch = parent.src.ref[child.refIndex] === parent.src.hyp[child.hypIndex];
|
|
120
|
+
if (!isMatch) {
|
|
121
|
+
const refIsDelimiter = parent.src.refCharTypes[child.refIndex] === 0;
|
|
122
|
+
const hypIsDelimiter = parent.src.hypCharTypes[child.hypIndex] === 0;
|
|
123
|
+
if (refIsDelimiter || hypIsDelimiter) return null;
|
|
124
|
+
}
|
|
125
|
+
if (parent.src.ref[child.refIndex] === import_utils.START_DELIMITER) {
|
|
126
|
+
endInsertionSegment(child, parent.hypIndex, parent.refIndex);
|
|
127
|
+
}
|
|
128
|
+
if (!isMatch) {
|
|
129
|
+
const isBacktrace = parent.src.backtraceNodeSet.has(parent.index);
|
|
130
|
+
const isLetterTypeMatch = parent.src.refCharTypes[child.refIndex] === parent.src.hypCharTypes[child.hypIndex];
|
|
131
|
+
child.openCost += isLetterTypeMatch ? 2 : 3;
|
|
132
|
+
child.openCost += isBacktrace ? 0 : 1;
|
|
133
|
+
}
|
|
134
|
+
if (child.src.ref[child.refIndex] === import_utils.END_DELIMITER) {
|
|
135
|
+
child = endSegment(child);
|
|
136
|
+
}
|
|
137
|
+
return child;
|
|
138
|
+
}
|
|
139
|
+
function addInsert(parent) {
|
|
140
|
+
if (parent.refIndex >= parent.src.refMaxIndex) {
|
|
141
|
+
return null;
|
|
142
|
+
}
|
|
143
|
+
let child = transitionToChildNode(parent, {
|
|
144
|
+
refStep: 1,
|
|
145
|
+
hypStep: 0
|
|
146
|
+
});
|
|
147
|
+
if (parent.src.ref[child.refIndex] === import_utils.START_DELIMITER) {
|
|
148
|
+
endInsertionSegment(child, parent.hypIndex, parent.refIndex);
|
|
149
|
+
}
|
|
150
|
+
const isBacktrace = parent.src.backtraceNodeSet.has(parent.index);
|
|
151
|
+
const isDelimiter = parent.src.refCharTypes[child.refIndex] === 0;
|
|
152
|
+
child.openCost += isDelimiter ? 1 : 2;
|
|
153
|
+
child.openCost += isBacktrace || isDelimiter ? 0 : 1;
|
|
154
|
+
if (child.src.ref[child.refIndex] === import_utils.END_DELIMITER) {
|
|
155
|
+
child = endSegment(child);
|
|
156
|
+
}
|
|
157
|
+
return child;
|
|
158
|
+
}
|
|
159
|
+
function addDelete(parent) {
|
|
160
|
+
if (parent.hypIndex >= parent.src.hypMaxIndex) {
|
|
161
|
+
return null;
|
|
162
|
+
}
|
|
163
|
+
const child = transitionToChildNode(parent, { refStep: 0, hypStep: 1 });
|
|
164
|
+
const isBacktrace = parent.src.backtraceNodeSet.has(parent.index);
|
|
165
|
+
const isDelimiter = parent.src.hypCharTypes[child.hypIndex] === 0;
|
|
166
|
+
child.openCost += isDelimiter ? 1 : 2;
|
|
167
|
+
child.openCost += isBacktrace || isDelimiter ? 0 : 1;
|
|
168
|
+
if (child.src.hyp[child.hypIndex] === import_utils.END_DELIMITER) {
|
|
169
|
+
endInsertionSegment(child, child.hypIndex, child.refIndex);
|
|
170
|
+
}
|
|
171
|
+
return child;
|
|
172
|
+
}
|
|
173
|
+
function resetSegmentVariables(path, hypIndex, refIndex) {
|
|
174
|
+
path.closedCost += path.openCost;
|
|
175
|
+
const isSub = isSubstitution(
|
|
176
|
+
hypIndex,
|
|
177
|
+
refIndex,
|
|
178
|
+
path.lastHypIndex,
|
|
179
|
+
path.lastRefIndex
|
|
180
|
+
);
|
|
181
|
+
path.closedCost += isSub ? path.openCost : 0;
|
|
182
|
+
path.lastHypIndex = hypIndex;
|
|
183
|
+
path.lastRefIndex = refIndex;
|
|
184
|
+
path.openCost = 0;
|
|
185
|
+
}
|
|
186
|
+
function endInsertionSegment(path, hypIndex, refIndex) {
|
|
187
|
+
const hypSlice = (0, import_utils.translateSlice)(
|
|
188
|
+
[path.lastHypIndex + 1, hypIndex + 1],
|
|
189
|
+
path.src.hypIndexMap
|
|
190
|
+
);
|
|
191
|
+
const refIsEmpty = refIndex === path.lastRefIndex;
|
|
192
|
+
if (hypSlice && refIsEmpty) {
|
|
193
|
+
path.endIndices = path.endIndices.concat([
|
|
194
|
+
[path.hypIndex, path.refIndex, path.openCost]
|
|
195
|
+
]);
|
|
196
|
+
resetSegmentVariables(path, hypIndex, refIndex);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
function endSegment(path) {
|
|
200
|
+
const hypSlice = (0, import_utils.translateSlice)(
|
|
201
|
+
[path.lastHypIndex + 1, path.hypIndex + 1],
|
|
202
|
+
path.src.hypIndexMap
|
|
203
|
+
);
|
|
204
|
+
const refSlice = (0, import_utils.translateSlice)(
|
|
205
|
+
[path.lastRefIndex + 1, path.refIndex + 1],
|
|
206
|
+
path.src.refIndexMap
|
|
207
|
+
);
|
|
208
|
+
(0, import_node_assert.default)(!!refSlice);
|
|
209
|
+
const hypIsEmpty = path.hypIndex === path.lastHypIndex;
|
|
210
|
+
if (hypIsEmpty) {
|
|
211
|
+
path.endIndices = path.endIndices.concat([
|
|
212
|
+
[path.hypIndex, path.refIndex, path.openCost]
|
|
213
|
+
]);
|
|
214
|
+
} else {
|
|
215
|
+
if (!hypSlice) {
|
|
216
|
+
return null;
|
|
217
|
+
}
|
|
218
|
+
const isMatchSegment = path.openCost === 0;
|
|
219
|
+
path.atUnambiguousMatchNode = isMatchSegment && path.src.unambiguousMatches.has(path.index);
|
|
220
|
+
path.endIndices = path.endIndices.concat([
|
|
221
|
+
[path.hypIndex, path.refIndex, path.openCost]
|
|
222
|
+
]);
|
|
223
|
+
}
|
|
224
|
+
resetSegmentVariables(path, path.hypIndex, path.refIndex);
|
|
225
|
+
return path;
|
|
226
|
+
}
|
|
227
|
+
function transitionToChildNode(parent, { refStep, hypStep }) {
|
|
228
|
+
const child = new Path(parent.src);
|
|
229
|
+
child.refIndex = parent.refIndex + refStep;
|
|
230
|
+
child.hypIndex = parent.hypIndex + hypStep;
|
|
231
|
+
child.lastHypIndex = parent.lastHypIndex;
|
|
232
|
+
child.lastRefIndex = parent.lastRefIndex;
|
|
233
|
+
child.closedCost = parent.closedCost;
|
|
234
|
+
child.openCost = parent.openCost;
|
|
235
|
+
child.atUnambiguousMatchNode = false;
|
|
236
|
+
child.endIndices = parent.endIndices;
|
|
237
|
+
child.sortId = parent.sortId;
|
|
238
|
+
child.updateSortId(BigInt(refStep + refStep + hypStep));
|
|
239
|
+
return child;
|
|
240
|
+
}
|
|
241
|
+
function isSubstitution(hypIndex, refIndex, lastHypIndex, lastRefIndex) {
|
|
242
|
+
return !(refIndex === lastRefIndex || hypIndex === lastHypIndex);
|
|
243
|
+
}
|
|
244
|
+
function errorAlignBeamSearch(src, beamSize = 100) {
|
|
245
|
+
var _a;
|
|
246
|
+
const startPath = new Path(src);
|
|
247
|
+
let beam = [startPath];
|
|
248
|
+
let pruneMap = {};
|
|
249
|
+
const ended = [];
|
|
250
|
+
while (beam.length > 0) {
|
|
251
|
+
const newBeam = {};
|
|
252
|
+
for (const path of beam) {
|
|
253
|
+
if (path.atEnd) {
|
|
254
|
+
ended.push(path);
|
|
255
|
+
continue;
|
|
256
|
+
}
|
|
257
|
+
for (const newPath of expand(path)) {
|
|
258
|
+
const newPathCost = newPath.cost;
|
|
259
|
+
const newPathPruneId = newPath.pruneId;
|
|
260
|
+
if (newPathPruneId in pruneMap) {
|
|
261
|
+
if (newPathCost > pruneMap[newPathPruneId]) {
|
|
262
|
+
continue;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
pruneMap[newPathPruneId] = newPathCost;
|
|
266
|
+
if (!(newPathPruneId in newBeam) || newPathCost < newBeam[newPathPruneId].cost) {
|
|
267
|
+
newBeam[newPathPruneId] = newPath;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
const newBeamPaths = Object.values(newBeam).toSorted((a, b) => {
|
|
272
|
+
if (a.normCost === b.normCost) {
|
|
273
|
+
const comp = a.sortId - b.sortId;
|
|
274
|
+
if (comp < 0n) return -1;
|
|
275
|
+
if (comp > 0n) return 1;
|
|
276
|
+
return 0;
|
|
277
|
+
}
|
|
278
|
+
return a.normCost - b.normCost;
|
|
279
|
+
});
|
|
280
|
+
beam = newBeamPaths.slice(0, beamSize);
|
|
281
|
+
if ((_a = beam[0]) == null ? void 0 : _a.atUnambiguousMatchNode) {
|
|
282
|
+
beam = beam.slice(0, 1);
|
|
283
|
+
pruneMap = {};
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const [result] = ended.toSorted((a, b) => {
|
|
287
|
+
if (a.cost === b.cost) {
|
|
288
|
+
const comp = a.sortId - b.sortId;
|
|
289
|
+
if (comp < 0n) return -1;
|
|
290
|
+
if (comp > 0n) return 1;
|
|
291
|
+
return 0;
|
|
292
|
+
}
|
|
293
|
+
return a.cost - b.cost;
|
|
294
|
+
});
|
|
295
|
+
(0, import_node_assert.default)(!!result);
|
|
296
|
+
return result;
|
|
297
|
+
}
|
|
298
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
299
|
+
0 && (module.exports = {
|
|
300
|
+
Path,
|
|
301
|
+
errorAlignBeamSearch
|
|
302
|
+
});
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { Index } from './backtraceGraph.cjs';
|
|
2
|
+
import { SubgraphMetadata } from './graphMetadata.cjs';
|
|
3
|
+
import './utils.cjs';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Class to represent a graph path.
|
|
7
|
+
*/
|
|
8
|
+
declare class Path {
|
|
9
|
+
src: SubgraphMetadata;
|
|
10
|
+
refIndex: number;
|
|
11
|
+
hypIndex: number;
|
|
12
|
+
lastHypIndex: number;
|
|
13
|
+
lastRefIndex: number;
|
|
14
|
+
closedCost: number;
|
|
15
|
+
openCost: number;
|
|
16
|
+
atUnambiguousMatchNode: boolean;
|
|
17
|
+
endIndices: [number, number, number][];
|
|
18
|
+
sortId: bigint;
|
|
19
|
+
constructor(src: SubgraphMetadata);
|
|
20
|
+
/**
|
|
21
|
+
* Get the ID of the path used for pruning.
|
|
22
|
+
*/
|
|
23
|
+
get pruneId(): number;
|
|
24
|
+
/**
|
|
25
|
+
* Get the cost of the path.
|
|
26
|
+
*/
|
|
27
|
+
get cost(): number;
|
|
28
|
+
/**
|
|
29
|
+
* Get the normalized cost of the path.
|
|
30
|
+
*/
|
|
31
|
+
get normCost(): number;
|
|
32
|
+
/**
|
|
33
|
+
* Get the current node index of the path.
|
|
34
|
+
*/
|
|
35
|
+
get index(): Index;
|
|
36
|
+
/**
|
|
37
|
+
* Check if the path has reached the terminal node.
|
|
38
|
+
*/
|
|
39
|
+
get atEnd(): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Update the sort ID for path ordering. Ensures identical behavior as C++ implementation.
|
|
42
|
+
*/
|
|
43
|
+
updateSortId(t: bigint): void;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Perform beam search to align reference and hypothesis texts for a given source.
|
|
47
|
+
*
|
|
48
|
+
* @param src The source metadata for alignment.
|
|
49
|
+
* @param beamSize The size of the beam for beam search. Defaults to 100.
|
|
50
|
+
*/
|
|
51
|
+
declare function errorAlignBeamSearch(src: SubgraphMetadata, beamSize?: number): Path;
|
|
52
|
+
|
|
53
|
+
export { Path, errorAlignBeamSearch };
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { Index } from './backtraceGraph.js';
|
|
2
|
+
import { SubgraphMetadata } from './graphMetadata.js';
|
|
3
|
+
import './utils.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Class to represent a graph path.
|
|
7
|
+
*/
|
|
8
|
+
declare class Path {
|
|
9
|
+
src: SubgraphMetadata;
|
|
10
|
+
refIndex: number;
|
|
11
|
+
hypIndex: number;
|
|
12
|
+
lastHypIndex: number;
|
|
13
|
+
lastRefIndex: number;
|
|
14
|
+
closedCost: number;
|
|
15
|
+
openCost: number;
|
|
16
|
+
atUnambiguousMatchNode: boolean;
|
|
17
|
+
endIndices: [number, number, number][];
|
|
18
|
+
sortId: bigint;
|
|
19
|
+
constructor(src: SubgraphMetadata);
|
|
20
|
+
/**
|
|
21
|
+
* Get the ID of the path used for pruning.
|
|
22
|
+
*/
|
|
23
|
+
get pruneId(): number;
|
|
24
|
+
/**
|
|
25
|
+
* Get the cost of the path.
|
|
26
|
+
*/
|
|
27
|
+
get cost(): number;
|
|
28
|
+
/**
|
|
29
|
+
* Get the normalized cost of the path.
|
|
30
|
+
*/
|
|
31
|
+
get normCost(): number;
|
|
32
|
+
/**
|
|
33
|
+
* Get the current node index of the path.
|
|
34
|
+
*/
|
|
35
|
+
get index(): Index;
|
|
36
|
+
/**
|
|
37
|
+
* Check if the path has reached the terminal node.
|
|
38
|
+
*/
|
|
39
|
+
get atEnd(): boolean;
|
|
40
|
+
/**
|
|
41
|
+
* Update the sort ID for path ordering. Ensures identical behavior as C++ implementation.
|
|
42
|
+
*/
|
|
43
|
+
updateSortId(t: bigint): void;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Perform beam search to align reference and hypothesis texts for a given source.
|
|
47
|
+
*
|
|
48
|
+
* @param src The source metadata for alignment.
|
|
49
|
+
* @param beamSize The size of the beam for beam search. Defaults to 100.
|
|
50
|
+
*/
|
|
51
|
+
declare function errorAlignBeamSearch(src: SubgraphMetadata, beamSize?: number): Path;
|
|
52
|
+
|
|
53
|
+
export { Path, errorAlignBeamSearch };
|