@storyteller-platform/align 0.1.8 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/__tests__/align.test.cjs +6 -5
- package/dist/align/__tests__/align.test.js +6 -5
- package/dist/align/align.cjs +133 -81
- package/dist/align/align.d.cts +1 -0
- package/dist/align/align.d.ts +1 -0
- package/dist/align/align.js +133 -81
- package/dist/align/getSentenceRanges.cjs +78 -149
- package/dist/align/getSentenceRanges.d.cts +1 -1
- package/dist/align/getSentenceRanges.d.ts +1 -1
- package/dist/align/getSentenceRanges.js +78 -149
- package/dist/align/slugify.cjs +2 -0
- package/dist/align/slugify.js +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
- package/dist/errorAlign/__tests__/native.test.cjs +118 -0
- package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
- package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
- package/dist/errorAlign/__tests__/native.test.js +107 -0
- package/dist/errorAlign/backtraceGraph.cjs +298 -0
- package/dist/errorAlign/backtraceGraph.d.cts +103 -0
- package/dist/errorAlign/backtraceGraph.d.ts +103 -0
- package/dist/errorAlign/backtraceGraph.js +270 -0
- package/dist/errorAlign/beamSearch.cjs +302 -0
- package/dist/errorAlign/beamSearch.d.cts +53 -0
- package/dist/errorAlign/beamSearch.d.ts +53 -0
- package/dist/errorAlign/beamSearch.js +268 -0
- package/dist/errorAlign/core.cjs +33 -0
- package/dist/errorAlign/core.d.cts +5 -0
- package/dist/errorAlign/core.d.ts +5 -0
- package/dist/errorAlign/core.js +11 -0
- package/dist/errorAlign/editDistance.cjs +115 -0
- package/dist/errorAlign/editDistance.d.cts +46 -0
- package/dist/errorAlign/editDistance.d.ts +46 -0
- package/dist/errorAlign/editDistance.js +90 -0
- package/dist/errorAlign/errorAlign.cjs +159 -0
- package/dist/errorAlign/errorAlign.d.cts +15 -0
- package/dist/errorAlign/errorAlign.d.ts +15 -0
- package/dist/errorAlign/errorAlign.js +145 -0
- package/dist/errorAlign/graphMetadata.cjs +97 -0
- package/dist/errorAlign/graphMetadata.d.cts +44 -0
- package/dist/errorAlign/graphMetadata.d.ts +44 -0
- package/dist/errorAlign/graphMetadata.js +64 -0
- package/dist/errorAlign/hash.cjs +173 -0
- package/dist/errorAlign/hash.d.cts +28 -0
- package/dist/errorAlign/hash.d.ts +28 -0
- package/dist/errorAlign/hash.js +150 -0
- package/dist/errorAlign/native.cjs +60 -0
- package/dist/errorAlign/native.d.cts +18 -0
- package/dist/errorAlign/native.d.ts +18 -0
- package/dist/errorAlign/native.js +24 -0
- package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
- package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
- package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
- package/dist/errorAlign/node-gyp-build.d.js +0 -0
- package/dist/errorAlign/pathToAlignment.cjs +122 -0
- package/dist/errorAlign/pathToAlignment.d.cts +11 -0
- package/dist/errorAlign/pathToAlignment.d.ts +11 -0
- package/dist/errorAlign/pathToAlignment.js +89 -0
- package/dist/errorAlign/utils.cjs +301 -0
- package/dist/errorAlign/utils.d.cts +107 -0
- package/dist/errorAlign/utils.d.ts +107 -0
- package/dist/errorAlign/utils.js +248 -0
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/markup/__tests__/markup.test.cjs +108 -81
- package/dist/markup/__tests__/markup.test.js +109 -82
- package/dist/markup/__tests__/parseDom.test.cjs +112 -0
- package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
- package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
- package/dist/markup/__tests__/parseDom.test.js +89 -0
- package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
- package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
- package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
- package/dist/markup/__tests__/serializeDom.test.js +97 -0
- package/dist/markup/__tests__/transform.test.cjs +122 -0
- package/dist/markup/__tests__/transform.test.d.cts +2 -0
- package/dist/markup/__tests__/transform.test.d.ts +2 -0
- package/dist/markup/__tests__/transform.test.js +99 -0
- package/dist/markup/map.cjs +261 -0
- package/dist/markup/map.d.cts +50 -0
- package/dist/markup/map.d.ts +50 -0
- package/dist/markup/map.js +236 -0
- package/dist/markup/markup.cjs +23 -201
- package/dist/markup/markup.d.cts +5 -9
- package/dist/markup/markup.d.ts +5 -9
- package/dist/markup/markup.js +24 -203
- package/dist/markup/model.cjs +172 -0
- package/dist/markup/model.d.cts +57 -0
- package/dist/markup/model.d.ts +57 -0
- package/dist/markup/model.js +145 -0
- package/dist/markup/parseDom.cjs +59 -0
- package/dist/markup/parseDom.d.cts +7 -0
- package/dist/markup/parseDom.d.ts +7 -0
- package/dist/markup/parseDom.js +35 -0
- package/dist/markup/segmentation.cjs +11 -57
- package/dist/markup/segmentation.d.cts +6 -2
- package/dist/markup/segmentation.d.ts +6 -2
- package/dist/markup/segmentation.js +11 -58
- package/dist/markup/serializeDom.cjs +87 -0
- package/dist/markup/serializeDom.d.cts +7 -0
- package/dist/markup/serializeDom.d.ts +7 -0
- package/dist/markup/serializeDom.js +63 -0
- package/dist/markup/transform.cjs +92 -0
- package/dist/markup/transform.d.cts +11 -0
- package/dist/markup/transform.d.ts +11 -0
- package/dist/markup/transform.js +71 -0
- package/dist/types/node-gyp-build.d.cjs +1 -0
- package/dist/types/node-gyp-build.d.d.cts +3 -0
- package/dist/types/node-gyp-build.d.d.ts +3 -0
- package/dist/types/node-gyp-build.d.js +0 -0
- package/package.json +11 -4
package/dist/markup/markup.cjs
CHANGED
|
@@ -63,7 +63,6 @@ var __callDispose = (stack, error, hasError) => {
|
|
|
63
63
|
};
|
|
64
64
|
var markup_exports = {};
|
|
65
65
|
__export(markup_exports, {
|
|
66
|
-
appendTextNode: () => appendTextNode,
|
|
67
66
|
markup: () => markup,
|
|
68
67
|
markupChapter: () => markupChapter
|
|
69
68
|
});
|
|
@@ -72,8 +71,11 @@ var import_promises = require("node:fs/promises");
|
|
|
72
71
|
var import_posix = require("node:path/posix");
|
|
73
72
|
var import_epub = require("@storyteller-platform/epub");
|
|
74
73
|
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
74
|
+
var import_model = require("./model.cjs");
|
|
75
|
+
var import_parseDom = require("./parseDom.cjs");
|
|
75
76
|
var import_segmentation = require("./segmentation.cjs");
|
|
76
|
-
var
|
|
77
|
+
var import_serializeDom = require("./serializeDom.cjs");
|
|
78
|
+
var import_transform = require("./transform.cjs");
|
|
77
79
|
async function markup(input, output, options) {
|
|
78
80
|
var _a, _b, _c, _d;
|
|
79
81
|
var _stack = [];
|
|
@@ -96,14 +98,15 @@ async function markup(input, output, options) {
|
|
|
96
98
|
continue;
|
|
97
99
|
}
|
|
98
100
|
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
99
|
-
const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
101
|
+
const { result: segmentation, mapping } = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
100
102
|
import_epub.Epub.getXhtmlBody(chapterXml),
|
|
101
103
|
{ primaryLocale }
|
|
102
104
|
);
|
|
103
105
|
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
104
106
|
chapterId,
|
|
105
107
|
chapterXml,
|
|
106
|
-
segmentation
|
|
108
|
+
segmentation,
|
|
109
|
+
mapping
|
|
107
110
|
);
|
|
108
111
|
timing.add(chapterTiming.summary());
|
|
109
112
|
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
@@ -116,7 +119,7 @@ async function markup(input, output, options) {
|
|
|
116
119
|
__callDispose(_stack, _error, _hasError);
|
|
117
120
|
}
|
|
118
121
|
}
|
|
119
|
-
function markupChapter(chapterId, chapterXml, segmentation) {
|
|
122
|
+
function markupChapter(chapterId, chapterXml, segmentation, mapping) {
|
|
120
123
|
const timing = (0, import_ghost_story.createTiming)();
|
|
121
124
|
const html = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
122
125
|
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
@@ -125,205 +128,25 @@ function markupChapter(chapterId, chapterXml, segmentation) {
|
|
|
125
128
|
clearBodyElement(chapterXml);
|
|
126
129
|
const taggedHtml = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
127
130
|
const taggedBody = import_epub.Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
128
|
-
taggedBody["body"] = [];
|
|
129
131
|
timing.time("mark up", () => {
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
[],
|
|
141
|
-
import_epub.Epub.getXmlChildren(taggedBody)
|
|
142
|
-
);
|
|
143
|
-
});
|
|
144
|
-
return { markedUp: chapterXml, timing };
|
|
145
|
-
}
|
|
146
|
-
function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
|
|
147
|
-
if (import_epub.Epub.isXmlTextNode(currentNode)) {
|
|
148
|
-
const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
|
|
149
|
-
const text = currentNode["#text"];
|
|
150
|
-
const remainingNodeText = text.slice(state.currentNodeProgress);
|
|
151
|
-
const index = remainingNodeText.indexOf(remainingSentence[0]);
|
|
152
|
-
if (index === -1) {
|
|
153
|
-
appendTextNode(
|
|
154
|
-
chapterId,
|
|
155
|
-
taggedXml,
|
|
156
|
-
remainingNodeText,
|
|
157
|
-
marks,
|
|
158
|
-
taggedSentences
|
|
159
|
-
);
|
|
160
|
-
return {
|
|
161
|
-
...state,
|
|
162
|
-
currentNodeProgress: -1
|
|
163
|
-
};
|
|
164
|
-
}
|
|
165
|
-
if (remainingNodeText.slice(index).length < remainingSentence.length) {
|
|
166
|
-
appendTextNode(
|
|
167
|
-
chapterId,
|
|
168
|
-
taggedXml,
|
|
169
|
-
remainingNodeText.slice(0, index),
|
|
170
|
-
marks,
|
|
171
|
-
taggedSentences
|
|
172
|
-
);
|
|
173
|
-
appendTextNode(
|
|
174
|
-
chapterId,
|
|
175
|
-
taggedXml,
|
|
176
|
-
remainingNodeText.slice(index),
|
|
177
|
-
marks,
|
|
178
|
-
taggedSentences,
|
|
179
|
-
state.currentSentenceIndex
|
|
180
|
-
);
|
|
181
|
-
return {
|
|
182
|
-
...state,
|
|
183
|
-
currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
|
|
184
|
-
currentNodeProgress: -1
|
|
185
|
-
};
|
|
186
|
-
}
|
|
187
|
-
appendTextNode(
|
|
188
|
-
chapterId,
|
|
189
|
-
taggedXml,
|
|
190
|
-
remainingNodeText.slice(0, index),
|
|
191
|
-
marks,
|
|
192
|
-
taggedSentences
|
|
193
|
-
);
|
|
194
|
-
appendTextNode(
|
|
195
|
-
chapterId,
|
|
196
|
-
taggedXml,
|
|
197
|
-
remainingSentence,
|
|
198
|
-
marks,
|
|
199
|
-
taggedSentences,
|
|
200
|
-
state.currentSentenceIndex
|
|
201
|
-
);
|
|
202
|
-
if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
|
|
203
|
-
appendTextNode(
|
|
204
|
-
chapterId,
|
|
205
|
-
taggedXml,
|
|
206
|
-
remainingNodeText.slice(index + remainingSentence.length),
|
|
207
|
-
marks,
|
|
208
|
-
taggedSentences
|
|
209
|
-
);
|
|
210
|
-
}
|
|
211
|
-
const mapping = mapWhitespace(remainingNodeText);
|
|
212
|
-
const mapped = mapThrough(
|
|
213
|
-
remainingSentence.length,
|
|
214
|
-
mapping.filter(([start]) => start >= index)
|
|
215
|
-
);
|
|
216
|
-
return {
|
|
217
|
-
currentSentenceIndex: state.currentSentenceIndex + 1,
|
|
218
|
-
currentSentenceProgress: 0,
|
|
219
|
-
currentNodeProgress: state.currentNodeProgress + mapped + index
|
|
220
|
-
};
|
|
221
|
-
}
|
|
222
|
-
let nextState = {
|
|
223
|
-
...state
|
|
224
|
-
};
|
|
225
|
-
const children = import_epub.Epub.getXmlChildren(currentNode);
|
|
226
|
-
for (const child of children) {
|
|
227
|
-
if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
|
|
228
|
-
taggedXml.push(child);
|
|
229
|
-
continue;
|
|
230
|
-
}
|
|
231
|
-
nextState.currentNodeProgress = 0;
|
|
232
|
-
let nextTaggedXml = taggedXml;
|
|
233
|
-
const nextMarks = [...marks];
|
|
234
|
-
if (!import_epub.Epub.isXmlTextNode(child)) {
|
|
235
|
-
const childTagName = import_epub.Epub.getXmlElementName(child);
|
|
236
|
-
const isTextContent = import_semantics.BLOCKS.includes(childTagName.toLowerCase());
|
|
237
|
-
if (import_epub.Epub.getXmlChildren(child).length === 0) {
|
|
238
|
-
appendLeafNode(
|
|
239
|
-
chapterId,
|
|
240
|
-
taggedXml,
|
|
241
|
-
child,
|
|
242
|
-
nextMarks,
|
|
243
|
-
taggedSentences,
|
|
244
|
-
isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
|
|
132
|
+
let root = (0, import_parseDom.parseDom)(import_epub.Epub.getXmlChildren(body));
|
|
133
|
+
let pos = 0;
|
|
134
|
+
let i = 0;
|
|
135
|
+
for (const sentence of segmentation) {
|
|
136
|
+
if (sentence.text.match(/\S/)) {
|
|
137
|
+
root = (0, import_transform.addMark)(
|
|
138
|
+
root,
|
|
139
|
+
mapping.invert().map(pos),
|
|
140
|
+
mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
141
|
+
new import_model.Mark("span", { id: `${chapterId}-s${i}` })
|
|
245
142
|
);
|
|
246
|
-
|
|
247
|
-
}
|
|
248
|
-
if (isTextContent) {
|
|
249
|
-
const block = {
|
|
250
|
-
[childTagName]: [],
|
|
251
|
-
...child[":@"] && { ":@": child[":@"] }
|
|
252
|
-
};
|
|
253
|
-
nextTaggedXml.push(block);
|
|
254
|
-
nextTaggedXml = import_epub.Epub.getXmlChildren(block);
|
|
255
|
-
} else {
|
|
256
|
-
nextMarks.push({
|
|
257
|
-
elementName: childTagName,
|
|
258
|
-
attributes: child[":@"]
|
|
259
|
-
});
|
|
143
|
+
i++;
|
|
260
144
|
}
|
|
145
|
+
pos += sentence.text.replace(/\n$/, "").length;
|
|
261
146
|
}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
nextState,
|
|
266
|
-
segmentation,
|
|
267
|
-
child,
|
|
268
|
-
taggedSentences,
|
|
269
|
-
nextMarks,
|
|
270
|
-
nextTaggedXml
|
|
271
|
-
);
|
|
272
|
-
}
|
|
273
|
-
}
|
|
274
|
-
nextState.currentNodeProgress = -1;
|
|
275
|
-
return nextState;
|
|
276
|
-
}
|
|
277
|
-
function mapWhitespace(text) {
|
|
278
|
-
const re = /(\s\s+)/g;
|
|
279
|
-
const mapping = [];
|
|
280
|
-
let match = null;
|
|
281
|
-
while ((match = re.exec(text)) !== null) {
|
|
282
|
-
mapping.push([match.index, match[0].length, 1]);
|
|
283
|
-
}
|
|
284
|
-
return mapping;
|
|
285
|
-
}
|
|
286
|
-
function mapThrough(position, mapping) {
|
|
287
|
-
let result = position;
|
|
288
|
-
let index = 0;
|
|
289
|
-
while (index < mapping.length && mapping[index][0] < result) {
|
|
290
|
-
const map = mapping[index];
|
|
291
|
-
result += map[1] - map[2];
|
|
292
|
-
index++;
|
|
293
|
-
}
|
|
294
|
-
return result;
|
|
295
|
-
}
|
|
296
|
-
function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
|
|
297
|
-
if (text.length === 0) return;
|
|
298
|
-
const textNode = { "#text": text };
|
|
299
|
-
appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
|
|
300
|
-
}
|
|
301
|
-
function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
|
|
302
|
-
var _a, _b;
|
|
303
|
-
const tagId = `${chapterId}-s${sentenceId}`;
|
|
304
|
-
const markedNode = [...marks].reverse().reduce(
|
|
305
|
-
(acc, mark) => ({
|
|
306
|
-
[mark.elementName]: [acc],
|
|
307
|
-
":@": mark.attributes
|
|
308
|
-
}),
|
|
309
|
-
node
|
|
310
|
-
);
|
|
311
|
-
const lastNode = xml[xml.length - 1];
|
|
312
|
-
if (lastNode && !import_epub.Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
|
|
313
|
-
const tagName = import_epub.Epub.getXmlElementName(lastNode);
|
|
314
|
-
(_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
|
|
315
|
-
return;
|
|
316
|
-
}
|
|
317
|
-
if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
|
|
318
|
-
xml.push(markedNode);
|
|
319
|
-
return;
|
|
320
|
-
}
|
|
321
|
-
const taggedNode = {
|
|
322
|
-
span: [markedNode],
|
|
323
|
-
":@": { "@_id": tagId }
|
|
324
|
-
};
|
|
325
|
-
taggedSentences.add(sentenceId);
|
|
326
|
-
xml.push(taggedNode);
|
|
147
|
+
taggedBody["body"] = (0, import_serializeDom.serializeDom)(root);
|
|
148
|
+
});
|
|
149
|
+
return { markedUp: chapterXml, timing };
|
|
327
150
|
}
|
|
328
151
|
function clearBodyElement(xml) {
|
|
329
152
|
const html = import_epub.Epub.findXmlChildByName("html", xml);
|
|
@@ -338,7 +161,6 @@ function clearBodyElement(xml) {
|
|
|
338
161
|
}
|
|
339
162
|
// Annotate the CommonJS export names for ESM import in node:
|
|
340
163
|
0 && (module.exports = {
|
|
341
|
-
appendTextNode,
|
|
342
164
|
markup,
|
|
343
165
|
markupChapter
|
|
344
166
|
});
|
package/dist/markup/markup.d.cts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
2
|
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
-
import {
|
|
3
|
+
import { Sentence } from '@echogarden/text-segmentation';
|
|
4
4
|
import { Logger } from 'pino';
|
|
5
|
-
import { ParsedXml
|
|
5
|
+
import { ParsedXml } from '@storyteller-platform/epub';
|
|
6
|
+
import { Mapping } from './map.cjs';
|
|
6
7
|
|
|
7
8
|
interface MarkupOptions {
|
|
8
9
|
granularity?: "word" | "sentence";
|
|
@@ -11,14 +12,9 @@ interface MarkupOptions {
|
|
|
11
12
|
logger?: Logger;
|
|
12
13
|
}
|
|
13
14
|
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
14
|
-
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation:
|
|
15
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: Sentence[], mapping: Mapping): {
|
|
15
16
|
markedUp: ParsedXml;
|
|
16
17
|
timing: _storyteller_platform_ghost_story.Timing;
|
|
17
18
|
};
|
|
18
|
-
type Mark = {
|
|
19
|
-
elementName: ElementName;
|
|
20
|
-
attributes: Record<string, string> | undefined;
|
|
21
|
-
};
|
|
22
|
-
declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
|
|
23
19
|
|
|
24
|
-
export { type MarkupOptions,
|
|
20
|
+
export { type MarkupOptions, markup, markupChapter };
|
package/dist/markup/markup.d.ts
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
2
|
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
-
import {
|
|
3
|
+
import { Sentence } from '@echogarden/text-segmentation';
|
|
4
4
|
import { Logger } from 'pino';
|
|
5
|
-
import { ParsedXml
|
|
5
|
+
import { ParsedXml } from '@storyteller-platform/epub';
|
|
6
|
+
import { Mapping } from './map.js';
|
|
6
7
|
|
|
7
8
|
interface MarkupOptions {
|
|
8
9
|
granularity?: "word" | "sentence";
|
|
@@ -11,14 +12,9 @@ interface MarkupOptions {
|
|
|
11
12
|
logger?: Logger;
|
|
12
13
|
}
|
|
13
14
|
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
14
|
-
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation:
|
|
15
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: Sentence[], mapping: Mapping): {
|
|
15
16
|
markedUp: ParsedXml;
|
|
16
17
|
timing: _storyteller_platform_ghost_story.Timing;
|
|
17
18
|
};
|
|
18
|
-
type Mark = {
|
|
19
|
-
elementName: ElementName;
|
|
20
|
-
attributes: Record<string, string> | undefined;
|
|
21
|
-
};
|
|
22
|
-
declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
|
|
23
19
|
|
|
24
|
-
export { type MarkupOptions,
|
|
20
|
+
export { type MarkupOptions, markup, markupChapter };
|
package/dist/markup/markup.js
CHANGED
|
@@ -4,15 +4,16 @@ import {
|
|
|
4
4
|
} from "../chunk-BIEQXUOY.js";
|
|
5
5
|
import { copyFile } from "node:fs/promises";
|
|
6
6
|
import { basename } from "node:path/posix";
|
|
7
|
-
import {
|
|
8
|
-
Epub
|
|
9
|
-
} from "@storyteller-platform/epub";
|
|
7
|
+
import { Epub } from "@storyteller-platform/epub";
|
|
10
8
|
import {
|
|
11
9
|
createAggregator,
|
|
12
10
|
createTiming
|
|
13
11
|
} from "@storyteller-platform/ghost-story";
|
|
12
|
+
import { Mark } from "./model.js";
|
|
13
|
+
import { parseDom } from "./parseDom.js";
|
|
14
14
|
import { getXhtmlSegmentation } from "./segmentation.js";
|
|
15
|
-
import {
|
|
15
|
+
import { serializeDom } from "./serializeDom.js";
|
|
16
|
+
import { addMark } from "./transform.js";
|
|
16
17
|
async function markup(input, output, options) {
|
|
17
18
|
var _a, _b, _c, _d;
|
|
18
19
|
var _stack = [];
|
|
@@ -35,14 +36,15 @@ async function markup(input, output, options) {
|
|
|
35
36
|
continue;
|
|
36
37
|
}
|
|
37
38
|
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
38
|
-
const segmentation = await getXhtmlSegmentation(
|
|
39
|
+
const { result: segmentation, mapping } = await getXhtmlSegmentation(
|
|
39
40
|
Epub.getXhtmlBody(chapterXml),
|
|
40
41
|
{ primaryLocale }
|
|
41
42
|
);
|
|
42
43
|
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
43
44
|
chapterId,
|
|
44
45
|
chapterXml,
|
|
45
|
-
segmentation
|
|
46
|
+
segmentation,
|
|
47
|
+
mapping
|
|
46
48
|
);
|
|
47
49
|
timing.add(chapterTiming.summary());
|
|
48
50
|
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
@@ -55,7 +57,7 @@ async function markup(input, output, options) {
|
|
|
55
57
|
__callDispose(_stack, _error, _hasError);
|
|
56
58
|
}
|
|
57
59
|
}
|
|
58
|
-
function markupChapter(chapterId, chapterXml, segmentation) {
|
|
60
|
+
function markupChapter(chapterId, chapterXml, segmentation, mapping) {
|
|
59
61
|
const timing = createTiming();
|
|
60
62
|
const html = Epub.findXmlChildByName("html", chapterXml);
|
|
61
63
|
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
@@ -64,205 +66,25 @@ function markupChapter(chapterId, chapterXml, segmentation) {
|
|
|
64
66
|
clearBodyElement(chapterXml);
|
|
65
67
|
const taggedHtml = Epub.findXmlChildByName("html", chapterXml);
|
|
66
68
|
const taggedBody = Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
67
|
-
taggedBody["body"] = [];
|
|
68
69
|
timing.time("mark up", () => {
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
[],
|
|
80
|
-
Epub.getXmlChildren(taggedBody)
|
|
81
|
-
);
|
|
82
|
-
});
|
|
83
|
-
return { markedUp: chapterXml, timing };
|
|
84
|
-
}
|
|
85
|
-
function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
|
|
86
|
-
if (Epub.isXmlTextNode(currentNode)) {
|
|
87
|
-
const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
|
|
88
|
-
const text = currentNode["#text"];
|
|
89
|
-
const remainingNodeText = text.slice(state.currentNodeProgress);
|
|
90
|
-
const index = remainingNodeText.indexOf(remainingSentence[0]);
|
|
91
|
-
if (index === -1) {
|
|
92
|
-
appendTextNode(
|
|
93
|
-
chapterId,
|
|
94
|
-
taggedXml,
|
|
95
|
-
remainingNodeText,
|
|
96
|
-
marks,
|
|
97
|
-
taggedSentences
|
|
98
|
-
);
|
|
99
|
-
return {
|
|
100
|
-
...state,
|
|
101
|
-
currentNodeProgress: -1
|
|
102
|
-
};
|
|
103
|
-
}
|
|
104
|
-
if (remainingNodeText.slice(index).length < remainingSentence.length) {
|
|
105
|
-
appendTextNode(
|
|
106
|
-
chapterId,
|
|
107
|
-
taggedXml,
|
|
108
|
-
remainingNodeText.slice(0, index),
|
|
109
|
-
marks,
|
|
110
|
-
taggedSentences
|
|
111
|
-
);
|
|
112
|
-
appendTextNode(
|
|
113
|
-
chapterId,
|
|
114
|
-
taggedXml,
|
|
115
|
-
remainingNodeText.slice(index),
|
|
116
|
-
marks,
|
|
117
|
-
taggedSentences,
|
|
118
|
-
state.currentSentenceIndex
|
|
119
|
-
);
|
|
120
|
-
return {
|
|
121
|
-
...state,
|
|
122
|
-
currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
|
|
123
|
-
currentNodeProgress: -1
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
appendTextNode(
|
|
127
|
-
chapterId,
|
|
128
|
-
taggedXml,
|
|
129
|
-
remainingNodeText.slice(0, index),
|
|
130
|
-
marks,
|
|
131
|
-
taggedSentences
|
|
132
|
-
);
|
|
133
|
-
appendTextNode(
|
|
134
|
-
chapterId,
|
|
135
|
-
taggedXml,
|
|
136
|
-
remainingSentence,
|
|
137
|
-
marks,
|
|
138
|
-
taggedSentences,
|
|
139
|
-
state.currentSentenceIndex
|
|
140
|
-
);
|
|
141
|
-
if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
|
|
142
|
-
appendTextNode(
|
|
143
|
-
chapterId,
|
|
144
|
-
taggedXml,
|
|
145
|
-
remainingNodeText.slice(index + remainingSentence.length),
|
|
146
|
-
marks,
|
|
147
|
-
taggedSentences
|
|
148
|
-
);
|
|
149
|
-
}
|
|
150
|
-
const mapping = mapWhitespace(remainingNodeText);
|
|
151
|
-
const mapped = mapThrough(
|
|
152
|
-
remainingSentence.length,
|
|
153
|
-
mapping.filter(([start]) => start >= index)
|
|
154
|
-
);
|
|
155
|
-
return {
|
|
156
|
-
currentSentenceIndex: state.currentSentenceIndex + 1,
|
|
157
|
-
currentSentenceProgress: 0,
|
|
158
|
-
currentNodeProgress: state.currentNodeProgress + mapped + index
|
|
159
|
-
};
|
|
160
|
-
}
|
|
161
|
-
let nextState = {
|
|
162
|
-
...state
|
|
163
|
-
};
|
|
164
|
-
const children = Epub.getXmlChildren(currentNode);
|
|
165
|
-
for (const child of children) {
|
|
166
|
-
if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
|
|
167
|
-
taggedXml.push(child);
|
|
168
|
-
continue;
|
|
169
|
-
}
|
|
170
|
-
nextState.currentNodeProgress = 0;
|
|
171
|
-
let nextTaggedXml = taggedXml;
|
|
172
|
-
const nextMarks = [...marks];
|
|
173
|
-
if (!Epub.isXmlTextNode(child)) {
|
|
174
|
-
const childTagName = Epub.getXmlElementName(child);
|
|
175
|
-
const isTextContent = BLOCKS.includes(childTagName.toLowerCase());
|
|
176
|
-
if (Epub.getXmlChildren(child).length === 0) {
|
|
177
|
-
appendLeafNode(
|
|
178
|
-
chapterId,
|
|
179
|
-
taggedXml,
|
|
180
|
-
child,
|
|
181
|
-
nextMarks,
|
|
182
|
-
taggedSentences,
|
|
183
|
-
isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
|
|
70
|
+
let root = parseDom(Epub.getXmlChildren(body));
|
|
71
|
+
let pos = 0;
|
|
72
|
+
let i = 0;
|
|
73
|
+
for (const sentence of segmentation) {
|
|
74
|
+
if (sentence.text.match(/\S/)) {
|
|
75
|
+
root = addMark(
|
|
76
|
+
root,
|
|
77
|
+
mapping.invert().map(pos),
|
|
78
|
+
mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
79
|
+
new Mark("span", { id: `${chapterId}-s${i}` })
|
|
184
80
|
);
|
|
185
|
-
|
|
186
|
-
}
|
|
187
|
-
if (isTextContent) {
|
|
188
|
-
const block = {
|
|
189
|
-
[childTagName]: [],
|
|
190
|
-
...child[":@"] && { ":@": child[":@"] }
|
|
191
|
-
};
|
|
192
|
-
nextTaggedXml.push(block);
|
|
193
|
-
nextTaggedXml = Epub.getXmlChildren(block);
|
|
194
|
-
} else {
|
|
195
|
-
nextMarks.push({
|
|
196
|
-
elementName: childTagName,
|
|
197
|
-
attributes: child[":@"]
|
|
198
|
-
});
|
|
81
|
+
i++;
|
|
199
82
|
}
|
|
83
|
+
pos += sentence.text.replace(/\n$/, "").length;
|
|
200
84
|
}
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
nextState,
|
|
205
|
-
segmentation,
|
|
206
|
-
child,
|
|
207
|
-
taggedSentences,
|
|
208
|
-
nextMarks,
|
|
209
|
-
nextTaggedXml
|
|
210
|
-
);
|
|
211
|
-
}
|
|
212
|
-
}
|
|
213
|
-
nextState.currentNodeProgress = -1;
|
|
214
|
-
return nextState;
|
|
215
|
-
}
|
|
216
|
-
function mapWhitespace(text) {
|
|
217
|
-
const re = /(\s\s+)/g;
|
|
218
|
-
const mapping = [];
|
|
219
|
-
let match = null;
|
|
220
|
-
while ((match = re.exec(text)) !== null) {
|
|
221
|
-
mapping.push([match.index, match[0].length, 1]);
|
|
222
|
-
}
|
|
223
|
-
return mapping;
|
|
224
|
-
}
|
|
225
|
-
function mapThrough(position, mapping) {
|
|
226
|
-
let result = position;
|
|
227
|
-
let index = 0;
|
|
228
|
-
while (index < mapping.length && mapping[index][0] < result) {
|
|
229
|
-
const map = mapping[index];
|
|
230
|
-
result += map[1] - map[2];
|
|
231
|
-
index++;
|
|
232
|
-
}
|
|
233
|
-
return result;
|
|
234
|
-
}
|
|
235
|
-
function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
|
|
236
|
-
if (text.length === 0) return;
|
|
237
|
-
const textNode = { "#text": text };
|
|
238
|
-
appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
|
|
239
|
-
}
|
|
240
|
-
function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
|
|
241
|
-
var _a, _b;
|
|
242
|
-
const tagId = `${chapterId}-s${sentenceId}`;
|
|
243
|
-
const markedNode = [...marks].reverse().reduce(
|
|
244
|
-
(acc, mark) => ({
|
|
245
|
-
[mark.elementName]: [acc],
|
|
246
|
-
":@": mark.attributes
|
|
247
|
-
}),
|
|
248
|
-
node
|
|
249
|
-
);
|
|
250
|
-
const lastNode = xml[xml.length - 1];
|
|
251
|
-
if (lastNode && !Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
|
|
252
|
-
const tagName = Epub.getXmlElementName(lastNode);
|
|
253
|
-
(_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
|
|
254
|
-
return;
|
|
255
|
-
}
|
|
256
|
-
if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
|
|
257
|
-
xml.push(markedNode);
|
|
258
|
-
return;
|
|
259
|
-
}
|
|
260
|
-
const taggedNode = {
|
|
261
|
-
span: [markedNode],
|
|
262
|
-
":@": { "@_id": tagId }
|
|
263
|
-
};
|
|
264
|
-
taggedSentences.add(sentenceId);
|
|
265
|
-
xml.push(taggedNode);
|
|
85
|
+
taggedBody["body"] = serializeDom(root);
|
|
86
|
+
});
|
|
87
|
+
return { markedUp: chapterXml, timing };
|
|
266
88
|
}
|
|
267
89
|
function clearBodyElement(xml) {
|
|
268
90
|
const html = Epub.findXmlChildByName("html", xml);
|
|
@@ -276,7 +98,6 @@ function clearBodyElement(xml) {
|
|
|
276
98
|
});
|
|
277
99
|
}
|
|
278
100
|
export {
|
|
279
|
-
appendTextNode,
|
|
280
101
|
markup,
|
|
281
102
|
markupChapter
|
|
282
103
|
};
|