@storyteller-platform/align 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +21 -0
- package/README.md +3 -0
- package/dist/align/align.cjs +525 -0
- package/dist/align/align.d.cts +58 -0
- package/dist/align/align.d.ts +58 -0
- package/dist/align/align.js +458 -0
- package/dist/align/fuzzy.cjs +164 -0
- package/dist/align/fuzzy.d.cts +6 -0
- package/dist/align/fuzzy.d.ts +6 -0
- package/dist/align/fuzzy.js +141 -0
- package/dist/align/getSentenceRanges.cjs +304 -0
- package/dist/align/getSentenceRanges.d.cts +31 -0
- package/dist/align/getSentenceRanges.d.ts +31 -0
- package/dist/align/getSentenceRanges.js +277 -0
- package/dist/align/parse.cjs +63 -0
- package/dist/align/parse.d.cts +30 -0
- package/dist/align/parse.d.ts +30 -0
- package/dist/align/parse.js +51 -0
- package/dist/chunk-BIEQXUOY.js +50 -0
- package/dist/cli/bin.cjs +368 -0
- package/dist/cli/bin.d.cts +1 -0
- package/dist/cli/bin.d.ts +1 -0
- package/dist/cli/bin.js +319 -0
- package/dist/common/ffmpeg.cjs +232 -0
- package/dist/common/ffmpeg.d.cts +33 -0
- package/dist/common/ffmpeg.d.ts +33 -0
- package/dist/common/ffmpeg.js +196 -0
- package/dist/common/logging.cjs +45 -0
- package/dist/common/logging.d.cts +5 -0
- package/dist/common/logging.d.ts +5 -0
- package/dist/common/logging.js +12 -0
- package/dist/common/parse.cjs +73 -0
- package/dist/common/parse.d.cts +28 -0
- package/dist/common/parse.d.ts +28 -0
- package/dist/common/parse.js +56 -0
- package/dist/common/shell.cjs +30 -0
- package/dist/common/shell.d.cts +3 -0
- package/dist/common/shell.d.ts +3 -0
- package/dist/common/shell.js +7 -0
- package/dist/index.cjs +37 -0
- package/dist/index.d.cts +12 -0
- package/dist/index.d.ts +12 -0
- package/dist/index.js +11 -0
- package/dist/markup/__tests__/markup.test.cjs +464 -0
- package/dist/markup/__tests__/markup.test.d.cts +2 -0
- package/dist/markup/__tests__/markup.test.d.ts +2 -0
- package/dist/markup/__tests__/markup.test.js +441 -0
- package/dist/markup/markup.cjs +316 -0
- package/dist/markup/markup.d.cts +24 -0
- package/dist/markup/markup.d.ts +24 -0
- package/dist/markup/markup.js +254 -0
- package/dist/markup/parse.cjs +55 -0
- package/dist/markup/parse.d.cts +17 -0
- package/dist/markup/parse.d.ts +17 -0
- package/dist/markup/parse.js +43 -0
- package/dist/markup/segmentation.cjs +87 -0
- package/dist/markup/segmentation.d.cts +8 -0
- package/dist/markup/segmentation.d.ts +8 -0
- package/dist/markup/segmentation.js +67 -0
- package/dist/markup/semantics.cjs +79 -0
- package/dist/markup/semantics.d.cts +6 -0
- package/dist/markup/semantics.d.ts +6 -0
- package/dist/markup/semantics.js +53 -0
- package/dist/process/AudioEncoding.cjs +16 -0
- package/dist/process/AudioEncoding.d.cts +8 -0
- package/dist/process/AudioEncoding.d.ts +8 -0
- package/dist/process/AudioEncoding.js +0 -0
- package/dist/process/__tests__/processAudiobook.test.cjs +232 -0
- package/dist/process/__tests__/processAudiobook.test.d.cts +2 -0
- package/dist/process/__tests__/processAudiobook.test.d.ts +2 -0
- package/dist/process/__tests__/processAudiobook.test.js +209 -0
- package/dist/process/mime.cjs +43 -0
- package/dist/process/mime.d.cts +3 -0
- package/dist/process/mime.d.ts +3 -0
- package/dist/process/mime.js +24 -0
- package/dist/process/parse.cjs +84 -0
- package/dist/process/parse.d.cts +28 -0
- package/dist/process/parse.d.ts +28 -0
- package/dist/process/parse.js +73 -0
- package/dist/process/processAudiobook.cjs +220 -0
- package/dist/process/processAudiobook.d.cts +24 -0
- package/dist/process/processAudiobook.d.ts +24 -0
- package/dist/process/processAudiobook.js +166 -0
- package/dist/process/ranges.cjs +203 -0
- package/dist/process/ranges.d.cts +15 -0
- package/dist/process/ranges.d.ts +15 -0
- package/dist/process/ranges.js +137 -0
- package/dist/transcribe/parse.cjs +149 -0
- package/dist/transcribe/parse.d.cts +114 -0
- package/dist/transcribe/parse.d.ts +114 -0
- package/dist/transcribe/parse.js +143 -0
- package/dist/transcribe/transcribe.cjs +400 -0
- package/dist/transcribe/transcribe.d.cts +41 -0
- package/dist/transcribe/transcribe.d.ts +41 -0
- package/dist/transcribe/transcribe.js +330 -0
- package/package.json +96 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __knownSymbol = (name, symbol) => (symbol = Symbol[name]) ? symbol : Symbol.for("Symbol." + name);
|
|
7
|
+
var __typeError = (msg) => {
|
|
8
|
+
throw TypeError(msg);
|
|
9
|
+
};
|
|
10
|
+
var __export = (target, all) => {
|
|
11
|
+
for (var name in all)
|
|
12
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
13
|
+
};
|
|
14
|
+
var __copyProps = (to, from, except, desc) => {
|
|
15
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
16
|
+
for (let key of __getOwnPropNames(from))
|
|
17
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
18
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
19
|
+
}
|
|
20
|
+
return to;
|
|
21
|
+
};
|
|
22
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
23
|
+
var __using = (stack, value, async) => {
|
|
24
|
+
if (value != null) {
|
|
25
|
+
if (typeof value !== "object" && typeof value !== "function") __typeError("Object expected");
|
|
26
|
+
var dispose, inner;
|
|
27
|
+
if (async) dispose = value[__knownSymbol("asyncDispose")];
|
|
28
|
+
if (dispose === void 0) {
|
|
29
|
+
dispose = value[__knownSymbol("dispose")];
|
|
30
|
+
if (async) inner = dispose;
|
|
31
|
+
}
|
|
32
|
+
if (typeof dispose !== "function") __typeError("Object not disposable");
|
|
33
|
+
if (inner) dispose = function() {
|
|
34
|
+
try {
|
|
35
|
+
inner.call(this);
|
|
36
|
+
} catch (e) {
|
|
37
|
+
return Promise.reject(e);
|
|
38
|
+
}
|
|
39
|
+
};
|
|
40
|
+
stack.push([async, dispose, value]);
|
|
41
|
+
} else if (async) {
|
|
42
|
+
stack.push([async]);
|
|
43
|
+
}
|
|
44
|
+
return value;
|
|
45
|
+
};
|
|
46
|
+
var __callDispose = (stack, error, hasError) => {
|
|
47
|
+
var E = typeof SuppressedError === "function" ? SuppressedError : function(e, s, m, _) {
|
|
48
|
+
return _ = Error(m), _.name = "SuppressedError", _.error = e, _.suppressed = s, _;
|
|
49
|
+
};
|
|
50
|
+
var fail = (e) => error = hasError ? new E(e, error, "An error was suppressed during disposal") : (hasError = true, e);
|
|
51
|
+
var next = (it) => {
|
|
52
|
+
while (it = stack.pop()) {
|
|
53
|
+
try {
|
|
54
|
+
var result = it[1] && it[1].call(it[2]);
|
|
55
|
+
if (it[0]) return Promise.resolve(result).then(next, (e) => (fail(e), next()));
|
|
56
|
+
} catch (e) {
|
|
57
|
+
fail(e);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
if (hasError) throw error;
|
|
61
|
+
};
|
|
62
|
+
return next();
|
|
63
|
+
};
|
|
64
|
+
var markup_exports = {};
|
|
65
|
+
__export(markup_exports, {
|
|
66
|
+
appendTextNode: () => appendTextNode,
|
|
67
|
+
markup: () => markup,
|
|
68
|
+
markupChapter: () => markupChapter
|
|
69
|
+
});
|
|
70
|
+
module.exports = __toCommonJS(markup_exports);
|
|
71
|
+
var import_promises = require("node:fs/promises");
|
|
72
|
+
var import_posix = require("node:path/posix");
|
|
73
|
+
var import_epub = require("@storyteller-platform/epub");
|
|
74
|
+
var import_ghost_story = require("@storyteller-platform/ghost-story");
|
|
75
|
+
var import_segmentation = require("./segmentation.cjs");
|
|
76
|
+
var import_semantics = require("./semantics.cjs");
|
|
77
|
+
async function markup(input, output, options) {
|
|
78
|
+
var _a, _b;
|
|
79
|
+
var _stack = [];
|
|
80
|
+
try {
|
|
81
|
+
const timing = (0, import_ghost_story.createAggregator)();
|
|
82
|
+
timing.setMetadata("granularity", options.granularity ?? "sentence");
|
|
83
|
+
await (0, import_promises.copyFile)(input, output);
|
|
84
|
+
const epub = __using(_stack, await import_epub.Epub.from(output));
|
|
85
|
+
const primaryLocale = options.primaryLocale ?? await epub.getLanguage();
|
|
86
|
+
const spine = await epub.getSpineItems();
|
|
87
|
+
for (let index = 0; index < spine.length; index++) {
|
|
88
|
+
(_a = options.onProgress) == null ? void 0 : _a.call(options, index / spine.length);
|
|
89
|
+
const spineItem = spine[index];
|
|
90
|
+
(_b = options.logger) == null ? void 0 : _b.info(
|
|
91
|
+
`Marking up epub item #${index}: ${(0, import_posix.basename)(spineItem.href)}`
|
|
92
|
+
);
|
|
93
|
+
const chapterId = spineItem.id;
|
|
94
|
+
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
95
|
+
const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
|
|
96
|
+
import_epub.Epub.getXhtmlBody(chapterXml),
|
|
97
|
+
{ primaryLocale }
|
|
98
|
+
);
|
|
99
|
+
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
100
|
+
chapterId,
|
|
101
|
+
chapterXml,
|
|
102
|
+
segmentation
|
|
103
|
+
);
|
|
104
|
+
timing.add(chapterTiming.summary());
|
|
105
|
+
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
106
|
+
}
|
|
107
|
+
await epub.saveAndClose();
|
|
108
|
+
return timing;
|
|
109
|
+
} catch (_) {
|
|
110
|
+
var _error = _, _hasError = true;
|
|
111
|
+
} finally {
|
|
112
|
+
__callDispose(_stack, _error, _hasError);
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
function markupChapter(chapterId, chapterXml, segmentation) {
|
|
116
|
+
const timing = (0, import_ghost_story.createTiming)();
|
|
117
|
+
const html = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
118
|
+
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
119
|
+
const body = import_epub.Epub.findXmlChildByName("body", html["html"]);
|
|
120
|
+
if (!body) throw new Error("Invalid XHTML document: No body element");
|
|
121
|
+
clearBodyElement(chapterXml);
|
|
122
|
+
const taggedHtml = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
123
|
+
const taggedBody = import_epub.Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
124
|
+
taggedBody["body"] = [];
|
|
125
|
+
timing.time("mark up", () => {
|
|
126
|
+
markupBySegmentation(
|
|
127
|
+
chapterId,
|
|
128
|
+
{
|
|
129
|
+
currentSentenceIndex: 0,
|
|
130
|
+
currentNodeProgress: 0,
|
|
131
|
+
currentSentenceProgress: 0
|
|
132
|
+
},
|
|
133
|
+
segmentation,
|
|
134
|
+
body,
|
|
135
|
+
/* @__PURE__ */ new Set(),
|
|
136
|
+
[],
|
|
137
|
+
import_epub.Epub.getXmlChildren(taggedBody)
|
|
138
|
+
);
|
|
139
|
+
});
|
|
140
|
+
return { markedUp: chapterXml, timing };
|
|
141
|
+
}
|
|
142
|
+
function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
|
|
143
|
+
if (import_epub.Epub.isXmlTextNode(currentNode)) {
|
|
144
|
+
const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
|
|
145
|
+
const text = currentNode["#text"];
|
|
146
|
+
const remainingNodeText = text.slice(state.currentNodeProgress);
|
|
147
|
+
const index = remainingNodeText.indexOf(remainingSentence[0]);
|
|
148
|
+
if (index === -1) {
|
|
149
|
+
appendTextNode(
|
|
150
|
+
chapterId,
|
|
151
|
+
taggedXml,
|
|
152
|
+
remainingNodeText,
|
|
153
|
+
marks,
|
|
154
|
+
taggedSentences
|
|
155
|
+
);
|
|
156
|
+
return {
|
|
157
|
+
...state,
|
|
158
|
+
currentNodeProgress: -1
|
|
159
|
+
};
|
|
160
|
+
}
|
|
161
|
+
if (remainingNodeText.slice(index).length < remainingSentence.length) {
|
|
162
|
+
appendTextNode(
|
|
163
|
+
chapterId,
|
|
164
|
+
taggedXml,
|
|
165
|
+
remainingNodeText.slice(0, index),
|
|
166
|
+
marks,
|
|
167
|
+
taggedSentences
|
|
168
|
+
);
|
|
169
|
+
appendTextNode(
|
|
170
|
+
chapterId,
|
|
171
|
+
taggedXml,
|
|
172
|
+
remainingNodeText.slice(index),
|
|
173
|
+
marks,
|
|
174
|
+
taggedSentences,
|
|
175
|
+
state.currentSentenceIndex
|
|
176
|
+
);
|
|
177
|
+
return {
|
|
178
|
+
...state,
|
|
179
|
+
currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
|
|
180
|
+
currentNodeProgress: -1
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
appendTextNode(
|
|
184
|
+
chapterId,
|
|
185
|
+
taggedXml,
|
|
186
|
+
remainingNodeText.slice(0, index),
|
|
187
|
+
marks,
|
|
188
|
+
taggedSentences
|
|
189
|
+
);
|
|
190
|
+
appendTextNode(
|
|
191
|
+
chapterId,
|
|
192
|
+
taggedXml,
|
|
193
|
+
remainingSentence,
|
|
194
|
+
marks,
|
|
195
|
+
taggedSentences,
|
|
196
|
+
state.currentSentenceIndex
|
|
197
|
+
);
|
|
198
|
+
if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
|
|
199
|
+
appendTextNode(
|
|
200
|
+
chapterId,
|
|
201
|
+
taggedXml,
|
|
202
|
+
remainingNodeText.slice(index + remainingSentence.length),
|
|
203
|
+
marks,
|
|
204
|
+
taggedSentences
|
|
205
|
+
);
|
|
206
|
+
}
|
|
207
|
+
return {
|
|
208
|
+
currentSentenceIndex: state.currentSentenceIndex + 1,
|
|
209
|
+
currentSentenceProgress: 0,
|
|
210
|
+
currentNodeProgress: state.currentNodeProgress + remainingSentence.length + index
|
|
211
|
+
};
|
|
212
|
+
}
|
|
213
|
+
let nextState = {
|
|
214
|
+
...state
|
|
215
|
+
};
|
|
216
|
+
const children = import_epub.Epub.getXmlChildren(currentNode);
|
|
217
|
+
for (const child of children) {
|
|
218
|
+
if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
|
|
219
|
+
taggedXml.push(child);
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
nextState.currentNodeProgress = 0;
|
|
223
|
+
let nextTaggedXml = taggedXml;
|
|
224
|
+
const nextMarks = [...marks];
|
|
225
|
+
if (!import_epub.Epub.isXmlTextNode(child)) {
|
|
226
|
+
const childTagName = import_epub.Epub.getXmlElementName(child);
|
|
227
|
+
const isTextContent = import_semantics.BLOCKS.includes(childTagName.toLowerCase());
|
|
228
|
+
if (import_epub.Epub.getXmlChildren(child).length === 0) {
|
|
229
|
+
appendLeafNode(
|
|
230
|
+
chapterId,
|
|
231
|
+
taggedXml,
|
|
232
|
+
child,
|
|
233
|
+
nextMarks,
|
|
234
|
+
taggedSentences,
|
|
235
|
+
isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
|
|
236
|
+
);
|
|
237
|
+
continue;
|
|
238
|
+
}
|
|
239
|
+
if (isTextContent) {
|
|
240
|
+
const block = {
|
|
241
|
+
[childTagName]: [],
|
|
242
|
+
...child[":@"] && { ":@": child[":@"] }
|
|
243
|
+
};
|
|
244
|
+
nextTaggedXml.push(block);
|
|
245
|
+
nextTaggedXml = import_epub.Epub.getXmlChildren(block);
|
|
246
|
+
} else {
|
|
247
|
+
nextMarks.push({
|
|
248
|
+
elementName: childTagName,
|
|
249
|
+
attributes: child[":@"]
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
|
|
254
|
+
nextState = markupBySegmentation(
|
|
255
|
+
chapterId,
|
|
256
|
+
nextState,
|
|
257
|
+
segmentation,
|
|
258
|
+
child,
|
|
259
|
+
taggedSentences,
|
|
260
|
+
nextMarks,
|
|
261
|
+
nextTaggedXml
|
|
262
|
+
);
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
nextState.currentNodeProgress = -1;
|
|
266
|
+
return nextState;
|
|
267
|
+
}
|
|
268
|
+
function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
|
|
269
|
+
if (text.length === 0) return;
|
|
270
|
+
const textNode = { "#text": text };
|
|
271
|
+
appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
|
|
272
|
+
}
|
|
273
|
+
function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
|
|
274
|
+
var _a, _b;
|
|
275
|
+
const tagId = `${chapterId}-s${sentenceId}`;
|
|
276
|
+
const markedNode = [...marks].reverse().reduce(
|
|
277
|
+
(acc, mark) => ({
|
|
278
|
+
[mark.elementName]: [acc],
|
|
279
|
+
":@": mark.attributes
|
|
280
|
+
}),
|
|
281
|
+
node
|
|
282
|
+
);
|
|
283
|
+
const lastNode = xml[xml.length - 1];
|
|
284
|
+
if (lastNode && !import_epub.Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
|
|
285
|
+
const tagName = import_epub.Epub.getXmlElementName(lastNode);
|
|
286
|
+
(_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
|
|
287
|
+
return;
|
|
288
|
+
}
|
|
289
|
+
if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
|
|
290
|
+
xml.push(markedNode);
|
|
291
|
+
return;
|
|
292
|
+
}
|
|
293
|
+
const taggedNode = {
|
|
294
|
+
span: [markedNode],
|
|
295
|
+
":@": { "@_id": tagId }
|
|
296
|
+
};
|
|
297
|
+
taggedSentences.add(sentenceId);
|
|
298
|
+
xml.push(taggedNode);
|
|
299
|
+
}
|
|
300
|
+
function clearBodyElement(xml) {
|
|
301
|
+
const html = import_epub.Epub.findXmlChildByName("html", xml);
|
|
302
|
+
if (!html) throw new Error("Invalid XHTML: Found no html element");
|
|
303
|
+
const bodyIndex = html["html"].findIndex((element) => "body" in element);
|
|
304
|
+
const body = html["html"][bodyIndex];
|
|
305
|
+
if (!body) throw new Error("Invalid XHTML: Found no body element");
|
|
306
|
+
html["html"].splice(bodyIndex, 1, {
|
|
307
|
+
...body,
|
|
308
|
+
body: []
|
|
309
|
+
});
|
|
310
|
+
}
|
|
311
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
312
|
+
0 && (module.exports = {
|
|
313
|
+
appendTextNode,
|
|
314
|
+
markup,
|
|
315
|
+
markupChapter
|
|
316
|
+
});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
|
+
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
+
import { SegmentationResult } from '@echogarden/text-segmentation';
|
|
4
|
+
import { Logger } from 'pino';
|
|
5
|
+
import { ParsedXml, ElementName } from '@storyteller-platform/epub';
|
|
6
|
+
|
|
7
|
+
interface MarkupOptions {
|
|
8
|
+
granularity?: "word" | "sentence";
|
|
9
|
+
primaryLocale?: Intl.Locale;
|
|
10
|
+
onProgress?: (progress: number) => void;
|
|
11
|
+
logger?: Logger;
|
|
12
|
+
}
|
|
13
|
+
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
14
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
|
|
15
|
+
markedUp: ParsedXml;
|
|
16
|
+
timing: _storyteller_platform_ghost_story.Timing;
|
|
17
|
+
};
|
|
18
|
+
type Mark = {
|
|
19
|
+
elementName: ElementName;
|
|
20
|
+
attributes: Record<string, string> | undefined;
|
|
21
|
+
};
|
|
22
|
+
declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
|
|
23
|
+
|
|
24
|
+
export { type MarkupOptions, appendTextNode, markup, markupChapter };
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
|
+
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
+
import { SegmentationResult } from '@echogarden/text-segmentation';
|
|
4
|
+
import { Logger } from 'pino';
|
|
5
|
+
import { ParsedXml, ElementName } from '@storyteller-platform/epub';
|
|
6
|
+
|
|
7
|
+
interface MarkupOptions {
|
|
8
|
+
granularity?: "word" | "sentence";
|
|
9
|
+
primaryLocale?: Intl.Locale;
|
|
10
|
+
onProgress?: (progress: number) => void;
|
|
11
|
+
logger?: Logger;
|
|
12
|
+
}
|
|
13
|
+
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
14
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
|
|
15
|
+
markedUp: ParsedXml;
|
|
16
|
+
timing: _storyteller_platform_ghost_story.Timing;
|
|
17
|
+
};
|
|
18
|
+
type Mark = {
|
|
19
|
+
elementName: ElementName;
|
|
20
|
+
attributes: Record<string, string> | undefined;
|
|
21
|
+
};
|
|
22
|
+
declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
|
|
23
|
+
|
|
24
|
+
export { type MarkupOptions, appendTextNode, markup, markupChapter };
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
import {
|
|
2
|
+
__callDispose,
|
|
3
|
+
__using
|
|
4
|
+
} from "../chunk-BIEQXUOY.js";
|
|
5
|
+
import { copyFile } from "node:fs/promises";
|
|
6
|
+
import { basename } from "node:path/posix";
|
|
7
|
+
import {
|
|
8
|
+
Epub
|
|
9
|
+
} from "@storyteller-platform/epub";
|
|
10
|
+
import {
|
|
11
|
+
createAggregator,
|
|
12
|
+
createTiming
|
|
13
|
+
} from "@storyteller-platform/ghost-story";
|
|
14
|
+
import { getXhtmlSegmentation } from "./segmentation.js";
|
|
15
|
+
import { BLOCKS } from "./semantics.js";
|
|
16
|
+
async function markup(input, output, options) {
|
|
17
|
+
var _a, _b;
|
|
18
|
+
var _stack = [];
|
|
19
|
+
try {
|
|
20
|
+
const timing = createAggregator();
|
|
21
|
+
timing.setMetadata("granularity", options.granularity ?? "sentence");
|
|
22
|
+
await copyFile(input, output);
|
|
23
|
+
const epub = __using(_stack, await Epub.from(output));
|
|
24
|
+
const primaryLocale = options.primaryLocale ?? await epub.getLanguage();
|
|
25
|
+
const spine = await epub.getSpineItems();
|
|
26
|
+
for (let index = 0; index < spine.length; index++) {
|
|
27
|
+
(_a = options.onProgress) == null ? void 0 : _a.call(options, index / spine.length);
|
|
28
|
+
const spineItem = spine[index];
|
|
29
|
+
(_b = options.logger) == null ? void 0 : _b.info(
|
|
30
|
+
`Marking up epub item #${index}: ${basename(spineItem.href)}`
|
|
31
|
+
);
|
|
32
|
+
const chapterId = spineItem.id;
|
|
33
|
+
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
34
|
+
const segmentation = await getXhtmlSegmentation(
|
|
35
|
+
Epub.getXhtmlBody(chapterXml),
|
|
36
|
+
{ primaryLocale }
|
|
37
|
+
);
|
|
38
|
+
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
39
|
+
chapterId,
|
|
40
|
+
chapterXml,
|
|
41
|
+
segmentation
|
|
42
|
+
);
|
|
43
|
+
timing.add(chapterTiming.summary());
|
|
44
|
+
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
45
|
+
}
|
|
46
|
+
await epub.saveAndClose();
|
|
47
|
+
return timing;
|
|
48
|
+
} catch (_) {
|
|
49
|
+
var _error = _, _hasError = true;
|
|
50
|
+
} finally {
|
|
51
|
+
__callDispose(_stack, _error, _hasError);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
function markupChapter(chapterId, chapterXml, segmentation) {
|
|
55
|
+
const timing = createTiming();
|
|
56
|
+
const html = Epub.findXmlChildByName("html", chapterXml);
|
|
57
|
+
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
58
|
+
const body = Epub.findXmlChildByName("body", html["html"]);
|
|
59
|
+
if (!body) throw new Error("Invalid XHTML document: No body element");
|
|
60
|
+
clearBodyElement(chapterXml);
|
|
61
|
+
const taggedHtml = Epub.findXmlChildByName("html", chapterXml);
|
|
62
|
+
const taggedBody = Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
63
|
+
taggedBody["body"] = [];
|
|
64
|
+
timing.time("mark up", () => {
|
|
65
|
+
markupBySegmentation(
|
|
66
|
+
chapterId,
|
|
67
|
+
{
|
|
68
|
+
currentSentenceIndex: 0,
|
|
69
|
+
currentNodeProgress: 0,
|
|
70
|
+
currentSentenceProgress: 0
|
|
71
|
+
},
|
|
72
|
+
segmentation,
|
|
73
|
+
body,
|
|
74
|
+
/* @__PURE__ */ new Set(),
|
|
75
|
+
[],
|
|
76
|
+
Epub.getXmlChildren(taggedBody)
|
|
77
|
+
);
|
|
78
|
+
});
|
|
79
|
+
return { markedUp: chapterXml, timing };
|
|
80
|
+
}
|
|
81
|
+
function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
|
|
82
|
+
if (Epub.isXmlTextNode(currentNode)) {
|
|
83
|
+
const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
|
|
84
|
+
const text = currentNode["#text"];
|
|
85
|
+
const remainingNodeText = text.slice(state.currentNodeProgress);
|
|
86
|
+
const index = remainingNodeText.indexOf(remainingSentence[0]);
|
|
87
|
+
if (index === -1) {
|
|
88
|
+
appendTextNode(
|
|
89
|
+
chapterId,
|
|
90
|
+
taggedXml,
|
|
91
|
+
remainingNodeText,
|
|
92
|
+
marks,
|
|
93
|
+
taggedSentences
|
|
94
|
+
);
|
|
95
|
+
return {
|
|
96
|
+
...state,
|
|
97
|
+
currentNodeProgress: -1
|
|
98
|
+
};
|
|
99
|
+
}
|
|
100
|
+
if (remainingNodeText.slice(index).length < remainingSentence.length) {
|
|
101
|
+
appendTextNode(
|
|
102
|
+
chapterId,
|
|
103
|
+
taggedXml,
|
|
104
|
+
remainingNodeText.slice(0, index),
|
|
105
|
+
marks,
|
|
106
|
+
taggedSentences
|
|
107
|
+
);
|
|
108
|
+
appendTextNode(
|
|
109
|
+
chapterId,
|
|
110
|
+
taggedXml,
|
|
111
|
+
remainingNodeText.slice(index),
|
|
112
|
+
marks,
|
|
113
|
+
taggedSentences,
|
|
114
|
+
state.currentSentenceIndex
|
|
115
|
+
);
|
|
116
|
+
return {
|
|
117
|
+
...state,
|
|
118
|
+
currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
|
|
119
|
+
currentNodeProgress: -1
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
appendTextNode(
|
|
123
|
+
chapterId,
|
|
124
|
+
taggedXml,
|
|
125
|
+
remainingNodeText.slice(0, index),
|
|
126
|
+
marks,
|
|
127
|
+
taggedSentences
|
|
128
|
+
);
|
|
129
|
+
appendTextNode(
|
|
130
|
+
chapterId,
|
|
131
|
+
taggedXml,
|
|
132
|
+
remainingSentence,
|
|
133
|
+
marks,
|
|
134
|
+
taggedSentences,
|
|
135
|
+
state.currentSentenceIndex
|
|
136
|
+
);
|
|
137
|
+
if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
|
|
138
|
+
appendTextNode(
|
|
139
|
+
chapterId,
|
|
140
|
+
taggedXml,
|
|
141
|
+
remainingNodeText.slice(index + remainingSentence.length),
|
|
142
|
+
marks,
|
|
143
|
+
taggedSentences
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
currentSentenceIndex: state.currentSentenceIndex + 1,
|
|
148
|
+
currentSentenceProgress: 0,
|
|
149
|
+
currentNodeProgress: state.currentNodeProgress + remainingSentence.length + index
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
let nextState = {
|
|
153
|
+
...state
|
|
154
|
+
};
|
|
155
|
+
const children = Epub.getXmlChildren(currentNode);
|
|
156
|
+
for (const child of children) {
|
|
157
|
+
if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
|
|
158
|
+
taggedXml.push(child);
|
|
159
|
+
continue;
|
|
160
|
+
}
|
|
161
|
+
nextState.currentNodeProgress = 0;
|
|
162
|
+
let nextTaggedXml = taggedXml;
|
|
163
|
+
const nextMarks = [...marks];
|
|
164
|
+
if (!Epub.isXmlTextNode(child)) {
|
|
165
|
+
const childTagName = Epub.getXmlElementName(child);
|
|
166
|
+
const isTextContent = BLOCKS.includes(childTagName.toLowerCase());
|
|
167
|
+
if (Epub.getXmlChildren(child).length === 0) {
|
|
168
|
+
appendLeafNode(
|
|
169
|
+
chapterId,
|
|
170
|
+
taggedXml,
|
|
171
|
+
child,
|
|
172
|
+
nextMarks,
|
|
173
|
+
taggedSentences,
|
|
174
|
+
isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
|
|
175
|
+
);
|
|
176
|
+
continue;
|
|
177
|
+
}
|
|
178
|
+
if (isTextContent) {
|
|
179
|
+
const block = {
|
|
180
|
+
[childTagName]: [],
|
|
181
|
+
...child[":@"] && { ":@": child[":@"] }
|
|
182
|
+
};
|
|
183
|
+
nextTaggedXml.push(block);
|
|
184
|
+
nextTaggedXml = Epub.getXmlChildren(block);
|
|
185
|
+
} else {
|
|
186
|
+
nextMarks.push({
|
|
187
|
+
elementName: childTagName,
|
|
188
|
+
attributes: child[":@"]
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
|
|
193
|
+
nextState = markupBySegmentation(
|
|
194
|
+
chapterId,
|
|
195
|
+
nextState,
|
|
196
|
+
segmentation,
|
|
197
|
+
child,
|
|
198
|
+
taggedSentences,
|
|
199
|
+
nextMarks,
|
|
200
|
+
nextTaggedXml
|
|
201
|
+
);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
nextState.currentNodeProgress = -1;
|
|
205
|
+
return nextState;
|
|
206
|
+
}
|
|
207
|
+
function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
|
|
208
|
+
if (text.length === 0) return;
|
|
209
|
+
const textNode = { "#text": text };
|
|
210
|
+
appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
|
|
211
|
+
}
|
|
212
|
+
function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
|
|
213
|
+
var _a, _b;
|
|
214
|
+
const tagId = `${chapterId}-s${sentenceId}`;
|
|
215
|
+
const markedNode = [...marks].reverse().reduce(
|
|
216
|
+
(acc, mark) => ({
|
|
217
|
+
[mark.elementName]: [acc],
|
|
218
|
+
":@": mark.attributes
|
|
219
|
+
}),
|
|
220
|
+
node
|
|
221
|
+
);
|
|
222
|
+
const lastNode = xml[xml.length - 1];
|
|
223
|
+
if (lastNode && !Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
|
|
224
|
+
const tagName = Epub.getXmlElementName(lastNode);
|
|
225
|
+
(_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
|
|
226
|
+
return;
|
|
227
|
+
}
|
|
228
|
+
if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
|
|
229
|
+
xml.push(markedNode);
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
const taggedNode = {
|
|
233
|
+
span: [markedNode],
|
|
234
|
+
":@": { "@_id": tagId }
|
|
235
|
+
};
|
|
236
|
+
taggedSentences.add(sentenceId);
|
|
237
|
+
xml.push(taggedNode);
|
|
238
|
+
}
|
|
239
|
+
function clearBodyElement(xml) {
|
|
240
|
+
const html = Epub.findXmlChildByName("html", xml);
|
|
241
|
+
if (!html) throw new Error("Invalid XHTML: Found no html element");
|
|
242
|
+
const bodyIndex = html["html"].findIndex((element) => "body" in element);
|
|
243
|
+
const body = html["html"][bodyIndex];
|
|
244
|
+
if (!body) throw new Error("Invalid XHTML: Found no body element");
|
|
245
|
+
html["html"].splice(bodyIndex, 1, {
|
|
246
|
+
...body,
|
|
247
|
+
body: []
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
export {
|
|
251
|
+
appendTextNode,
|
|
252
|
+
markup,
|
|
253
|
+
markupChapter
|
|
254
|
+
};
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var parse_exports = {};
|
|
20
|
+
__export(parse_exports, {
|
|
21
|
+
markupCommand: () => markupCommand
|
|
22
|
+
});
|
|
23
|
+
module.exports = __toCommonJS(parse_exports);
|
|
24
|
+
var import_core = require("@optique/core");
|
|
25
|
+
var import_valueparser = require("@optique/run/valueparser");
|
|
26
|
+
var import_parse = require("../common/parse.cjs");
|
|
27
|
+
const markupCommand = (0, import_core.command)(
|
|
28
|
+
"markup",
|
|
29
|
+
(0, import_core.merge)(
|
|
30
|
+
(0, import_core.object)({
|
|
31
|
+
action: (0, import_core.constant)("markup"),
|
|
32
|
+
input: (0, import_core.argument)(
|
|
33
|
+
(0, import_valueparser.path)({
|
|
34
|
+
mustExist: true,
|
|
35
|
+
type: "file",
|
|
36
|
+
extensions: [".epub"],
|
|
37
|
+
metavar: "INPUT_PATH"
|
|
38
|
+
})
|
|
39
|
+
),
|
|
40
|
+
output: (0, import_core.argument)(
|
|
41
|
+
(0, import_valueparser.path)({ type: "file", extensions: [".epub"], metavar: "OUTPUT_PATH" })
|
|
42
|
+
)
|
|
43
|
+
}),
|
|
44
|
+
import_parse.granularityParser,
|
|
45
|
+
import_parse.languageParser,
|
|
46
|
+
import_parse.loggingParser
|
|
47
|
+
),
|
|
48
|
+
{
|
|
49
|
+
description: import_core.message`Mark up an EPUB file at the provided granularity level`
|
|
50
|
+
}
|
|
51
|
+
);
|
|
52
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
53
|
+
0 && (module.exports = {
|
|
54
|
+
markupCommand
|
|
55
|
+
});
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import * as _optique_core from '@optique/core';
|
|
2
|
+
|
|
3
|
+
declare const markupCommand: _optique_core.Parser<"sync", {
|
|
4
|
+
readonly action: "markup";
|
|
5
|
+
readonly input: string;
|
|
6
|
+
readonly output: string;
|
|
7
|
+
} & {
|
|
8
|
+
readonly granularity: "word" | "sentence";
|
|
9
|
+
} & {
|
|
10
|
+
readonly language: Intl.Locale | undefined;
|
|
11
|
+
} & {
|
|
12
|
+
readonly noProgress: boolean;
|
|
13
|
+
readonly logLevel: "silent" | "debug" | "info" | "warn" | "error";
|
|
14
|
+
readonly time: boolean;
|
|
15
|
+
}, ["matched", string] | ["parsing", Record<string | symbol, unknown>] | undefined>;
|
|
16
|
+
|
|
17
|
+
export { markupCommand };
|