@storyteller-platform/align 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE.txt +21 -0
  2. package/README.md +3 -0
  3. package/dist/align/align.cjs +525 -0
  4. package/dist/align/align.d.cts +58 -0
  5. package/dist/align/align.d.ts +58 -0
  6. package/dist/align/align.js +458 -0
  7. package/dist/align/fuzzy.cjs +164 -0
  8. package/dist/align/fuzzy.d.cts +6 -0
  9. package/dist/align/fuzzy.d.ts +6 -0
  10. package/dist/align/fuzzy.js +141 -0
  11. package/dist/align/getSentenceRanges.cjs +304 -0
  12. package/dist/align/getSentenceRanges.d.cts +31 -0
  13. package/dist/align/getSentenceRanges.d.ts +31 -0
  14. package/dist/align/getSentenceRanges.js +277 -0
  15. package/dist/align/parse.cjs +63 -0
  16. package/dist/align/parse.d.cts +30 -0
  17. package/dist/align/parse.d.ts +30 -0
  18. package/dist/align/parse.js +51 -0
  19. package/dist/chunk-BIEQXUOY.js +50 -0
  20. package/dist/cli/bin.cjs +368 -0
  21. package/dist/cli/bin.d.cts +1 -0
  22. package/dist/cli/bin.d.ts +1 -0
  23. package/dist/cli/bin.js +319 -0
  24. package/dist/common/ffmpeg.cjs +232 -0
  25. package/dist/common/ffmpeg.d.cts +33 -0
  26. package/dist/common/ffmpeg.d.ts +33 -0
  27. package/dist/common/ffmpeg.js +196 -0
  28. package/dist/common/logging.cjs +45 -0
  29. package/dist/common/logging.d.cts +5 -0
  30. package/dist/common/logging.d.ts +5 -0
  31. package/dist/common/logging.js +12 -0
  32. package/dist/common/parse.cjs +73 -0
  33. package/dist/common/parse.d.cts +28 -0
  34. package/dist/common/parse.d.ts +28 -0
  35. package/dist/common/parse.js +56 -0
  36. package/dist/common/shell.cjs +30 -0
  37. package/dist/common/shell.d.cts +3 -0
  38. package/dist/common/shell.d.ts +3 -0
  39. package/dist/common/shell.js +7 -0
  40. package/dist/index.cjs +37 -0
  41. package/dist/index.d.cts +12 -0
  42. package/dist/index.d.ts +12 -0
  43. package/dist/index.js +11 -0
  44. package/dist/markup/__tests__/markup.test.cjs +464 -0
  45. package/dist/markup/__tests__/markup.test.d.cts +2 -0
  46. package/dist/markup/__tests__/markup.test.d.ts +2 -0
  47. package/dist/markup/__tests__/markup.test.js +441 -0
  48. package/dist/markup/markup.cjs +316 -0
  49. package/dist/markup/markup.d.cts +24 -0
  50. package/dist/markup/markup.d.ts +24 -0
  51. package/dist/markup/markup.js +254 -0
  52. package/dist/markup/parse.cjs +55 -0
  53. package/dist/markup/parse.d.cts +17 -0
  54. package/dist/markup/parse.d.ts +17 -0
  55. package/dist/markup/parse.js +43 -0
  56. package/dist/markup/segmentation.cjs +87 -0
  57. package/dist/markup/segmentation.d.cts +8 -0
  58. package/dist/markup/segmentation.d.ts +8 -0
  59. package/dist/markup/segmentation.js +67 -0
  60. package/dist/markup/semantics.cjs +79 -0
  61. package/dist/markup/semantics.d.cts +6 -0
  62. package/dist/markup/semantics.d.ts +6 -0
  63. package/dist/markup/semantics.js +53 -0
  64. package/dist/process/AudioEncoding.cjs +16 -0
  65. package/dist/process/AudioEncoding.d.cts +8 -0
  66. package/dist/process/AudioEncoding.d.ts +8 -0
  67. package/dist/process/AudioEncoding.js +0 -0
  68. package/dist/process/__tests__/processAudiobook.test.cjs +232 -0
  69. package/dist/process/__tests__/processAudiobook.test.d.cts +2 -0
  70. package/dist/process/__tests__/processAudiobook.test.d.ts +2 -0
  71. package/dist/process/__tests__/processAudiobook.test.js +209 -0
  72. package/dist/process/mime.cjs +43 -0
  73. package/dist/process/mime.d.cts +3 -0
  74. package/dist/process/mime.d.ts +3 -0
  75. package/dist/process/mime.js +24 -0
  76. package/dist/process/parse.cjs +84 -0
  77. package/dist/process/parse.d.cts +28 -0
  78. package/dist/process/parse.d.ts +28 -0
  79. package/dist/process/parse.js +73 -0
  80. package/dist/process/processAudiobook.cjs +220 -0
  81. package/dist/process/processAudiobook.d.cts +24 -0
  82. package/dist/process/processAudiobook.d.ts +24 -0
  83. package/dist/process/processAudiobook.js +166 -0
  84. package/dist/process/ranges.cjs +203 -0
  85. package/dist/process/ranges.d.cts +15 -0
  86. package/dist/process/ranges.d.ts +15 -0
  87. package/dist/process/ranges.js +137 -0
  88. package/dist/transcribe/parse.cjs +149 -0
  89. package/dist/transcribe/parse.d.cts +114 -0
  90. package/dist/transcribe/parse.d.ts +114 -0
  91. package/dist/transcribe/parse.js +143 -0
  92. package/dist/transcribe/transcribe.cjs +400 -0
  93. package/dist/transcribe/transcribe.d.cts +41 -0
  94. package/dist/transcribe/transcribe.d.ts +41 -0
  95. package/dist/transcribe/transcribe.js +330 -0
  96. package/package.json +96 -0
@@ -0,0 +1,316 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __knownSymbol = (name, symbol) => (symbol = Symbol[name]) ? symbol : Symbol.for("Symbol." + name);
7
+ var __typeError = (msg) => {
8
+ throw TypeError(msg);
9
+ };
10
+ var __export = (target, all) => {
11
+ for (var name in all)
12
+ __defProp(target, name, { get: all[name], enumerable: true });
13
+ };
14
+ var __copyProps = (to, from, except, desc) => {
15
+ if (from && typeof from === "object" || typeof from === "function") {
16
+ for (let key of __getOwnPropNames(from))
17
+ if (!__hasOwnProp.call(to, key) && key !== except)
18
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
19
+ }
20
+ return to;
21
+ };
22
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
23
+ var __using = (stack, value, async) => {
24
+ if (value != null) {
25
+ if (typeof value !== "object" && typeof value !== "function") __typeError("Object expected");
26
+ var dispose, inner;
27
+ if (async) dispose = value[__knownSymbol("asyncDispose")];
28
+ if (dispose === void 0) {
29
+ dispose = value[__knownSymbol("dispose")];
30
+ if (async) inner = dispose;
31
+ }
32
+ if (typeof dispose !== "function") __typeError("Object not disposable");
33
+ if (inner) dispose = function() {
34
+ try {
35
+ inner.call(this);
36
+ } catch (e) {
37
+ return Promise.reject(e);
38
+ }
39
+ };
40
+ stack.push([async, dispose, value]);
41
+ } else if (async) {
42
+ stack.push([async]);
43
+ }
44
+ return value;
45
+ };
46
+ var __callDispose = (stack, error, hasError) => {
47
+ var E = typeof SuppressedError === "function" ? SuppressedError : function(e, s, m, _) {
48
+ return _ = Error(m), _.name = "SuppressedError", _.error = e, _.suppressed = s, _;
49
+ };
50
+ var fail = (e) => error = hasError ? new E(e, error, "An error was suppressed during disposal") : (hasError = true, e);
51
+ var next = (it) => {
52
+ while (it = stack.pop()) {
53
+ try {
54
+ var result = it[1] && it[1].call(it[2]);
55
+ if (it[0]) return Promise.resolve(result).then(next, (e) => (fail(e), next()));
56
+ } catch (e) {
57
+ fail(e);
58
+ }
59
+ }
60
+ if (hasError) throw error;
61
+ };
62
+ return next();
63
+ };
64
+ var markup_exports = {};
65
+ __export(markup_exports, {
66
+ appendTextNode: () => appendTextNode,
67
+ markup: () => markup,
68
+ markupChapter: () => markupChapter
69
+ });
70
+ module.exports = __toCommonJS(markup_exports);
71
+ var import_promises = require("node:fs/promises");
72
+ var import_posix = require("node:path/posix");
73
+ var import_epub = require("@storyteller-platform/epub");
74
+ var import_ghost_story = require("@storyteller-platform/ghost-story");
75
+ var import_segmentation = require("./segmentation.cjs");
76
+ var import_semantics = require("./semantics.cjs");
77
+ async function markup(input, output, options) {
78
+ var _a, _b;
79
+ var _stack = [];
80
+ try {
81
+ const timing = (0, import_ghost_story.createAggregator)();
82
+ timing.setMetadata("granularity", options.granularity ?? "sentence");
83
+ await (0, import_promises.copyFile)(input, output);
84
+ const epub = __using(_stack, await import_epub.Epub.from(output));
85
+ const primaryLocale = options.primaryLocale ?? await epub.getLanguage();
86
+ const spine = await epub.getSpineItems();
87
+ for (let index = 0; index < spine.length; index++) {
88
+ (_a = options.onProgress) == null ? void 0 : _a.call(options, index / spine.length);
89
+ const spineItem = spine[index];
90
+ (_b = options.logger) == null ? void 0 : _b.info(
91
+ `Marking up epub item #${index}: ${(0, import_posix.basename)(spineItem.href)}`
92
+ );
93
+ const chapterId = spineItem.id;
94
+ const chapterXml = await epub.readXhtmlItemContents(chapterId);
95
+ const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
96
+ import_epub.Epub.getXhtmlBody(chapterXml),
97
+ { primaryLocale }
98
+ );
99
+ const { markedUp, timing: chapterTiming } = markupChapter(
100
+ chapterId,
101
+ chapterXml,
102
+ segmentation
103
+ );
104
+ timing.add(chapterTiming.summary());
105
+ await epub.writeXhtmlItemContents(chapterId, markedUp);
106
+ }
107
+ await epub.saveAndClose();
108
+ return timing;
109
+ } catch (_) {
110
+ var _error = _, _hasError = true;
111
+ } finally {
112
+ __callDispose(_stack, _error, _hasError);
113
+ }
114
+ }
115
+ function markupChapter(chapterId, chapterXml, segmentation) {
116
+ const timing = (0, import_ghost_story.createTiming)();
117
+ const html = import_epub.Epub.findXmlChildByName("html", chapterXml);
118
+ if (!html) throw new Error("Invalid XHTML document: no html element");
119
+ const body = import_epub.Epub.findXmlChildByName("body", html["html"]);
120
+ if (!body) throw new Error("Invalid XHTML document: No body element");
121
+ clearBodyElement(chapterXml);
122
+ const taggedHtml = import_epub.Epub.findXmlChildByName("html", chapterXml);
123
+ const taggedBody = import_epub.Epub.findXmlChildByName("body", taggedHtml["html"]);
124
+ taggedBody["body"] = [];
125
+ timing.time("mark up", () => {
126
+ markupBySegmentation(
127
+ chapterId,
128
+ {
129
+ currentSentenceIndex: 0,
130
+ currentNodeProgress: 0,
131
+ currentSentenceProgress: 0
132
+ },
133
+ segmentation,
134
+ body,
135
+ /* @__PURE__ */ new Set(),
136
+ [],
137
+ import_epub.Epub.getXmlChildren(taggedBody)
138
+ );
139
+ });
140
+ return { markedUp: chapterXml, timing };
141
+ }
142
+ function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
143
+ if (import_epub.Epub.isXmlTextNode(currentNode)) {
144
+ const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
145
+ const text = currentNode["#text"];
146
+ const remainingNodeText = text.slice(state.currentNodeProgress);
147
+ const index = remainingNodeText.indexOf(remainingSentence[0]);
148
+ if (index === -1) {
149
+ appendTextNode(
150
+ chapterId,
151
+ taggedXml,
152
+ remainingNodeText,
153
+ marks,
154
+ taggedSentences
155
+ );
156
+ return {
157
+ ...state,
158
+ currentNodeProgress: -1
159
+ };
160
+ }
161
+ if (remainingNodeText.slice(index).length < remainingSentence.length) {
162
+ appendTextNode(
163
+ chapterId,
164
+ taggedXml,
165
+ remainingNodeText.slice(0, index),
166
+ marks,
167
+ taggedSentences
168
+ );
169
+ appendTextNode(
170
+ chapterId,
171
+ taggedXml,
172
+ remainingNodeText.slice(index),
173
+ marks,
174
+ taggedSentences,
175
+ state.currentSentenceIndex
176
+ );
177
+ return {
178
+ ...state,
179
+ currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
180
+ currentNodeProgress: -1
181
+ };
182
+ }
183
+ appendTextNode(
184
+ chapterId,
185
+ taggedXml,
186
+ remainingNodeText.slice(0, index),
187
+ marks,
188
+ taggedSentences
189
+ );
190
+ appendTextNode(
191
+ chapterId,
192
+ taggedXml,
193
+ remainingSentence,
194
+ marks,
195
+ taggedSentences,
196
+ state.currentSentenceIndex
197
+ );
198
+ if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
199
+ appendTextNode(
200
+ chapterId,
201
+ taggedXml,
202
+ remainingNodeText.slice(index + remainingSentence.length),
203
+ marks,
204
+ taggedSentences
205
+ );
206
+ }
207
+ return {
208
+ currentSentenceIndex: state.currentSentenceIndex + 1,
209
+ currentSentenceProgress: 0,
210
+ currentNodeProgress: state.currentNodeProgress + remainingSentence.length + index
211
+ };
212
+ }
213
+ let nextState = {
214
+ ...state
215
+ };
216
+ const children = import_epub.Epub.getXmlChildren(currentNode);
217
+ for (const child of children) {
218
+ if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
219
+ taggedXml.push(child);
220
+ continue;
221
+ }
222
+ nextState.currentNodeProgress = 0;
223
+ let nextTaggedXml = taggedXml;
224
+ const nextMarks = [...marks];
225
+ if (!import_epub.Epub.isXmlTextNode(child)) {
226
+ const childTagName = import_epub.Epub.getXmlElementName(child);
227
+ const isTextContent = import_semantics.BLOCKS.includes(childTagName.toLowerCase());
228
+ if (import_epub.Epub.getXmlChildren(child).length === 0) {
229
+ appendLeafNode(
230
+ chapterId,
231
+ taggedXml,
232
+ child,
233
+ nextMarks,
234
+ taggedSentences,
235
+ isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
236
+ );
237
+ continue;
238
+ }
239
+ if (isTextContent) {
240
+ const block = {
241
+ [childTagName]: [],
242
+ ...child[":@"] && { ":@": child[":@"] }
243
+ };
244
+ nextTaggedXml.push(block);
245
+ nextTaggedXml = import_epub.Epub.getXmlChildren(block);
246
+ } else {
247
+ nextMarks.push({
248
+ elementName: childTagName,
249
+ attributes: child[":@"]
250
+ });
251
+ }
252
+ }
253
+ while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
254
+ nextState = markupBySegmentation(
255
+ chapterId,
256
+ nextState,
257
+ segmentation,
258
+ child,
259
+ taggedSentences,
260
+ nextMarks,
261
+ nextTaggedXml
262
+ );
263
+ }
264
+ }
265
+ nextState.currentNodeProgress = -1;
266
+ return nextState;
267
+ }
268
+ function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
269
+ if (text.length === 0) return;
270
+ const textNode = { "#text": text };
271
+ appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
272
+ }
273
+ function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
274
+ var _a, _b;
275
+ const tagId = `${chapterId}-s${sentenceId}`;
276
+ const markedNode = [...marks].reverse().reduce(
277
+ (acc, mark) => ({
278
+ [mark.elementName]: [acc],
279
+ ":@": mark.attributes
280
+ }),
281
+ node
282
+ );
283
+ const lastNode = xml[xml.length - 1];
284
+ if (lastNode && !import_epub.Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
285
+ const tagName = import_epub.Epub.getXmlElementName(lastNode);
286
+ (_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
287
+ return;
288
+ }
289
+ if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
290
+ xml.push(markedNode);
291
+ return;
292
+ }
293
+ const taggedNode = {
294
+ span: [markedNode],
295
+ ":@": { "@_id": tagId }
296
+ };
297
+ taggedSentences.add(sentenceId);
298
+ xml.push(taggedNode);
299
+ }
300
+ function clearBodyElement(xml) {
301
+ const html = import_epub.Epub.findXmlChildByName("html", xml);
302
+ if (!html) throw new Error("Invalid XHTML: Found no html element");
303
+ const bodyIndex = html["html"].findIndex((element) => "body" in element);
304
+ const body = html["html"][bodyIndex];
305
+ if (!body) throw new Error("Invalid XHTML: Found no body element");
306
+ html["html"].splice(bodyIndex, 1, {
307
+ ...body,
308
+ body: []
309
+ });
310
+ }
311
+ // Annotate the CommonJS export names for ESM import in node:
312
+ 0 && (module.exports = {
313
+ appendTextNode,
314
+ markup,
315
+ markupChapter
316
+ });
@@ -0,0 +1,24 @@
1
+ import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
2
+ import { TimingAggregator } from '@storyteller-platform/ghost-story';
3
+ import { SegmentationResult } from '@echogarden/text-segmentation';
4
+ import { Logger } from 'pino';
5
+ import { ParsedXml, ElementName } from '@storyteller-platform/epub';
6
+
7
+ interface MarkupOptions {
8
+ granularity?: "word" | "sentence";
9
+ primaryLocale?: Intl.Locale;
10
+ onProgress?: (progress: number) => void;
11
+ logger?: Logger;
12
+ }
13
+ declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
14
+ declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
15
+ markedUp: ParsedXml;
16
+ timing: _storyteller_platform_ghost_story.Timing;
17
+ };
18
+ type Mark = {
19
+ elementName: ElementName;
20
+ attributes: Record<string, string> | undefined;
21
+ };
22
+ declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
23
+
24
+ export { type MarkupOptions, appendTextNode, markup, markupChapter };
@@ -0,0 +1,24 @@
1
+ import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
2
+ import { TimingAggregator } from '@storyteller-platform/ghost-story';
3
+ import { SegmentationResult } from '@echogarden/text-segmentation';
4
+ import { Logger } from 'pino';
5
+ import { ParsedXml, ElementName } from '@storyteller-platform/epub';
6
+
7
+ interface MarkupOptions {
8
+ granularity?: "word" | "sentence";
9
+ primaryLocale?: Intl.Locale;
10
+ onProgress?: (progress: number) => void;
11
+ logger?: Logger;
12
+ }
13
+ declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
14
+ declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
15
+ markedUp: ParsedXml;
16
+ timing: _storyteller_platform_ghost_story.Timing;
17
+ };
18
+ type Mark = {
19
+ elementName: ElementName;
20
+ attributes: Record<string, string> | undefined;
21
+ };
22
+ declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
23
+
24
+ export { type MarkupOptions, appendTextNode, markup, markupChapter };
@@ -0,0 +1,254 @@
1
+ import {
2
+ __callDispose,
3
+ __using
4
+ } from "../chunk-BIEQXUOY.js";
5
+ import { copyFile } from "node:fs/promises";
6
+ import { basename } from "node:path/posix";
7
+ import {
8
+ Epub
9
+ } from "@storyteller-platform/epub";
10
+ import {
11
+ createAggregator,
12
+ createTiming
13
+ } from "@storyteller-platform/ghost-story";
14
+ import { getXhtmlSegmentation } from "./segmentation.js";
15
+ import { BLOCKS } from "./semantics.js";
16
+ async function markup(input, output, options) {
17
+ var _a, _b;
18
+ var _stack = [];
19
+ try {
20
+ const timing = createAggregator();
21
+ timing.setMetadata("granularity", options.granularity ?? "sentence");
22
+ await copyFile(input, output);
23
+ const epub = __using(_stack, await Epub.from(output));
24
+ const primaryLocale = options.primaryLocale ?? await epub.getLanguage();
25
+ const spine = await epub.getSpineItems();
26
+ for (let index = 0; index < spine.length; index++) {
27
+ (_a = options.onProgress) == null ? void 0 : _a.call(options, index / spine.length);
28
+ const spineItem = spine[index];
29
+ (_b = options.logger) == null ? void 0 : _b.info(
30
+ `Marking up epub item #${index}: ${basename(spineItem.href)}`
31
+ );
32
+ const chapterId = spineItem.id;
33
+ const chapterXml = await epub.readXhtmlItemContents(chapterId);
34
+ const segmentation = await getXhtmlSegmentation(
35
+ Epub.getXhtmlBody(chapterXml),
36
+ { primaryLocale }
37
+ );
38
+ const { markedUp, timing: chapterTiming } = markupChapter(
39
+ chapterId,
40
+ chapterXml,
41
+ segmentation
42
+ );
43
+ timing.add(chapterTiming.summary());
44
+ await epub.writeXhtmlItemContents(chapterId, markedUp);
45
+ }
46
+ await epub.saveAndClose();
47
+ return timing;
48
+ } catch (_) {
49
+ var _error = _, _hasError = true;
50
+ } finally {
51
+ __callDispose(_stack, _error, _hasError);
52
+ }
53
+ }
54
+ function markupChapter(chapterId, chapterXml, segmentation) {
55
+ const timing = createTiming();
56
+ const html = Epub.findXmlChildByName("html", chapterXml);
57
+ if (!html) throw new Error("Invalid XHTML document: no html element");
58
+ const body = Epub.findXmlChildByName("body", html["html"]);
59
+ if (!body) throw new Error("Invalid XHTML document: No body element");
60
+ clearBodyElement(chapterXml);
61
+ const taggedHtml = Epub.findXmlChildByName("html", chapterXml);
62
+ const taggedBody = Epub.findXmlChildByName("body", taggedHtml["html"]);
63
+ taggedBody["body"] = [];
64
+ timing.time("mark up", () => {
65
+ markupBySegmentation(
66
+ chapterId,
67
+ {
68
+ currentSentenceIndex: 0,
69
+ currentNodeProgress: 0,
70
+ currentSentenceProgress: 0
71
+ },
72
+ segmentation,
73
+ body,
74
+ /* @__PURE__ */ new Set(),
75
+ [],
76
+ Epub.getXmlChildren(taggedBody)
77
+ );
78
+ });
79
+ return { markedUp: chapterXml, timing };
80
+ }
81
+ function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
82
+ if (Epub.isXmlTextNode(currentNode)) {
83
+ const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
84
+ const text = currentNode["#text"];
85
+ const remainingNodeText = text.slice(state.currentNodeProgress);
86
+ const index = remainingNodeText.indexOf(remainingSentence[0]);
87
+ if (index === -1) {
88
+ appendTextNode(
89
+ chapterId,
90
+ taggedXml,
91
+ remainingNodeText,
92
+ marks,
93
+ taggedSentences
94
+ );
95
+ return {
96
+ ...state,
97
+ currentNodeProgress: -1
98
+ };
99
+ }
100
+ if (remainingNodeText.slice(index).length < remainingSentence.length) {
101
+ appendTextNode(
102
+ chapterId,
103
+ taggedXml,
104
+ remainingNodeText.slice(0, index),
105
+ marks,
106
+ taggedSentences
107
+ );
108
+ appendTextNode(
109
+ chapterId,
110
+ taggedXml,
111
+ remainingNodeText.slice(index),
112
+ marks,
113
+ taggedSentences,
114
+ state.currentSentenceIndex
115
+ );
116
+ return {
117
+ ...state,
118
+ currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
119
+ currentNodeProgress: -1
120
+ };
121
+ }
122
+ appendTextNode(
123
+ chapterId,
124
+ taggedXml,
125
+ remainingNodeText.slice(0, index),
126
+ marks,
127
+ taggedSentences
128
+ );
129
+ appendTextNode(
130
+ chapterId,
131
+ taggedXml,
132
+ remainingSentence,
133
+ marks,
134
+ taggedSentences,
135
+ state.currentSentenceIndex
136
+ );
137
+ if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
138
+ appendTextNode(
139
+ chapterId,
140
+ taggedXml,
141
+ remainingNodeText.slice(index + remainingSentence.length),
142
+ marks,
143
+ taggedSentences
144
+ );
145
+ }
146
+ return {
147
+ currentSentenceIndex: state.currentSentenceIndex + 1,
148
+ currentSentenceProgress: 0,
149
+ currentNodeProgress: state.currentNodeProgress + remainingSentence.length + index
150
+ };
151
+ }
152
+ let nextState = {
153
+ ...state
154
+ };
155
+ const children = Epub.getXmlChildren(currentNode);
156
+ for (const child of children) {
157
+ if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
158
+ taggedXml.push(child);
159
+ continue;
160
+ }
161
+ nextState.currentNodeProgress = 0;
162
+ let nextTaggedXml = taggedXml;
163
+ const nextMarks = [...marks];
164
+ if (!Epub.isXmlTextNode(child)) {
165
+ const childTagName = Epub.getXmlElementName(child);
166
+ const isTextContent = BLOCKS.includes(childTagName.toLowerCase());
167
+ if (Epub.getXmlChildren(child).length === 0) {
168
+ appendLeafNode(
169
+ chapterId,
170
+ taggedXml,
171
+ child,
172
+ nextMarks,
173
+ taggedSentences,
174
+ isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
175
+ );
176
+ continue;
177
+ }
178
+ if (isTextContent) {
179
+ const block = {
180
+ [childTagName]: [],
181
+ ...child[":@"] && { ":@": child[":@"] }
182
+ };
183
+ nextTaggedXml.push(block);
184
+ nextTaggedXml = Epub.getXmlChildren(block);
185
+ } else {
186
+ nextMarks.push({
187
+ elementName: childTagName,
188
+ attributes: child[":@"]
189
+ });
190
+ }
191
+ }
192
+ while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
193
+ nextState = markupBySegmentation(
194
+ chapterId,
195
+ nextState,
196
+ segmentation,
197
+ child,
198
+ taggedSentences,
199
+ nextMarks,
200
+ nextTaggedXml
201
+ );
202
+ }
203
+ }
204
+ nextState.currentNodeProgress = -1;
205
+ return nextState;
206
+ }
207
+ function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
208
+ if (text.length === 0) return;
209
+ const textNode = { "#text": text };
210
+ appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
211
+ }
212
+ function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
213
+ var _a, _b;
214
+ const tagId = `${chapterId}-s${sentenceId}`;
215
+ const markedNode = [...marks].reverse().reduce(
216
+ (acc, mark) => ({
217
+ [mark.elementName]: [acc],
218
+ ":@": mark.attributes
219
+ }),
220
+ node
221
+ );
222
+ const lastNode = xml[xml.length - 1];
223
+ if (lastNode && !Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
224
+ const tagName = Epub.getXmlElementName(lastNode);
225
+ (_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
226
+ return;
227
+ }
228
+ if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
229
+ xml.push(markedNode);
230
+ return;
231
+ }
232
+ const taggedNode = {
233
+ span: [markedNode],
234
+ ":@": { "@_id": tagId }
235
+ };
236
+ taggedSentences.add(sentenceId);
237
+ xml.push(taggedNode);
238
+ }
239
+ function clearBodyElement(xml) {
240
+ const html = Epub.findXmlChildByName("html", xml);
241
+ if (!html) throw new Error("Invalid XHTML: Found no html element");
242
+ const bodyIndex = html["html"].findIndex((element) => "body" in element);
243
+ const body = html["html"][bodyIndex];
244
+ if (!body) throw new Error("Invalid XHTML: Found no body element");
245
+ html["html"].splice(bodyIndex, 1, {
246
+ ...body,
247
+ body: []
248
+ });
249
+ }
250
+ export {
251
+ appendTextNode,
252
+ markup,
253
+ markupChapter
254
+ };
@@ -0,0 +1,55 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var parse_exports = {};
20
+ __export(parse_exports, {
21
+ markupCommand: () => markupCommand
22
+ });
23
+ module.exports = __toCommonJS(parse_exports);
24
+ var import_core = require("@optique/core");
25
+ var import_valueparser = require("@optique/run/valueparser");
26
+ var import_parse = require("../common/parse.cjs");
27
+ const markupCommand = (0, import_core.command)(
28
+ "markup",
29
+ (0, import_core.merge)(
30
+ (0, import_core.object)({
31
+ action: (0, import_core.constant)("markup"),
32
+ input: (0, import_core.argument)(
33
+ (0, import_valueparser.path)({
34
+ mustExist: true,
35
+ type: "file",
36
+ extensions: [".epub"],
37
+ metavar: "INPUT_PATH"
38
+ })
39
+ ),
40
+ output: (0, import_core.argument)(
41
+ (0, import_valueparser.path)({ type: "file", extensions: [".epub"], metavar: "OUTPUT_PATH" })
42
+ )
43
+ }),
44
+ import_parse.granularityParser,
45
+ import_parse.languageParser,
46
+ import_parse.loggingParser
47
+ ),
48
+ {
49
+ description: import_core.message`Mark up an EPUB file at the provided granularity level`
50
+ }
51
+ );
52
+ // Annotate the CommonJS export names for ESM import in node:
53
+ 0 && (module.exports = {
54
+ markupCommand
55
+ });
@@ -0,0 +1,17 @@
1
+ import * as _optique_core from '@optique/core';
2
+
3
+ declare const markupCommand: _optique_core.Parser<"sync", {
4
+ readonly action: "markup";
5
+ readonly input: string;
6
+ readonly output: string;
7
+ } & {
8
+ readonly granularity: "word" | "sentence";
9
+ } & {
10
+ readonly language: Intl.Locale | undefined;
11
+ } & {
12
+ readonly noProgress: boolean;
13
+ readonly logLevel: "silent" | "debug" | "info" | "warn" | "error";
14
+ readonly time: boolean;
15
+ }, ["matched", string] | ["parsing", Record<string | symbol, unknown>] | undefined>;
16
+
17
+ export { markupCommand };