@storyteller-platform/align 0.1.9 → 0.1.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. package/dist/align/__tests__/align.test.cjs +6 -5
  2. package/dist/align/__tests__/align.test.js +6 -5
  3. package/dist/align/align.cjs +133 -81
  4. package/dist/align/align.d.cts +1 -0
  5. package/dist/align/align.d.ts +1 -0
  6. package/dist/align/align.js +133 -81
  7. package/dist/align/getSentenceRanges.cjs +78 -149
  8. package/dist/align/getSentenceRanges.d.cts +1 -1
  9. package/dist/align/getSentenceRanges.d.ts +1 -1
  10. package/dist/align/getSentenceRanges.js +78 -149
  11. package/dist/errorAlign/__tests__/errorAlign.test.cjs +100 -0
  12. package/dist/errorAlign/__tests__/errorAlign.test.d.cts +2 -0
  13. package/dist/errorAlign/__tests__/errorAlign.test.d.ts +2 -0
  14. package/dist/errorAlign/__tests__/errorAlign.test.js +77 -0
  15. package/dist/errorAlign/__tests__/native.test.cjs +118 -0
  16. package/dist/errorAlign/__tests__/native.test.d.cts +2 -0
  17. package/dist/errorAlign/__tests__/native.test.d.ts +2 -0
  18. package/dist/errorAlign/__tests__/native.test.js +107 -0
  19. package/dist/errorAlign/backtraceGraph.cjs +298 -0
  20. package/dist/errorAlign/backtraceGraph.d.cts +103 -0
  21. package/dist/errorAlign/backtraceGraph.d.ts +103 -0
  22. package/dist/errorAlign/backtraceGraph.js +270 -0
  23. package/dist/errorAlign/beamSearch.cjs +302 -0
  24. package/dist/errorAlign/beamSearch.d.cts +53 -0
  25. package/dist/errorAlign/beamSearch.d.ts +53 -0
  26. package/dist/errorAlign/beamSearch.js +268 -0
  27. package/dist/errorAlign/core.cjs +33 -0
  28. package/dist/errorAlign/core.d.cts +5 -0
  29. package/dist/errorAlign/core.d.ts +5 -0
  30. package/dist/errorAlign/core.js +11 -0
  31. package/dist/errorAlign/editDistance.cjs +115 -0
  32. package/dist/errorAlign/editDistance.d.cts +46 -0
  33. package/dist/errorAlign/editDistance.d.ts +46 -0
  34. package/dist/errorAlign/editDistance.js +90 -0
  35. package/dist/errorAlign/errorAlign.cjs +159 -0
  36. package/dist/errorAlign/errorAlign.d.cts +15 -0
  37. package/dist/errorAlign/errorAlign.d.ts +15 -0
  38. package/dist/errorAlign/errorAlign.js +145 -0
  39. package/dist/errorAlign/graphMetadata.cjs +97 -0
  40. package/dist/errorAlign/graphMetadata.d.cts +44 -0
  41. package/dist/errorAlign/graphMetadata.d.ts +44 -0
  42. package/dist/errorAlign/graphMetadata.js +64 -0
  43. package/dist/errorAlign/hash.cjs +173 -0
  44. package/dist/errorAlign/hash.d.cts +28 -0
  45. package/dist/errorAlign/hash.d.ts +28 -0
  46. package/dist/errorAlign/hash.js +150 -0
  47. package/dist/errorAlign/native.cjs +60 -0
  48. package/dist/errorAlign/native.d.cts +18 -0
  49. package/dist/errorAlign/native.d.ts +18 -0
  50. package/dist/errorAlign/native.js +24 -0
  51. package/dist/errorAlign/node-gyp-build.d.cjs +1 -0
  52. package/dist/errorAlign/node-gyp-build.d.d.cts +3 -0
  53. package/dist/errorAlign/node-gyp-build.d.d.ts +3 -0
  54. package/dist/errorAlign/node-gyp-build.d.js +0 -0
  55. package/dist/errorAlign/pathToAlignment.cjs +122 -0
  56. package/dist/errorAlign/pathToAlignment.d.cts +11 -0
  57. package/dist/errorAlign/pathToAlignment.d.ts +11 -0
  58. package/dist/errorAlign/pathToAlignment.js +89 -0
  59. package/dist/errorAlign/utils.cjs +301 -0
  60. package/dist/errorAlign/utils.d.cts +107 -0
  61. package/dist/errorAlign/utils.d.ts +107 -0
  62. package/dist/errorAlign/utils.js +248 -0
  63. package/dist/index.d.cts +1 -0
  64. package/dist/index.d.ts +1 -0
  65. package/dist/markup/__tests__/markup.test.cjs +108 -81
  66. package/dist/markup/__tests__/markup.test.js +109 -82
  67. package/dist/markup/__tests__/parseDom.test.cjs +112 -0
  68. package/dist/markup/__tests__/parseDom.test.d.cts +2 -0
  69. package/dist/markup/__tests__/parseDom.test.d.ts +2 -0
  70. package/dist/markup/__tests__/parseDom.test.js +89 -0
  71. package/dist/markup/__tests__/serializeDom.test.cjs +120 -0
  72. package/dist/markup/__tests__/serializeDom.test.d.cts +2 -0
  73. package/dist/markup/__tests__/serializeDom.test.d.ts +2 -0
  74. package/dist/markup/__tests__/serializeDom.test.js +97 -0
  75. package/dist/markup/__tests__/transform.test.cjs +122 -0
  76. package/dist/markup/__tests__/transform.test.d.cts +2 -0
  77. package/dist/markup/__tests__/transform.test.d.ts +2 -0
  78. package/dist/markup/__tests__/transform.test.js +99 -0
  79. package/dist/markup/map.cjs +261 -0
  80. package/dist/markup/map.d.cts +50 -0
  81. package/dist/markup/map.d.ts +50 -0
  82. package/dist/markup/map.js +236 -0
  83. package/dist/markup/markup.cjs +23 -201
  84. package/dist/markup/markup.d.cts +5 -9
  85. package/dist/markup/markup.d.ts +5 -9
  86. package/dist/markup/markup.js +24 -203
  87. package/dist/markup/model.cjs +172 -0
  88. package/dist/markup/model.d.cts +57 -0
  89. package/dist/markup/model.d.ts +57 -0
  90. package/dist/markup/model.js +145 -0
  91. package/dist/markup/parseDom.cjs +59 -0
  92. package/dist/markup/parseDom.d.cts +7 -0
  93. package/dist/markup/parseDom.d.ts +7 -0
  94. package/dist/markup/parseDom.js +35 -0
  95. package/dist/markup/segmentation.cjs +11 -57
  96. package/dist/markup/segmentation.d.cts +6 -2
  97. package/dist/markup/segmentation.d.ts +6 -2
  98. package/dist/markup/segmentation.js +11 -58
  99. package/dist/markup/serializeDom.cjs +87 -0
  100. package/dist/markup/serializeDom.d.cts +7 -0
  101. package/dist/markup/serializeDom.d.ts +7 -0
  102. package/dist/markup/serializeDom.js +63 -0
  103. package/dist/markup/transform.cjs +92 -0
  104. package/dist/markup/transform.d.cts +11 -0
  105. package/dist/markup/transform.d.ts +11 -0
  106. package/dist/markup/transform.js +71 -0
  107. package/dist/types/node-gyp-build.d.cjs +1 -0
  108. package/dist/types/node-gyp-build.d.d.cts +3 -0
  109. package/dist/types/node-gyp-build.d.d.ts +3 -0
  110. package/dist/types/node-gyp-build.d.js +0 -0
  111. package/package.json +11 -4
@@ -63,7 +63,6 @@ var __callDispose = (stack, error, hasError) => {
63
63
  };
64
64
  var markup_exports = {};
65
65
  __export(markup_exports, {
66
- appendTextNode: () => appendTextNode,
67
66
  markup: () => markup,
68
67
  markupChapter: () => markupChapter
69
68
  });
@@ -72,8 +71,11 @@ var import_promises = require("node:fs/promises");
72
71
  var import_posix = require("node:path/posix");
73
72
  var import_epub = require("@storyteller-platform/epub");
74
73
  var import_ghost_story = require("@storyteller-platform/ghost-story");
74
+ var import_model = require("./model.cjs");
75
+ var import_parseDom = require("./parseDom.cjs");
75
76
  var import_segmentation = require("./segmentation.cjs");
76
- var import_semantics = require("./semantics.cjs");
77
+ var import_serializeDom = require("./serializeDom.cjs");
78
+ var import_transform = require("./transform.cjs");
77
79
  async function markup(input, output, options) {
78
80
  var _a, _b, _c, _d;
79
81
  var _stack = [];
@@ -96,14 +98,15 @@ async function markup(input, output, options) {
96
98
  continue;
97
99
  }
98
100
  const chapterXml = await epub.readXhtmlItemContents(chapterId);
99
- const segmentation = await (0, import_segmentation.getXhtmlSegmentation)(
101
+ const { result: segmentation, mapping } = await (0, import_segmentation.getXhtmlSegmentation)(
100
102
  import_epub.Epub.getXhtmlBody(chapterXml),
101
103
  { primaryLocale }
102
104
  );
103
105
  const { markedUp, timing: chapterTiming } = markupChapter(
104
106
  chapterId,
105
107
  chapterXml,
106
- segmentation
108
+ segmentation,
109
+ mapping
107
110
  );
108
111
  timing.add(chapterTiming.summary());
109
112
  await epub.writeXhtmlItemContents(chapterId, markedUp);
@@ -116,7 +119,7 @@ async function markup(input, output, options) {
116
119
  __callDispose(_stack, _error, _hasError);
117
120
  }
118
121
  }
119
- function markupChapter(chapterId, chapterXml, segmentation) {
122
+ function markupChapter(chapterId, chapterXml, segmentation, mapping) {
120
123
  const timing = (0, import_ghost_story.createTiming)();
121
124
  const html = import_epub.Epub.findXmlChildByName("html", chapterXml);
122
125
  if (!html) throw new Error("Invalid XHTML document: no html element");
@@ -125,205 +128,25 @@ function markupChapter(chapterId, chapterXml, segmentation) {
125
128
  clearBodyElement(chapterXml);
126
129
  const taggedHtml = import_epub.Epub.findXmlChildByName("html", chapterXml);
127
130
  const taggedBody = import_epub.Epub.findXmlChildByName("body", taggedHtml["html"]);
128
- taggedBody["body"] = [];
129
131
  timing.time("mark up", () => {
130
- markupBySegmentation(
131
- chapterId,
132
- {
133
- currentSentenceIndex: 0,
134
- currentNodeProgress: 0,
135
- currentSentenceProgress: 0
136
- },
137
- segmentation,
138
- body,
139
- /* @__PURE__ */ new Set(),
140
- [],
141
- import_epub.Epub.getXmlChildren(taggedBody)
142
- );
143
- });
144
- return { markedUp: chapterXml, timing };
145
- }
146
- function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
147
- if (import_epub.Epub.isXmlTextNode(currentNode)) {
148
- const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
149
- const text = currentNode["#text"];
150
- const remainingNodeText = text.slice(state.currentNodeProgress);
151
- const index = remainingNodeText.indexOf(remainingSentence[0]);
152
- if (index === -1) {
153
- appendTextNode(
154
- chapterId,
155
- taggedXml,
156
- remainingNodeText,
157
- marks,
158
- taggedSentences
159
- );
160
- return {
161
- ...state,
162
- currentNodeProgress: -1
163
- };
164
- }
165
- if (remainingNodeText.slice(index).length < remainingSentence.length) {
166
- appendTextNode(
167
- chapterId,
168
- taggedXml,
169
- remainingNodeText.slice(0, index),
170
- marks,
171
- taggedSentences
172
- );
173
- appendTextNode(
174
- chapterId,
175
- taggedXml,
176
- remainingNodeText.slice(index),
177
- marks,
178
- taggedSentences,
179
- state.currentSentenceIndex
180
- );
181
- return {
182
- ...state,
183
- currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
184
- currentNodeProgress: -1
185
- };
186
- }
187
- appendTextNode(
188
- chapterId,
189
- taggedXml,
190
- remainingNodeText.slice(0, index),
191
- marks,
192
- taggedSentences
193
- );
194
- appendTextNode(
195
- chapterId,
196
- taggedXml,
197
- remainingSentence,
198
- marks,
199
- taggedSentences,
200
- state.currentSentenceIndex
201
- );
202
- if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
203
- appendTextNode(
204
- chapterId,
205
- taggedXml,
206
- remainingNodeText.slice(index + remainingSentence.length),
207
- marks,
208
- taggedSentences
209
- );
210
- }
211
- const mapping = mapWhitespace(remainingNodeText);
212
- const mapped = mapThrough(
213
- remainingSentence.length,
214
- mapping.filter(([start]) => start >= index)
215
- );
216
- return {
217
- currentSentenceIndex: state.currentSentenceIndex + 1,
218
- currentSentenceProgress: 0,
219
- currentNodeProgress: state.currentNodeProgress + mapped + index
220
- };
221
- }
222
- let nextState = {
223
- ...state
224
- };
225
- const children = import_epub.Epub.getXmlChildren(currentNode);
226
- for (const child of children) {
227
- if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
228
- taggedXml.push(child);
229
- continue;
230
- }
231
- nextState.currentNodeProgress = 0;
232
- let nextTaggedXml = taggedXml;
233
- const nextMarks = [...marks];
234
- if (!import_epub.Epub.isXmlTextNode(child)) {
235
- const childTagName = import_epub.Epub.getXmlElementName(child);
236
- const isTextContent = import_semantics.BLOCKS.includes(childTagName.toLowerCase());
237
- if (import_epub.Epub.getXmlChildren(child).length === 0) {
238
- appendLeafNode(
239
- chapterId,
240
- taggedXml,
241
- child,
242
- nextMarks,
243
- taggedSentences,
244
- isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
132
+ let root = (0, import_parseDom.parseDom)(import_epub.Epub.getXmlChildren(body));
133
+ let pos = 0;
134
+ let i = 0;
135
+ for (const sentence of segmentation) {
136
+ if (sentence.text.match(/\S/)) {
137
+ root = (0, import_transform.addMark)(
138
+ root,
139
+ mapping.invert().map(pos),
140
+ mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
141
+ new import_model.Mark("span", { id: `${chapterId}-s${i}` })
245
142
  );
246
- continue;
247
- }
248
- if (isTextContent) {
249
- const block = {
250
- [childTagName]: [],
251
- ...child[":@"] && { ":@": child[":@"] }
252
- };
253
- nextTaggedXml.push(block);
254
- nextTaggedXml = import_epub.Epub.getXmlChildren(block);
255
- } else {
256
- nextMarks.push({
257
- elementName: childTagName,
258
- attributes: child[":@"]
259
- });
143
+ i++;
260
144
  }
145
+ pos += sentence.text.replace(/\n$/, "").length;
261
146
  }
262
- while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
263
- nextState = markupBySegmentation(
264
- chapterId,
265
- nextState,
266
- segmentation,
267
- child,
268
- taggedSentences,
269
- nextMarks,
270
- nextTaggedXml
271
- );
272
- }
273
- }
274
- nextState.currentNodeProgress = -1;
275
- return nextState;
276
- }
277
- function mapWhitespace(text) {
278
- const re = /(\s\s+)/g;
279
- const mapping = [];
280
- let match = null;
281
- while ((match = re.exec(text)) !== null) {
282
- mapping.push([match.index, match[0].length, 1]);
283
- }
284
- return mapping;
285
- }
286
- function mapThrough(position, mapping) {
287
- let result = position;
288
- let index = 0;
289
- while (index < mapping.length && mapping[index][0] < result) {
290
- const map = mapping[index];
291
- result += map[1] - map[2];
292
- index++;
293
- }
294
- return result;
295
- }
296
- function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
297
- if (text.length === 0) return;
298
- const textNode = { "#text": text };
299
- appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
300
- }
301
- function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
302
- var _a, _b;
303
- const tagId = `${chapterId}-s${sentenceId}`;
304
- const markedNode = [...marks].reverse().reduce(
305
- (acc, mark) => ({
306
- [mark.elementName]: [acc],
307
- ":@": mark.attributes
308
- }),
309
- node
310
- );
311
- const lastNode = xml[xml.length - 1];
312
- if (lastNode && !import_epub.Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
313
- const tagName = import_epub.Epub.getXmlElementName(lastNode);
314
- (_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
315
- return;
316
- }
317
- if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
318
- xml.push(markedNode);
319
- return;
320
- }
321
- const taggedNode = {
322
- span: [markedNode],
323
- ":@": { "@_id": tagId }
324
- };
325
- taggedSentences.add(sentenceId);
326
- xml.push(taggedNode);
147
+ taggedBody["body"] = (0, import_serializeDom.serializeDom)(root);
148
+ });
149
+ return { markedUp: chapterXml, timing };
327
150
  }
328
151
  function clearBodyElement(xml) {
329
152
  const html = import_epub.Epub.findXmlChildByName("html", xml);
@@ -338,7 +161,6 @@ function clearBodyElement(xml) {
338
161
  }
339
162
  // Annotate the CommonJS export names for ESM import in node:
340
163
  0 && (module.exports = {
341
- appendTextNode,
342
164
  markup,
343
165
  markupChapter
344
166
  });
@@ -1,8 +1,9 @@
1
1
  import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
2
2
  import { TimingAggregator } from '@storyteller-platform/ghost-story';
3
- import { SegmentationResult } from '@echogarden/text-segmentation';
3
+ import { Sentence } from '@echogarden/text-segmentation';
4
4
  import { Logger } from 'pino';
5
- import { ParsedXml, ElementName } from '@storyteller-platform/epub';
5
+ import { ParsedXml } from '@storyteller-platform/epub';
6
+ import { Mapping } from './map.cjs';
6
7
 
7
8
  interface MarkupOptions {
8
9
  granularity?: "word" | "sentence";
@@ -11,14 +12,9 @@ interface MarkupOptions {
11
12
  logger?: Logger;
12
13
  }
13
14
  declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
14
- declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
15
+ declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: Sentence[], mapping: Mapping): {
15
16
  markedUp: ParsedXml;
16
17
  timing: _storyteller_platform_ghost_story.Timing;
17
18
  };
18
- type Mark = {
19
- elementName: ElementName;
20
- attributes: Record<string, string> | undefined;
21
- };
22
- declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
23
19
 
24
- export { type MarkupOptions, appendTextNode, markup, markupChapter };
20
+ export { type MarkupOptions, markup, markupChapter };
@@ -1,8 +1,9 @@
1
1
  import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
2
2
  import { TimingAggregator } from '@storyteller-platform/ghost-story';
3
- import { SegmentationResult } from '@echogarden/text-segmentation';
3
+ import { Sentence } from '@echogarden/text-segmentation';
4
4
  import { Logger } from 'pino';
5
- import { ParsedXml, ElementName } from '@storyteller-platform/epub';
5
+ import { ParsedXml } from '@storyteller-platform/epub';
6
+ import { Mapping } from './map.js';
6
7
 
7
8
  interface MarkupOptions {
8
9
  granularity?: "word" | "sentence";
@@ -11,14 +12,9 @@ interface MarkupOptions {
11
12
  logger?: Logger;
12
13
  }
13
14
  declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
14
- declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: SegmentationResult): {
15
+ declare function markupChapter(chapterId: string, chapterXml: ParsedXml, segmentation: Sentence[], mapping: Mapping): {
15
16
  markedUp: ParsedXml;
16
17
  timing: _storyteller_platform_ghost_story.Timing;
17
18
  };
18
- type Mark = {
19
- elementName: ElementName;
20
- attributes: Record<string, string> | undefined;
21
- };
22
- declare function appendTextNode(chapterId: string, xml: ParsedXml, text: string, marks: Mark[], taggedSentences: Set<number>, sentenceId?: number): void;
23
19
 
24
- export { type MarkupOptions, appendTextNode, markup, markupChapter };
20
+ export { type MarkupOptions, markup, markupChapter };
@@ -4,15 +4,16 @@ import {
4
4
  } from "../chunk-BIEQXUOY.js";
5
5
  import { copyFile } from "node:fs/promises";
6
6
  import { basename } from "node:path/posix";
7
- import {
8
- Epub
9
- } from "@storyteller-platform/epub";
7
+ import { Epub } from "@storyteller-platform/epub";
10
8
  import {
11
9
  createAggregator,
12
10
  createTiming
13
11
  } from "@storyteller-platform/ghost-story";
12
+ import { Mark } from "./model.js";
13
+ import { parseDom } from "./parseDom.js";
14
14
  import { getXhtmlSegmentation } from "./segmentation.js";
15
- import { BLOCKS } from "./semantics.js";
15
+ import { serializeDom } from "./serializeDom.js";
16
+ import { addMark } from "./transform.js";
16
17
  async function markup(input, output, options) {
17
18
  var _a, _b, _c, _d;
18
19
  var _stack = [];
@@ -35,14 +36,15 @@ async function markup(input, output, options) {
35
36
  continue;
36
37
  }
37
38
  const chapterXml = await epub.readXhtmlItemContents(chapterId);
38
- const segmentation = await getXhtmlSegmentation(
39
+ const { result: segmentation, mapping } = await getXhtmlSegmentation(
39
40
  Epub.getXhtmlBody(chapterXml),
40
41
  { primaryLocale }
41
42
  );
42
43
  const { markedUp, timing: chapterTiming } = markupChapter(
43
44
  chapterId,
44
45
  chapterXml,
45
- segmentation
46
+ segmentation,
47
+ mapping
46
48
  );
47
49
  timing.add(chapterTiming.summary());
48
50
  await epub.writeXhtmlItemContents(chapterId, markedUp);
@@ -55,7 +57,7 @@ async function markup(input, output, options) {
55
57
  __callDispose(_stack, _error, _hasError);
56
58
  }
57
59
  }
58
- function markupChapter(chapterId, chapterXml, segmentation) {
60
+ function markupChapter(chapterId, chapterXml, segmentation, mapping) {
59
61
  const timing = createTiming();
60
62
  const html = Epub.findXmlChildByName("html", chapterXml);
61
63
  if (!html) throw new Error("Invalid XHTML document: no html element");
@@ -64,205 +66,25 @@ function markupChapter(chapterId, chapterXml, segmentation) {
64
66
  clearBodyElement(chapterXml);
65
67
  const taggedHtml = Epub.findXmlChildByName("html", chapterXml);
66
68
  const taggedBody = Epub.findXmlChildByName("body", taggedHtml["html"]);
67
- taggedBody["body"] = [];
68
69
  timing.time("mark up", () => {
69
- markupBySegmentation(
70
- chapterId,
71
- {
72
- currentSentenceIndex: 0,
73
- currentNodeProgress: 0,
74
- currentSentenceProgress: 0
75
- },
76
- segmentation,
77
- body,
78
- /* @__PURE__ */ new Set(),
79
- [],
80
- Epub.getXmlChildren(taggedBody)
81
- );
82
- });
83
- return { markedUp: chapterXml, timing };
84
- }
85
- function markupBySegmentation(chapterId, state, segmentation, currentNode, taggedSentences, marks, taggedXml) {
86
- if (Epub.isXmlTextNode(currentNode)) {
87
- const remainingSentence = segmentation.sentences[state.currentSentenceIndex].text.slice(state.currentSentenceProgress);
88
- const text = currentNode["#text"];
89
- const remainingNodeText = text.slice(state.currentNodeProgress);
90
- const index = remainingNodeText.indexOf(remainingSentence[0]);
91
- if (index === -1) {
92
- appendTextNode(
93
- chapterId,
94
- taggedXml,
95
- remainingNodeText,
96
- marks,
97
- taggedSentences
98
- );
99
- return {
100
- ...state,
101
- currentNodeProgress: -1
102
- };
103
- }
104
- if (remainingNodeText.slice(index).length < remainingSentence.length) {
105
- appendTextNode(
106
- chapterId,
107
- taggedXml,
108
- remainingNodeText.slice(0, index),
109
- marks,
110
- taggedSentences
111
- );
112
- appendTextNode(
113
- chapterId,
114
- taggedXml,
115
- remainingNodeText.slice(index),
116
- marks,
117
- taggedSentences,
118
- state.currentSentenceIndex
119
- );
120
- return {
121
- ...state,
122
- currentSentenceProgress: state.currentSentenceProgress + remainingNodeText.length - index,
123
- currentNodeProgress: -1
124
- };
125
- }
126
- appendTextNode(
127
- chapterId,
128
- taggedXml,
129
- remainingNodeText.slice(0, index),
130
- marks,
131
- taggedSentences
132
- );
133
- appendTextNode(
134
- chapterId,
135
- taggedXml,
136
- remainingSentence,
137
- marks,
138
- taggedSentences,
139
- state.currentSentenceIndex
140
- );
141
- if (state.currentSentenceIndex + 1 === segmentation.sentences.length) {
142
- appendTextNode(
143
- chapterId,
144
- taggedXml,
145
- remainingNodeText.slice(index + remainingSentence.length),
146
- marks,
147
- taggedSentences
148
- );
149
- }
150
- const mapping = mapWhitespace(remainingNodeText);
151
- const mapped = mapThrough(
152
- remainingSentence.length,
153
- mapping.filter(([start]) => start >= index)
154
- );
155
- return {
156
- currentSentenceIndex: state.currentSentenceIndex + 1,
157
- currentSentenceProgress: 0,
158
- currentNodeProgress: state.currentNodeProgress + mapped + index
159
- };
160
- }
161
- let nextState = {
162
- ...state
163
- };
164
- const children = Epub.getXmlChildren(currentNode);
165
- for (const child of children) {
166
- if (nextState.currentSentenceIndex > segmentation.sentences.length + 1) {
167
- taggedXml.push(child);
168
- continue;
169
- }
170
- nextState.currentNodeProgress = 0;
171
- let nextTaggedXml = taggedXml;
172
- const nextMarks = [...marks];
173
- if (!Epub.isXmlTextNode(child)) {
174
- const childTagName = Epub.getXmlElementName(child);
175
- const isTextContent = BLOCKS.includes(childTagName.toLowerCase());
176
- if (Epub.getXmlChildren(child).length === 0) {
177
- appendLeafNode(
178
- chapterId,
179
- taggedXml,
180
- child,
181
- nextMarks,
182
- taggedSentences,
183
- isTextContent || nextState.currentSentenceProgress === 0 ? void 0 : nextState.currentSentenceIndex
70
+ let root = parseDom(Epub.getXmlChildren(body));
71
+ let pos = 0;
72
+ let i = 0;
73
+ for (const sentence of segmentation) {
74
+ if (sentence.text.match(/\S/)) {
75
+ root = addMark(
76
+ root,
77
+ mapping.invert().map(pos),
78
+ mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
79
+ new Mark("span", { id: `${chapterId}-s${i}` })
184
80
  );
185
- continue;
186
- }
187
- if (isTextContent) {
188
- const block = {
189
- [childTagName]: [],
190
- ...child[":@"] && { ":@": child[":@"] }
191
- };
192
- nextTaggedXml.push(block);
193
- nextTaggedXml = Epub.getXmlChildren(block);
194
- } else {
195
- nextMarks.push({
196
- elementName: childTagName,
197
- attributes: child[":@"]
198
- });
81
+ i++;
199
82
  }
83
+ pos += sentence.text.replace(/\n$/, "").length;
200
84
  }
201
- while (nextState.currentSentenceIndex < segmentation.sentences.length && nextState.currentNodeProgress !== -1) {
202
- nextState = markupBySegmentation(
203
- chapterId,
204
- nextState,
205
- segmentation,
206
- child,
207
- taggedSentences,
208
- nextMarks,
209
- nextTaggedXml
210
- );
211
- }
212
- }
213
- nextState.currentNodeProgress = -1;
214
- return nextState;
215
- }
216
- function mapWhitespace(text) {
217
- const re = /(\s\s+)/g;
218
- const mapping = [];
219
- let match = null;
220
- while ((match = re.exec(text)) !== null) {
221
- mapping.push([match.index, match[0].length, 1]);
222
- }
223
- return mapping;
224
- }
225
- function mapThrough(position, mapping) {
226
- let result = position;
227
- let index = 0;
228
- while (index < mapping.length && mapping[index][0] < result) {
229
- const map = mapping[index];
230
- result += map[1] - map[2];
231
- index++;
232
- }
233
- return result;
234
- }
235
- function appendTextNode(chapterId, xml, text, marks, taggedSentences, sentenceId) {
236
- if (text.length === 0) return;
237
- const textNode = { "#text": text };
238
- appendLeafNode(chapterId, xml, textNode, marks, taggedSentences, sentenceId);
239
- }
240
- function appendLeafNode(chapterId, xml, node, marks, taggedSentences, sentenceId) {
241
- var _a, _b;
242
- const tagId = `${chapterId}-s${sentenceId}`;
243
- const markedNode = [...marks].reverse().reduce(
244
- (acc, mark) => ({
245
- [mark.elementName]: [acc],
246
- ":@": mark.attributes
247
- }),
248
- node
249
- );
250
- const lastNode = xml[xml.length - 1];
251
- if (lastNode && !Epub.isXmlTextNode(lastNode) && ((_a = lastNode[":@"]) == null ? void 0 : _a["@_id"]) && lastNode[":@"]["@_id"] === tagId) {
252
- const tagName = Epub.getXmlElementName(lastNode);
253
- (_b = lastNode[tagName]) == null ? void 0 : _b.push(markedNode);
254
- return;
255
- }
256
- if (sentenceId === void 0 || taggedSentences.has(sentenceId)) {
257
- xml.push(markedNode);
258
- return;
259
- }
260
- const taggedNode = {
261
- span: [markedNode],
262
- ":@": { "@_id": tagId }
263
- };
264
- taggedSentences.add(sentenceId);
265
- xml.push(taggedNode);
85
+ taggedBody["body"] = serializeDom(root);
86
+ });
87
+ return { markedUp: chapterXml, timing };
266
88
  }
267
89
  function clearBodyElement(xml) {
268
90
  const html = Epub.findXmlChildByName("html", xml);
@@ -276,7 +98,6 @@ function clearBodyElement(xml) {
276
98
  });
277
99
  }
278
100
  export {
279
- appendTextNode,
280
101
  markup,
281
102
  markupChapter
282
103
  };