@storyteller-platform/align 0.1.4 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,10 +89,12 @@ var import_ffmpeg = require("../common/ffmpeg.cjs");
89
89
  var import_segmentation = require("../markup/segmentation.cjs");
90
90
  var import_fuzzy = require("./fuzzy.cjs");
91
91
  var import_getSentenceRanges = require("./getSentenceRanges.cjs");
92
+ var import_slugify = require("./slugify.cjs");
92
93
  const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
93
94
  async function align(input, output, transcriptionsDir, audiobookDir, options) {
94
95
  var _stack = [];
95
96
  try {
97
+ await (0, import_promises.mkdir)((0, import_posix.dirname)(output), { recursive: true });
96
98
  await (0, import_promises.copyFile)(input, output);
97
99
  const audiobookFiles = await (0, import_promises.readdir)(audiobookDir).then(
98
100
  (filenames) => filenames.filter((f) => (0, import_audiobook.isAudioFile)(f)).map((f) => (0, import_node_path.join)(audiobookDir, f))
@@ -118,6 +120,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
118
120
  options.logger
119
121
  );
120
122
  const timing = await aligner.alignBook(options.onProgress);
123
+ await epub.saveAndClose();
121
124
  if (options.reportsPath) {
122
125
  await (0, import_promises.mkdir)((0, import_node_path.dirname)(options.reportsPath), { recursive: true });
123
126
  await (0, import_promises.writeFile)(
@@ -152,7 +155,7 @@ class Aligner {
152
155
  report = {
153
156
  chapters: []
154
157
  };
155
- findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
158
+ findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
156
159
  let i = 0;
157
160
  while (i < transcriptionText.length) {
158
161
  let startSentence = 0;
@@ -164,11 +167,13 @@ class Aligner {
164
167
  let startSeen = null;
165
168
  let endSeen = null;
166
169
  for (const aligned of this.alignedChapters) {
167
- if (startSeen !== null && endSeen === aligned.startOffset) {
168
- endSeen = aligned.endOffset;
170
+ const alignedStart = mapping.map(aligned.startOffset, -1);
171
+ const alignedEnd = mapping.map(aligned.endOffset, -1);
172
+ if (startSeen !== null && endSeen === alignedStart) {
173
+ endSeen = alignedEnd;
169
174
  } else {
170
- startSeen = aligned.startOffset;
171
- endSeen = aligned.endOffset;
175
+ startSeen = alignedStart;
176
+ endSeen = alignedEnd;
172
177
  }
173
178
  if (startIndex >= startSeen && startIndex < endSeen) {
174
179
  startIndex = endSeen;
@@ -183,7 +188,7 @@ class Aligner {
183
188
  endIndex
184
189
  );
185
190
  while (startSentence < epubSentences.length) {
186
- const queryString = epubSentences.slice(startSentence, startSentence + 6).join(" ");
191
+ const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
187
192
  const firstMatch = (0, import_fuzzy.findNearestMatch)(
188
193
  queryString.toLowerCase(),
189
194
  transcriptionTextSlice.toLowerCase(),
@@ -309,7 +314,7 @@ class Aligner {
309
314
  }, [])
310
315
  });
311
316
  }
312
- async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
317
+ async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
313
318
  const timing = (0, import_ghost_story.createTiming)();
314
319
  timing.start("read contents");
315
320
  const manifest = await this.epub.getManifest();
@@ -329,6 +334,7 @@ class Aligner {
329
334
  this.transcription,
330
335
  chapterSentences,
331
336
  transcriptionOffset,
337
+ locale,
332
338
  lastSentenceRange
333
339
  );
334
340
  timing.end("align sentences");
@@ -369,53 +375,67 @@ class Aligner {
369
375
  };
370
376
  }
371
377
  async alignBook(onProgress) {
372
- var _a, _b, _c, _d, _e, _f;
373
- this.timing.setMetadata(
374
- "language",
375
- ((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
376
- );
378
+ var _a, _b, _c, _d, _e, _f, _g;
379
+ const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
380
+ this.timing.setMetadata("language", locale.toString());
377
381
  this.timing.setMetadata("granularity", this.granularity);
378
382
  const spine = await this.epub.getSpineItems();
379
- const transcriptionText = this.transcription.transcript;
383
+ const manifest = await this.epub.getManifest();
384
+ const { result: transcriptionText, mapping } = await (0, import_slugify.slugify)(
385
+ this.transcription.transcript,
386
+ locale
387
+ );
380
388
  let lastTranscriptionOffset = 0;
381
389
  let lastSentenceRange = null;
382
390
  for (let index = 0; index < spine.length; index++) {
383
391
  onProgress == null ? void 0 : onProgress(index / spine.length);
384
392
  const spineItem = spine[index];
385
- (_b = this.logger) == null ? void 0 : _b.info(
393
+ (_a = this.logger) == null ? void 0 : _a.info(
386
394
  `Aligning epub item #${index} : ${(0, import_posix.basename)(spineItem.href)}`
387
395
  );
388
396
  const chapterId = spineItem.id;
397
+ if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
398
+ continue;
399
+ }
389
400
  const chapterSentences = await this.getChapterSentences(chapterId);
401
+ const slugifiedChapterSentences = [];
402
+ for (const chapterSentence of chapterSentences) {
403
+ slugifiedChapterSentences.push(
404
+ (await (0, import_slugify.slugify)(chapterSentence, locale)).result
405
+ );
406
+ }
390
407
  if (chapterSentences.length === 0) {
391
- (_c = this.logger) == null ? void 0 : _c.info(`Chapter #${index} has no text; skipping`);
408
+ (_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
392
409
  continue;
393
410
  }
394
411
  if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
395
412
  chapterSentences[0].split(" ").length < 4) {
396
- (_d = this.logger) == null ? void 0 : _d.info(
413
+ (_e = this.logger) == null ? void 0 : _e.info(
397
414
  `Chapter #${index} is fewer than four words; skipping`
398
415
  );
399
416
  continue;
400
417
  }
401
- const { startSentence, transcriptionOffset } = this.findBestOffset(
402
- chapterSentences,
418
+ const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
419
+ slugifiedChapterSentences,
403
420
  transcriptionText,
404
- lastTranscriptionOffset
421
+ mapping.map(lastTranscriptionOffset, -1),
422
+ mapping
405
423
  );
424
+ const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
406
425
  if (transcriptionOffset === null) {
407
- (_e = this.logger) == null ? void 0 : _e.info(
426
+ (_f = this.logger) == null ? void 0 : _f.info(
408
427
  `Couldn't find matching transcription for chapter #${index}`
409
428
  );
410
429
  continue;
411
430
  }
412
- (_f = this.logger) == null ? void 0 : _f.info(
431
+ (_g = this.logger) == null ? void 0 : _g.info(
413
432
  `Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
414
433
  );
415
434
  const result = await this.alignChapter(
416
435
  startSentence,
417
436
  chapterId,
418
437
  transcriptionOffset,
438
+ locale,
419
439
  lastSentenceRange
420
440
  );
421
441
  lastSentenceRange = result.lastSentenceRange;
@@ -23,10 +23,12 @@ import {
23
23
  getSentenceRanges,
24
24
  interpolateSentenceRanges
25
25
  } from "./getSentenceRanges.js";
26
+ import { slugify } from "./slugify.js";
26
27
  const OFFSET_SEARCH_WINDOW_SIZE = 5e3;
27
28
  async function align(input, output, transcriptionsDir, audiobookDir, options) {
28
29
  var _stack = [];
29
30
  try {
31
+ await mkdir(dirname(output), { recursive: true });
30
32
  await copyFile(input, output);
31
33
  const audiobookFiles = await readdir(audiobookDir).then(
32
34
  (filenames) => filenames.filter((f) => isAudioFile(f)).map((f) => autoJoin(audiobookDir, f))
@@ -52,6 +54,7 @@ async function align(input, output, transcriptionsDir, audiobookDir, options) {
52
54
  options.logger
53
55
  );
54
56
  const timing = await aligner.alignBook(options.onProgress);
57
+ await epub.saveAndClose();
55
58
  if (options.reportsPath) {
56
59
  await mkdir(autoDirname(options.reportsPath), { recursive: true });
57
60
  await writeFile(
@@ -86,7 +89,7 @@ class Aligner {
86
89
  report = {
87
90
  chapters: []
88
91
  };
89
- findBestOffset(epubSentences, transcriptionText, lastMatchOffset) {
92
+ findBestOffset(epubSentences, transcriptionText, lastMatchOffset, mapping) {
90
93
  let i = 0;
91
94
  while (i < transcriptionText.length) {
92
95
  let startSentence = 0;
@@ -98,11 +101,13 @@ class Aligner {
98
101
  let startSeen = null;
99
102
  let endSeen = null;
100
103
  for (const aligned of this.alignedChapters) {
101
- if (startSeen !== null && endSeen === aligned.startOffset) {
102
- endSeen = aligned.endOffset;
104
+ const alignedStart = mapping.map(aligned.startOffset, -1);
105
+ const alignedEnd = mapping.map(aligned.endOffset, -1);
106
+ if (startSeen !== null && endSeen === alignedStart) {
107
+ endSeen = alignedEnd;
103
108
  } else {
104
- startSeen = aligned.startOffset;
105
- endSeen = aligned.endOffset;
109
+ startSeen = alignedStart;
110
+ endSeen = alignedEnd;
106
111
  }
107
112
  if (startIndex >= startSeen && startIndex < endSeen) {
108
113
  startIndex = endSeen;
@@ -117,7 +122,7 @@ class Aligner {
117
122
  endIndex
118
123
  );
119
124
  while (startSentence < epubSentences.length) {
120
- const queryString = epubSentences.slice(startSentence, startSentence + 6).join(" ");
125
+ const queryString = epubSentences.slice(startSentence, startSentence + 6).join("-");
121
126
  const firstMatch = findNearestMatch(
122
127
  queryString.toLowerCase(),
123
128
  transcriptionTextSlice.toLowerCase(),
@@ -243,7 +248,7 @@ class Aligner {
243
248
  }, [])
244
249
  });
245
250
  }
246
- async alignChapter(startSentence, chapterId, transcriptionOffset, lastSentenceRange) {
251
+ async alignChapter(startSentence, chapterId, transcriptionOffset, locale, lastSentenceRange) {
247
252
  const timing = createTiming();
248
253
  timing.start("read contents");
249
254
  const manifest = await this.epub.getManifest();
@@ -263,6 +268,7 @@ class Aligner {
263
268
  this.transcription,
264
269
  chapterSentences,
265
270
  transcriptionOffset,
271
+ locale,
266
272
  lastSentenceRange
267
273
  );
268
274
  timing.end("align sentences");
@@ -303,53 +309,67 @@ class Aligner {
303
309
  };
304
310
  }
305
311
  async alignBook(onProgress) {
306
- var _a, _b, _c, _d, _e, _f;
307
- this.timing.setMetadata(
308
- "language",
309
- ((_a = this.languageOverride ?? await this.epub.getLanguage()) == null ? void 0 : _a.language) ?? "unknown"
310
- );
312
+ var _a, _b, _c, _d, _e, _f, _g;
313
+ const locale = this.languageOverride ?? await this.epub.getLanguage() ?? new Intl.Locale("en-US");
314
+ this.timing.setMetadata("language", locale.toString());
311
315
  this.timing.setMetadata("granularity", this.granularity);
312
316
  const spine = await this.epub.getSpineItems();
313
- const transcriptionText = this.transcription.transcript;
317
+ const manifest = await this.epub.getManifest();
318
+ const { result: transcriptionText, mapping } = await slugify(
319
+ this.transcription.transcript,
320
+ locale
321
+ );
314
322
  let lastTranscriptionOffset = 0;
315
323
  let lastSentenceRange = null;
316
324
  for (let index = 0; index < spine.length; index++) {
317
325
  onProgress == null ? void 0 : onProgress(index / spine.length);
318
326
  const spineItem = spine[index];
319
- (_b = this.logger) == null ? void 0 : _b.info(
327
+ (_a = this.logger) == null ? void 0 : _a.info(
320
328
  `Aligning epub item #${index} : ${basename(spineItem.href)}`
321
329
  );
322
330
  const chapterId = spineItem.id;
331
+ if ((_c = (_b = manifest[chapterId]) == null ? void 0 : _b.properties) == null ? void 0 : _c.includes("nav")) {
332
+ continue;
333
+ }
323
334
  const chapterSentences = await this.getChapterSentences(chapterId);
335
+ const slugifiedChapterSentences = [];
336
+ for (const chapterSentence of chapterSentences) {
337
+ slugifiedChapterSentences.push(
338
+ (await slugify(chapterSentence, locale)).result
339
+ );
340
+ }
324
341
  if (chapterSentences.length === 0) {
325
- (_c = this.logger) == null ? void 0 : _c.info(`Chapter #${index} has no text; skipping`);
342
+ (_d = this.logger) == null ? void 0 : _d.info(`Chapter #${index} has no text; skipping`);
326
343
  continue;
327
344
  }
328
345
  if (chapterSentences.length < 2 && // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
329
346
  chapterSentences[0].split(" ").length < 4) {
330
- (_d = this.logger) == null ? void 0 : _d.info(
347
+ (_e = this.logger) == null ? void 0 : _e.info(
331
348
  `Chapter #${index} is fewer than four words; skipping`
332
349
  );
333
350
  continue;
334
351
  }
335
- const { startSentence, transcriptionOffset } = this.findBestOffset(
336
- chapterSentences,
352
+ const { startSentence, transcriptionOffset: slugifiedOffset } = this.findBestOffset(
353
+ slugifiedChapterSentences,
337
354
  transcriptionText,
338
- lastTranscriptionOffset
355
+ mapping.map(lastTranscriptionOffset, -1),
356
+ mapping
339
357
  );
358
+ const transcriptionOffset = slugifiedOffset && mapping.invert().map(slugifiedOffset, -1);
340
359
  if (transcriptionOffset === null) {
341
- (_e = this.logger) == null ? void 0 : _e.info(
360
+ (_f = this.logger) == null ? void 0 : _f.info(
342
361
  `Couldn't find matching transcription for chapter #${index}`
343
362
  );
344
363
  continue;
345
364
  }
346
- (_f = this.logger) == null ? void 0 : _f.info(
365
+ (_g = this.logger) == null ? void 0 : _g.info(
347
366
  `Chapter #${index} best matches transcription at offset ${transcriptionOffset}, starting at sentence ${startSentence}`
348
367
  );
349
368
  const result = await this.alignChapter(
350
369
  startSentence,
351
370
  chapterId,
352
371
  transcriptionOffset,
372
+ locale,
353
373
  lastSentenceRange
354
374
  );
355
375
  lastSentenceRange = result.lastSentenceRange;
@@ -108,7 +108,7 @@ function expand(subsequence, sequence, maxDist) {
108
108
  function* levenshteinNgram(subsequence, sequence, maxDist) {
109
109
  const subsequenceLength = subsequence.length;
110
110
  const sequenceLength = sequence.length;
111
- const ngramLength = Math.round(subsequenceLength / (maxDist + 1));
111
+ const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
112
112
  if (ngramLength === 0) {
113
113
  throw new Error("The subsequence length must be greater than maxDist");
114
114
  }
@@ -86,7 +86,7 @@ function expand(subsequence, sequence, maxDist) {
86
86
  function* levenshteinNgram(subsequence, sequence, maxDist) {
87
87
  const subsequenceLength = subsequence.length;
88
88
  const sequenceLength = sequence.length;
89
- const ngramLength = Math.round(subsequenceLength / (maxDist + 1));
89
+ const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
90
90
  if (ngramLength === 0) {
91
91
  throw new Error("The subsequence length must be greater than maxDist");
92
92
  }
@@ -28,6 +28,7 @@ module.exports = __toCommonJS(getSentenceRanges_exports);
28
28
  var import_text_segmentation = require("@echogarden/text-segmentation");
29
29
  var import_ffmpeg = require("../common/ffmpeg.cjs");
30
30
  var import_fuzzy = require("./fuzzy.cjs");
31
+ var import_slugify = require("./slugify.cjs");
31
32
  async function getSentencesWithOffsets(text) {
32
33
  const sentences = await (0, import_text_segmentation.segmentText)(text).then(
33
34
  (r) => r.sentences.map((s) => s.text)
@@ -75,7 +76,7 @@ function getWindowIndexFromOffset(window, offset) {
75
76
  function collapseWhitespace(input) {
76
77
  return input.replaceAll(/\s+/g, " ");
77
78
  }
78
- async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
79
+ async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
79
80
  const sentenceRanges = [];
80
81
  const fullTranscriptionText = transcription.transcript;
81
82
  const transcriptionText = fullTranscriptionText.slice(chapterOffset);
@@ -83,13 +84,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
83
84
  transcriptionText
84
85
  ).then((s) => s.map((sentence) => sentence.toLowerCase()));
85
86
  let startSentenceEntry = startSentence;
86
- const sentenceEntries = sentences.map((sentence, index) => [index, sentence]).filter(([index, sentence]) => {
87
- if (sentence.replaceAll(/[.-_()[\],/?!@#$%^^&*`~;:='"<>+ˌˈ]/g, "").length <= 3) {
88
- if (index < startSentence) startSentenceEntry--;
89
- return false;
87
+ const sentenceEntries = [];
88
+ for (let i = 0; i < sentences.length; i++) {
89
+ const sentence = (await (0, import_slugify.slugify)(sentences[i], locale)).result;
90
+ if (sentence.length <= 3) {
91
+ if (i < startSentence) startSentenceEntry--;
92
+ continue;
90
93
  }
91
- return true;
92
- });
94
+ sentenceEntries.push([i, sentence]);
95
+ }
93
96
  let transcriptionWindowIndex = 0;
94
97
  let transcriptionWindowOffset = 0;
95
98
  let lastGoodTranscriptionWindow = 0;
@@ -102,7 +105,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
102
105
  transcriptionWindowIndex,
103
106
  transcriptionWindowIndex + 10
104
107
  );
105
- const transcriptionWindow = transcriptionWindowList.join("").slice(transcriptionWindowOffset);
108
+ const { result: transcriptionWindow, mapping } = await (0, import_slugify.slugify)(
109
+ transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
110
+ locale
111
+ );
112
+ const inverted = mapping.invert();
106
113
  const query = collapseWhitespace(sentence.trim()).toLowerCase();
107
114
  const firstMatch = (0, import_fuzzy.findNearestMatch)(
108
115
  query,
@@ -125,8 +132,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
125
132
  continue;
126
133
  }
127
134
  const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
135
+ const matchStart = inverted.map(firstMatch.index, 1);
136
+ const matchEnd = inverted.map(
137
+ firstMatch.index + firstMatch.match.length,
138
+ -1
139
+ );
128
140
  const startResult = findStartTimestamp(
129
- firstMatch.index + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
141
+ matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
130
142
  transcription
131
143
  );
132
144
  if (!startResult) {
@@ -136,7 +148,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
136
148
  let start = startResult.start;
137
149
  const audiofile = startResult.audiofile;
138
150
  const end = findEndTimestamp(
139
- firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
151
+ matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
140
152
  transcription
141
153
  ) ?? startResult.end;
142
154
  if (sentenceRanges.length > 0) {
@@ -177,10 +189,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
177
189
  audiofile
178
190
  });
179
191
  notFound = 0;
180
- lastMatchEnd = firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
192
+ lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
181
193
  const windowIndexResult = getWindowIndexFromOffset(
182
194
  transcriptionWindowList,
183
- firstMatch.index + firstMatch.match.length + transcriptionWindowOffset
195
+ matchEnd + transcriptionWindowOffset
184
196
  );
185
197
  transcriptionWindowIndex += windowIndexResult.index;
186
198
  transcriptionWindowOffset = windowIndexResult.offset;
@@ -14,7 +14,7 @@ type SentenceRange = {
14
14
  audiofile: string;
15
15
  };
16
16
  declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
17
- declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
17
+ declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
18
18
  sentenceRanges: SentenceRange[];
19
19
  transcriptionOffset: number;
20
20
  }>;
@@ -14,7 +14,7 @@ type SentenceRange = {
14
14
  audiofile: string;
15
15
  };
16
16
  declare function findEndTimestamp(matchEndIndex: number, transcription: StorytellerTranscription): number | null;
17
- declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, lastSentenceRange: SentenceRange | null): Promise<{
17
+ declare function getSentenceRanges(startSentence: number, transcription: StorytellerTranscription, sentences: string[], chapterOffset: number, locale: Intl.Locale, lastSentenceRange: SentenceRange | null): Promise<{
18
18
  sentenceRanges: SentenceRange[];
19
19
  transcriptionOffset: number;
20
20
  }>;
@@ -2,6 +2,7 @@ import "../chunk-BIEQXUOY.js";
2
2
  import { segmentText } from "@echogarden/text-segmentation";
3
3
  import { getTrackDuration } from "../common/ffmpeg.js";
4
4
  import { findNearestMatch } from "./fuzzy.js";
5
+ import { slugify } from "./slugify.js";
5
6
  async function getSentencesWithOffsets(text) {
6
7
  const sentences = await segmentText(text).then(
7
8
  (r) => r.sentences.map((s) => s.text)
@@ -49,7 +50,7 @@ function getWindowIndexFromOffset(window, offset) {
49
50
  function collapseWhitespace(input) {
50
51
  return input.replaceAll(/\s+/g, " ");
51
52
  }
52
- async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, lastSentenceRange) {
53
+ async function getSentenceRanges(startSentence, transcription, sentences, chapterOffset, locale, lastSentenceRange) {
53
54
  const sentenceRanges = [];
54
55
  const fullTranscriptionText = transcription.transcript;
55
56
  const transcriptionText = fullTranscriptionText.slice(chapterOffset);
@@ -57,13 +58,15 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
57
58
  transcriptionText
58
59
  ).then((s) => s.map((sentence) => sentence.toLowerCase()));
59
60
  let startSentenceEntry = startSentence;
60
- const sentenceEntries = sentences.map((sentence, index) => [index, sentence]).filter(([index, sentence]) => {
61
- if (sentence.replaceAll(/[.-_()[\],/?!@#$%^^&*`~;:='"<>+ˌˈ]/g, "").length <= 3) {
62
- if (index < startSentence) startSentenceEntry--;
63
- return false;
61
+ const sentenceEntries = [];
62
+ for (let i = 0; i < sentences.length; i++) {
63
+ const sentence = (await slugify(sentences[i], locale)).result;
64
+ if (sentence.length <= 3) {
65
+ if (i < startSentence) startSentenceEntry--;
66
+ continue;
64
67
  }
65
- return true;
66
- });
68
+ sentenceEntries.push([i, sentence]);
69
+ }
67
70
  let transcriptionWindowIndex = 0;
68
71
  let transcriptionWindowOffset = 0;
69
72
  let lastGoodTranscriptionWindow = 0;
@@ -76,7 +79,11 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
76
79
  transcriptionWindowIndex,
77
80
  transcriptionWindowIndex + 10
78
81
  );
79
- const transcriptionWindow = transcriptionWindowList.join("").slice(transcriptionWindowOffset);
82
+ const { result: transcriptionWindow, mapping } = await slugify(
83
+ transcriptionWindowList.join("-").slice(transcriptionWindowOffset),
84
+ locale
85
+ );
86
+ const inverted = mapping.invert();
80
87
  const query = collapseWhitespace(sentence.trim()).toLowerCase();
81
88
  const firstMatch = findNearestMatch(
82
89
  query,
@@ -99,8 +106,13 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
99
106
  continue;
100
107
  }
101
108
  const transcriptionOffset = transcriptionSentences.slice(0, transcriptionWindowIndex).join("").length;
109
+ const matchStart = inverted.map(firstMatch.index, 1);
110
+ const matchEnd = inverted.map(
111
+ firstMatch.index + firstMatch.match.length,
112
+ -1
113
+ );
102
114
  const startResult = findStartTimestamp(
103
- firstMatch.index + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
115
+ matchStart + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
104
116
  transcription
105
117
  );
106
118
  if (!startResult) {
@@ -110,7 +122,7 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
110
122
  let start = startResult.start;
111
123
  const audiofile = startResult.audiofile;
112
124
  const end = findEndTimestamp(
113
- firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
125
+ matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset,
114
126
  transcription
115
127
  ) ?? startResult.end;
116
128
  if (sentenceRanges.length > 0) {
@@ -151,10 +163,10 @@ async function getSentenceRanges(startSentence, transcription, sentences, chapte
151
163
  audiofile
152
164
  });
153
165
  notFound = 0;
154
- lastMatchEnd = firstMatch.index + firstMatch.match.length + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
166
+ lastMatchEnd = matchEnd + transcriptionOffset + transcriptionWindowOffset + chapterOffset;
155
167
  const windowIndexResult = getWindowIndexFromOffset(
156
168
  transcriptionWindowList,
157
- firstMatch.index + firstMatch.match.length + transcriptionWindowOffset
169
+ matchEnd + transcriptionWindowOffset
158
170
  );
159
171
  transcriptionWindowIndex += windowIndexResult.index;
160
172
  transcriptionWindowOffset = windowIndexResult.offset;
@@ -0,0 +1,125 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+ var slugify_exports = {};
20
+ __export(slugify_exports, {
21
+ slugify: () => slugify
22
+ });
23
+ module.exports = __toCommonJS(slugify_exports);
24
+ var import_locale_currency = require("locale-currency");
25
+ var import_to_words = require("to-words");
26
+ var import_transliteration = require("@storyteller-platform/transliteration");
27
+ const replacerMap = /* @__PURE__ */ new WeakMap();
28
+ function createReplacers(locale) {
29
+ const maximizedLocale = locale.maximize();
30
+ const demoNumber = 123456.789;
31
+ const currencyFormat = new Intl.NumberFormat(locale, {
32
+ style: "currency",
33
+ // eslint-disable-next-line @typescript-eslint/no-non-null-assertion
34
+ currency: (0, import_locale_currency.getCurrency)(locale.maximize().region)
35
+ });
36
+ const currencyParts = currencyFormat.formatToParts(demoNumber);
37
+ const currencySymbols = currencyParts.reduce(
38
+ (acc, part, index) => {
39
+ if (part.type === "group") {
40
+ return {
41
+ ...acc,
42
+ group: part.value
43
+ };
44
+ }
45
+ if (part.type === "decimal") {
46
+ return {
47
+ ...acc,
48
+ decimal: part.value
49
+ };
50
+ }
51
+ if (part.type === "currency") {
52
+ return {
53
+ ...acc,
54
+ currency: part.value,
55
+ currencyLeading: index === 0
56
+ };
57
+ }
58
+ return acc;
59
+ },
60
+ { group: "", decimal: "", currency: "", currencyLeading: true }
61
+ );
62
+ const numeralRegexPart = `(\\p{Number}[\\p{Number}${currencySymbols.group}]*(?:[${currencySymbols.decimal}]\\p{Number}*)?)`;
63
+ const currencyRegex = currencySymbols.currencyLeading ? new RegExp(`[${currencySymbols.currency}]\\s?${numeralRegexPart}`, "gu") : new RegExp(`${numeralRegexPart}\\s?[${currencySymbols.currency}]`, "gu");
64
+ function currencyReplacer(match) {
65
+ const numeralMatch = match[1];
66
+ if (!numeralMatch) return match[0];
67
+ const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
68
+ const number = parseFloat(normalizedNumeral);
69
+ return (0, import_to_words.toWords)(number, {
70
+ localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
71
+ currency: true,
72
+ doNotAddOnly: true
73
+ });
74
+ }
75
+ const numberFormat = new Intl.NumberFormat(locale);
76
+ const numberParts = numberFormat.formatToParts(demoNumber);
77
+ const numberSymbols = numberParts.reduce(
78
+ (acc, part) => {
79
+ if (part.type === "group") {
80
+ return {
81
+ ...acc,
82
+ group: part.value
83
+ };
84
+ }
85
+ if (part.type === "decimal") {
86
+ return {
87
+ ...acc,
88
+ decimal: part.value
89
+ };
90
+ }
91
+ return acc;
92
+ },
93
+ { group: "", decimal: "" }
94
+ );
95
+ const numberRegex = new RegExp(
96
+ `(\\p{Number}[\\p{Number}${numberSymbols.group}]*(?:[${numberSymbols.decimal}]\\p{Number}*)?)`,
97
+ "gu"
98
+ );
99
+ function numberReplacer(match) {
100
+ const numeralMatch = match[1];
101
+ if (!numeralMatch) return match[0];
102
+ const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${numberSymbols.group}`, "g"), "").replace(new RegExp(`\\${numberSymbols.decimal}`), ".");
103
+ const number = parseFloat(normalizedNumeral);
104
+ return (0, import_to_words.toWords)(number, {
105
+ localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`
106
+ });
107
+ }
108
+ return [
109
+ [currencyRegex, currencyReplacer],
110
+ [numberRegex, numberReplacer]
111
+ ];
112
+ }
113
+ async function slugify(text, locale) {
114
+ const replacers = replacerMap.get(locale) ?? createReplacers(locale);
115
+ replacerMap.set(locale, replacers);
116
+ const { result, mapping } = await (0, import_transliteration.slugify)(text, {
117
+ allowedChars: "a-zA-Z0-9",
118
+ replace: replacers
119
+ });
120
+ return { result, mapping };
121
+ }
122
+ // Annotate the CommonJS export names for ESM import in node:
123
+ 0 && (module.exports = {
124
+ slugify
125
+ });
@@ -0,0 +1,8 @@
1
+ import * as _storyteller_platform_transliteration from '@storyteller-platform/transliteration';
2
+
3
+ declare function slugify(text: string, locale: Intl.Locale): Promise<{
4
+ result: string;
5
+ mapping: _storyteller_platform_transliteration.Mapping;
6
+ }>;
7
+
8
+ export { slugify };