@storyteller-platform/align 0.1.24 → 0.1.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/align.cjs +21 -9
- package/dist/align/align.js +22 -11
- package/dist/align/getSentenceRanges.cjs +0 -58
- package/dist/align/getSentenceRanges.d.cts +1 -2
- package/dist/align/getSentenceRanges.d.ts +1 -2
- package/dist/align/getSentenceRanges.js +0 -57
- package/dist/align/interpolateSentenceRanges.cjs +124 -0
- package/dist/align/interpolateSentenceRanges.d.cts +23 -0
- package/dist/align/interpolateSentenceRanges.d.ts +23 -0
- package/dist/align/interpolateSentenceRanges.js +101 -0
- package/dist/align/search.cjs +18 -7
- package/dist/align/search.js +18 -7
- package/dist/align/slugify.cjs +31 -23
- package/dist/align/slugify.js +31 -23
- package/dist/index.d.cts +1 -2
- package/dist/index.d.ts +1 -2
- package/dist/markup/markup.cjs +21 -14
- package/dist/markup/markup.d.cts +2 -4
- package/dist/markup/markup.d.ts +2 -4
- package/dist/markup/markup.js +28 -16
- package/dist/markup/model.cjs +138 -5
- package/dist/markup/model.d.cts +2 -57
- package/dist/markup/model.d.ts +2 -57
- package/dist/markup/model.js +136 -5
- package/dist/markup/parseDom.cjs +80 -25
- package/dist/markup/parseDom.d.cts +4 -4
- package/dist/markup/parseDom.d.ts +4 -4
- package/dist/markup/parseDom.js +87 -24
- package/dist/markup/resolvedPos.cjs +85 -0
- package/dist/markup/resolvedPos.d.cts +2 -0
- package/dist/markup/resolvedPos.d.ts +2 -0
- package/dist/markup/resolvedPos.js +62 -0
- package/dist/markup/segmentation.cjs +4 -8
- package/dist/markup/segmentation.d.cts +3 -8
- package/dist/markup/segmentation.d.ts +3 -8
- package/dist/markup/segmentation.js +3 -7
- package/dist/markup/serializeDom.d.cts +1 -1
- package/dist/markup/serializeDom.d.ts +1 -1
- package/dist/markup/transform.cjs +59 -2
- package/dist/markup/transform.d.cts +8 -2
- package/dist/markup/transform.d.ts +8 -2
- package/dist/markup/transform.js +58 -1
- package/dist/model-Bv3yPEdd.d.cts +96 -0
- package/dist/model-Bv3yPEdd.d.ts +96 -0
- package/dist/snapshot/snapshot.cjs +8 -6
- package/dist/snapshot/snapshot.js +9 -7
- package/package.json +4 -4
package/dist/align/search.js
CHANGED
|
@@ -12,16 +12,16 @@ function buildNgramIndex(text) {
|
|
|
12
12
|
}
|
|
13
13
|
return index;
|
|
14
14
|
}
|
|
15
|
+
const NGRAM_SIZE = 5;
|
|
15
16
|
function* ngrams(text) {
|
|
16
17
|
const words = text.split("-");
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
yield [ngram, pos];
|
|
21
|
-
pos += words[i].length + 1;
|
|
18
|
+
for (const i of range(words.length - NGRAM_SIZE - 1)) {
|
|
19
|
+
const ngram = words.slice(i, i + NGRAM_SIZE).join("-");
|
|
20
|
+
yield [ngram, i];
|
|
22
21
|
}
|
|
23
22
|
}
|
|
24
23
|
function collectBoundaryVotes(query, document) {
|
|
24
|
+
const queryWords = query.split("-");
|
|
25
25
|
const documentIndex = buildNgramIndex(document);
|
|
26
26
|
let skippedNgrams = 0;
|
|
27
27
|
let totalNgrams = 0;
|
|
@@ -36,7 +36,7 @@ function collectBoundaryVotes(query, document) {
|
|
|
36
36
|
}
|
|
37
37
|
for (const documentStart of documentStarts) {
|
|
38
38
|
startVotes.push(documentStart - start);
|
|
39
|
-
endVotes.push(documentStart + (
|
|
39
|
+
endVotes.push(documentStart + (queryWords.length - start));
|
|
40
40
|
}
|
|
41
41
|
}
|
|
42
42
|
if (skippedNgrams > totalNgrams / 2) {
|
|
@@ -72,6 +72,14 @@ function chooseBestFromBins(bins, dir) {
|
|
|
72
72
|
}
|
|
73
73
|
return dir > 0 ? max(best) ?? null : min(best) ?? null;
|
|
74
74
|
}
|
|
75
|
+
function getOffsetFromWordIndex(wordIndex, document) {
|
|
76
|
+
const words = document.split("-");
|
|
77
|
+
let offset = 0;
|
|
78
|
+
for (const i of range(Math.min(words.length, Math.max(0, wordIndex)))) {
|
|
79
|
+
offset += words[i].length + 1;
|
|
80
|
+
}
|
|
81
|
+
return offset;
|
|
82
|
+
}
|
|
75
83
|
function findBoundaries(query, document) {
|
|
76
84
|
const boundaryVotes = collectBoundaryVotes(query, document);
|
|
77
85
|
if (!boundaryVotes) return null;
|
|
@@ -86,7 +94,10 @@ function findBoundaries(query, document) {
|
|
|
86
94
|
if (bestEnd === null) {
|
|
87
95
|
return null;
|
|
88
96
|
}
|
|
89
|
-
return {
|
|
97
|
+
return {
|
|
98
|
+
start: getOffsetFromWordIndex(bestStart, document),
|
|
99
|
+
end: getOffsetFromWordIndex(bestEnd, document)
|
|
100
|
+
};
|
|
90
101
|
}
|
|
91
102
|
export {
|
|
92
103
|
buildNgramIndex,
|
package/dist/align/slugify.cjs
CHANGED
|
@@ -25,16 +25,16 @@ var import_locale_currency = require("locale-currency");
|
|
|
25
25
|
var import_to_words = require("to-words");
|
|
26
26
|
var import_transliteration = require("@storyteller-platform/transliteration");
|
|
27
27
|
const replacerMap = /* @__PURE__ */ new WeakMap();
|
|
28
|
-
function
|
|
29
|
-
const
|
|
28
|
+
function getCurrencySymbols(locale) {
|
|
29
|
+
const region = locale.maximize().region;
|
|
30
30
|
const demoNumber = 123456.789;
|
|
31
31
|
const currencyFormat = new Intl.NumberFormat(locale, {
|
|
32
32
|
style: "currency",
|
|
33
33
|
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
34
|
-
currency: (0, import_locale_currency.getCurrency)(locale.maximize().region)
|
|
34
|
+
currency: region ? (0, import_locale_currency.getCurrency)(locale.maximize().region) : "USD"
|
|
35
35
|
});
|
|
36
36
|
const currencyParts = currencyFormat.formatToParts(demoNumber);
|
|
37
|
-
|
|
37
|
+
return currencyParts.reduce(
|
|
38
38
|
(acc, part, index) => {
|
|
39
39
|
if (part.type === "group") {
|
|
40
40
|
return {
|
|
@@ -59,27 +59,12 @@ function createReplacers(locale) {
|
|
|
59
59
|
},
|
|
60
60
|
{ group: "", decimal: "", currency: "", currencyLeading: true }
|
|
61
61
|
);
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
const numeralMatch = match[1];
|
|
66
|
-
if (!numeralMatch) return match[0];
|
|
67
|
-
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
|
|
68
|
-
const number = parseFloat(normalizedNumeral);
|
|
69
|
-
if (Number.isNaN(number)) return match[0];
|
|
70
|
-
try {
|
|
71
|
-
return (0, import_to_words.toWords)(number, {
|
|
72
|
-
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
|
|
73
|
-
currency: true,
|
|
74
|
-
doNotAddOnly: true
|
|
75
|
-
});
|
|
76
|
-
} catch {
|
|
77
|
-
return match[0];
|
|
78
|
-
}
|
|
79
|
-
}
|
|
62
|
+
}
|
|
63
|
+
function getNumberSymbols(locale) {
|
|
64
|
+
const demoNumber = 123456.789;
|
|
80
65
|
const numberFormat = new Intl.NumberFormat(locale);
|
|
81
66
|
const numberParts = numberFormat.formatToParts(demoNumber);
|
|
82
|
-
|
|
67
|
+
return numberParts.reduce(
|
|
83
68
|
(acc, part) => {
|
|
84
69
|
if (part.type === "group") {
|
|
85
70
|
return {
|
|
@@ -97,6 +82,29 @@ function createReplacers(locale) {
|
|
|
97
82
|
},
|
|
98
83
|
{ group: "", decimal: "" }
|
|
99
84
|
);
|
|
85
|
+
}
|
|
86
|
+
function createReplacers(locale) {
|
|
87
|
+
const maximizedLocale = locale.maximize();
|
|
88
|
+
const currencySymbols = getCurrencySymbols(maximizedLocale);
|
|
89
|
+
const numeralRegexPart = `(\\p{Number}[\\p{Number}${currencySymbols.group}]*(?:[${currencySymbols.decimal}]\\p{Number}*)?)`;
|
|
90
|
+
const currencyRegex = currencySymbols.currencyLeading ? new RegExp(`[${currencySymbols.currency}]\\s?${numeralRegexPart}`, "gu") : new RegExp(`${numeralRegexPart}\\s?[${currencySymbols.currency}]`, "gu");
|
|
91
|
+
function currencyReplacer(match) {
|
|
92
|
+
const numeralMatch = match[1];
|
|
93
|
+
if (!numeralMatch) return match[0];
|
|
94
|
+
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
|
|
95
|
+
const number = parseFloat(normalizedNumeral);
|
|
96
|
+
if (Number.isNaN(number)) return match[0];
|
|
97
|
+
try {
|
|
98
|
+
return (0, import_to_words.toWords)(number, {
|
|
99
|
+
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
|
|
100
|
+
currency: true,
|
|
101
|
+
doNotAddOnly: true
|
|
102
|
+
});
|
|
103
|
+
} catch {
|
|
104
|
+
return match[0];
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const numberSymbols = getNumberSymbols(maximizedLocale);
|
|
100
108
|
const numberRegex = new RegExp(
|
|
101
109
|
`(\\p{Number}[\\p{Number}${numberSymbols.group}]*(?:[${numberSymbols.decimal}]\\p{Number}*)?)`,
|
|
102
110
|
"gu"
|
package/dist/align/slugify.js
CHANGED
|
@@ -3,16 +3,16 @@ import { getCurrency } from "locale-currency";
|
|
|
3
3
|
import { toWords } from "to-words";
|
|
4
4
|
import { slugify as transliterateSlugify } from "@storyteller-platform/transliteration";
|
|
5
5
|
const replacerMap = /* @__PURE__ */ new WeakMap();
|
|
6
|
-
function
|
|
7
|
-
const
|
|
6
|
+
function getCurrencySymbols(locale) {
|
|
7
|
+
const region = locale.maximize().region;
|
|
8
8
|
const demoNumber = 123456.789;
|
|
9
9
|
const currencyFormat = new Intl.NumberFormat(locale, {
|
|
10
10
|
style: "currency",
|
|
11
11
|
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
|
|
12
|
-
currency: getCurrency(locale.maximize().region)
|
|
12
|
+
currency: region ? getCurrency(locale.maximize().region) : "USD"
|
|
13
13
|
});
|
|
14
14
|
const currencyParts = currencyFormat.formatToParts(demoNumber);
|
|
15
|
-
|
|
15
|
+
return currencyParts.reduce(
|
|
16
16
|
(acc, part, index) => {
|
|
17
17
|
if (part.type === "group") {
|
|
18
18
|
return {
|
|
@@ -37,27 +37,12 @@ function createReplacers(locale) {
|
|
|
37
37
|
},
|
|
38
38
|
{ group: "", decimal: "", currency: "", currencyLeading: true }
|
|
39
39
|
);
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
const numeralMatch = match[1];
|
|
44
|
-
if (!numeralMatch) return match[0];
|
|
45
|
-
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
|
|
46
|
-
const number = parseFloat(normalizedNumeral);
|
|
47
|
-
if (Number.isNaN(number)) return match[0];
|
|
48
|
-
try {
|
|
49
|
-
return toWords(number, {
|
|
50
|
-
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
|
|
51
|
-
currency: true,
|
|
52
|
-
doNotAddOnly: true
|
|
53
|
-
});
|
|
54
|
-
} catch {
|
|
55
|
-
return match[0];
|
|
56
|
-
}
|
|
57
|
-
}
|
|
40
|
+
}
|
|
41
|
+
function getNumberSymbols(locale) {
|
|
42
|
+
const demoNumber = 123456.789;
|
|
58
43
|
const numberFormat = new Intl.NumberFormat(locale);
|
|
59
44
|
const numberParts = numberFormat.formatToParts(demoNumber);
|
|
60
|
-
|
|
45
|
+
return numberParts.reduce(
|
|
61
46
|
(acc, part) => {
|
|
62
47
|
if (part.type === "group") {
|
|
63
48
|
return {
|
|
@@ -75,6 +60,29 @@ function createReplacers(locale) {
|
|
|
75
60
|
},
|
|
76
61
|
{ group: "", decimal: "" }
|
|
77
62
|
);
|
|
63
|
+
}
|
|
64
|
+
function createReplacers(locale) {
|
|
65
|
+
const maximizedLocale = locale.maximize();
|
|
66
|
+
const currencySymbols = getCurrencySymbols(maximizedLocale);
|
|
67
|
+
const numeralRegexPart = `(\\p{Number}[\\p{Number}${currencySymbols.group}]*(?:[${currencySymbols.decimal}]\\p{Number}*)?)`;
|
|
68
|
+
const currencyRegex = currencySymbols.currencyLeading ? new RegExp(`[${currencySymbols.currency}]\\s?${numeralRegexPart}`, "gu") : new RegExp(`${numeralRegexPart}\\s?[${currencySymbols.currency}]`, "gu");
|
|
69
|
+
function currencyReplacer(match) {
|
|
70
|
+
const numeralMatch = match[1];
|
|
71
|
+
if (!numeralMatch) return match[0];
|
|
72
|
+
const normalizedNumeral = numeralMatch.replaceAll(new RegExp(`\\${currencySymbols.group}`, "g"), "").replace(new RegExp(`\\${currencySymbols.decimal}`), ".");
|
|
73
|
+
const number = parseFloat(normalizedNumeral);
|
|
74
|
+
if (Number.isNaN(number)) return match[0];
|
|
75
|
+
try {
|
|
76
|
+
return toWords(number, {
|
|
77
|
+
localeCode: `${maximizedLocale.language}-${maximizedLocale.region}`,
|
|
78
|
+
currency: true,
|
|
79
|
+
doNotAddOnly: true
|
|
80
|
+
});
|
|
81
|
+
} catch {
|
|
82
|
+
return match[0];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
const numberSymbols = getNumberSymbols(maximizedLocale);
|
|
78
86
|
const numberRegex = new RegExp(
|
|
79
87
|
`(\\p{Number}[\\p{Number}${numberSymbols.group}]*(?:[${numberSymbols.decimal}]\\p{Number}*)?)`,
|
|
80
88
|
"gu"
|
package/dist/index.d.cts
CHANGED
|
@@ -7,9 +7,8 @@ import '@esfx/async-semaphore';
|
|
|
7
7
|
import 'pino';
|
|
8
8
|
import './process/AudioEncoding.cjs';
|
|
9
9
|
import '@storyteller-platform/ghost-story/constants';
|
|
10
|
-
import '@echogarden/text-segmentation';
|
|
11
10
|
import '@storyteller-platform/epub';
|
|
12
|
-
import './markup/map.cjs';
|
|
13
11
|
import '@storyteller-platform/ghost-story/recognition';
|
|
14
12
|
import './align/getSentenceRanges.cjs';
|
|
13
|
+
import '@echogarden/text-segmentation';
|
|
15
14
|
import '@storyteller-platform/transliteration';
|
package/dist/index.d.ts
CHANGED
|
@@ -7,9 +7,8 @@ import '@esfx/async-semaphore';
|
|
|
7
7
|
import 'pino';
|
|
8
8
|
import './process/AudioEncoding.js';
|
|
9
9
|
import '@storyteller-platform/ghost-story/constants';
|
|
10
|
-
import '@echogarden/text-segmentation';
|
|
11
10
|
import '@storyteller-platform/epub';
|
|
12
|
-
import './markup/map.js';
|
|
13
11
|
import '@storyteller-platform/ghost-story/recognition';
|
|
14
12
|
import './align/getSentenceRanges.js';
|
|
13
|
+
import '@echogarden/text-segmentation';
|
|
15
14
|
import '@storyteller-platform/transliteration';
|
package/dist/markup/markup.cjs
CHANGED
|
@@ -97,16 +97,11 @@ async function markup(input, output, options) {
|
|
|
97
97
|
continue;
|
|
98
98
|
}
|
|
99
99
|
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
100
|
-
const {
|
|
101
|
-
import_epub.Epub.getXhtmlBody(chapterXml),
|
|
102
|
-
{ primaryLocale }
|
|
103
|
-
);
|
|
104
|
-
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
100
|
+
const { markedUp, timing: chapterTiming } = await markupChapter(
|
|
105
101
|
chapterId,
|
|
106
102
|
chapterXml,
|
|
107
103
|
options.granularity ?? "sentence",
|
|
108
|
-
|
|
109
|
-
mapping
|
|
104
|
+
primaryLocale
|
|
110
105
|
);
|
|
111
106
|
timing.add(chapterTiming.summary());
|
|
112
107
|
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
@@ -119,7 +114,7 @@ async function markup(input, output, options) {
|
|
|
119
114
|
__callDispose(_stack, _error, _hasError);
|
|
120
115
|
}
|
|
121
116
|
}
|
|
122
|
-
function markupChapter(chapterId, chapterXml, granularity,
|
|
117
|
+
async function markupChapter(chapterId, chapterXml, granularity, locale) {
|
|
123
118
|
const timing = (0, import_ghost_story.createTiming)();
|
|
124
119
|
const html = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
125
120
|
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
@@ -128,8 +123,14 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
128
123
|
clearBodyElement(chapterXml);
|
|
129
124
|
const taggedHtml = import_epub.Epub.findXmlChildByName("html", chapterXml);
|
|
130
125
|
const taggedBody = import_epub.Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
126
|
+
const original = (0, import_parseDom.parseDom)(import_epub.Epub.getXmlChildren(body));
|
|
127
|
+
const inlined = (0, import_transform.inlineFootnotes)(original);
|
|
128
|
+
const lifted = (0, import_transform.liftText)(inlined.root);
|
|
129
|
+
const segmentation = await (0, import_segmentation.segmentChapter)(lifted.result, {
|
|
130
|
+
primaryLocale: locale
|
|
131
|
+
});
|
|
131
132
|
timing.time("mark up", () => {
|
|
132
|
-
let root =
|
|
133
|
+
let root = inlined.root;
|
|
133
134
|
let pos = 0;
|
|
134
135
|
let i = 0;
|
|
135
136
|
for (const sentence of segmentation) {
|
|
@@ -140,8 +141,8 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
140
141
|
if (word.text.match(/\S/)) {
|
|
141
142
|
root = (0, import_transform.addMark)(
|
|
142
143
|
root,
|
|
143
|
-
mapping.invert().map(wordPos),
|
|
144
|
-
mapping.invert().map(wordPos + word.text.replace(/\n$/, "").length, -1),
|
|
144
|
+
lifted.mapping.invert().map(wordPos),
|
|
145
|
+
lifted.mapping.invert().map(wordPos + word.text.replace(/\n$/, "").length, -1),
|
|
145
146
|
new import_model.Mark("span", { id: `${chapterId}-s${i}-w${j}` })
|
|
146
147
|
);
|
|
147
148
|
j++;
|
|
@@ -152,15 +153,21 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
152
153
|
if (sentence.text.match(/\S/)) {
|
|
153
154
|
root = (0, import_transform.addMark)(
|
|
154
155
|
root,
|
|
155
|
-
mapping.invert().map(pos),
|
|
156
|
-
mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
156
|
+
lifted.mapping.invert().map(pos),
|
|
157
|
+
lifted.mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
157
158
|
new import_model.Mark("span", { id: `${chapterId}-s${i}` })
|
|
158
159
|
);
|
|
159
160
|
i++;
|
|
160
161
|
}
|
|
161
162
|
pos += sentence.text.replace(/\n$/, "").length;
|
|
162
163
|
}
|
|
163
|
-
|
|
164
|
+
const replaced = (0, import_transform.replaceFootnotes)(
|
|
165
|
+
original,
|
|
166
|
+
root,
|
|
167
|
+
inlined.footnotePairs,
|
|
168
|
+
inlined.mapping
|
|
169
|
+
);
|
|
170
|
+
taggedBody["body"] = (0, import_serializeDom.serializeDom)(replaced);
|
|
164
171
|
});
|
|
165
172
|
return { markedUp: chapterXml, timing };
|
|
166
173
|
}
|
package/dist/markup/markup.d.cts
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
2
|
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
-
import { Sentence } from '@echogarden/text-segmentation';
|
|
4
3
|
import { Logger } from 'pino';
|
|
5
4
|
import { ParsedXml } from '@storyteller-platform/epub';
|
|
6
|
-
import { Mapping } from './map.cjs';
|
|
7
5
|
|
|
8
6
|
interface MarkupOptions {
|
|
9
7
|
granularity?: "word" | "sentence";
|
|
@@ -12,9 +10,9 @@ interface MarkupOptions {
|
|
|
12
10
|
logger?: Logger;
|
|
13
11
|
}
|
|
14
12
|
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
15
|
-
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, granularity: "word" | "sentence",
|
|
13
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, granularity: "word" | "sentence", locale: Intl.Locale | null): Promise<{
|
|
16
14
|
markedUp: ParsedXml;
|
|
17
15
|
timing: _storyteller_platform_ghost_story.Timing;
|
|
18
|
-
}
|
|
16
|
+
}>;
|
|
19
17
|
|
|
20
18
|
export { type MarkupOptions, markup, markupChapter };
|
package/dist/markup/markup.d.ts
CHANGED
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
import * as _storyteller_platform_ghost_story from '@storyteller-platform/ghost-story';
|
|
2
2
|
import { TimingAggregator } from '@storyteller-platform/ghost-story';
|
|
3
|
-
import { Sentence } from '@echogarden/text-segmentation';
|
|
4
3
|
import { Logger } from 'pino';
|
|
5
4
|
import { ParsedXml } from '@storyteller-platform/epub';
|
|
6
|
-
import { Mapping } from './map.js';
|
|
7
5
|
|
|
8
6
|
interface MarkupOptions {
|
|
9
7
|
granularity?: "word" | "sentence";
|
|
@@ -12,9 +10,9 @@ interface MarkupOptions {
|
|
|
12
10
|
logger?: Logger;
|
|
13
11
|
}
|
|
14
12
|
declare function markup(input: string, output: string, options: MarkupOptions): Promise<TimingAggregator>;
|
|
15
|
-
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, granularity: "word" | "sentence",
|
|
13
|
+
declare function markupChapter(chapterId: string, chapterXml: ParsedXml, granularity: "word" | "sentence", locale: Intl.Locale | null): Promise<{
|
|
16
14
|
markedUp: ParsedXml;
|
|
17
15
|
timing: _storyteller_platform_ghost_story.Timing;
|
|
18
|
-
}
|
|
16
|
+
}>;
|
|
19
17
|
|
|
20
18
|
export { type MarkupOptions, markup, markupChapter };
|
package/dist/markup/markup.js
CHANGED
|
@@ -11,9 +11,14 @@ import {
|
|
|
11
11
|
} from "@storyteller-platform/ghost-story";
|
|
12
12
|
import { Mark } from "./model.js";
|
|
13
13
|
import { parseDom } from "./parseDom.js";
|
|
14
|
-
import {
|
|
14
|
+
import { segmentChapter } from "./segmentation.js";
|
|
15
15
|
import { serializeDom } from "./serializeDom.js";
|
|
16
|
-
import {
|
|
16
|
+
import {
|
|
17
|
+
addMark,
|
|
18
|
+
inlineFootnotes,
|
|
19
|
+
liftText,
|
|
20
|
+
replaceFootnotes
|
|
21
|
+
} from "./transform.js";
|
|
17
22
|
async function markup(input, output, options) {
|
|
18
23
|
var _stack = [];
|
|
19
24
|
try {
|
|
@@ -35,16 +40,11 @@ async function markup(input, output, options) {
|
|
|
35
40
|
continue;
|
|
36
41
|
}
|
|
37
42
|
const chapterXml = await epub.readXhtmlItemContents(chapterId);
|
|
38
|
-
const {
|
|
39
|
-
Epub.getXhtmlBody(chapterXml),
|
|
40
|
-
{ primaryLocale }
|
|
41
|
-
);
|
|
42
|
-
const { markedUp, timing: chapterTiming } = markupChapter(
|
|
43
|
+
const { markedUp, timing: chapterTiming } = await markupChapter(
|
|
43
44
|
chapterId,
|
|
44
45
|
chapterXml,
|
|
45
46
|
options.granularity ?? "sentence",
|
|
46
|
-
|
|
47
|
-
mapping
|
|
47
|
+
primaryLocale
|
|
48
48
|
);
|
|
49
49
|
timing.add(chapterTiming.summary());
|
|
50
50
|
await epub.writeXhtmlItemContents(chapterId, markedUp);
|
|
@@ -57,7 +57,7 @@ async function markup(input, output, options) {
|
|
|
57
57
|
__callDispose(_stack, _error, _hasError);
|
|
58
58
|
}
|
|
59
59
|
}
|
|
60
|
-
function markupChapter(chapterId, chapterXml, granularity,
|
|
60
|
+
async function markupChapter(chapterId, chapterXml, granularity, locale) {
|
|
61
61
|
const timing = createTiming();
|
|
62
62
|
const html = Epub.findXmlChildByName("html", chapterXml);
|
|
63
63
|
if (!html) throw new Error("Invalid XHTML document: no html element");
|
|
@@ -66,8 +66,14 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
66
66
|
clearBodyElement(chapterXml);
|
|
67
67
|
const taggedHtml = Epub.findXmlChildByName("html", chapterXml);
|
|
68
68
|
const taggedBody = Epub.findXmlChildByName("body", taggedHtml["html"]);
|
|
69
|
+
const original = parseDom(Epub.getXmlChildren(body));
|
|
70
|
+
const inlined = inlineFootnotes(original);
|
|
71
|
+
const lifted = liftText(inlined.root);
|
|
72
|
+
const segmentation = await segmentChapter(lifted.result, {
|
|
73
|
+
primaryLocale: locale
|
|
74
|
+
});
|
|
69
75
|
timing.time("mark up", () => {
|
|
70
|
-
let root =
|
|
76
|
+
let root = inlined.root;
|
|
71
77
|
let pos = 0;
|
|
72
78
|
let i = 0;
|
|
73
79
|
for (const sentence of segmentation) {
|
|
@@ -78,8 +84,8 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
78
84
|
if (word.text.match(/\S/)) {
|
|
79
85
|
root = addMark(
|
|
80
86
|
root,
|
|
81
|
-
mapping.invert().map(wordPos),
|
|
82
|
-
mapping.invert().map(wordPos + word.text.replace(/\n$/, "").length, -1),
|
|
87
|
+
lifted.mapping.invert().map(wordPos),
|
|
88
|
+
lifted.mapping.invert().map(wordPos + word.text.replace(/\n$/, "").length, -1),
|
|
83
89
|
new Mark("span", { id: `${chapterId}-s${i}-w${j}` })
|
|
84
90
|
);
|
|
85
91
|
j++;
|
|
@@ -90,15 +96,21 @@ function markupChapter(chapterId, chapterXml, granularity, segmentation, mapping
|
|
|
90
96
|
if (sentence.text.match(/\S/)) {
|
|
91
97
|
root = addMark(
|
|
92
98
|
root,
|
|
93
|
-
mapping.invert().map(pos),
|
|
94
|
-
mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
99
|
+
lifted.mapping.invert().map(pos),
|
|
100
|
+
lifted.mapping.invert().map(pos + sentence.text.replace(/\n$/, "").length, -1),
|
|
95
101
|
new Mark("span", { id: `${chapterId}-s${i}` })
|
|
96
102
|
);
|
|
97
103
|
i++;
|
|
98
104
|
}
|
|
99
105
|
pos += sentence.text.replace(/\n$/, "").length;
|
|
100
106
|
}
|
|
101
|
-
|
|
107
|
+
const replaced = replaceFootnotes(
|
|
108
|
+
original,
|
|
109
|
+
root,
|
|
110
|
+
inlined.footnotePairs,
|
|
111
|
+
inlined.mapping
|
|
112
|
+
);
|
|
113
|
+
taggedBody["body"] = serializeDom(replaced);
|
|
102
114
|
});
|
|
103
115
|
return { markedUp: chapterXml, timing };
|
|
104
116
|
}
|
package/dist/markup/model.cjs
CHANGED
|
@@ -18,14 +18,17 @@ var __copyProps = (to, from, except, desc) => {
|
|
|
18
18
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
19
|
var model_exports = {};
|
|
20
20
|
__export(model_exports, {
|
|
21
|
+
FootnoteNode: () => FootnoteNode,
|
|
21
22
|
Mark: () => Mark,
|
|
22
23
|
Node: () => Node,
|
|
24
|
+
NoterefNode: () => NoterefNode,
|
|
23
25
|
Root: () => Root,
|
|
24
26
|
TextNode: () => TextNode,
|
|
25
27
|
descendants: () => descendants
|
|
26
28
|
});
|
|
27
29
|
module.exports = __toCommonJS(model_exports);
|
|
28
30
|
var import_itertools = require("itertools");
|
|
31
|
+
var import_resolvedPos = require("./resolvedPos.cjs");
|
|
29
32
|
var import_semantics = require("./semantics.cjs");
|
|
30
33
|
class Root {
|
|
31
34
|
constructor(children) {
|
|
@@ -39,6 +42,12 @@ class Root {
|
|
|
39
42
|
get textContent() {
|
|
40
43
|
return this.children.reduce((acc, child) => acc + child.textContent, "");
|
|
41
44
|
}
|
|
45
|
+
get nodeSize() {
|
|
46
|
+
return this.border + (this.children.reduce((acc, child) => acc + child.nodeSize, 0) || 1) + this.border;
|
|
47
|
+
}
|
|
48
|
+
get contentSize() {
|
|
49
|
+
return this.nodeSize - this.border * 2;
|
|
50
|
+
}
|
|
42
51
|
split(at) {
|
|
43
52
|
const children = [];
|
|
44
53
|
let pos = this.border;
|
|
@@ -57,6 +66,58 @@ class Root {
|
|
|
57
66
|
copy(opts = {}) {
|
|
58
67
|
return new Root(opts.children ?? this.children);
|
|
59
68
|
}
|
|
69
|
+
findIndex(pos) {
|
|
70
|
+
if (pos === 0) return { index: 0, offset: pos };
|
|
71
|
+
if (pos === this.contentSize) {
|
|
72
|
+
return { index: this.children.length, offset: pos };
|
|
73
|
+
}
|
|
74
|
+
if (pos > this.contentSize || pos < 0) {
|
|
75
|
+
throw new RangeError(`Position ${pos} outside of fragment`);
|
|
76
|
+
}
|
|
77
|
+
for (let i = 0, curPos = 0; ; i++) {
|
|
78
|
+
const cur = this.children[i];
|
|
79
|
+
const end = curPos + cur.nodeSize;
|
|
80
|
+
if (end >= pos) {
|
|
81
|
+
if (end === pos) return { index: i + 1, offset: end };
|
|
82
|
+
return { index: i, offset: curPos };
|
|
83
|
+
}
|
|
84
|
+
curPos = end;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
replace(at, withNode) {
|
|
88
|
+
const children = [];
|
|
89
|
+
let pos = this.border;
|
|
90
|
+
for (const child of this.children) {
|
|
91
|
+
if (at === pos) {
|
|
92
|
+
children.push(withNode);
|
|
93
|
+
} else if (at > pos && at < pos + child.nodeSize) {
|
|
94
|
+
if (child instanceof TextNode) {
|
|
95
|
+
throw new Error("Tried to replace at a position within a text node");
|
|
96
|
+
}
|
|
97
|
+
children.push(child.replace(at - pos, withNode));
|
|
98
|
+
} else {
|
|
99
|
+
children.push(child);
|
|
100
|
+
}
|
|
101
|
+
pos += children.at(-1).nodeSize;
|
|
102
|
+
}
|
|
103
|
+
return this.copy({ children });
|
|
104
|
+
}
|
|
105
|
+
cut(pos) {
|
|
106
|
+
let currentPos = this.border;
|
|
107
|
+
for (const child of this.children) {
|
|
108
|
+
if (pos === currentPos) {
|
|
109
|
+
return child;
|
|
110
|
+
}
|
|
111
|
+
if (pos > currentPos && pos < currentPos + child.nodeSize) {
|
|
112
|
+
return child.cut(pos - currentPos);
|
|
113
|
+
}
|
|
114
|
+
currentPos += child.nodeSize;
|
|
115
|
+
}
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
resolve(pos) {
|
|
119
|
+
return import_resolvedPos.ResolvedPos.resolve(this, pos);
|
|
120
|
+
}
|
|
60
121
|
}
|
|
61
122
|
class Node {
|
|
62
123
|
constructor(tagName, attrs = {}, children = [], marks = []) {
|
|
@@ -80,6 +141,9 @@ class Node {
|
|
|
80
141
|
get nodeSize() {
|
|
81
142
|
return this.border + (this.children.reduce((acc, child) => acc + child.nodeSize, 0) || 1) + this.border;
|
|
82
143
|
}
|
|
144
|
+
get contentSize() {
|
|
145
|
+
return this.nodeSize - this.border * 2;
|
|
146
|
+
}
|
|
83
147
|
get textContent() {
|
|
84
148
|
return this.children.reduce((acc, child) => acc + child.textContent, "");
|
|
85
149
|
}
|
|
@@ -102,14 +166,74 @@ class Node {
|
|
|
102
166
|
}
|
|
103
167
|
return this.copy({ children });
|
|
104
168
|
}
|
|
169
|
+
static instance() {
|
|
170
|
+
return this;
|
|
171
|
+
}
|
|
172
|
+
static create(klass, ...args) {
|
|
173
|
+
return new klass(...args);
|
|
174
|
+
}
|
|
105
175
|
copy(opts = {}) {
|
|
106
|
-
return
|
|
176
|
+
return Node.create(
|
|
177
|
+
this.constructor,
|
|
107
178
|
this.tagName,
|
|
108
179
|
opts.attrs ?? this.attrs,
|
|
109
180
|
opts.children ?? this.children,
|
|
110
181
|
opts.marks ?? this.marks
|
|
111
182
|
);
|
|
112
183
|
}
|
|
184
|
+
replace(at, withNode) {
|
|
185
|
+
const children = [];
|
|
186
|
+
let pos = this.border;
|
|
187
|
+
for (const child of this.children) {
|
|
188
|
+
if (at === pos) {
|
|
189
|
+
children.push(withNode);
|
|
190
|
+
} else if (at > pos && at < pos + child.nodeSize) {
|
|
191
|
+
if (child instanceof TextNode) {
|
|
192
|
+
throw new Error("Tried to replace at a position within a text node");
|
|
193
|
+
}
|
|
194
|
+
children.push(child.replace(at - pos, withNode));
|
|
195
|
+
} else {
|
|
196
|
+
children.push(child);
|
|
197
|
+
}
|
|
198
|
+
pos += children.at(-1).nodeSize;
|
|
199
|
+
}
|
|
200
|
+
return this.copy({ children });
|
|
201
|
+
}
|
|
202
|
+
cut(pos) {
|
|
203
|
+
let currentPos = this.border;
|
|
204
|
+
for (const child of this.children) {
|
|
205
|
+
if (pos === currentPos) {
|
|
206
|
+
return child;
|
|
207
|
+
}
|
|
208
|
+
if (pos > currentPos && pos < currentPos + child.nodeSize) {
|
|
209
|
+
return child.cut(pos - currentPos);
|
|
210
|
+
}
|
|
211
|
+
currentPos += child.nodeSize;
|
|
212
|
+
}
|
|
213
|
+
return null;
|
|
214
|
+
}
|
|
215
|
+
findIndex(pos) {
|
|
216
|
+
if (pos === 0) return { index: 0, offset: pos };
|
|
217
|
+
if (pos === this.contentSize) {
|
|
218
|
+
return { index: this.children.length, offset: pos };
|
|
219
|
+
}
|
|
220
|
+
if (pos > this.contentSize || pos < 0) {
|
|
221
|
+
throw new RangeError(`Position ${pos} outside of fragment`);
|
|
222
|
+
}
|
|
223
|
+
for (let i = 0, curPos = 0; ; i++) {
|
|
224
|
+
const cur = this.children[i];
|
|
225
|
+
const end = curPos + cur.nodeSize;
|
|
226
|
+
if (end >= pos) {
|
|
227
|
+
if (end === pos) return { index: i + 1, offset: end };
|
|
228
|
+
return { index: i, offset: curPos };
|
|
229
|
+
}
|
|
230
|
+
curPos = end;
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
class NoterefNode extends Node {
|
|
235
|
+
}
|
|
236
|
+
class FootnoteNode extends Node {
|
|
113
237
|
}
|
|
114
238
|
class Mark {
|
|
115
239
|
constructor(tagName, attrs = {}) {
|
|
@@ -138,6 +262,9 @@ class TextNode {
|
|
|
138
262
|
get nodeSize() {
|
|
139
263
|
return this.text.length;
|
|
140
264
|
}
|
|
265
|
+
get contentSize() {
|
|
266
|
+
return this.nodeSize;
|
|
267
|
+
}
|
|
141
268
|
get textContent() {
|
|
142
269
|
return this.text;
|
|
143
270
|
}
|
|
@@ -152,20 +279,26 @@ class TextNode {
|
|
|
152
279
|
copy(opts = {}) {
|
|
153
280
|
return new TextNode(this.text, opts.marks ?? this.marks);
|
|
154
281
|
}
|
|
282
|
+
cut(pos) {
|
|
283
|
+
return new TextNode(this.text.slice(pos));
|
|
284
|
+
}
|
|
155
285
|
}
|
|
156
|
-
function descendants(
|
|
157
|
-
|
|
158
|
-
|
|
286
|
+
function descendants(node, cb, pos = 0) {
|
|
287
|
+
pos += node.border;
|
|
288
|
+
for (const [i, child] of (0, import_itertools.enumerate)(node.children)) {
|
|
289
|
+
const descend = cb(child, pos, node, i);
|
|
159
290
|
if (descend && !child.isLeaf) {
|
|
160
|
-
descendants(child, cb, pos
|
|
291
|
+
descendants(child, cb, pos);
|
|
161
292
|
}
|
|
162
293
|
pos += child.nodeSize;
|
|
163
294
|
}
|
|
164
295
|
}
|
|
165
296
|
// Annotate the CommonJS export names for ESM import in node:
|
|
166
297
|
0 && (module.exports = {
|
|
298
|
+
FootnoteNode,
|
|
167
299
|
Mark,
|
|
168
300
|
Node,
|
|
301
|
+
NoterefNode,
|
|
169
302
|
Root,
|
|
170
303
|
TextNode,
|
|
171
304
|
descendants
|