@storyteller-platform/align 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/align/align.cjs +42 -117
- package/dist/align/align.d.cts +14 -1
- package/dist/align/align.d.ts +14 -1
- package/dist/align/align.js +42 -117
- package/dist/align/getSentenceRanges.cjs +165 -36
- package/dist/align/getSentenceRanges.d.cts +8 -2
- package/dist/align/getSentenceRanges.d.ts +8 -2
- package/dist/align/getSentenceRanges.js +165 -36
- package/dist/align/search.cjs +122 -0
- package/dist/align/search.d.cts +12 -0
- package/dist/align/search.d.ts +12 -0
- package/dist/align/search.js +96 -0
- package/dist/errorAlign/utils.d.cts +1 -1
- package/dist/errorAlign/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/align/fuzzy.cjs +0 -164
- package/dist/align/fuzzy.d.cts +0 -6
- package/dist/align/fuzzy.d.ts +0 -6
- package/dist/align/fuzzy.js +0 -141
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
var search_exports = {};
|
|
20
|
+
__export(search_exports, {
|
|
21
|
+
buildNgramIndex: () => buildNgramIndex,
|
|
22
|
+
collectBoundaryVotes: () => collectBoundaryVotes,
|
|
23
|
+
findBoundaries: () => findBoundaries,
|
|
24
|
+
ngrams: () => ngrams
|
|
25
|
+
});
|
|
26
|
+
module.exports = __toCommonJS(search_exports);
|
|
27
|
+
var import_itertools = require("itertools");
|
|
28
|
+
function buildNgramIndex(text) {
|
|
29
|
+
const index = /* @__PURE__ */ new Map();
|
|
30
|
+
for (const [ngram, pos] of ngrams(text)) {
|
|
31
|
+
const positions = index.get(ngram);
|
|
32
|
+
if (positions) {
|
|
33
|
+
positions.push(pos);
|
|
34
|
+
} else {
|
|
35
|
+
index.set(ngram, [pos]);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
return index;
|
|
39
|
+
}
|
|
40
|
+
function* ngrams(text) {
|
|
41
|
+
const words = text.split("-");
|
|
42
|
+
let pos = 0;
|
|
43
|
+
for (const i of (0, import_itertools.range)(words.length - 4)) {
|
|
44
|
+
const ngram = words.slice(i, i + 5).join("-");
|
|
45
|
+
yield [ngram, pos];
|
|
46
|
+
pos += words[i].length + 1;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
function collectBoundaryVotes(query, document) {
|
|
50
|
+
const documentIndex = buildNgramIndex(document);
|
|
51
|
+
let skippedNgrams = 0;
|
|
52
|
+
let totalNgrams = 0;
|
|
53
|
+
const startVotes = [];
|
|
54
|
+
const endVotes = [];
|
|
55
|
+
for (const [ngram, start] of ngrams(query)) {
|
|
56
|
+
totalNgrams++;
|
|
57
|
+
const documentStarts = documentIndex.get(ngram);
|
|
58
|
+
if (!documentStarts) {
|
|
59
|
+
skippedNgrams++;
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
for (const documentStart of documentStarts) {
|
|
63
|
+
startVotes.push(documentStart - start);
|
|
64
|
+
endVotes.push(documentStart + (query.length - start));
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
if (skippedNgrams > totalNgrams / 2) {
|
|
68
|
+
return null;
|
|
69
|
+
}
|
|
70
|
+
return { startVotes, endVotes };
|
|
71
|
+
}
|
|
72
|
+
const BIN_SIZE = 1e3;
|
|
73
|
+
function binBoundaryVotes(votes) {
|
|
74
|
+
const start = (0, import_itertools.min)(votes);
|
|
75
|
+
const bins = /* @__PURE__ */ new Map();
|
|
76
|
+
if (start === void 0) return bins;
|
|
77
|
+
for (const vote of votes) {
|
|
78
|
+
const binIndex = Math.floor((vote - start) / BIN_SIZE);
|
|
79
|
+
const bin = bins.get(binIndex);
|
|
80
|
+
if (bin) {
|
|
81
|
+
bin.push(vote);
|
|
82
|
+
} else {
|
|
83
|
+
bins.set(binIndex, [vote]);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
return bins;
|
|
87
|
+
}
|
|
88
|
+
function chooseBestFromBins(bins, dir) {
|
|
89
|
+
const totalLength = Array.from(bins.values()).reduce(
|
|
90
|
+
(acc, bin) => acc + bin.length,
|
|
91
|
+
0
|
|
92
|
+
);
|
|
93
|
+
const best = (0, import_itertools.max)(bins.values(), (bin) => bin.length);
|
|
94
|
+
if (!best) return null;
|
|
95
|
+
if (best.length / totalLength < 0.2) {
|
|
96
|
+
return null;
|
|
97
|
+
}
|
|
98
|
+
return dir > 0 ? (0, import_itertools.max)(best) ?? null : (0, import_itertools.min)(best) ?? null;
|
|
99
|
+
}
|
|
100
|
+
function findBoundaries(query, document) {
|
|
101
|
+
const boundaryVotes = collectBoundaryVotes(query, document);
|
|
102
|
+
if (!boundaryVotes) return null;
|
|
103
|
+
const { startVotes, endVotes } = boundaryVotes;
|
|
104
|
+
const startBins = binBoundaryVotes(startVotes);
|
|
105
|
+
const bestStart = chooseBestFromBins(startBins, -1);
|
|
106
|
+
if (bestStart === null) {
|
|
107
|
+
return null;
|
|
108
|
+
}
|
|
109
|
+
const endBins = binBoundaryVotes(endVotes);
|
|
110
|
+
const bestEnd = chooseBestFromBins(endBins, 1);
|
|
111
|
+
if (bestEnd === null) {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
return { start: bestStart, end: bestEnd };
|
|
115
|
+
}
|
|
116
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
117
|
+
0 && (module.exports = {
|
|
118
|
+
buildNgramIndex,
|
|
119
|
+
collectBoundaryVotes,
|
|
120
|
+
findBoundaries,
|
|
121
|
+
ngrams
|
|
122
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
declare function buildNgramIndex(text: string): Map<string, number[]>;
|
|
2
|
+
declare function ngrams(text: string): Generator<readonly [string, number], void, unknown>;
|
|
3
|
+
declare function collectBoundaryVotes(query: string, document: string): {
|
|
4
|
+
startVotes: number[];
|
|
5
|
+
endVotes: number[];
|
|
6
|
+
} | null;
|
|
7
|
+
declare function findBoundaries(query: string, document: string): {
|
|
8
|
+
start: number;
|
|
9
|
+
end: number;
|
|
10
|
+
} | null;
|
|
11
|
+
|
|
12
|
+
export { buildNgramIndex, collectBoundaryVotes, findBoundaries, ngrams };
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
declare function buildNgramIndex(text: string): Map<string, number[]>;
|
|
2
|
+
declare function ngrams(text: string): Generator<readonly [string, number], void, unknown>;
|
|
3
|
+
declare function collectBoundaryVotes(query: string, document: string): {
|
|
4
|
+
startVotes: number[];
|
|
5
|
+
endVotes: number[];
|
|
6
|
+
} | null;
|
|
7
|
+
declare function findBoundaries(query: string, document: string): {
|
|
8
|
+
start: number;
|
|
9
|
+
end: number;
|
|
10
|
+
} | null;
|
|
11
|
+
|
|
12
|
+
export { buildNgramIndex, collectBoundaryVotes, findBoundaries, ngrams };
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import "../chunk-BIEQXUOY.js";
|
|
2
|
+
import { max, min, range } from "itertools";
|
|
3
|
+
function buildNgramIndex(text) {
|
|
4
|
+
const index = /* @__PURE__ */ new Map();
|
|
5
|
+
for (const [ngram, pos] of ngrams(text)) {
|
|
6
|
+
const positions = index.get(ngram);
|
|
7
|
+
if (positions) {
|
|
8
|
+
positions.push(pos);
|
|
9
|
+
} else {
|
|
10
|
+
index.set(ngram, [pos]);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
return index;
|
|
14
|
+
}
|
|
15
|
+
function* ngrams(text) {
|
|
16
|
+
const words = text.split("-");
|
|
17
|
+
let pos = 0;
|
|
18
|
+
for (const i of range(words.length - 4)) {
|
|
19
|
+
const ngram = words.slice(i, i + 5).join("-");
|
|
20
|
+
yield [ngram, pos];
|
|
21
|
+
pos += words[i].length + 1;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
function collectBoundaryVotes(query, document) {
|
|
25
|
+
const documentIndex = buildNgramIndex(document);
|
|
26
|
+
let skippedNgrams = 0;
|
|
27
|
+
let totalNgrams = 0;
|
|
28
|
+
const startVotes = [];
|
|
29
|
+
const endVotes = [];
|
|
30
|
+
for (const [ngram, start] of ngrams(query)) {
|
|
31
|
+
totalNgrams++;
|
|
32
|
+
const documentStarts = documentIndex.get(ngram);
|
|
33
|
+
if (!documentStarts) {
|
|
34
|
+
skippedNgrams++;
|
|
35
|
+
continue;
|
|
36
|
+
}
|
|
37
|
+
for (const documentStart of documentStarts) {
|
|
38
|
+
startVotes.push(documentStart - start);
|
|
39
|
+
endVotes.push(documentStart + (query.length - start));
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
if (skippedNgrams > totalNgrams / 2) {
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
return { startVotes, endVotes };
|
|
46
|
+
}
|
|
47
|
+
const BIN_SIZE = 1e3;
|
|
48
|
+
function binBoundaryVotes(votes) {
|
|
49
|
+
const start = min(votes);
|
|
50
|
+
const bins = /* @__PURE__ */ new Map();
|
|
51
|
+
if (start === void 0) return bins;
|
|
52
|
+
for (const vote of votes) {
|
|
53
|
+
const binIndex = Math.floor((vote - start) / BIN_SIZE);
|
|
54
|
+
const bin = bins.get(binIndex);
|
|
55
|
+
if (bin) {
|
|
56
|
+
bin.push(vote);
|
|
57
|
+
} else {
|
|
58
|
+
bins.set(binIndex, [vote]);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
return bins;
|
|
62
|
+
}
|
|
63
|
+
function chooseBestFromBins(bins, dir) {
|
|
64
|
+
const totalLength = Array.from(bins.values()).reduce(
|
|
65
|
+
(acc, bin) => acc + bin.length,
|
|
66
|
+
0
|
|
67
|
+
);
|
|
68
|
+
const best = max(bins.values(), (bin) => bin.length);
|
|
69
|
+
if (!best) return null;
|
|
70
|
+
if (best.length / totalLength < 0.2) {
|
|
71
|
+
return null;
|
|
72
|
+
}
|
|
73
|
+
return dir > 0 ? max(best) ?? null : min(best) ?? null;
|
|
74
|
+
}
|
|
75
|
+
function findBoundaries(query, document) {
|
|
76
|
+
const boundaryVotes = collectBoundaryVotes(query, document);
|
|
77
|
+
if (!boundaryVotes) return null;
|
|
78
|
+
const { startVotes, endVotes } = boundaryVotes;
|
|
79
|
+
const startBins = binBoundaryVotes(startVotes);
|
|
80
|
+
const bestStart = chooseBestFromBins(startBins, -1);
|
|
81
|
+
if (bestStart === null) {
|
|
82
|
+
return null;
|
|
83
|
+
}
|
|
84
|
+
const endBins = binBoundaryVotes(endVotes);
|
|
85
|
+
const bestEnd = chooseBestFromBins(endBins, 1);
|
|
86
|
+
if (bestEnd === null) {
|
|
87
|
+
return null;
|
|
88
|
+
}
|
|
89
|
+
return { start: bestStart, end: bestEnd };
|
|
90
|
+
}
|
|
91
|
+
export {
|
|
92
|
+
buildNgramIndex,
|
|
93
|
+
collectBoundaryVotes,
|
|
94
|
+
findBoundaries,
|
|
95
|
+
ngrams
|
|
96
|
+
};
|
|
@@ -21,7 +21,7 @@ declare class Alignment {
|
|
|
21
21
|
* @returns All possible combinations of operation types.
|
|
22
22
|
*/
|
|
23
23
|
declare function opTypePowerset(): IterableIterator<NonNullable<"DELETE" | "MATCH" | "INSERT" | "SUBSTITUTE">[]>;
|
|
24
|
-
declare function reversed<T>(iterable:
|
|
24
|
+
declare function reversed<T>(iterable: Iterable<T>): T[];
|
|
25
25
|
declare const START_DELIMITER = "<";
|
|
26
26
|
declare const END_DELIMITER = ">";
|
|
27
27
|
declare const DELIMITERS: Set<string>;
|
|
@@ -21,7 +21,7 @@ declare class Alignment {
|
|
|
21
21
|
* @returns All possible combinations of operation types.
|
|
22
22
|
*/
|
|
23
23
|
declare function opTypePowerset(): IterableIterator<NonNullable<"DELETE" | "MATCH" | "INSERT" | "SUBSTITUTE">[]>;
|
|
24
|
-
declare function reversed<T>(iterable:
|
|
24
|
+
declare function reversed<T>(iterable: Iterable<T>): T[];
|
|
25
25
|
declare const START_DELIMITER = "<";
|
|
26
26
|
declare const END_DELIMITER = ">";
|
|
27
27
|
declare const DELIMITERS: Set<string>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@storyteller-platform/align",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.14",
|
|
4
4
|
"description": "A library and CLI for automatically aligning audiobooks and EPUBs to produce Media Overlays",
|
|
5
5
|
"author": "Shane Friedman",
|
|
6
6
|
"license": "MIT",
|
|
@@ -57,9 +57,9 @@
|
|
|
57
57
|
"@esfx/async-semaphore": "^1.0.0",
|
|
58
58
|
"@optique/core": "^0.10.7",
|
|
59
59
|
"@optique/run": "^0.10.7",
|
|
60
|
-
"@storyteller-platform/audiobook": "^0.3.
|
|
60
|
+
"@storyteller-platform/audiobook": "^0.3.10",
|
|
61
61
|
"@storyteller-platform/epub": "^0.4.8",
|
|
62
|
-
"@storyteller-platform/ghost-story": "^0.1.
|
|
62
|
+
"@storyteller-platform/ghost-story": "^0.1.6",
|
|
63
63
|
"@storyteller-platform/transliteration": "^3.1.0",
|
|
64
64
|
"chalk": "^5.4.1",
|
|
65
65
|
"cli-progress": "^3.12.0",
|
package/dist/align/fuzzy.cjs
DELETED
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
var __defProp = Object.defineProperty;
|
|
3
|
-
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
-
var __export = (target, all) => {
|
|
7
|
-
for (var name in all)
|
|
8
|
-
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
-
};
|
|
10
|
-
var __copyProps = (to, from, except, desc) => {
|
|
11
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
-
for (let key of __getOwnPropNames(from))
|
|
13
|
-
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
-
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
-
}
|
|
16
|
-
return to;
|
|
17
|
-
};
|
|
18
|
-
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
-
var fuzzy_exports = {};
|
|
20
|
-
__export(fuzzy_exports, {
|
|
21
|
-
findNearestMatch: () => findNearestMatch
|
|
22
|
-
});
|
|
23
|
-
module.exports = __toCommonJS(fuzzy_exports);
|
|
24
|
-
function findNearestMatch(needle, haystack, maxDist) {
|
|
25
|
-
let nearest = null;
|
|
26
|
-
for (const match of levenshteinNgram(needle, haystack, maxDist)) {
|
|
27
|
-
if (!nearest || match.dist < nearest.dist) {
|
|
28
|
-
nearest = match;
|
|
29
|
-
}
|
|
30
|
-
}
|
|
31
|
-
return nearest && {
|
|
32
|
-
match: haystack.slice(nearest.start, nearest.end),
|
|
33
|
-
index: nearest.start
|
|
34
|
-
};
|
|
35
|
-
}
|
|
36
|
-
function reverse(str, from = str.length, to = 0) {
|
|
37
|
-
let reversed = "";
|
|
38
|
-
for (let i = from - 1; i >= to; i--) {
|
|
39
|
-
reversed = reversed + str[i];
|
|
40
|
-
}
|
|
41
|
-
return reversed;
|
|
42
|
-
}
|
|
43
|
-
function* searchExact(subsequence, sequence, startIndex = 0, endIndex = sequence.length) {
|
|
44
|
-
let index = sequence.indexOf(subsequence, startIndex);
|
|
45
|
-
while (index !== -1 && index + subsequence.length < endIndex) {
|
|
46
|
-
yield index;
|
|
47
|
-
index = sequence.indexOf(subsequence, index + 1);
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
function expand(subsequence, sequence, maxDist) {
|
|
51
|
-
const subsequenceLength = subsequence.length;
|
|
52
|
-
if (subsequenceLength === 0) {
|
|
53
|
-
return { index: 0, score: 0 };
|
|
54
|
-
}
|
|
55
|
-
const scores = Array.from({ length: subsequenceLength + 1 }).map((_, i) => i);
|
|
56
|
-
let minScore = subsequenceLength;
|
|
57
|
-
let minScoreIndex = -1;
|
|
58
|
-
let maxGoodScore = maxDist;
|
|
59
|
-
let newNeedleIndexRangeStart = 0;
|
|
60
|
-
let newNeedleIndexRangeEnd = subsequenceLength - 1;
|
|
61
|
-
for (let sequenceIndex = 0; sequenceIndex < sequence.length; sequenceIndex++) {
|
|
62
|
-
const char = sequence[sequenceIndex];
|
|
63
|
-
const needleIndexRangeStart = newNeedleIndexRangeStart;
|
|
64
|
-
const needleIndexRangeEnd = Math.min(
|
|
65
|
-
subsequenceLength,
|
|
66
|
-
newNeedleIndexRangeEnd + 1
|
|
67
|
-
);
|
|
68
|
-
let a = sequenceIndex;
|
|
69
|
-
let c = a + 1;
|
|
70
|
-
if (c <= maxGoodScore) {
|
|
71
|
-
newNeedleIndexRangeStart = 0;
|
|
72
|
-
newNeedleIndexRangeEnd = 0;
|
|
73
|
-
} else {
|
|
74
|
-
newNeedleIndexRangeStart = null;
|
|
75
|
-
newNeedleIndexRangeEnd = -1;
|
|
76
|
-
}
|
|
77
|
-
for (let subsequenceIndex = needleIndexRangeStart; subsequenceIndex < needleIndexRangeEnd; subsequenceIndex++) {
|
|
78
|
-
const b = scores[subsequenceIndex];
|
|
79
|
-
c = scores[subsequenceIndex] = Math.min(
|
|
80
|
-
a + (char === subsequence[subsequenceIndex] ? 0 : 1),
|
|
81
|
-
b + 1,
|
|
82
|
-
c + 1
|
|
83
|
-
);
|
|
84
|
-
a = b;
|
|
85
|
-
if (c <= maxGoodScore) {
|
|
86
|
-
if (newNeedleIndexRangeStart === null) {
|
|
87
|
-
newNeedleIndexRangeStart = subsequenceIndex;
|
|
88
|
-
}
|
|
89
|
-
newNeedleIndexRangeEnd = Math.max(
|
|
90
|
-
newNeedleIndexRangeEnd,
|
|
91
|
-
subsequenceIndex + 1 + (maxGoodScore - c)
|
|
92
|
-
);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
if (newNeedleIndexRangeStart === null) {
|
|
96
|
-
break;
|
|
97
|
-
}
|
|
98
|
-
if (needleIndexRangeEnd === subsequenceLength && c <= minScore) {
|
|
99
|
-
minScore = c;
|
|
100
|
-
minScoreIndex = sequenceIndex;
|
|
101
|
-
if (minScore < maxGoodScore) {
|
|
102
|
-
maxGoodScore = minScore;
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
return minScore <= maxDist ? { score: minScore, index: minScoreIndex + 1 } : null;
|
|
107
|
-
}
|
|
108
|
-
function* levenshteinNgram(subsequence, sequence, maxDist) {
|
|
109
|
-
const subsequenceLength = subsequence.length;
|
|
110
|
-
const sequenceLength = sequence.length;
|
|
111
|
-
const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
|
|
112
|
-
if (ngramLength === 0) {
|
|
113
|
-
throw new Error("The subsequence length must be greater than maxDist");
|
|
114
|
-
}
|
|
115
|
-
for (let ngramStart = 0; ngramStart < subsequenceLength - ngramLength + 1; ngramStart += ngramLength) {
|
|
116
|
-
const ngramEnd = ngramStart + ngramLength;
|
|
117
|
-
const subsequenceBeforeReversed = reverse(subsequence, ngramStart);
|
|
118
|
-
const subsequenceAfter = subsequence.slice(ngramEnd);
|
|
119
|
-
const startIndex = Math.max(0, ngramStart - maxDist);
|
|
120
|
-
const endIndex = Math.min(
|
|
121
|
-
sequenceLength,
|
|
122
|
-
sequenceLength - subsequenceLength + ngramEnd + maxDist
|
|
123
|
-
);
|
|
124
|
-
for (const index of searchExact(
|
|
125
|
-
subsequence.slice(ngramStart, ngramEnd),
|
|
126
|
-
sequence,
|
|
127
|
-
startIndex,
|
|
128
|
-
endIndex
|
|
129
|
-
)) {
|
|
130
|
-
const rightMatch = expand(
|
|
131
|
-
subsequenceAfter,
|
|
132
|
-
sequence.slice(
|
|
133
|
-
index + ngramLength,
|
|
134
|
-
index - ngramStart + subsequenceLength + maxDist
|
|
135
|
-
),
|
|
136
|
-
maxDist
|
|
137
|
-
);
|
|
138
|
-
if (rightMatch === null) continue;
|
|
139
|
-
const { score: distRight, index: rightExpandSize } = rightMatch;
|
|
140
|
-
const leftMatch = expand(
|
|
141
|
-
subsequenceBeforeReversed,
|
|
142
|
-
reverse(
|
|
143
|
-
sequence,
|
|
144
|
-
index,
|
|
145
|
-
Math.max(0, index - ngramStart - (maxDist - distRight))
|
|
146
|
-
),
|
|
147
|
-
maxDist - distRight
|
|
148
|
-
);
|
|
149
|
-
if (leftMatch === null) continue;
|
|
150
|
-
const { score: distLeft, index: leftExpandSize } = leftMatch;
|
|
151
|
-
const start = index - leftExpandSize;
|
|
152
|
-
yield {
|
|
153
|
-
start,
|
|
154
|
-
end: index + ngramLength + rightExpandSize,
|
|
155
|
-
// dist: distLeft + distRight + (start / sequenceLength) * maxDist,
|
|
156
|
-
dist: distLeft + distRight
|
|
157
|
-
};
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
// Annotate the CommonJS export names for ESM import in node:
|
|
162
|
-
0 && (module.exports = {
|
|
163
|
-
findNearestMatch
|
|
164
|
-
});
|
package/dist/align/fuzzy.d.cts
DELETED
package/dist/align/fuzzy.d.ts
DELETED
package/dist/align/fuzzy.js
DELETED
|
@@ -1,141 +0,0 @@
|
|
|
1
|
-
import "../chunk-BIEQXUOY.js";
|
|
2
|
-
function findNearestMatch(needle, haystack, maxDist) {
|
|
3
|
-
let nearest = null;
|
|
4
|
-
for (const match of levenshteinNgram(needle, haystack, maxDist)) {
|
|
5
|
-
if (!nearest || match.dist < nearest.dist) {
|
|
6
|
-
nearest = match;
|
|
7
|
-
}
|
|
8
|
-
}
|
|
9
|
-
return nearest && {
|
|
10
|
-
match: haystack.slice(nearest.start, nearest.end),
|
|
11
|
-
index: nearest.start
|
|
12
|
-
};
|
|
13
|
-
}
|
|
14
|
-
function reverse(str, from = str.length, to = 0) {
|
|
15
|
-
let reversed = "";
|
|
16
|
-
for (let i = from - 1; i >= to; i--) {
|
|
17
|
-
reversed = reversed + str[i];
|
|
18
|
-
}
|
|
19
|
-
return reversed;
|
|
20
|
-
}
|
|
21
|
-
function* searchExact(subsequence, sequence, startIndex = 0, endIndex = sequence.length) {
|
|
22
|
-
let index = sequence.indexOf(subsequence, startIndex);
|
|
23
|
-
while (index !== -1 && index + subsequence.length < endIndex) {
|
|
24
|
-
yield index;
|
|
25
|
-
index = sequence.indexOf(subsequence, index + 1);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
function expand(subsequence, sequence, maxDist) {
|
|
29
|
-
const subsequenceLength = subsequence.length;
|
|
30
|
-
if (subsequenceLength === 0) {
|
|
31
|
-
return { index: 0, score: 0 };
|
|
32
|
-
}
|
|
33
|
-
const scores = Array.from({ length: subsequenceLength + 1 }).map((_, i) => i);
|
|
34
|
-
let minScore = subsequenceLength;
|
|
35
|
-
let minScoreIndex = -1;
|
|
36
|
-
let maxGoodScore = maxDist;
|
|
37
|
-
let newNeedleIndexRangeStart = 0;
|
|
38
|
-
let newNeedleIndexRangeEnd = subsequenceLength - 1;
|
|
39
|
-
for (let sequenceIndex = 0; sequenceIndex < sequence.length; sequenceIndex++) {
|
|
40
|
-
const char = sequence[sequenceIndex];
|
|
41
|
-
const needleIndexRangeStart = newNeedleIndexRangeStart;
|
|
42
|
-
const needleIndexRangeEnd = Math.min(
|
|
43
|
-
subsequenceLength,
|
|
44
|
-
newNeedleIndexRangeEnd + 1
|
|
45
|
-
);
|
|
46
|
-
let a = sequenceIndex;
|
|
47
|
-
let c = a + 1;
|
|
48
|
-
if (c <= maxGoodScore) {
|
|
49
|
-
newNeedleIndexRangeStart = 0;
|
|
50
|
-
newNeedleIndexRangeEnd = 0;
|
|
51
|
-
} else {
|
|
52
|
-
newNeedleIndexRangeStart = null;
|
|
53
|
-
newNeedleIndexRangeEnd = -1;
|
|
54
|
-
}
|
|
55
|
-
for (let subsequenceIndex = needleIndexRangeStart; subsequenceIndex < needleIndexRangeEnd; subsequenceIndex++) {
|
|
56
|
-
const b = scores[subsequenceIndex];
|
|
57
|
-
c = scores[subsequenceIndex] = Math.min(
|
|
58
|
-
a + (char === subsequence[subsequenceIndex] ? 0 : 1),
|
|
59
|
-
b + 1,
|
|
60
|
-
c + 1
|
|
61
|
-
);
|
|
62
|
-
a = b;
|
|
63
|
-
if (c <= maxGoodScore) {
|
|
64
|
-
if (newNeedleIndexRangeStart === null) {
|
|
65
|
-
newNeedleIndexRangeStart = subsequenceIndex;
|
|
66
|
-
}
|
|
67
|
-
newNeedleIndexRangeEnd = Math.max(
|
|
68
|
-
newNeedleIndexRangeEnd,
|
|
69
|
-
subsequenceIndex + 1 + (maxGoodScore - c)
|
|
70
|
-
);
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
if (newNeedleIndexRangeStart === null) {
|
|
74
|
-
break;
|
|
75
|
-
}
|
|
76
|
-
if (needleIndexRangeEnd === subsequenceLength && c <= minScore) {
|
|
77
|
-
minScore = c;
|
|
78
|
-
minScoreIndex = sequenceIndex;
|
|
79
|
-
if (minScore < maxGoodScore) {
|
|
80
|
-
maxGoodScore = minScore;
|
|
81
|
-
}
|
|
82
|
-
}
|
|
83
|
-
}
|
|
84
|
-
return minScore <= maxDist ? { score: minScore, index: minScoreIndex + 1 } : null;
|
|
85
|
-
}
|
|
86
|
-
function* levenshteinNgram(subsequence, sequence, maxDist) {
|
|
87
|
-
const subsequenceLength = subsequence.length;
|
|
88
|
-
const sequenceLength = sequence.length;
|
|
89
|
-
const ngramLength = Math.floor(subsequenceLength / (maxDist + 1));
|
|
90
|
-
if (ngramLength === 0) {
|
|
91
|
-
throw new Error("The subsequence length must be greater than maxDist");
|
|
92
|
-
}
|
|
93
|
-
for (let ngramStart = 0; ngramStart < subsequenceLength - ngramLength + 1; ngramStart += ngramLength) {
|
|
94
|
-
const ngramEnd = ngramStart + ngramLength;
|
|
95
|
-
const subsequenceBeforeReversed = reverse(subsequence, ngramStart);
|
|
96
|
-
const subsequenceAfter = subsequence.slice(ngramEnd);
|
|
97
|
-
const startIndex = Math.max(0, ngramStart - maxDist);
|
|
98
|
-
const endIndex = Math.min(
|
|
99
|
-
sequenceLength,
|
|
100
|
-
sequenceLength - subsequenceLength + ngramEnd + maxDist
|
|
101
|
-
);
|
|
102
|
-
for (const index of searchExact(
|
|
103
|
-
subsequence.slice(ngramStart, ngramEnd),
|
|
104
|
-
sequence,
|
|
105
|
-
startIndex,
|
|
106
|
-
endIndex
|
|
107
|
-
)) {
|
|
108
|
-
const rightMatch = expand(
|
|
109
|
-
subsequenceAfter,
|
|
110
|
-
sequence.slice(
|
|
111
|
-
index + ngramLength,
|
|
112
|
-
index - ngramStart + subsequenceLength + maxDist
|
|
113
|
-
),
|
|
114
|
-
maxDist
|
|
115
|
-
);
|
|
116
|
-
if (rightMatch === null) continue;
|
|
117
|
-
const { score: distRight, index: rightExpandSize } = rightMatch;
|
|
118
|
-
const leftMatch = expand(
|
|
119
|
-
subsequenceBeforeReversed,
|
|
120
|
-
reverse(
|
|
121
|
-
sequence,
|
|
122
|
-
index,
|
|
123
|
-
Math.max(0, index - ngramStart - (maxDist - distRight))
|
|
124
|
-
),
|
|
125
|
-
maxDist - distRight
|
|
126
|
-
);
|
|
127
|
-
if (leftMatch === null) continue;
|
|
128
|
-
const { score: distLeft, index: leftExpandSize } = leftMatch;
|
|
129
|
-
const start = index - leftExpandSize;
|
|
130
|
-
yield {
|
|
131
|
-
start,
|
|
132
|
-
end: index + ngramLength + rightExpandSize,
|
|
133
|
-
// dist: distLeft + distRight + (start / sequenceLength) * maxDist,
|
|
134
|
-
dist: distLeft + distRight
|
|
135
|
-
};
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
export {
|
|
140
|
-
findNearestMatch
|
|
141
|
-
};
|