baburchi 1.7.2 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/index.d.ts +56 -87
- package/dist/index.js +331 -263
- package/dist/index.js.map +1 -1
- package/package.json +9 -7
package/dist/index.js
CHANGED
|
@@ -86,10 +86,11 @@ const CHAR_TA_MARBUTAH = 1577;
|
|
|
86
86
|
const CHAR_MADDA_ABOVE = 1619;
|
|
87
87
|
const CHAR_HAMZA_ABOVE_MARK = 1620;
|
|
88
88
|
const CHAR_HAMZA_BELOW_MARK = 1621;
|
|
89
|
+
const CHAR_DAGGER_ALIF = 1648;
|
|
89
90
|
let sharedBuffer = new Uint16Array(2048);
|
|
90
91
|
const decoder = new TextDecoder("utf-16le");
|
|
91
92
|
const isDiacritic = (code) => {
|
|
92
|
-
return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code ===
|
|
93
|
+
return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === CHAR_DAGGER_ALIF || code >= 1750 && code <= 1773;
|
|
93
94
|
};
|
|
94
95
|
const isZeroWidth = (code) => {
|
|
95
96
|
return code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
|
|
@@ -133,264 +134,292 @@ const resolveTatweelMode = (presetValue, override) => {
|
|
|
133
134
|
return override;
|
|
134
135
|
};
|
|
135
136
|
/**
|
|
136
|
-
*
|
|
137
|
-
*
|
|
138
|
-
*
|
|
137
|
+
* Emits a single space into the output buffer, respecting the collapse-whitespace flag.
|
|
138
|
+
*
|
|
139
|
+
* @param ctx - Mutable loop state; `bufIdx` and `lastWasSpace` may be updated.
|
|
140
|
+
* @param collapseWS - When true, suppress consecutive spaces and leading spaces.
|
|
141
|
+
*/
|
|
142
|
+
const emitSpace = (ctx, collapseWS) => {
|
|
143
|
+
if (collapseWS) {
|
|
144
|
+
if (!ctx.lastWasSpace && ctx.bufIdx > 0) {
|
|
145
|
+
ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
|
|
146
|
+
ctx.lastWasSpace = true;
|
|
147
|
+
}
|
|
148
|
+
} else {
|
|
149
|
+
ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
|
|
150
|
+
ctx.lastWasSpace = false;
|
|
151
|
+
}
|
|
152
|
+
};
|
|
153
|
+
/**
|
|
154
|
+
* Applies letter-level normalization to a single code point:
|
|
155
|
+
* alif variants → bare alif, alif maqsurah → ya, ta marbutah → ha.
|
|
156
|
+
*
|
|
157
|
+
* @param code - Input code point.
|
|
158
|
+
* @param normAlif - Whether to collapse alif variants.
|
|
159
|
+
* @param maqToYa - Whether to replace ى with ي.
|
|
160
|
+
* @param taToHa - Whether to replace ة with ه.
|
|
161
|
+
* @returns The (possibly mapped) output code point.
|
|
162
|
+
*/
|
|
163
|
+
const normalizeCode = (code, normAlif, maqToYa, taToHa) => {
|
|
164
|
+
let out = code;
|
|
165
|
+
if (normAlif) {
|
|
166
|
+
if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) out = CHAR_ALIF;
|
|
167
|
+
}
|
|
168
|
+
if (maqToYa && code === CHAR_ALIF_MAQSURAH) out = CHAR_YA;
|
|
169
|
+
if (taToHa && code === CHAR_TA_MARBUTAH) out = CHAR_HA;
|
|
170
|
+
return out;
|
|
171
|
+
};
|
|
172
|
+
/**
|
|
173
|
+
* Handles ASCII and control whitespace (code ≤ 32).
|
|
174
|
+
* Returns `false` for non-whitespace characters.
|
|
175
|
+
*/
|
|
176
|
+
const processWhitespace = (code, ctx, opts) => {
|
|
177
|
+
if (code > 32) return false;
|
|
178
|
+
if (opts.lettersOnly) return true;
|
|
179
|
+
if (opts.collapseWS) {
|
|
180
|
+
if (!ctx.lastWasSpace && ctx.bufIdx > 0) {
|
|
181
|
+
ctx.buffer[ctx.bufIdx++] = CHAR_SPACE;
|
|
182
|
+
ctx.lastWasSpace = true;
|
|
183
|
+
}
|
|
184
|
+
} else {
|
|
185
|
+
ctx.buffer[ctx.bufIdx++] = code;
|
|
186
|
+
ctx.lastWasSpace = false;
|
|
187
|
+
}
|
|
188
|
+
return true;
|
|
189
|
+
};
|
|
190
|
+
/**
|
|
191
|
+
* Performs inline NFC canonical composition for Arabic combining marks.
|
|
192
|
+
* Only handles the five compositions relevant to Arabic OCR:
|
|
193
|
+
* ا + ◌ٓ → آ, ا + ◌ٔ → أ, ا + ◌ٕ → إ, و + ◌ٔ → ؤ, ي + ◌ٔ → ئ
|
|
194
|
+
*
|
|
195
|
+
* Called only when `nfc` is enabled. Returns `false` when the mark cannot be
|
|
196
|
+
* composed (it will then be emitted as a standalone character by the fallthrough).
|
|
197
|
+
*/
|
|
198
|
+
const processNfc = (code, ctx) => {
|
|
199
|
+
if (code !== CHAR_MADDA_ABOVE && code !== CHAR_HAMZA_ABOVE_MARK && code !== CHAR_HAMZA_BELOW_MARK) return false;
|
|
200
|
+
const prevIdx = ctx.bufIdx - 1;
|
|
201
|
+
if (prevIdx < 0) return false;
|
|
202
|
+
const prev = ctx.buffer[prevIdx];
|
|
203
|
+
let composed = 0;
|
|
204
|
+
if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
|
|
205
|
+
else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
|
|
206
|
+
else composed = CHAR_ALIF_HAMZA_BELOW;
|
|
207
|
+
else if (code === CHAR_HAMZA_ABOVE_MARK) {
|
|
208
|
+
if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
|
|
209
|
+
else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
|
|
210
|
+
}
|
|
211
|
+
if (composed === 0) return false;
|
|
212
|
+
ctx.buffer[prevIdx] = composed;
|
|
213
|
+
return true;
|
|
214
|
+
};
|
|
215
|
+
/**
|
|
216
|
+
* Strips zero-width controls (U+200B–U+FEFF range).
|
|
217
|
+
* When `zwAsSpace` is set, emits a space in place of the removed character.
|
|
218
|
+
* Called only when `stripZW` is enabled.
|
|
219
|
+
*/
|
|
220
|
+
const processZeroWidth = (code, ctx, opts) => {
|
|
221
|
+
if (!isZeroWidth(code)) return false;
|
|
222
|
+
if (opts.zwAsSpace) emitSpace(ctx, opts.collapseWS);
|
|
223
|
+
return true;
|
|
224
|
+
};
|
|
225
|
+
/**
|
|
226
|
+
* Removes the Hijri date marker "هـ" (or bare "ه" when tatweel has already been
|
|
227
|
+
* stripped) when it immediately follows a date-like token (digits/slashes/hyphens).
|
|
228
|
+
*
|
|
229
|
+
* May advance `ctx.i` by one to also consume an attached tatweel.
|
|
230
|
+
* Called only when `removeHijri` is enabled.
|
|
231
|
+
*/
|
|
232
|
+
const processHijriMarker = (code, ctx, opts) => {
|
|
233
|
+
if (code !== CHAR_HA) return false;
|
|
234
|
+
const { text, len } = ctx;
|
|
235
|
+
const origI = ctx.i;
|
|
236
|
+
let nextIdx = origI + 1;
|
|
237
|
+
const hasTatweel = nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL;
|
|
238
|
+
if (hasTatweel) nextIdx++;
|
|
239
|
+
let isBoundary = nextIdx >= len;
|
|
240
|
+
if (!isBoundary) {
|
|
241
|
+
const nextCode = text.charCodeAt(nextIdx);
|
|
242
|
+
isBoundary = nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45;
|
|
243
|
+
}
|
|
244
|
+
if (!isBoundary) return false;
|
|
245
|
+
let backIdx = origI - 1;
|
|
246
|
+
while (backIdx >= 0 && (text.charCodeAt(backIdx) <= 32 || isZeroWidth(text.charCodeAt(backIdx)))) backIdx--;
|
|
247
|
+
if (backIdx < 0 || !isDigit(text.charCodeAt(backIdx))) return false;
|
|
248
|
+
if (hasTatweel) ctx.i = origI + 1;
|
|
249
|
+
return true;
|
|
250
|
+
};
|
|
251
|
+
/**
|
|
252
|
+
* Strips tatweel (ـ U+0640) according to the resolved mode.
|
|
253
|
+
*
|
|
254
|
+
* - `'all'`: always remove.
|
|
255
|
+
* - `'safe'`: remove unless immediately preceded by a digit or ه
|
|
256
|
+
* (preserves date suffixes like "هـ" and list markers like "4ـ").
|
|
257
|
+
*
|
|
258
|
+
* Called only when `tatweelMode !== false`.
|
|
259
|
+
*/
|
|
260
|
+
const processTatweel = (code, ctx, tatweelMode) => {
|
|
261
|
+
if (code !== CHAR_TATWEEL) return false;
|
|
262
|
+
if (tatweelMode === "all") return true;
|
|
263
|
+
let backIdx = ctx.bufIdx - 1;
|
|
264
|
+
while (backIdx >= 0 && ctx.buffer[backIdx] === CHAR_SPACE) backIdx--;
|
|
265
|
+
if (backIdx < 0) return true;
|
|
266
|
+
const prev = ctx.buffer[backIdx];
|
|
267
|
+
return !(isDigit(prev) || prev === CHAR_HA);
|
|
268
|
+
};
|
|
269
|
+
/**
|
|
270
|
+
* Replaces Latin letters, Western digits, and recognised symbols with a space.
|
|
271
|
+
* Also collapses runs of double-slashes ("//") common in URLs.
|
|
272
|
+
*
|
|
273
|
+
* Called only when `stripNoise` is enabled and letter-filtering is not already
|
|
274
|
+
* handling cleanup (`lettersSpacesOnly` / `lettersOnly` take care of it themselves).
|
|
275
|
+
*
|
|
276
|
+
* May advance `ctx.i` to consume a run of slashes.
|
|
277
|
+
*/
|
|
278
|
+
const processNoise = (code, ctx, opts) => {
|
|
279
|
+
if (opts.lettersSpacesOnly || opts.lettersOnly) return false;
|
|
280
|
+
if (isLatinOrDigit(code) || isSymbol(code)) {
|
|
281
|
+
emitSpace(ctx, opts.collapseWS);
|
|
282
|
+
return true;
|
|
283
|
+
}
|
|
284
|
+
if (code === 47 && ctx.i + 1 < ctx.len && ctx.text.charCodeAt(ctx.i + 1) === 47) {
|
|
285
|
+
while (ctx.i + 1 < ctx.len && ctx.text.charCodeAt(ctx.i + 1) === 47) ctx.i++;
|
|
286
|
+
emitSpace(ctx, opts.collapseWS);
|
|
287
|
+
return true;
|
|
288
|
+
}
|
|
289
|
+
return false;
|
|
290
|
+
};
|
|
291
|
+
/**
|
|
292
|
+
* Matches footnote pattern 1: `(¬٣)` or `(¬٣ )` — a negation sign followed by
|
|
293
|
+
* Arabic-Indic digits and an optional space before the closing parenthesis.
|
|
294
|
+
*
|
|
295
|
+
* @param text - Full source string.
|
|
296
|
+
* @param len - Length of `text`.
|
|
297
|
+
* @param startPos - Index of the first character **after** ¬.
|
|
298
|
+
* @returns Index of the closing `)` on match, or -1 on no match.
|
|
299
|
+
*/
|
|
300
|
+
const matchFootnotePattern1 = (text, len, startPos) => {
|
|
301
|
+
let pos = startPos;
|
|
302
|
+
let hasDigits = false;
|
|
303
|
+
while (pos < len && text.charCodeAt(pos) >= 1632 && text.charCodeAt(pos) <= 1641) {
|
|
304
|
+
hasDigits = true;
|
|
305
|
+
pos++;
|
|
306
|
+
}
|
|
307
|
+
if (!hasDigits || pos >= len) return -1;
|
|
308
|
+
const closing = text.charCodeAt(pos);
|
|
309
|
+
if (closing === 41) return pos;
|
|
310
|
+
if (closing === CHAR_SPACE && pos + 1 < len && text.charCodeAt(pos + 1) === 41) return pos + 1;
|
|
311
|
+
return -1;
|
|
312
|
+
};
|
|
313
|
+
/**
|
|
314
|
+
* Matches footnote pattern 2: `(٣)` or `(٣ X)` — a single Arabic-Indic digit,
|
|
315
|
+
* optionally followed by a space and one Arabic letter, then a closing parenthesis.
|
|
316
|
+
*
|
|
317
|
+
* @param text - Full source string.
|
|
318
|
+
* @param len - Length of `text`.
|
|
319
|
+
* @param digitPos - Index of the Arabic-Indic digit character.
|
|
320
|
+
* @returns Index of the closing `)` on match, or -1 on no match.
|
|
321
|
+
*/
|
|
322
|
+
const matchFootnotePattern2 = (text, len, digitPos) => {
|
|
323
|
+
const afterDigit = digitPos + 1;
|
|
324
|
+
if (afterDigit >= len) return -1;
|
|
325
|
+
const c2 = text.charCodeAt(afterDigit);
|
|
326
|
+
if (c2 === 41) return afterDigit;
|
|
327
|
+
if (c2 !== CHAR_SPACE) return -1;
|
|
328
|
+
const afterSpace = afterDigit + 1;
|
|
329
|
+
if (afterSpace >= len) return -1;
|
|
330
|
+
const c3 = text.charCodeAt(afterSpace);
|
|
331
|
+
if (c3 < 1536 || c3 > 1791) return -1;
|
|
332
|
+
const closingIdx = afterSpace + 1;
|
|
333
|
+
if (closingIdx >= len || text.charCodeAt(closingIdx) !== 41) return -1;
|
|
334
|
+
return closingIdx;
|
|
335
|
+
};
|
|
336
|
+
/**
|
|
337
|
+
* Removes inline footnote references of the form `(٣)`, `(٣ م)`, or `(¬٣)`.
|
|
338
|
+
* Replaces the entire token (including parens) with a single space.
|
|
339
|
+
*
|
|
340
|
+
* Called only when `removeFootnotes` is enabled and letter-filtering is inactive.
|
|
341
|
+
* May advance `ctx.i` past the consumed token.
|
|
342
|
+
*/
|
|
343
|
+
const processFootnote = (code, ctx, opts) => {
|
|
344
|
+
if (opts.lettersSpacesOnly || opts.lettersOnly || code !== 40) return false;
|
|
345
|
+
const { text, len } = ctx;
|
|
346
|
+
let nextIdx = ctx.i + 1;
|
|
347
|
+
if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
|
|
348
|
+
if (nextIdx >= len) return false;
|
|
349
|
+
const c1 = text.charCodeAt(nextIdx);
|
|
350
|
+
let endIdx = -1;
|
|
351
|
+
if (c1 === 172) endIdx = matchFootnotePattern1(text, len, nextIdx + 1);
|
|
352
|
+
else if (c1 >= 1632 && c1 <= 1641) endIdx = matchFootnotePattern2(text, len, nextIdx);
|
|
353
|
+
if (endIdx < 0) return false;
|
|
354
|
+
ctx.i = endIdx;
|
|
355
|
+
emitSpace(ctx, opts.collapseWS);
|
|
356
|
+
return true;
|
|
357
|
+
};
|
|
358
|
+
/**
|
|
359
|
+
* Handles letter filtering for the `lettersSpacesOnly` / `lettersOnly` modes.
|
|
360
|
+
* Non-Arabic characters are either dropped (lettersOnly) or replaced with a space.
|
|
361
|
+
* Arabic letters are emitted after normalization.
|
|
362
|
+
*
|
|
363
|
+
* Returns `false` when neither mode is active, allowing the default emit to run.
|
|
364
|
+
*/
|
|
365
|
+
const processLetterFilter = (code, ctx, opts) => {
|
|
366
|
+
if (!opts.lettersSpacesOnly && !opts.lettersOnly) return false;
|
|
367
|
+
if (!isArabicLetter(code)) {
|
|
368
|
+
if (!opts.lettersOnly) emitSpace(ctx, opts.collapseWS);
|
|
369
|
+
return true;
|
|
370
|
+
}
|
|
371
|
+
ctx.buffer[ctx.bufIdx++] = normalizeCode(code, opts.normAlif, opts.maqToYa, opts.taToHa);
|
|
372
|
+
ctx.lastWasSpace = false;
|
|
373
|
+
return true;
|
|
374
|
+
};
|
|
375
|
+
/**
|
|
376
|
+
* Internal sanitization logic. Iterates once over the source string, dispatching
|
|
377
|
+
* each character through a series of focused step handlers. All options must be
|
|
378
|
+
* pre-resolved; no allocations occur beyond the context object and any buffer growth.
|
|
139
379
|
*/
|
|
140
380
|
const applySanitization = (input, options) => {
|
|
141
381
|
if (!input) return "";
|
|
142
|
-
const { nfc, stripZW,
|
|
143
|
-
/**
|
|
144
|
-
* NFC Normalization (Fast Path)
|
|
145
|
-
*
|
|
146
|
-
* `String.prototype.normalize('NFC')` is extremely expensive under high throughput.
|
|
147
|
-
* For Arabic OCR text, the main canonical compositions we care about are:
|
|
148
|
-
* - ا + ◌ٓ (U+0653) → آ
|
|
149
|
-
* - ا + ◌ٔ (U+0654) → أ
|
|
150
|
-
* - ا + ◌ٕ (U+0655) → إ
|
|
151
|
-
* - و + ◌ٔ (U+0654) → ؤ
|
|
152
|
-
* - ي + ◌ٔ (U+0654) → ئ
|
|
153
|
-
*
|
|
154
|
-
* We implement these compositions inline during the main loop, avoiding full NFC
|
|
155
|
-
* normalization in the common case while preserving behavior needed by our sanitizer.
|
|
156
|
-
*/
|
|
382
|
+
const { nfc, stripZW, removeHijri, removeDia, tatweelMode, stripNoise, removeFootnotes, normAlif, maqToYa, taToHa, doTrim } = options;
|
|
157
383
|
const text = input;
|
|
158
384
|
const len = text.length;
|
|
159
385
|
if (len > sharedBuffer.length) sharedBuffer = new Uint16Array(len + 1024);
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
|
|
386
|
+
const ctx = {
|
|
387
|
+
buffer: sharedBuffer,
|
|
388
|
+
bufIdx: 0,
|
|
389
|
+
i: 0,
|
|
390
|
+
lastWasSpace: false,
|
|
391
|
+
len,
|
|
392
|
+
text
|
|
393
|
+
};
|
|
163
394
|
let start = 0;
|
|
164
395
|
if (doTrim) while (start < len && text.charCodeAt(start) <= 32) start++;
|
|
165
396
|
for (let i = start; i < len; i++) {
|
|
397
|
+
ctx.i = i;
|
|
166
398
|
const code = text.charCodeAt(i);
|
|
167
|
-
if (code
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
lastWasSpace = true;
|
|
173
|
-
}
|
|
174
|
-
} else {
|
|
175
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
176
|
-
lastWasSpace = false;
|
|
177
|
-
}
|
|
178
|
-
continue;
|
|
179
|
-
}
|
|
180
|
-
if (nfc) {
|
|
181
|
-
if (code === CHAR_MADDA_ABOVE || code === CHAR_HAMZA_ABOVE_MARK || code === CHAR_HAMZA_BELOW_MARK) {
|
|
182
|
-
const prevIdx = bufIdx - 1;
|
|
183
|
-
if (prevIdx >= 0) {
|
|
184
|
-
const prev = buffer[prevIdx];
|
|
185
|
-
let composed = 0;
|
|
186
|
-
if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
|
|
187
|
-
else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
|
|
188
|
-
else composed = CHAR_ALIF_HAMZA_BELOW;
|
|
189
|
-
else if (code === CHAR_HAMZA_ABOVE_MARK) {
|
|
190
|
-
if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
|
|
191
|
-
else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
|
|
192
|
-
}
|
|
193
|
-
if (composed !== 0) {
|
|
194
|
-
buffer[prevIdx] = composed;
|
|
195
|
-
continue;
|
|
196
|
-
}
|
|
197
|
-
}
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
if (stripZW && isZeroWidth(code)) {
|
|
201
|
-
if (zwAsSpace) if (collapseWS) {
|
|
202
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
203
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
204
|
-
lastWasSpace = true;
|
|
205
|
-
}
|
|
206
|
-
} else {
|
|
207
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
208
|
-
lastWasSpace = false;
|
|
209
|
-
}
|
|
399
|
+
if (processWhitespace(code, ctx, options)) continue;
|
|
400
|
+
if (nfc && processNfc(code, ctx)) continue;
|
|
401
|
+
if (stripZW && processZeroWidth(code, ctx, options)) continue;
|
|
402
|
+
if (removeHijri && processHijriMarker(code, ctx, options)) {
|
|
403
|
+
i = ctx.i;
|
|
210
404
|
continue;
|
|
211
405
|
}
|
|
212
|
-
if (removeHijri && code === CHAR_HA) {
|
|
213
|
-
let nextIdx = i + 1;
|
|
214
|
-
if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL) nextIdx++;
|
|
215
|
-
let isBoundary = false;
|
|
216
|
-
if (nextIdx >= len) isBoundary = true;
|
|
217
|
-
else {
|
|
218
|
-
const nextCode = text.charCodeAt(nextIdx);
|
|
219
|
-
if (nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45) isBoundary = true;
|
|
220
|
-
}
|
|
221
|
-
if (isBoundary) {
|
|
222
|
-
let backIdx = i - 1;
|
|
223
|
-
while (backIdx >= 0) {
|
|
224
|
-
const c = text.charCodeAt(backIdx);
|
|
225
|
-
if (c <= 32 || isZeroWidth(c)) backIdx--;
|
|
226
|
-
else break;
|
|
227
|
-
}
|
|
228
|
-
if (backIdx >= 0 && isDigit(text.charCodeAt(backIdx))) {
|
|
229
|
-
if (nextIdx > i + 1) i++;
|
|
230
|
-
continue;
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
}
|
|
234
406
|
if (removeDia && isDiacritic(code)) continue;
|
|
235
|
-
if (code
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
let backIdx = bufIdx - 1;
|
|
239
|
-
while (backIdx >= 0 && buffer[backIdx] === CHAR_SPACE) backIdx--;
|
|
240
|
-
if (backIdx >= 0) {
|
|
241
|
-
const prev = buffer[backIdx];
|
|
242
|
-
if (isDigit(prev) || prev === CHAR_HA) {} else continue;
|
|
243
|
-
} else continue;
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
if (stripNoise && !lettersSpacesOnly && !lettersOnly) {
|
|
247
|
-
if (isLatinOrDigit(code) || isSymbol(code)) {
|
|
248
|
-
if (collapseWS) {
|
|
249
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
250
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
251
|
-
lastWasSpace = true;
|
|
252
|
-
}
|
|
253
|
-
} else {
|
|
254
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
255
|
-
lastWasSpace = false;
|
|
256
|
-
}
|
|
257
|
-
continue;
|
|
258
|
-
}
|
|
259
|
-
if (code === 47 && i + 1 < len && text.charCodeAt(i + 1) === 47) {
|
|
260
|
-
while (i + 1 < len && text.charCodeAt(i + 1) === 47) i++;
|
|
261
|
-
if (collapseWS) {
|
|
262
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
263
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
264
|
-
lastWasSpace = true;
|
|
265
|
-
}
|
|
266
|
-
} else {
|
|
267
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
268
|
-
lastWasSpace = false;
|
|
269
|
-
}
|
|
270
|
-
continue;
|
|
271
|
-
}
|
|
272
|
-
}
|
|
273
|
-
if (removeFootnotes && !lettersSpacesOnly && !lettersOnly && code === 40) {
|
|
274
|
-
let nextIdx = i + 1;
|
|
275
|
-
if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
|
|
276
|
-
if (nextIdx < len) {
|
|
277
|
-
const c1 = text.charCodeAt(nextIdx);
|
|
278
|
-
if (c1 === 172) {
|
|
279
|
-
nextIdx++;
|
|
280
|
-
let hasDigits = false;
|
|
281
|
-
while (nextIdx < len) {
|
|
282
|
-
const c = text.charCodeAt(nextIdx);
|
|
283
|
-
if (c >= 1632 && c <= 1641) {
|
|
284
|
-
hasDigits = true;
|
|
285
|
-
nextIdx++;
|
|
286
|
-
} else break;
|
|
287
|
-
}
|
|
288
|
-
if (hasDigits && nextIdx < len) {
|
|
289
|
-
if (text.charCodeAt(nextIdx) === 41) {
|
|
290
|
-
i = nextIdx;
|
|
291
|
-
if (collapseWS) {
|
|
292
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
293
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
294
|
-
lastWasSpace = true;
|
|
295
|
-
}
|
|
296
|
-
} else {
|
|
297
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
298
|
-
lastWasSpace = false;
|
|
299
|
-
}
|
|
300
|
-
continue;
|
|
301
|
-
}
|
|
302
|
-
if (text.charCodeAt(nextIdx) === CHAR_SPACE) {
|
|
303
|
-
nextIdx++;
|
|
304
|
-
if (nextIdx < len && text.charCodeAt(nextIdx) === 41) {
|
|
305
|
-
i = nextIdx;
|
|
306
|
-
if (collapseWS) {
|
|
307
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
308
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
309
|
-
lastWasSpace = true;
|
|
310
|
-
}
|
|
311
|
-
} else {
|
|
312
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
313
|
-
lastWasSpace = false;
|
|
314
|
-
}
|
|
315
|
-
continue;
|
|
316
|
-
}
|
|
317
|
-
}
|
|
318
|
-
}
|
|
319
|
-
} else if (c1 >= 1632 && c1 <= 1641) {
|
|
320
|
-
let tempIdx = nextIdx + 1;
|
|
321
|
-
let matched = false;
|
|
322
|
-
if (tempIdx < len) {
|
|
323
|
-
const c2 = text.charCodeAt(tempIdx);
|
|
324
|
-
if (c2 === 41) {
|
|
325
|
-
matched = true;
|
|
326
|
-
tempIdx++;
|
|
327
|
-
} else if (c2 === CHAR_SPACE) {
|
|
328
|
-
tempIdx++;
|
|
329
|
-
if (tempIdx < len) {
|
|
330
|
-
const c3 = text.charCodeAt(tempIdx);
|
|
331
|
-
if (c3 >= 1536 && c3 <= 1791) {
|
|
332
|
-
tempIdx++;
|
|
333
|
-
if (tempIdx < len && text.charCodeAt(tempIdx) === 41) {
|
|
334
|
-
matched = true;
|
|
335
|
-
tempIdx++;
|
|
336
|
-
}
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
|
-
if (matched) {
|
|
342
|
-
i = tempIdx - 1;
|
|
343
|
-
if (collapseWS) {
|
|
344
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
345
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
346
|
-
lastWasSpace = true;
|
|
347
|
-
}
|
|
348
|
-
} else {
|
|
349
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
350
|
-
lastWasSpace = false;
|
|
351
|
-
}
|
|
352
|
-
continue;
|
|
353
|
-
}
|
|
354
|
-
}
|
|
355
|
-
}
|
|
356
|
-
}
|
|
357
|
-
if (lettersSpacesOnly || lettersOnly) {
|
|
358
|
-
if (!isArabicLetter(code)) {
|
|
359
|
-
if (lettersOnly) continue;
|
|
360
|
-
if (collapseWS) {
|
|
361
|
-
if (!lastWasSpace && bufIdx > 0) {
|
|
362
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
363
|
-
lastWasSpace = true;
|
|
364
|
-
}
|
|
365
|
-
} else {
|
|
366
|
-
buffer[bufIdx++] = CHAR_SPACE;
|
|
367
|
-
lastWasSpace = false;
|
|
368
|
-
}
|
|
369
|
-
continue;
|
|
370
|
-
}
|
|
371
|
-
let outCode$1 = code;
|
|
372
|
-
if (normAlif) {
|
|
373
|
-
if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode$1 = CHAR_ALIF;
|
|
374
|
-
}
|
|
375
|
-
if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode$1 = CHAR_YA;
|
|
376
|
-
if (taToHa && code === CHAR_TA_MARBUTAH) outCode$1 = CHAR_HA;
|
|
377
|
-
buffer[bufIdx++] = outCode$1;
|
|
378
|
-
lastWasSpace = false;
|
|
407
|
+
if (tatweelMode !== false && processTatweel(code, ctx, tatweelMode)) continue;
|
|
408
|
+
if (stripNoise && processNoise(code, ctx, options)) {
|
|
409
|
+
i = ctx.i;
|
|
379
410
|
continue;
|
|
380
411
|
}
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
412
|
+
if (removeFootnotes && processFootnote(code, ctx, options)) {
|
|
413
|
+
i = ctx.i;
|
|
414
|
+
continue;
|
|
384
415
|
}
|
|
385
|
-
if (
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
lastWasSpace = false;
|
|
416
|
+
if (processLetterFilter(code, ctx, options)) continue;
|
|
417
|
+
ctx.buffer[ctx.bufIdx++] = normalizeCode(code, normAlif, maqToYa, taToHa);
|
|
418
|
+
ctx.lastWasSpace = false;
|
|
389
419
|
}
|
|
390
|
-
if (doTrim && lastWasSpace && bufIdx > 0) bufIdx--;
|
|
391
|
-
if (bufIdx === 0) return "";
|
|
392
|
-
|
|
393
|
-
return decoder.decode(resultView);
|
|
420
|
+
if (doTrim && ctx.lastWasSpace && ctx.bufIdx > 0) ctx.bufIdx--;
|
|
421
|
+
if (ctx.bufIdx === 0) return "";
|
|
422
|
+
return decoder.decode(ctx.buffer.subarray(0, ctx.bufIdx));
|
|
394
423
|
};
|
|
395
424
|
/**
|
|
396
425
|
* Resolves options from a preset or custom options object.
|
|
@@ -449,7 +478,58 @@ function sanitizeArabic(input, optionsOrPreset = "search") {
|
|
|
449
478
|
if (!input) return "";
|
|
450
479
|
return applySanitization(input, resolveOptions(optionsOrPreset));
|
|
451
480
|
}
|
|
452
|
-
|
|
481
|
+
const sanitizeQuranBase = createArabicSanitizer({
|
|
482
|
+
base: "none",
|
|
483
|
+
collapseWhitespace: true,
|
|
484
|
+
lettersAndSpacesOnly: true,
|
|
485
|
+
nfc: true,
|
|
486
|
+
replaceAlifMaqsurah: false,
|
|
487
|
+
replaceTaMarbutahWithHa: false,
|
|
488
|
+
stripDiacritics: true,
|
|
489
|
+
stripFootnotes: true,
|
|
490
|
+
stripTatweel: "all",
|
|
491
|
+
stripZeroWidth: true,
|
|
492
|
+
trim: true
|
|
493
|
+
});
|
|
494
|
+
const normalizeQuranOrthography = (input) => {
|
|
495
|
+
if (!input) return "";
|
|
496
|
+
let output = "";
|
|
497
|
+
let lastBaseCode = 0;
|
|
498
|
+
for (let index = 0; index < input.length; index += 1) {
|
|
499
|
+
const code = input.charCodeAt(index);
|
|
500
|
+
if (code === CHAR_ALIF_WASLA) {
|
|
501
|
+
output += String.fromCharCode(CHAR_ALIF);
|
|
502
|
+
lastBaseCode = CHAR_ALIF;
|
|
503
|
+
continue;
|
|
504
|
+
}
|
|
505
|
+
if (code === CHAR_DAGGER_ALIF) {
|
|
506
|
+
if (lastBaseCode !== 0 && lastBaseCode !== 1584 && lastBaseCode !== CHAR_HA && lastBaseCode !== CHAR_ALIF && lastBaseCode !== CHAR_WAW && lastBaseCode !== CHAR_YA && lastBaseCode !== CHAR_ALIF_MAQSURAH) {
|
|
507
|
+
output += String.fromCharCode(CHAR_ALIF);
|
|
508
|
+
lastBaseCode = CHAR_ALIF;
|
|
509
|
+
}
|
|
510
|
+
continue;
|
|
511
|
+
}
|
|
512
|
+
output += input[index];
|
|
513
|
+
if (!isDiacritic(code) && !isZeroWidth(code)) lastBaseCode = code;
|
|
514
|
+
}
|
|
515
|
+
return output;
|
|
516
|
+
};
|
|
517
|
+
/**
|
|
518
|
+
* Produces a conservative Qur'an-specific search surface.
|
|
519
|
+
*
|
|
520
|
+
* This helper is intentionally narrower than the generic `search` preset:
|
|
521
|
+
* it preserves standard hamza forms and alif maqsurah while normalizing
|
|
522
|
+
* Qur'anic orthography that would otherwise damage lexical identity in FTS.
|
|
523
|
+
*
|
|
524
|
+
* Current behavior:
|
|
525
|
+
* - maps alif wasla (`ٱ`) to bare alif (`ا`)
|
|
526
|
+
* - expands dagger alif (`ٰ`) only in contexts where the imla'i form needs an alif
|
|
527
|
+
* - strips tashkeel, tatweel, footnotes, zero-width chars, and non-letter noise
|
|
528
|
+
* - keeps only Arabic letters and spaces
|
|
529
|
+
*/
|
|
530
|
+
const sanitizeQuranForSearch = (input) => {
|
|
531
|
+
return sanitizeQuranBase(normalizeQuranOrthography(input)).replace(/آ/gu, "ا").replace(/ىء/gu, "يء");
|
|
532
|
+
};
|
|
453
533
|
//#endregion
|
|
454
534
|
//#region src/utils/levenshthein.ts
|
|
455
535
|
/**
|
|
@@ -556,7 +636,6 @@ const boundedLevenshtein = (a, b, maxDist) => {
|
|
|
556
636
|
}
|
|
557
637
|
return prev[b.length] <= maxDist ? prev[b.length] : big;
|
|
558
638
|
};
|
|
559
|
-
|
|
560
639
|
//#endregion
|
|
561
640
|
//#region src/utils/similarity.ts
|
|
562
641
|
const ALIGNMENT_SCORES = {
|
|
@@ -731,7 +810,6 @@ const alignTokenSequences = (tokensA, tokensB, typoSymbols, similarityThreshold)
|
|
|
731
810
|
}
|
|
732
811
|
return backtrackAlignment(matrix, tokensA, tokensB);
|
|
733
812
|
};
|
|
734
|
-
|
|
735
813
|
//#endregion
|
|
736
814
|
//#region src/alignment.ts
|
|
737
815
|
/**
|
|
@@ -805,7 +883,6 @@ const processAlignmentTarget = (targetLine, segmentLines, segmentIndex) => {
|
|
|
805
883
|
segmentsConsumed: 2
|
|
806
884
|
};
|
|
807
885
|
};
|
|
808
|
-
|
|
809
886
|
//#endregion
|
|
810
887
|
//#region src/balance.ts
|
|
811
888
|
/**
|
|
@@ -832,8 +909,8 @@ const checkQuoteBalance = (str) => {
|
|
|
832
909
|
quoteCount++;
|
|
833
910
|
lastQuoteIndex = i;
|
|
834
911
|
}
|
|
835
|
-
const isBalanced
|
|
836
|
-
if (!isBalanced
|
|
912
|
+
const isBalanced = quoteCount % 2 === 0;
|
|
913
|
+
if (!isBalanced && lastQuoteIndex !== -1) errors.push({
|
|
837
914
|
char: "\"",
|
|
838
915
|
index: lastQuoteIndex,
|
|
839
916
|
reason: "unmatched",
|
|
@@ -841,15 +918,15 @@ const checkQuoteBalance = (str) => {
|
|
|
841
918
|
});
|
|
842
919
|
return {
|
|
843
920
|
errors,
|
|
844
|
-
isBalanced
|
|
921
|
+
isBalanced
|
|
845
922
|
};
|
|
846
923
|
};
|
|
847
924
|
/** Mapping of opening brackets to their corresponding closing brackets */
|
|
848
925
|
const BRACKETS = {
|
|
849
|
-
"«": "»",
|
|
850
926
|
"(": ")",
|
|
851
927
|
"[": "]",
|
|
852
|
-
"{": "}"
|
|
928
|
+
"{": "}",
|
|
929
|
+
"«": "»"
|
|
853
930
|
};
|
|
854
931
|
/** Set of all opening bracket characters */
|
|
855
932
|
const OPEN_BRACKETS = new Set([
|
|
@@ -1049,7 +1126,6 @@ const areBracketsBalanced = (str) => {
|
|
|
1049
1126
|
const isBalanced = (str) => {
|
|
1050
1127
|
return checkBalance(str).isBalanced;
|
|
1051
1128
|
};
|
|
1052
|
-
|
|
1053
1129
|
//#endregion
|
|
1054
1130
|
//#region src/utils/textUtils.ts
|
|
1055
1131
|
const INTAHA_ACTUAL = "اهـ";
|
|
@@ -1220,7 +1296,6 @@ const standardizeHijriSymbol = (text) => {
|
|
|
1220
1296
|
const standardizeIntahaSymbol = (text) => {
|
|
1221
1297
|
return text.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu, `$1${INTAHA_ACTUAL}`);
|
|
1222
1298
|
};
|
|
1223
|
-
|
|
1224
1299
|
//#endregion
|
|
1225
1300
|
//#region src/footnotes.ts
|
|
1226
1301
|
const INVALID_FOOTNOTE = "()";
|
|
@@ -1266,9 +1341,9 @@ const numberToArabic = (num) => {
|
|
|
1266
1341
|
*/
|
|
1267
1342
|
const ocrToArabic = (char) => {
|
|
1268
1343
|
return {
|
|
1344
|
+
".": "٠",
|
|
1269
1345
|
"1": "١",
|
|
1270
1346
|
"9": "٩",
|
|
1271
|
-
".": "٠",
|
|
1272
1347
|
O: "٥",
|
|
1273
1348
|
o: "٥",
|
|
1274
1349
|
V: "٧",
|
|
@@ -1418,7 +1493,6 @@ const correctReferences = (lines) => {
|
|
|
1418
1493
|
};
|
|
1419
1494
|
});
|
|
1420
1495
|
};
|
|
1421
|
-
|
|
1422
1496
|
//#endregion
|
|
1423
1497
|
//#region src/utils/ahocorasick.ts
|
|
1424
1498
|
/**
|
|
@@ -1526,7 +1600,6 @@ const buildAhoCorasick = (patterns) => {
|
|
|
1526
1600
|
ac.build();
|
|
1527
1601
|
return ac;
|
|
1528
1602
|
};
|
|
1529
|
-
|
|
1530
1603
|
//#endregion
|
|
1531
1604
|
//#region src/utils/constants.ts
|
|
1532
1605
|
const DEFAULT_POLICY = {
|
|
@@ -1539,7 +1612,6 @@ const DEFAULT_POLICY = {
|
|
|
1539
1612
|
q: 4,
|
|
1540
1613
|
seamLen: 512
|
|
1541
1614
|
};
|
|
1542
|
-
|
|
1543
1615
|
//#endregion
|
|
1544
1616
|
//#region src/utils/fuzzyUtils.ts
|
|
1545
1617
|
const SEAM_GAP_CEILING = 200;
|
|
@@ -1776,7 +1848,6 @@ const findBestMatch = (windows, excerpt, acceptance) => {
|
|
|
1776
1848
|
dist: best
|
|
1777
1849
|
};
|
|
1778
1850
|
};
|
|
1779
|
-
|
|
1780
1851
|
//#endregion
|
|
1781
1852
|
//#region src/utils/qgram.ts
|
|
1782
1853
|
/**
|
|
@@ -1870,7 +1941,6 @@ var QGramIndex = class {
|
|
|
1870
1941
|
return this.map.get(gram);
|
|
1871
1942
|
}
|
|
1872
1943
|
};
|
|
1873
|
-
|
|
1874
1944
|
//#endregion
|
|
1875
1945
|
//#region src/fuzzy.ts
|
|
1876
1946
|
/**
|
|
@@ -2334,7 +2404,6 @@ function findMatchesAll(pages, excerpts, policy = {}) {
|
|
|
2334
2404
|
if (cfg.enableFuzzy) recordFuzzyMatches(excerptsN, pagesN, hitsByExcerpt, cfg);
|
|
2335
2405
|
return hitsByExcerpt.map((hits) => sortMatches(hits));
|
|
2336
2406
|
}
|
|
2337
|
-
|
|
2338
2407
|
//#endregion
|
|
2339
2408
|
//#region src/noise.ts
|
|
2340
2409
|
/**
|
|
@@ -2562,7 +2631,6 @@ function isValidArabicContent(charStats, textLength) {
|
|
|
2562
2631
|
if (charStats.arabicCount >= 1 && textLength <= 5 && charStats.punctuationCount <= 1) return true;
|
|
2563
2632
|
return false;
|
|
2564
2633
|
}
|
|
2565
|
-
|
|
2566
2634
|
//#endregion
|
|
2567
2635
|
//#region src/typos.ts
|
|
2568
2636
|
/**
|
|
@@ -2644,7 +2712,7 @@ const fixTypo = (original, correction, { highSimilarityThreshold = .8, similarit
|
|
|
2644
2712
|
typoSymbols
|
|
2645
2713
|
});
|
|
2646
2714
|
};
|
|
2647
|
-
|
|
2648
2715
|
//#endregion
|
|
2649
|
-
export { BRACKETS, CLOSE_BRACKETS, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|
|
2716
|
+
export { BRACKETS, CLOSE_BRACKETS, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, sanitizeQuranForSearch, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
|
|
2717
|
+
|
|
2650
2718
|
//# sourceMappingURL=index.js.map
|