flappa-doormal 2.6.4 → 2.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +47 -3
- package/README.md +69 -0
- package/dist/index.d.mts +164 -81
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +2193 -1358
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -156,1328 +156,1440 @@ const makeDiacriticInsensitive = (text) => {
|
|
|
156
156
|
};
|
|
157
157
|
|
|
158
158
|
//#endregion
|
|
159
|
-
//#region src/segmentation/
|
|
160
|
-
const WINDOW_PREFIX_LENGTHS = [
|
|
161
|
-
80,
|
|
162
|
-
60,
|
|
163
|
-
40,
|
|
164
|
-
30,
|
|
165
|
-
20,
|
|
166
|
-
15
|
|
167
|
-
];
|
|
168
|
-
const JOINER_PREFIX_LENGTHS = [
|
|
169
|
-
80,
|
|
170
|
-
60,
|
|
171
|
-
40,
|
|
172
|
-
30,
|
|
173
|
-
20,
|
|
174
|
-
15,
|
|
175
|
-
12,
|
|
176
|
-
10,
|
|
177
|
-
8,
|
|
178
|
-
6
|
|
179
|
-
];
|
|
159
|
+
//#region src/segmentation/tokens.ts
|
|
180
160
|
/**
|
|
181
|
-
*
|
|
182
|
-
* Strings are converted to { pattern: str } with no constraints.
|
|
161
|
+
* Token-based template system for Arabic text pattern matching.
|
|
183
162
|
*
|
|
184
|
-
*
|
|
185
|
-
*
|
|
163
|
+
* This module provides a human-readable way to define regex patterns using
|
|
164
|
+
* `{{token}}` placeholders that expand to their regex equivalents. It supports
|
|
165
|
+
* named capture groups for extracting matched values into metadata.
|
|
166
|
+
*
|
|
167
|
+
* @module tokens
|
|
186
168
|
*
|
|
187
169
|
* @example
|
|
188
|
-
*
|
|
189
|
-
*
|
|
170
|
+
* // Simple token expansion
|
|
171
|
+
* expandTokens('{{raqms}} {{dash}}')
|
|
172
|
+
* // → '[\\u0660-\\u0669]+ [-–—ـ]'
|
|
190
173
|
*
|
|
191
|
-
*
|
|
192
|
-
* //
|
|
174
|
+
* @example
|
|
175
|
+
* // Named capture groups
|
|
176
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
177
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
193
178
|
*/
|
|
194
|
-
const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
|
|
195
179
|
/**
|
|
196
|
-
*
|
|
180
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
197
181
|
*
|
|
198
|
-
*
|
|
199
|
-
*
|
|
200
|
-
*
|
|
182
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
183
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
184
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
185
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
201
186
|
*
|
|
202
|
-
* @
|
|
203
|
-
*
|
|
204
|
-
*
|
|
187
|
+
* @remarks
|
|
188
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
189
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
190
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
205
191
|
*
|
|
206
|
-
*
|
|
207
|
-
* //
|
|
192
|
+
* @example
|
|
193
|
+
* // Using tokens in a split rule
|
|
194
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
208
195
|
*
|
|
209
|
-
*
|
|
210
|
-
* //
|
|
196
|
+
* @example
|
|
197
|
+
* // Using tokens with named captures
|
|
198
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
211
199
|
*/
|
|
212
|
-
const isPageExcluded = (pageId, excludeList) => {
|
|
213
|
-
if (!excludeList || excludeList.length === 0) return false;
|
|
214
|
-
for (const item of excludeList) if (typeof item === "number") {
|
|
215
|
-
if (pageId === item) return true;
|
|
216
|
-
} else {
|
|
217
|
-
const [from, to] = item;
|
|
218
|
-
if (pageId >= from && pageId <= to) return true;
|
|
219
|
-
}
|
|
220
|
-
return false;
|
|
221
|
-
};
|
|
222
200
|
/**
|
|
223
|
-
*
|
|
201
|
+
* Escapes regex metacharacters (parentheses and brackets) in template patterns,
|
|
202
|
+
* but preserves content inside `{{...}}` token delimiters.
|
|
224
203
|
*
|
|
225
|
-
*
|
|
226
|
-
*
|
|
227
|
-
*
|
|
204
|
+
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
205
|
+
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
206
|
+
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
207
|
+
*
|
|
208
|
+
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
209
|
+
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
228
210
|
*
|
|
229
211
|
* @example
|
|
230
|
-
*
|
|
231
|
-
* // →
|
|
212
|
+
* escapeTemplateBrackets('({{harf}}): ')
|
|
213
|
+
* // → '\\({{harf}}\\): '
|
|
232
214
|
*
|
|
233
|
-
*
|
|
234
|
-
*
|
|
215
|
+
* @example
|
|
216
|
+
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
217
|
+
* // → '\\[{{raqm}}\\] '
|
|
218
|
+
*
|
|
219
|
+
* @example
|
|
220
|
+
* escapeTemplateBrackets('{{harf}}')
|
|
221
|
+
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
235
222
|
*/
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
223
|
+
const escapeTemplateBrackets = (pattern) => {
|
|
224
|
+
return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
|
|
225
|
+
if (token) return token;
|
|
226
|
+
return `\\${bracket}`;
|
|
227
|
+
});
|
|
228
|
+
};
|
|
229
|
+
const RUMUZ_ATOM = `(?:${[
|
|
230
|
+
"تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
231
|
+
"خت",
|
|
232
|
+
"خغ",
|
|
233
|
+
"بخ",
|
|
234
|
+
"عخ",
|
|
235
|
+
"مق",
|
|
236
|
+
"مت",
|
|
237
|
+
"عس",
|
|
238
|
+
"سي",
|
|
239
|
+
"سن",
|
|
240
|
+
"كن",
|
|
241
|
+
"مد",
|
|
242
|
+
"قد",
|
|
243
|
+
"خد",
|
|
244
|
+
"فد",
|
|
245
|
+
"دل",
|
|
246
|
+
"كد",
|
|
247
|
+
"غد",
|
|
248
|
+
"صد",
|
|
249
|
+
"دت",
|
|
250
|
+
"دس",
|
|
251
|
+
"تم",
|
|
252
|
+
"فق",
|
|
253
|
+
"دق",
|
|
254
|
+
"[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
255
|
+
"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
|
|
256
|
+
].join("|")})`;
|
|
257
|
+
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
258
|
+
const BASE_TOKENS = {
|
|
259
|
+
bab: "باب",
|
|
260
|
+
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
261
|
+
bullet: "[•*°]",
|
|
262
|
+
dash: "[-–—ـ]",
|
|
263
|
+
fasl: ["مسألة", "فصل"].join("|"),
|
|
264
|
+
harf: "[أ-ي]",
|
|
265
|
+
harfs: "[أ-ي](?:\\s+[أ-ي])*",
|
|
266
|
+
kitab: "كتاب",
|
|
267
|
+
naql: [
|
|
268
|
+
"حدثني",
|
|
269
|
+
"وأخبرنا",
|
|
270
|
+
"حدثنا",
|
|
271
|
+
"سمعت",
|
|
272
|
+
"أنبأنا",
|
|
273
|
+
"وحدثنا",
|
|
274
|
+
"أخبرنا",
|
|
275
|
+
"وحدثني",
|
|
276
|
+
"وحدثنيه"
|
|
277
|
+
].join("|"),
|
|
278
|
+
raqm: "[\\u0660-\\u0669]",
|
|
279
|
+
raqms: "[\\u0660-\\u0669]+",
|
|
280
|
+
rumuz: RUMUZ_BLOCK,
|
|
281
|
+
tarqim: "[.!?؟؛]"
|
|
240
282
|
};
|
|
241
283
|
/**
|
|
242
|
-
*
|
|
284
|
+
* Composite token definitions using template syntax.
|
|
243
285
|
*
|
|
244
|
-
*
|
|
245
|
-
*
|
|
286
|
+
* These tokens reference base tokens using `{{token}}` syntax and are
|
|
287
|
+
* automatically expanded to their final regex patterns at module load time.
|
|
246
288
|
*
|
|
247
|
-
*
|
|
248
|
-
*
|
|
249
|
-
* book-scale inputs (thousands of pages), this is small and keeps downstream logic
|
|
250
|
-
* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
|
|
251
|
-
* consider avoiding broad excludes or introducing a range-based membership structure.
|
|
289
|
+
* This provides better abstraction - if base tokens change, composites
|
|
290
|
+
* automatically update on the next build.
|
|
252
291
|
*
|
|
253
|
-
* @
|
|
254
|
-
* buildExcludeSet([1, 5, [10, 12]])
|
|
255
|
-
* // → Set { 1, 5, 10, 11, 12 }
|
|
292
|
+
* @internal
|
|
256
293
|
*/
|
|
257
|
-
const
|
|
258
|
-
const excludeSet = /* @__PURE__ */ new Set();
|
|
259
|
-
for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
|
|
260
|
-
else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
|
|
261
|
-
return excludeSet;
|
|
262
|
-
};
|
|
294
|
+
const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
|
|
263
295
|
/**
|
|
264
|
-
*
|
|
265
|
-
*
|
|
296
|
+
* Expands any *composite* tokens (like `{{numbered}}`) into their underlying template form
|
|
297
|
+
* (like `{{raqms}} {{dash}} `).
|
|
266
298
|
*
|
|
267
|
-
*
|
|
268
|
-
*
|
|
269
|
-
* @param toPageId - Optional ending page ID (omitted if same as from)
|
|
270
|
-
* @param meta - Optional metadata to attach
|
|
271
|
-
* @returns Segment object or null if empty
|
|
299
|
+
* This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
|
|
300
|
+
* and turn it into an editable template where you can add named captures, e.g.:
|
|
272
301
|
*
|
|
273
|
-
*
|
|
274
|
-
*
|
|
275
|
-
* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
|
|
302
|
+
* - `{{numbered}}` → `{{raqms}} {{dash}} `
|
|
303
|
+
* - then: `{{raqms:num}} {{dash}} ` to capture the number
|
|
276
304
|
*
|
|
277
|
-
*
|
|
278
|
-
*
|
|
305
|
+
* Notes:
|
|
306
|
+
* - This only expands the plain `{{token}}` form (not `{{token:name}}`).
|
|
307
|
+
* - Expansion is repeated a few times to support nested composites.
|
|
279
308
|
*/
|
|
280
|
-
const
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
return
|
|
309
|
+
const expandCompositeTokensInTemplate = (template) => {
|
|
310
|
+
let out = template;
|
|
311
|
+
for (let i = 0; i < 10; i++) {
|
|
312
|
+
const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => {
|
|
313
|
+
return COMPOSITE_TOKENS[tokenName] ?? m;
|
|
314
|
+
});
|
|
315
|
+
if (next === out) break;
|
|
316
|
+
out = next;
|
|
317
|
+
}
|
|
318
|
+
return out;
|
|
290
319
|
};
|
|
291
320
|
/**
|
|
292
|
-
* Expands
|
|
321
|
+
* Expands base tokens in a template string.
|
|
322
|
+
* Used internally to pre-expand composite tokens.
|
|
293
323
|
*
|
|
294
|
-
* @param
|
|
295
|
-
* @
|
|
296
|
-
* @
|
|
324
|
+
* @param template - Template string with `{{token}}` placeholders
|
|
325
|
+
* @returns Expanded pattern with base tokens replaced
|
|
326
|
+
* @internal
|
|
327
|
+
*/
|
|
328
|
+
const expandBaseTokens = (template) => {
|
|
329
|
+
return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
|
|
330
|
+
return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
|
|
331
|
+
});
|
|
332
|
+
};
|
|
333
|
+
/**
|
|
334
|
+
* Token definitions mapping human-readable token names to regex patterns.
|
|
335
|
+
*
|
|
336
|
+
* Tokens are used in template strings with double-brace syntax:
|
|
337
|
+
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
338
|
+
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
339
|
+
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
297
340
|
*
|
|
298
341
|
* @remarks
|
|
299
|
-
*
|
|
300
|
-
*
|
|
301
|
-
*
|
|
302
|
-
*/
|
|
303
|
-
const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
|
|
304
|
-
const rule = normalizeBreakpoint(bp);
|
|
305
|
-
const excludeSet = buildExcludeSet(rule.exclude);
|
|
306
|
-
const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
|
|
307
|
-
const expandedSkip = processPattern$1(rule.skipWhen);
|
|
308
|
-
try {
|
|
309
|
-
return new RegExp(expandedSkip, "mu");
|
|
310
|
-
} catch (error) {
|
|
311
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
312
|
-
throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
|
|
313
|
-
}
|
|
314
|
-
})() : null;
|
|
315
|
-
if (rule.pattern === "") return {
|
|
316
|
-
excludeSet,
|
|
317
|
-
regex: null,
|
|
318
|
-
rule,
|
|
319
|
-
skipWhenRegex
|
|
320
|
-
};
|
|
321
|
-
const expanded = processPattern$1(rule.pattern);
|
|
322
|
-
try {
|
|
323
|
-
return {
|
|
324
|
-
excludeSet,
|
|
325
|
-
regex: new RegExp(expanded, "gmu"),
|
|
326
|
-
rule,
|
|
327
|
-
skipWhenRegex
|
|
328
|
-
};
|
|
329
|
-
} catch (error) {
|
|
330
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
331
|
-
throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
|
|
332
|
-
}
|
|
333
|
-
});
|
|
334
|
-
/**
|
|
335
|
-
* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
|
|
342
|
+
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
343
|
+
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
344
|
+
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
336
345
|
*
|
|
337
|
-
*
|
|
338
|
-
*
|
|
339
|
-
*
|
|
340
|
-
* that page start.
|
|
346
|
+
* @example
|
|
347
|
+
* // Using tokens in a split rule
|
|
348
|
+
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
341
349
|
*
|
|
342
|
-
*
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
const pageData = normalizedPages.get(pageIds[pi]);
|
|
350
|
-
if (!pageData) continue;
|
|
351
|
-
const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
|
|
352
|
-
if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
353
|
-
if (found > 0) searchFrom = found;
|
|
354
|
-
}
|
|
355
|
-
return updated;
|
|
356
|
-
};
|
|
357
|
-
/**
|
|
358
|
-
* Finds the position of a page prefix in content, trying multiple prefix lengths.
|
|
350
|
+
* @example
|
|
351
|
+
* // Using tokens with named captures
|
|
352
|
+
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
353
|
+
*
|
|
354
|
+
* @example
|
|
355
|
+
* // Using the numbered convenience token
|
|
356
|
+
* { lineStartsAfter: ['{{numbered}}'], split: 'at' }
|
|
359
357
|
*/
|
|
360
|
-
const
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
if (!prefix) continue;
|
|
364
|
-
const pos = content.indexOf(prefix, searchFrom);
|
|
365
|
-
if (pos > 0) return pos;
|
|
366
|
-
}
|
|
367
|
-
return -1;
|
|
358
|
+
const TOKEN_PATTERNS = {
|
|
359
|
+
...BASE_TOKENS,
|
|
360
|
+
...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
|
|
368
361
|
};
|
|
369
362
|
/**
|
|
370
|
-
*
|
|
363
|
+
* Regex pattern for matching tokens with optional named capture syntax.
|
|
371
364
|
*
|
|
372
|
-
*
|
|
373
|
-
*
|
|
374
|
-
*
|
|
375
|
-
*
|
|
365
|
+
* Matches:
|
|
366
|
+
* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
|
|
367
|
+
* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
|
|
368
|
+
* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
|
|
369
|
+
*
|
|
370
|
+
* @internal
|
|
376
371
|
*/
|
|
377
|
-
const
|
|
378
|
-
const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
|
|
379
|
-
if (!currentPageData) return 0;
|
|
380
|
-
const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
|
|
381
|
-
const needle = remStart.slice(0, Math.min(30, remStart.length));
|
|
382
|
-
if (!needle) return 0;
|
|
383
|
-
const idx = currentPageData.content.indexOf(needle);
|
|
384
|
-
return idx > 0 ? idx : 0;
|
|
385
|
-
};
|
|
372
|
+
const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
|
|
386
373
|
/**
|
|
387
|
-
*
|
|
388
|
-
* anchored near an expected boundary position to reduce collisions.
|
|
374
|
+
* Regex pattern for simple token matching (no capture syntax).
|
|
389
375
|
*
|
|
390
|
-
*
|
|
391
|
-
*
|
|
376
|
+
* Matches only `{{token}}` format where token is one or more word characters.
|
|
377
|
+
* Used by `containsTokens()` for quick detection.
|
|
378
|
+
*
|
|
379
|
+
* @internal
|
|
392
380
|
*/
|
|
393
|
-
const
|
|
394
|
-
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
395
|
-
if (!targetPageData) return -1;
|
|
396
|
-
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
397
|
-
const searchStart = Math.max(0, approx - 1e4);
|
|
398
|
-
const searchEnd = Math.min(remainingContent.length, approx + 2e3);
|
|
399
|
-
const targetTrimmed = targetPageData.content.trimStart();
|
|
400
|
-
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
401
|
-
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
402
|
-
if (!prefix) continue;
|
|
403
|
-
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
404
|
-
while (pos !== -1 && pos <= searchEnd) {
|
|
405
|
-
if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
|
|
406
|
-
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
407
|
-
}
|
|
408
|
-
const last = remainingContent.lastIndexOf(prefix, approx);
|
|
409
|
-
if (last > 0) return last;
|
|
410
|
-
}
|
|
411
|
-
return -1;
|
|
412
|
-
};
|
|
381
|
+
const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
|
|
413
382
|
/**
|
|
414
|
-
*
|
|
415
|
-
*
|
|
416
|
-
* This function computes page boundaries once per segment and enables
|
|
417
|
-
* O(log n) page lookups via binary search with `findPageIndexForPosition`.
|
|
383
|
+
* Checks if a query string contains template tokens.
|
|
418
384
|
*
|
|
419
|
-
*
|
|
420
|
-
*
|
|
421
|
-
*
|
|
385
|
+
* Performs a quick test for `{{token}}` patterns without actually
|
|
386
|
+
* expanding them. Useful for determining whether to apply token
|
|
387
|
+
* expansion to a string.
|
|
422
388
|
*
|
|
423
|
-
* @param
|
|
424
|
-
* @
|
|
425
|
-
* @param toIdx - Ending page index
|
|
426
|
-
* @param pageIds - Array of all page IDs
|
|
427
|
-
* @param normalizedPages - Map of page ID to normalized content
|
|
428
|
-
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
429
|
-
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
430
|
-
* with a sentinel boundary at segmentContent.length as the last element
|
|
389
|
+
* @param query - String to check for tokens
|
|
390
|
+
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
431
391
|
*
|
|
432
392
|
* @example
|
|
433
|
-
*
|
|
434
|
-
*
|
|
435
|
-
* // →
|
|
393
|
+
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
394
|
+
* containsTokens('plain text') // → false
|
|
395
|
+
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
436
396
|
*/
|
|
437
|
-
const
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
397
|
+
const containsTokens = (query) => {
|
|
398
|
+
SIMPLE_TOKEN_REGEX.lastIndex = 0;
|
|
399
|
+
return SIMPLE_TOKEN_REGEX.test(query);
|
|
400
|
+
};
|
|
401
|
+
const splitTemplateIntoSegments = (query) => {
|
|
402
|
+
const segments = [];
|
|
403
|
+
let lastIndex = 0;
|
|
404
|
+
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
405
|
+
let match;
|
|
406
|
+
while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
|
|
407
|
+
if (match.index > lastIndex) segments.push({
|
|
408
|
+
type: "text",
|
|
409
|
+
value: query.slice(lastIndex, match.index)
|
|
410
|
+
});
|
|
411
|
+
segments.push({
|
|
412
|
+
type: "token",
|
|
413
|
+
value: match[0]
|
|
414
|
+
});
|
|
415
|
+
lastIndex = match.index + match[0].length;
|
|
449
416
|
}
|
|
450
|
-
|
|
451
|
-
|
|
417
|
+
if (lastIndex < query.length) segments.push({
|
|
418
|
+
type: "text",
|
|
419
|
+
value: query.slice(lastIndex)
|
|
420
|
+
});
|
|
421
|
+
return segments;
|
|
422
|
+
};
|
|
423
|
+
const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
|
|
424
|
+
if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
|
|
425
|
+
return text;
|
|
426
|
+
};
|
|
427
|
+
const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
|
|
428
|
+
if (!fuzzyTransform) return tokenPattern;
|
|
429
|
+
return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
430
|
+
};
|
|
431
|
+
const parseTokenLiteral = (literal) => {
|
|
432
|
+
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
433
|
+
const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
|
|
434
|
+
if (!tokenMatch) return null;
|
|
435
|
+
const [, tokenName, captureName] = tokenMatch;
|
|
436
|
+
return {
|
|
437
|
+
captureName,
|
|
438
|
+
tokenName
|
|
439
|
+
};
|
|
440
|
+
};
|
|
441
|
+
const createCaptureRegistry = (capturePrefix) => {
|
|
442
|
+
const captureNames = [];
|
|
443
|
+
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
444
|
+
const register = (baseName) => {
|
|
445
|
+
const count = captureNameCounts.get(baseName) ?? 0;
|
|
446
|
+
captureNameCounts.set(baseName, count + 1);
|
|
447
|
+
const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
|
|
448
|
+
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
449
|
+
captureNames.push(prefixedName);
|
|
450
|
+
return prefixedName;
|
|
451
|
+
};
|
|
452
|
+
return {
|
|
453
|
+
captureNames,
|
|
454
|
+
register
|
|
455
|
+
};
|
|
456
|
+
};
|
|
457
|
+
const expandTokenLiteral = (literal, opts) => {
|
|
458
|
+
const parsed = parseTokenLiteral(literal);
|
|
459
|
+
if (!parsed) return literal;
|
|
460
|
+
const { tokenName, captureName } = parsed;
|
|
461
|
+
if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
|
|
462
|
+
let tokenPattern = TOKEN_PATTERNS[tokenName];
|
|
463
|
+
if (!tokenPattern) return literal;
|
|
464
|
+
tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
|
|
465
|
+
if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
|
|
466
|
+
return tokenPattern;
|
|
452
467
|
};
|
|
453
468
|
/**
|
|
454
|
-
*
|
|
455
|
-
* Uses "largest i where boundaryPositions[i] <= position" semantics.
|
|
469
|
+
* Expands template tokens with support for named captures.
|
|
456
470
|
*
|
|
457
|
-
*
|
|
458
|
-
*
|
|
459
|
-
*
|
|
460
|
-
*
|
|
471
|
+
* This is the primary token expansion function that handles all token syntax:
|
|
472
|
+
* - `{{token}}` → Expands to the token's pattern (no capture group)
|
|
473
|
+
* - `{{token:name}}` → Expands to `(?<name>pattern)` (named capture)
|
|
474
|
+
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
461
475
|
*
|
|
462
|
-
*
|
|
463
|
-
*
|
|
464
|
-
*
|
|
465
|
-
*
|
|
466
|
-
*
|
|
476
|
+
* Unknown tokens are left as-is in the output, allowing for partial templates.
|
|
477
|
+
*
|
|
478
|
+
* @param query - The template string containing tokens
|
|
479
|
+
* @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
|
|
480
|
+
* Applied to both token patterns and plain Arabic text between tokens.
|
|
481
|
+
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
482
|
+
* @returns Object with expanded pattern, capture names, and capture flag
|
|
483
|
+
*
|
|
484
|
+
* @example
|
|
485
|
+
* // Simple token expansion
|
|
486
|
+
* expandTokensWithCaptures('{{raqms}} {{dash}}')
|
|
487
|
+
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
488
|
+
*
|
|
489
|
+
* @example
|
|
490
|
+
* // Named capture
|
|
491
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}}')
|
|
492
|
+
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
493
|
+
*
|
|
494
|
+
* @example
|
|
495
|
+
* // Capture-only token
|
|
496
|
+
* expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
|
|
497
|
+
* // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
|
|
498
|
+
*
|
|
499
|
+
* @example
|
|
500
|
+
* // With fuzzy transform
|
|
501
|
+
* expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
|
|
502
|
+
* // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
|
|
467
503
|
*/
|
|
468
|
-
const
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
504
|
+
const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
|
|
505
|
+
const segments = splitTemplateIntoSegments(query);
|
|
506
|
+
const registry = createCaptureRegistry(capturePrefix);
|
|
507
|
+
const processedParts = segments.map((segment) => {
|
|
508
|
+
if (segment.type === "text") return maybeApplyFuzzyToText(segment.value, fuzzyTransform);
|
|
509
|
+
return expandTokenLiteral(segment.value, {
|
|
510
|
+
capturePrefix,
|
|
511
|
+
fuzzyTransform,
|
|
512
|
+
registerCapture: registry.register
|
|
513
|
+
});
|
|
514
|
+
});
|
|
515
|
+
return {
|
|
516
|
+
captureNames: registry.captureNames,
|
|
517
|
+
hasCaptures: registry.captureNames.length > 0,
|
|
518
|
+
pattern: processedParts.join("")
|
|
519
|
+
};
|
|
478
520
|
};
|
|
479
521
|
/**
|
|
480
|
-
*
|
|
522
|
+
* Expands template tokens in a query string to their regex equivalents.
|
|
481
523
|
*
|
|
482
|
-
*
|
|
483
|
-
*
|
|
484
|
-
*
|
|
524
|
+
* This is the simple version without capture support. It returns only the
|
|
525
|
+
* expanded pattern string, not capture metadata.
|
|
526
|
+
*
|
|
527
|
+
* Unknown tokens are left as-is, allowing for partial templates.
|
|
528
|
+
*
|
|
529
|
+
* @param query - Template string containing `{{token}}` placeholders
|
|
530
|
+
* @returns Expanded regex pattern string
|
|
531
|
+
*
|
|
532
|
+
* @example
|
|
533
|
+
* expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
|
|
534
|
+
* expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
|
|
535
|
+
* expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
|
|
536
|
+
* expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
|
|
537
|
+
*
|
|
538
|
+
* @see expandTokensWithCaptures for full capture group support
|
|
485
539
|
*/
|
|
486
|
-
const
|
|
487
|
-
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
488
|
-
const desiredNextIdx = windowEndIdx + 1;
|
|
489
|
-
const minNextIdx = currentFromIdx + 1;
|
|
490
|
-
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
491
|
-
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
492
|
-
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
493
|
-
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
494
|
-
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
|
|
495
|
-
if (pos > 0) return pos;
|
|
496
|
-
}
|
|
497
|
-
return remainingContent.length;
|
|
498
|
-
};
|
|
540
|
+
const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
|
|
499
541
|
/**
|
|
500
|
-
*
|
|
542
|
+
* Converts a template string to a compiled RegExp.
|
|
501
543
|
*
|
|
502
|
-
*
|
|
503
|
-
*
|
|
544
|
+
* Expands all tokens and attempts to compile the result as a RegExp
|
|
545
|
+
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
546
|
+
*
|
|
547
|
+
* @remarks
|
|
548
|
+
* This function dynamically compiles regular expressions from template strings.
|
|
549
|
+
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
550
|
+
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
551
|
+
* Consider validating pattern complexity or applying execution timeouts when
|
|
552
|
+
* running user-submitted patterns.
|
|
553
|
+
*
|
|
554
|
+
* @param template - Template string containing `{{token}}` placeholders
|
|
555
|
+
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
556
|
+
*
|
|
557
|
+
* @example
|
|
558
|
+
* templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
|
|
559
|
+
* templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
|
|
560
|
+
* templateToRegex('(((') // → null (invalid regex)
|
|
504
561
|
*/
|
|
505
|
-
const
|
|
506
|
-
const
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
562
|
+
const templateToRegex = (template) => {
|
|
563
|
+
const expanded = expandTokens(template);
|
|
564
|
+
try {
|
|
565
|
+
return new RegExp(expanded, "u");
|
|
566
|
+
} catch {
|
|
567
|
+
return null;
|
|
511
568
|
}
|
|
512
|
-
return -1;
|
|
513
569
|
};
|
|
514
570
|
/**
|
|
515
|
-
*
|
|
571
|
+
* Lists all available token names defined in `TOKEN_PATTERNS`.
|
|
516
572
|
*
|
|
517
|
-
*
|
|
518
|
-
*
|
|
519
|
-
*
|
|
520
|
-
* @
|
|
521
|
-
*
|
|
573
|
+
* Useful for documentation, validation, or building user interfaces
|
|
574
|
+
* that show available tokens.
|
|
575
|
+
*
|
|
576
|
+
* @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
|
|
577
|
+
*
|
|
578
|
+
* @example
|
|
579
|
+
* getAvailableTokens()
|
|
580
|
+
* // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
|
|
522
581
|
*/
|
|
523
|
-
const
|
|
524
|
-
if (excludeSet.size === 0) return false;
|
|
525
|
-
for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
|
|
526
|
-
return false;
|
|
527
|
-
};
|
|
582
|
+
const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
|
|
528
583
|
/**
|
|
529
|
-
*
|
|
530
|
-
* Returns -1 if not found.
|
|
584
|
+
* Gets the regex pattern for a specific token name.
|
|
531
585
|
*
|
|
532
|
-
*
|
|
533
|
-
*
|
|
534
|
-
*
|
|
586
|
+
* Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
|
|
587
|
+
* without any expansion or capture group wrapping.
|
|
588
|
+
*
|
|
589
|
+
* @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
|
|
590
|
+
* @returns The regex pattern string, or `undefined` if token doesn't exist
|
|
591
|
+
*
|
|
592
|
+
* @example
|
|
593
|
+
* getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
|
|
594
|
+
* getTokenPattern('dash') // → '[-–—ـ]'
|
|
595
|
+
* getTokenPattern('unknown') // → undefined
|
|
535
596
|
*/
|
|
536
|
-
const
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
597
|
+
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
598
|
+
/**
|
|
599
|
+
* Regex to detect fuzzy-default tokens in a pattern string.
|
|
600
|
+
* Matches {{token}} or {{token:name}} syntax.
|
|
601
|
+
*/
|
|
602
|
+
const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
|
|
603
|
+
"bab",
|
|
604
|
+
"basmalah",
|
|
605
|
+
"fasl",
|
|
606
|
+
"kitab",
|
|
607
|
+
"naql"
|
|
608
|
+
].join("|")})(?::\\w+)?\\}\\}`, "g");
|
|
609
|
+
/**
|
|
610
|
+
* Checks if a pattern (or array of patterns) contains tokens that should
|
|
611
|
+
* default to fuzzy matching.
|
|
612
|
+
*
|
|
613
|
+
* Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
|
|
614
|
+
*
|
|
615
|
+
* @param patterns - Single pattern string or array of pattern strings
|
|
616
|
+
* @returns `true` if any pattern contains a fuzzy-default token
|
|
617
|
+
*
|
|
618
|
+
* @example
|
|
619
|
+
* shouldDefaultToFuzzy('{{bab}} الإيمان') // true
|
|
620
|
+
* shouldDefaultToFuzzy('{{raqms}} {{dash}}') // false
|
|
621
|
+
* shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
|
|
622
|
+
*/
|
|
623
|
+
const shouldDefaultToFuzzy = (patterns) => {
|
|
624
|
+
return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
|
|
625
|
+
FUZZY_TOKEN_REGEX.lastIndex = 0;
|
|
626
|
+
return FUZZY_TOKEN_REGEX.test(p);
|
|
627
|
+
});
|
|
541
628
|
};
|
|
629
|
+
|
|
630
|
+
//#endregion
|
|
631
|
+
//#region src/segmentation/pattern-validator.ts
|
|
542
632
|
/**
|
|
543
|
-
*
|
|
633
|
+
* Pattern validation utilities for detecting common mistakes in rule patterns.
|
|
544
634
|
*
|
|
545
|
-
*
|
|
546
|
-
*
|
|
547
|
-
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
548
|
-
* @returns Break position after the selected match, or -1 if no matches
|
|
635
|
+
* These utilities help catch typos and issues early, before rules are used
|
|
636
|
+
* for segmentation.
|
|
549
637
|
*/
|
|
550
|
-
const
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
index: m.index,
|
|
556
|
-
length: m[0].length
|
|
557
|
-
};
|
|
558
|
-
if (!first) first = match;
|
|
559
|
-
last = match;
|
|
560
|
-
}
|
|
561
|
-
if (!first) return -1;
|
|
562
|
-
const selected = prefer === "longer" ? last : first;
|
|
563
|
-
return selected.index + selected.length;
|
|
638
|
+
const KNOWN_TOKENS = new Set(getAvailableTokens());
|
|
639
|
+
const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
|
|
640
|
+
const buildBareTokenRegex = () => {
|
|
641
|
+
const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
|
|
642
|
+
return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
|
|
564
643
|
};
|
|
565
644
|
/**
|
|
566
|
-
*
|
|
567
|
-
* Returns break position or -1 if no valid position found.
|
|
645
|
+
* Validates a single pattern for common issues.
|
|
568
646
|
*/
|
|
569
|
-
const
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
647
|
+
const validatePattern = (pattern, seenPatterns) => {
|
|
648
|
+
if (seenPatterns.has(pattern)) return {
|
|
649
|
+
message: `Duplicate pattern: "${pattern}"`,
|
|
650
|
+
type: "duplicate"
|
|
651
|
+
};
|
|
652
|
+
seenPatterns.add(pattern);
|
|
653
|
+
const tokensInBraces = [...pattern.matchAll(TOKEN_INSIDE_BRACES)];
|
|
654
|
+
for (const match of tokensInBraces) {
|
|
655
|
+
const tokenName = match[1];
|
|
656
|
+
if (!KNOWN_TOKENS.has(tokenName)) return {
|
|
657
|
+
message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
|
|
658
|
+
suggestion: `Check spelling or use a known token`,
|
|
659
|
+
type: "unknown_token"
|
|
660
|
+
};
|
|
661
|
+
}
|
|
662
|
+
const bareTokenRegex = buildBareTokenRegex();
|
|
663
|
+
const bareMatches = [...pattern.matchAll(bareTokenRegex)];
|
|
664
|
+
for (const match of bareMatches) {
|
|
665
|
+
const tokenName = match[1];
|
|
666
|
+
const fullMatch = match[0];
|
|
667
|
+
const matchIndex = match.index;
|
|
668
|
+
const before = pattern.slice(Math.max(0, matchIndex - 2), matchIndex);
|
|
669
|
+
const after = pattern.slice(matchIndex + fullMatch.length, matchIndex + fullMatch.length + 2);
|
|
670
|
+
if (before !== "{{" && after !== "}}") return {
|
|
671
|
+
message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
|
|
672
|
+
suggestion: `{{${fullMatch}}}`,
|
|
673
|
+
type: "missing_braces"
|
|
674
|
+
};
|
|
577
675
|
}
|
|
578
|
-
return Math.min(windowEndPosition, remainingContent.length);
|
|
579
676
|
};
|
|
580
677
|
/**
|
|
581
|
-
*
|
|
582
|
-
* Returns the break position or -1 if no suitable break was found.
|
|
583
|
-
*
|
|
584
|
-
* @param remainingContent - Content remaining to be segmented
|
|
585
|
-
* @param currentFromIdx - Current starting page index
|
|
586
|
-
* @param toIdx - Ending page index
|
|
587
|
-
* @param windowEndIdx - Maximum window end index
|
|
588
|
-
* @param ctx - Breakpoint context with page data and patterns
|
|
589
|
-
* @returns Break position in the content, or -1 if no break found
|
|
678
|
+
* Validates an array of patterns, returning parallel array of issues.
|
|
590
679
|
*/
|
|
591
|
-
const
|
|
592
|
-
const
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
597
|
-
if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
|
|
598
|
-
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
599
|
-
if (breakPos > 0) return breakPos;
|
|
600
|
-
}
|
|
601
|
-
return -1;
|
|
680
|
+
const validatePatternArray = (patterns) => {
|
|
681
|
+
const seenPatterns = /* @__PURE__ */ new Set();
|
|
682
|
+
const issues = patterns.map((p) => validatePattern(p, seenPatterns));
|
|
683
|
+
if (issues.every((i) => i === void 0)) return;
|
|
684
|
+
return issues;
|
|
602
685
|
};
|
|
603
|
-
|
|
604
|
-
//#endregion
|
|
605
|
-
//#region src/segmentation/breakpoint-processor.ts
|
|
606
686
|
/**
|
|
607
|
-
*
|
|
687
|
+
* Validates split rules for common pattern issues.
|
|
608
688
|
*
|
|
609
|
-
*
|
|
610
|
-
*
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
689
|
+
* Checks for:
|
|
690
|
+
* - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
|
|
691
|
+
* - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
|
|
692
|
+
* - Duplicate patterns within the same rule
|
|
693
|
+
*
|
|
694
|
+
* @param rules - Array of split rules to validate
|
|
695
|
+
* @returns Array parallel to input with validation results (undefined if no issues)
|
|
696
|
+
*
|
|
697
|
+
* @example
|
|
698
|
+
* const issues = validateRules([
|
|
699
|
+
* { lineStartsAfter: ['raqms:num'] }, // Missing braces
|
|
700
|
+
* { lineStartsWith: ['{{unknown}}'] }, // Unknown token
|
|
701
|
+
* ]);
|
|
702
|
+
* // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
|
|
703
|
+
* // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
|
|
704
|
+
*/
|
|
705
|
+
const validateRules = (rules) => {
|
|
706
|
+
return rules.map((rule) => {
|
|
707
|
+
const result = {};
|
|
708
|
+
let hasIssues = false;
|
|
709
|
+
if ("lineStartsWith" in rule && rule.lineStartsWith) {
|
|
710
|
+
const issues = validatePatternArray(rule.lineStartsWith);
|
|
711
|
+
if (issues) {
|
|
712
|
+
result.lineStartsWith = issues;
|
|
713
|
+
hasIssues = true;
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
if ("lineStartsAfter" in rule && rule.lineStartsAfter) {
|
|
717
|
+
const issues = validatePatternArray(rule.lineStartsAfter);
|
|
718
|
+
if (issues) {
|
|
719
|
+
result.lineStartsAfter = issues;
|
|
720
|
+
hasIssues = true;
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
if ("lineEndsWith" in rule && rule.lineEndsWith) {
|
|
724
|
+
const issues = validatePatternArray(rule.lineEndsWith);
|
|
725
|
+
if (issues) {
|
|
726
|
+
result.lineEndsWith = issues;
|
|
727
|
+
hasIssues = true;
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
if ("template" in rule && rule.template) {
|
|
731
|
+
const seenPatterns = /* @__PURE__ */ new Set();
|
|
732
|
+
const issue = validatePattern(rule.template, seenPatterns);
|
|
733
|
+
if (issue) {
|
|
734
|
+
result.template = issue;
|
|
735
|
+
hasIssues = true;
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
return hasIssues ? result : void 0;
|
|
739
|
+
});
|
|
624
740
|
};
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
741
|
+
|
|
742
|
+
//#endregion
|
|
743
|
+
//#region src/segmentation/replace.ts
|
|
744
|
+
const DEFAULT_REPLACE_FLAGS = "gu";
|
|
745
|
+
const normalizeReplaceFlags = (flags) => {
|
|
746
|
+
if (!flags) return DEFAULT_REPLACE_FLAGS;
|
|
747
|
+
const allowed = new Set([
|
|
748
|
+
"g",
|
|
749
|
+
"i",
|
|
750
|
+
"m",
|
|
751
|
+
"s",
|
|
752
|
+
"u",
|
|
753
|
+
"y"
|
|
754
|
+
]);
|
|
755
|
+
const set = /* @__PURE__ */ new Set();
|
|
756
|
+
for (const ch of flags) {
|
|
757
|
+
if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
|
|
758
|
+
set.add(ch);
|
|
633
759
|
}
|
|
634
|
-
|
|
760
|
+
set.add("g");
|
|
761
|
+
set.add("u");
|
|
762
|
+
return [
|
|
763
|
+
"g",
|
|
764
|
+
"i",
|
|
765
|
+
"m",
|
|
766
|
+
"s",
|
|
767
|
+
"y",
|
|
768
|
+
"u"
|
|
769
|
+
].filter((c) => set.has(c)).join("");
|
|
635
770
|
};
|
|
636
|
-
const
|
|
637
|
-
const
|
|
638
|
-
const
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
771
|
+
const compileReplaceRules = (rules) => {
|
|
772
|
+
const compiled = [];
|
|
773
|
+
for (const r of rules) {
|
|
774
|
+
if (r.pageIds && r.pageIds.length === 0) continue;
|
|
775
|
+
const flags = normalizeReplaceFlags(r.flags);
|
|
776
|
+
const re = new RegExp(r.regex, flags);
|
|
777
|
+
compiled.push({
|
|
778
|
+
pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
|
|
779
|
+
re,
|
|
780
|
+
replacement: r.replacement
|
|
781
|
+
});
|
|
782
|
+
}
|
|
783
|
+
return compiled;
|
|
643
784
|
};
|
|
644
|
-
const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
|
|
645
|
-
const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
|
|
646
785
|
/**
|
|
647
|
-
*
|
|
648
|
-
* precomputed boundary positions and binary search.
|
|
786
|
+
* Applies ordered regex replacements to page content (per page).
|
|
649
787
|
*
|
|
650
|
-
*
|
|
651
|
-
*
|
|
652
|
-
*
|
|
653
|
-
*
|
|
654
|
-
*
|
|
655
|
-
*
|
|
788
|
+
* - Replacement rules are applied in array order.
|
|
789
|
+
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
790
|
+
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
791
|
+
*
|
|
792
|
+
* This function is intentionally **pure**:
|
|
793
|
+
* it returns a new pages array only when changes are needed, otherwise it returns the original pages.
|
|
656
794
|
*/
|
|
657
|
-
const
|
|
658
|
-
|
|
659
|
-
const
|
|
660
|
-
return
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
let nextFromIdx = actualEndIdx;
|
|
667
|
-
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
668
|
-
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
669
|
-
if (nextPageData) {
|
|
670
|
-
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
671
|
-
const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
|
|
672
|
-
if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
|
|
795
|
+
const applyReplacements = (pages, rules) => {
|
|
796
|
+
if (!rules || rules.length === 0 || pages.length === 0) return pages;
|
|
797
|
+
const compiled = compileReplaceRules(rules);
|
|
798
|
+
if (compiled.length === 0) return pages;
|
|
799
|
+
return pages.map((p) => {
|
|
800
|
+
let content = p.content;
|
|
801
|
+
for (const rule of compiled) {
|
|
802
|
+
if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
|
|
803
|
+
content = content.replace(rule.re, rule.replacement);
|
|
673
804
|
}
|
|
674
|
-
|
|
675
|
-
|
|
805
|
+
if (content === p.content) return p;
|
|
806
|
+
return {
|
|
807
|
+
...p,
|
|
808
|
+
content
|
|
809
|
+
};
|
|
810
|
+
});
|
|
676
811
|
};
|
|
677
|
-
|
|
812
|
+
|
|
813
|
+
//#endregion
|
|
814
|
+
//#region src/segmentation/breakpoint-utils.ts
|
|
815
|
+
const WINDOW_PREFIX_LENGTHS = [
|
|
816
|
+
80,
|
|
817
|
+
60,
|
|
818
|
+
40,
|
|
819
|
+
30,
|
|
820
|
+
20,
|
|
821
|
+
15
|
|
822
|
+
];
|
|
823
|
+
const JOINER_PREFIX_LENGTHS = [
|
|
824
|
+
80,
|
|
825
|
+
60,
|
|
826
|
+
40,
|
|
827
|
+
30,
|
|
828
|
+
20,
|
|
829
|
+
15,
|
|
830
|
+
12,
|
|
831
|
+
10,
|
|
832
|
+
8,
|
|
833
|
+
6
|
|
834
|
+
];
|
|
678
835
|
/**
|
|
679
|
-
*
|
|
836
|
+
* Normalizes a breakpoint to the object form.
|
|
837
|
+
* Strings are converted to { pattern: str } with no constraints.
|
|
680
838
|
*
|
|
681
|
-
* @
|
|
839
|
+
* @param bp - Breakpoint as string or object
|
|
840
|
+
* @returns Normalized BreakpointRule object
|
|
841
|
+
*
|
|
842
|
+
* @example
|
|
843
|
+
* normalizeBreakpoint('\\n\\n')
|
|
844
|
+
* // → { pattern: '\\n\\n' }
|
|
845
|
+
*
|
|
846
|
+
* normalizeBreakpoint({ pattern: '\\n', min: 10 })
|
|
847
|
+
* // → { pattern: '\\n', min: 10 }
|
|
682
848
|
*/
|
|
683
|
-
const
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
849
|
+
const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
|
|
850
|
+
/**
|
|
851
|
+
* Checks if a page ID is in an excluded list (single pages or ranges).
|
|
852
|
+
*
|
|
853
|
+
* @param pageId - Page ID to check
|
|
854
|
+
* @param excludeList - List of page IDs or [from, to] ranges to exclude
|
|
855
|
+
* @returns True if page is excluded
|
|
856
|
+
*
|
|
857
|
+
* @example
|
|
858
|
+
* isPageExcluded(5, [1, 5, 10])
|
|
859
|
+
* // → true
|
|
860
|
+
*
|
|
861
|
+
* isPageExcluded(5, [[3, 7]])
|
|
862
|
+
* // → true
|
|
863
|
+
*
|
|
864
|
+
* isPageExcluded(5, [[10, 20]])
|
|
865
|
+
* // → false
|
|
866
|
+
*/
|
|
867
|
+
const isPageExcluded = (pageId, excludeList) => {
|
|
868
|
+
if (!excludeList || excludeList.length === 0) return false;
|
|
869
|
+
for (const item of excludeList) if (typeof item === "number") {
|
|
870
|
+
if (pageId === item) return true;
|
|
871
|
+
} else {
|
|
872
|
+
const [from, to] = item;
|
|
873
|
+
if (pageId >= from && pageId <= to) return true;
|
|
687
874
|
}
|
|
688
|
-
|
|
689
|
-
expandedBreakpoints,
|
|
690
|
-
normalizedPages,
|
|
691
|
-
pageIds,
|
|
692
|
-
prefer
|
|
693
|
-
});
|
|
694
|
-
return patternBreak > 0 ? patternBreak : windowEndPosition;
|
|
875
|
+
return false;
|
|
695
876
|
};
|
|
696
877
|
/**
|
|
697
|
-
*
|
|
878
|
+
* Checks if a page ID is within a breakpoint's min/max range and not excluded.
|
|
879
|
+
*
|
|
880
|
+
* @param pageId - Page ID to check
|
|
881
|
+
* @param rule - Breakpoint rule with optional min/max/exclude constraints
|
|
882
|
+
* @returns True if page is within valid range
|
|
883
|
+
*
|
|
884
|
+
* @example
|
|
885
|
+
* isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
|
|
886
|
+
* // → true
|
|
887
|
+
*
|
|
888
|
+
* isInBreakpointRange(5, { pattern: '\\n', min: 10 })
|
|
889
|
+
* // → false (below min)
|
|
698
890
|
*/
|
|
699
|
-
const
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
return
|
|
891
|
+
const isInBreakpointRange = (pageId, rule) => {
|
|
892
|
+
if (rule.min !== void 0 && pageId < rule.min) return false;
|
|
893
|
+
if (rule.max !== void 0 && pageId > rule.max) return false;
|
|
894
|
+
return !isPageExcluded(pageId, rule.exclude);
|
|
703
895
|
};
|
|
704
896
|
/**
|
|
705
|
-
*
|
|
706
|
-
* breaking it into smaller pieces that fit within maxPages constraints.
|
|
897
|
+
* Builds an exclude set from a PageRange array for O(1) lookups.
|
|
707
898
|
*
|
|
708
|
-
*
|
|
899
|
+
* @param excludeList - List of page IDs or [from, to] ranges
|
|
900
|
+
* @returns Set of all excluded page IDs
|
|
901
|
+
*
|
|
902
|
+
* @remarks
|
|
903
|
+
* This expands ranges into explicit page IDs for fast membership checks. For typical
|
|
904
|
+
* book-scale inputs (thousands of pages), this is small and keeps downstream logic
|
|
905
|
+
* simple and fast. If you expect extremely large ranges (e.g., millions of pages),
|
|
906
|
+
* consider avoiding broad excludes or introducing a range-based membership structure.
|
|
907
|
+
*
|
|
908
|
+
* @example
|
|
909
|
+
* buildExcludeSet([1, 5, [10, 12]])
|
|
910
|
+
* // → Set { 1, 5, 10, 11, 12 }
|
|
709
911
|
*/
|
|
710
|
-
const
|
|
711
|
-
const
|
|
712
|
-
const
|
|
713
|
-
let
|
|
714
|
-
|
|
715
|
-
let isFirstPiece = true;
|
|
716
|
-
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
717
|
-
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
718
|
-
boundaryPositions,
|
|
719
|
-
fromIdx,
|
|
720
|
-
fullContentLength: fullContent.length,
|
|
721
|
-
toIdx
|
|
722
|
-
});
|
|
723
|
-
const maxIterations = 1e4;
|
|
724
|
-
for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
|
|
725
|
-
const remainingContent = fullContent.slice(cursorPos);
|
|
726
|
-
if (!remainingContent.trim()) break;
|
|
727
|
-
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
728
|
-
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
729
|
-
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
730
|
-
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
731
|
-
if (finalSeg) result.push(finalSeg);
|
|
732
|
-
break;
|
|
733
|
-
}
|
|
734
|
-
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
735
|
-
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
736
|
-
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
737
|
-
currentFromIdx,
|
|
738
|
-
cursorPos,
|
|
739
|
-
windowEndIdx
|
|
740
|
-
});
|
|
741
|
-
const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
742
|
-
const breakPos = cursorPos + breakOffset;
|
|
743
|
-
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
744
|
-
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
745
|
-
logger?.trace?.("[breakpoints] piece", {
|
|
746
|
-
actualEndIdx,
|
|
747
|
-
actualStartIdx,
|
|
748
|
-
pieceLength: pieceContent.length
|
|
749
|
-
});
|
|
750
|
-
if (pieceContent) {
|
|
751
|
-
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
752
|
-
if (pieceSeg) result.push(pieceSeg);
|
|
753
|
-
}
|
|
754
|
-
cursorPos = skipWhitespace(fullContent, breakPos);
|
|
755
|
-
currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
756
|
-
isFirstPiece = false;
|
|
757
|
-
}
|
|
758
|
-
logger?.debug?.("[breakpoints] done", { resultCount: result.length });
|
|
759
|
-
return result;
|
|
912
|
+
const buildExcludeSet = (excludeList) => {
|
|
913
|
+
const excludeSet = /* @__PURE__ */ new Set();
|
|
914
|
+
for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
|
|
915
|
+
else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
|
|
916
|
+
return excludeSet;
|
|
760
917
|
};
|
|
761
918
|
/**
|
|
762
|
-
*
|
|
919
|
+
* Creates a segment with optional to and meta fields.
|
|
920
|
+
* Returns null if content is empty after trimming.
|
|
763
921
|
*
|
|
764
|
-
*
|
|
922
|
+
* @param content - Segment content
|
|
923
|
+
* @param fromPageId - Starting page ID
|
|
924
|
+
* @param toPageId - Optional ending page ID (omitted if same as from)
|
|
925
|
+
* @param meta - Optional metadata to attach
|
|
926
|
+
* @returns Segment object or null if empty
|
|
927
|
+
*
|
|
928
|
+
* @example
|
|
929
|
+
* createSegment('Hello world', 1, 3, { chapter: 1 })
|
|
930
|
+
* // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
|
|
931
|
+
*
|
|
932
|
+
* createSegment(' ', 1, undefined, undefined)
|
|
933
|
+
* // → null (empty content)
|
|
765
934
|
*/
|
|
766
|
-
const
|
|
767
|
-
const
|
|
768
|
-
|
|
769
|
-
const
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
});
|
|
777
|
-
logger?.debug?.("[breakpoints] inputSegments", {
|
|
778
|
-
segmentCount: segments.length,
|
|
779
|
-
segments: segments.map((s) => ({
|
|
780
|
-
contentLength: s.content.length,
|
|
781
|
-
from: s.from,
|
|
782
|
-
to: s.to
|
|
783
|
-
}))
|
|
784
|
-
});
|
|
785
|
-
for (const segment of segments) {
|
|
786
|
-
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
787
|
-
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
788
|
-
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
789
|
-
const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
|
|
790
|
-
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
791
|
-
result.push(segment);
|
|
792
|
-
continue;
|
|
793
|
-
}
|
|
794
|
-
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
795
|
-
result.push(...broken.map((s) => {
|
|
796
|
-
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
797
|
-
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
798
|
-
if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
|
|
799
|
-
...s,
|
|
800
|
-
content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
|
|
801
|
-
};
|
|
802
|
-
return s;
|
|
803
|
-
}));
|
|
804
|
-
}
|
|
805
|
-
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
806
|
-
return result;
|
|
935
|
+
const createSegment = (content, fromPageId, toPageId, meta) => {
|
|
936
|
+
const trimmed = content.trim();
|
|
937
|
+
if (!trimmed) return null;
|
|
938
|
+
const seg = {
|
|
939
|
+
content: trimmed,
|
|
940
|
+
from: fromPageId
|
|
941
|
+
};
|
|
942
|
+
if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
|
|
943
|
+
if (meta) seg.meta = meta;
|
|
944
|
+
return seg;
|
|
807
945
|
};
|
|
808
|
-
|
|
809
|
-
//#endregion
|
|
810
|
-
//#region src/segmentation/match-utils.ts
|
|
811
946
|
/**
|
|
812
|
-
*
|
|
947
|
+
* Expands breakpoint patterns and pre-computes exclude sets.
|
|
813
948
|
*
|
|
814
|
-
*
|
|
815
|
-
*
|
|
816
|
-
*
|
|
949
|
+
* @param breakpoints - Array of breakpoint patterns or rules
|
|
950
|
+
* @param processPattern - Function to expand tokens in patterns
|
|
951
|
+
* @returns Array of expanded breakpoints with compiled regexes
|
|
817
952
|
*
|
|
818
|
-
* @
|
|
953
|
+
* @remarks
|
|
954
|
+
* This function compiles regex patterns dynamically. This can be a ReDoS vector
|
|
955
|
+
* if patterns come from untrusted sources. In typical usage, breakpoint rules
|
|
956
|
+
* are application configuration, not user input.
|
|
819
957
|
*/
|
|
958
|
+
const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
|
|
959
|
+
const rule = normalizeBreakpoint(bp);
|
|
960
|
+
const excludeSet = buildExcludeSet(rule.exclude);
|
|
961
|
+
const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
|
|
962
|
+
const expandedSkip = processPattern$1(rule.skipWhen);
|
|
963
|
+
try {
|
|
964
|
+
return new RegExp(expandedSkip, "mu");
|
|
965
|
+
} catch (error) {
|
|
966
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
967
|
+
throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
|
|
968
|
+
}
|
|
969
|
+
})() : null;
|
|
970
|
+
if (rule.pattern === "") return {
|
|
971
|
+
excludeSet,
|
|
972
|
+
regex: null,
|
|
973
|
+
rule,
|
|
974
|
+
skipWhenRegex
|
|
975
|
+
};
|
|
976
|
+
const expanded = processPattern$1(rule.pattern);
|
|
977
|
+
try {
|
|
978
|
+
return {
|
|
979
|
+
excludeSet,
|
|
980
|
+
regex: new RegExp(expanded, "gmu"),
|
|
981
|
+
rule,
|
|
982
|
+
skipWhenRegex
|
|
983
|
+
};
|
|
984
|
+
} catch (error) {
|
|
985
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
986
|
+
throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
|
|
987
|
+
}
|
|
988
|
+
});
|
|
820
989
|
/**
|
|
821
|
-
*
|
|
822
|
-
*
|
|
823
|
-
* Only includes groups that are in the `captureNames` list and have
|
|
824
|
-
* defined values. This filters out positional captures and ensures
|
|
825
|
-
* only explicitly requested named captures are returned.
|
|
826
|
-
*
|
|
827
|
-
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
828
|
-
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
829
|
-
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
830
|
-
*
|
|
831
|
-
* @example
|
|
832
|
-
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
833
|
-
* extractNamedCaptures(match.groups, ['num'])
|
|
834
|
-
* // → { num: '٦٦٩٦' }
|
|
990
|
+
* Applies a configured joiner at detected page boundaries within a multi-page content chunk.
|
|
835
991
|
*
|
|
836
|
-
*
|
|
837
|
-
*
|
|
838
|
-
*
|
|
839
|
-
*
|
|
992
|
+
* This is used for breakpoint-generated segments which don't have access to the original
|
|
993
|
+
* `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
|
|
994
|
+
* prefix after the previous boundary, then replace ONLY the single newline immediately before
|
|
995
|
+
* that page start.
|
|
840
996
|
*
|
|
841
|
-
*
|
|
842
|
-
* // Undefined groups
|
|
843
|
-
* extractNamedCaptures(undefined, ['num'])
|
|
844
|
-
* // → undefined
|
|
997
|
+
* This avoids converting real in-page newlines, while still normalizing page joins consistently.
|
|
845
998
|
*/
|
|
846
|
-
const
|
|
847
|
-
if (
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
999
|
+
const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
|
|
1000
|
+
if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
|
|
1001
|
+
let updated = content;
|
|
1002
|
+
let searchFrom = 0;
|
|
1003
|
+
for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
|
|
1004
|
+
const pageData = normalizedPages.get(pageIds[pi]);
|
|
1005
|
+
if (!pageData) continue;
|
|
1006
|
+
const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
|
|
1007
|
+
if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
|
|
1008
|
+
if (found > 0) searchFrom = found;
|
|
1009
|
+
}
|
|
1010
|
+
return updated;
|
|
851
1011
|
};
|
|
852
1012
|
/**
|
|
853
|
-
*
|
|
854
|
-
*
|
|
855
|
-
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
856
|
-
* is always at the end of the pattern. Named captures may shift the
|
|
857
|
-
* positional indices, so we iterate backward to find the actual content.
|
|
858
|
-
*
|
|
859
|
-
* @param match - RegExp exec result array
|
|
860
|
-
* @returns The last defined capture group value, or `undefined` if none
|
|
861
|
-
*
|
|
862
|
-
* @example
|
|
863
|
-
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
864
|
-
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
865
|
-
* getLastPositionalCapture(match)
|
|
866
|
-
* // → 'content'
|
|
867
|
-
*
|
|
868
|
-
* @example
|
|
869
|
-
* // No captures
|
|
870
|
-
* getLastPositionalCapture(['full match'])
|
|
871
|
-
* // → undefined
|
|
1013
|
+
* Finds the position of a page prefix in content, trying multiple prefix lengths.
|
|
872
1014
|
*/
|
|
873
|
-
const
|
|
874
|
-
|
|
875
|
-
|
|
1015
|
+
const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
|
|
1016
|
+
for (const len of JOINER_PREFIX_LENGTHS) {
|
|
1017
|
+
const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
|
|
1018
|
+
if (!prefix) continue;
|
|
1019
|
+
const pos = content.indexOf(prefix, searchFrom);
|
|
1020
|
+
if (pos > 0) return pos;
|
|
1021
|
+
}
|
|
1022
|
+
return -1;
|
|
876
1023
|
};
|
|
877
1024
|
/**
|
|
878
|
-
*
|
|
879
|
-
*
|
|
880
|
-
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
881
|
-
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
882
|
-
*
|
|
883
|
-
* @param matches - Array of match results to filter
|
|
884
|
-
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
885
|
-
* @param getId - Function that returns the page ID for a given offset
|
|
886
|
-
* @returns Filtered array containing only matches within constraints
|
|
1025
|
+
* Estimates how far into the current page `remainingContent` begins.
|
|
887
1026
|
*
|
|
888
|
-
*
|
|
889
|
-
*
|
|
890
|
-
*
|
|
891
|
-
*
|
|
892
|
-
* { start: 200, end: 210 }, // Page 10
|
|
893
|
-
* ];
|
|
894
|
-
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
895
|
-
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1027
|
+
* During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
|
|
1028
|
+
* When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
|
|
1029
|
+
* expected boundary positions. This helper computes an approximate starting offset by matching
|
|
1030
|
+
* a short prefix of `remainingContent` inside the current page content.
|
|
896
1031
|
*/
|
|
897
|
-
const
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
1032
|
+
const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
|
|
1033
|
+
const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
|
|
1034
|
+
if (!currentPageData) return 0;
|
|
1035
|
+
const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
|
|
1036
|
+
const needle = remStart.slice(0, Math.min(30, remStart.length));
|
|
1037
|
+
if (!needle) return 0;
|
|
1038
|
+
const idx = currentPageData.content.indexOf(needle);
|
|
1039
|
+
return idx > 0 ? idx : 0;
|
|
905
1040
|
};
|
|
906
1041
|
/**
|
|
907
|
-
*
|
|
1042
|
+
* Attempts to find the start position of a target page within remainingContent,
|
|
1043
|
+
* anchored near an expected boundary position to reduce collisions.
|
|
908
1044
|
*
|
|
909
|
-
*
|
|
910
|
-
*
|
|
1045
|
+
* This is used to define breakpoint windows in terms of actual content being split, rather than
|
|
1046
|
+
* raw per-page offsets which can desync when structural rules strip markers.
|
|
1047
|
+
*/
|
|
1048
|
+
const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
|
|
1049
|
+
const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
|
|
1050
|
+
if (!targetPageData) return -1;
|
|
1051
|
+
const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
|
|
1052
|
+
const searchStart = Math.max(0, approx - 1e4);
|
|
1053
|
+
const searchEnd = Math.min(remainingContent.length, approx + 2e3);
|
|
1054
|
+
const targetTrimmed = targetPageData.content.trimStart();
|
|
1055
|
+
for (const len of WINDOW_PREFIX_LENGTHS) {
|
|
1056
|
+
const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
|
|
1057
|
+
if (!prefix) continue;
|
|
1058
|
+
let pos = remainingContent.indexOf(prefix, searchStart);
|
|
1059
|
+
while (pos !== -1 && pos <= searchEnd) {
|
|
1060
|
+
if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
|
|
1061
|
+
pos = remainingContent.indexOf(prefix, pos + 1);
|
|
1062
|
+
}
|
|
1063
|
+
const last = remainingContent.lastIndexOf(prefix, approx);
|
|
1064
|
+
if (last > 0) return last;
|
|
1065
|
+
}
|
|
1066
|
+
return -1;
|
|
1067
|
+
};
|
|
1068
|
+
/**
|
|
1069
|
+
* Builds a boundary position map for pages within the given range.
|
|
911
1070
|
*
|
|
912
|
-
* This
|
|
913
|
-
*
|
|
1071
|
+
* This function computes page boundaries once per segment and enables
|
|
1072
|
+
* O(log n) page lookups via binary search with `findPageIndexForPosition`.
|
|
914
1073
|
*
|
|
915
|
-
*
|
|
916
|
-
*
|
|
917
|
-
*
|
|
1074
|
+
* Boundaries are derived from segmentContent (post-structural-rules).
|
|
1075
|
+
* When the segment starts mid-page, an offset correction is applied to
|
|
1076
|
+
* keep boundary estimates aligned with the segment's actual content space.
|
|
1077
|
+
*
|
|
1078
|
+
* @param segmentContent - Full segment content (already processed by structural rules)
|
|
1079
|
+
* @param fromIdx - Starting page index
|
|
1080
|
+
* @param toIdx - Ending page index
|
|
1081
|
+
* @param pageIds - Array of all page IDs
|
|
1082
|
+
* @param normalizedPages - Map of page ID to normalized content
|
|
1083
|
+
* @param cumulativeOffsets - Cumulative character offsets (for estimates)
|
|
1084
|
+
* @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
|
|
1085
|
+
* with a sentinel boundary at segmentContent.length as the last element
|
|
918
1086
|
*
|
|
919
1087
|
* @example
|
|
920
|
-
*
|
|
921
|
-
*
|
|
922
|
-
*
|
|
923
|
-
|
|
1088
|
+
* // For a 3-page segment:
|
|
1089
|
+
* buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
|
|
1090
|
+
* // → [0, 23, 45, 67] where 67 is content.length (sentinel)
|
|
1091
|
+
*/
|
|
1092
|
+
const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1093
|
+
const boundaryPositions = [0];
|
|
1094
|
+
const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
|
|
1095
|
+
for (let i = fromIdx + 1; i <= toIdx; i++) {
|
|
1096
|
+
const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
|
|
1097
|
+
const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
|
|
1098
|
+
const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
|
|
1099
|
+
if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
|
|
1100
|
+
else {
|
|
1101
|
+
const estimate = Math.max(prevBoundary + 1, expectedBoundary);
|
|
1102
|
+
boundaryPositions.push(Math.min(estimate, segmentContent.length));
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
boundaryPositions.push(segmentContent.length);
|
|
1106
|
+
return boundaryPositions;
|
|
1107
|
+
};
|
|
1108
|
+
/**
|
|
1109
|
+
* Binary search to find which page a position falls within.
|
|
1110
|
+
* Uses "largest i where boundaryPositions[i] <= position" semantics.
|
|
924
1111
|
*
|
|
925
|
-
*
|
|
926
|
-
*
|
|
927
|
-
*
|
|
1112
|
+
* @param position - Character position in segmentContent
|
|
1113
|
+
* @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
|
|
1114
|
+
* @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
|
|
1115
|
+
* @returns Page index in pageIds array
|
|
928
1116
|
*
|
|
929
1117
|
* @example
|
|
930
|
-
* //
|
|
931
|
-
*
|
|
1118
|
+
* // With boundaries [0, 20, 40, 60] and fromIdx=0:
|
|
1119
|
+
* findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
|
|
1120
|
+
* findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
|
|
1121
|
+
* findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
|
|
932
1122
|
*/
|
|
933
|
-
const
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
//#endregion
|
|
942
|
-
//#region src/segmentation/replace.ts
|
|
943
|
-
const DEFAULT_REPLACE_FLAGS = "gu";
|
|
944
|
-
const normalizeReplaceFlags = (flags) => {
|
|
945
|
-
if (!flags) return DEFAULT_REPLACE_FLAGS;
|
|
946
|
-
const allowed = new Set([
|
|
947
|
-
"g",
|
|
948
|
-
"i",
|
|
949
|
-
"m",
|
|
950
|
-
"s",
|
|
951
|
-
"u",
|
|
952
|
-
"y"
|
|
953
|
-
]);
|
|
954
|
-
const set = /* @__PURE__ */ new Set();
|
|
955
|
-
for (const ch of flags) {
|
|
956
|
-
if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
|
|
957
|
-
set.add(ch);
|
|
1123
|
+
const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
|
|
1124
|
+
if (boundaryPositions.length <= 1) return fromIdx;
|
|
1125
|
+
let left = 0;
|
|
1126
|
+
let right = boundaryPositions.length - 2;
|
|
1127
|
+
while (left < right) {
|
|
1128
|
+
const mid = Math.ceil((left + right) / 2);
|
|
1129
|
+
if (boundaryPositions[mid] <= position) left = mid;
|
|
1130
|
+
else right = mid - 1;
|
|
958
1131
|
}
|
|
959
|
-
|
|
960
|
-
set.add("u");
|
|
961
|
-
return [
|
|
962
|
-
"g",
|
|
963
|
-
"i",
|
|
964
|
-
"m",
|
|
965
|
-
"s",
|
|
966
|
-
"y",
|
|
967
|
-
"u"
|
|
968
|
-
].filter((c) => set.has(c)).join("");
|
|
1132
|
+
return fromIdx + left;
|
|
969
1133
|
};
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
1134
|
+
/**
|
|
1135
|
+
* Finds the end position of a breakpoint window inside `remainingContent`.
|
|
1136
|
+
*
|
|
1137
|
+
* The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
|
|
1138
|
+
* found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
|
|
1139
|
+
* that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
|
|
1140
|
+
*/
|
|
1141
|
+
const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
|
|
1142
|
+
if (windowEndIdx >= toIdx) return remainingContent.length;
|
|
1143
|
+
const desiredNextIdx = windowEndIdx + 1;
|
|
1144
|
+
const minNextIdx = currentFromIdx + 1;
|
|
1145
|
+
const maxNextIdx = Math.min(desiredNextIdx, toIdx);
|
|
1146
|
+
const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
|
|
1147
|
+
for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
|
|
1148
|
+
const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
|
|
1149
|
+
const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
|
|
1150
|
+
if (pos > 0) return pos;
|
|
981
1151
|
}
|
|
982
|
-
return
|
|
1152
|
+
return remainingContent.length;
|
|
983
1153
|
};
|
|
984
1154
|
/**
|
|
985
|
-
*
|
|
986
|
-
*
|
|
987
|
-
* - Replacement rules are applied in array order.
|
|
988
|
-
* - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
|
|
989
|
-
* - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
|
|
1155
|
+
* Finds exclusion-based break position using raw cumulative offsets.
|
|
990
1156
|
*
|
|
991
|
-
* This
|
|
992
|
-
*
|
|
1157
|
+
* This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
|
|
1158
|
+
* Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
|
|
993
1159
|
*/
|
|
994
|
-
const
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
content = content.replace(rule.re, rule.replacement);
|
|
1003
|
-
}
|
|
1004
|
-
if (content === p.content) return p;
|
|
1005
|
-
return {
|
|
1006
|
-
...p,
|
|
1007
|
-
content
|
|
1008
|
-
};
|
|
1009
|
-
});
|
|
1160
|
+
const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
|
|
1161
|
+
const startingPageId = pageIds[currentFromIdx];
|
|
1162
|
+
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
|
|
1163
|
+
for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
|
|
1164
|
+
const pageId = pageIds[pageIdx];
|
|
1165
|
+
if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
|
|
1166
|
+
}
|
|
1167
|
+
return -1;
|
|
1010
1168
|
};
|
|
1011
|
-
|
|
1012
|
-
//#endregion
|
|
1013
|
-
//#region src/segmentation/tokens.ts
|
|
1014
1169
|
/**
|
|
1015
|
-
*
|
|
1016
|
-
*
|
|
1017
|
-
* This module provides a human-readable way to define regex patterns using
|
|
1018
|
-
* `{{token}}` placeholders that expand to their regex equivalents. It supports
|
|
1019
|
-
* named capture groups for extracting matched values into metadata.
|
|
1020
|
-
*
|
|
1021
|
-
* @module tokens
|
|
1170
|
+
* Checks if any page in a range is excluded by the given exclude set.
|
|
1022
1171
|
*
|
|
1023
|
-
* @
|
|
1024
|
-
*
|
|
1025
|
-
*
|
|
1026
|
-
*
|
|
1172
|
+
* @param excludeSet - Set of excluded page IDs
|
|
1173
|
+
* @param pageIds - Array of page IDs
|
|
1174
|
+
* @param fromIdx - Start index (inclusive)
|
|
1175
|
+
* @param toIdx - End index (inclusive)
|
|
1176
|
+
* @returns True if any page in range is excluded
|
|
1177
|
+
*/
|
|
1178
|
+
const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
|
|
1179
|
+
if (excludeSet.size === 0) return false;
|
|
1180
|
+
for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
|
|
1181
|
+
return false;
|
|
1182
|
+
};
|
|
1183
|
+
/**
|
|
1184
|
+
* Finds the position of the next page content within remaining content.
|
|
1185
|
+
* Returns -1 if not found.
|
|
1027
1186
|
*
|
|
1028
|
-
* @
|
|
1029
|
-
*
|
|
1030
|
-
*
|
|
1031
|
-
* // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
|
|
1187
|
+
* @param remainingContent - Content to search in
|
|
1188
|
+
* @param nextPageData - Normalized data for the next page
|
|
1189
|
+
* @returns Position of next page content, or -1 if not found
|
|
1032
1190
|
*/
|
|
1191
|
+
const findNextPagePosition = (remainingContent, nextPageData) => {
|
|
1192
|
+
const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
|
|
1193
|
+
if (searchPrefix.length === 0) return -1;
|
|
1194
|
+
const pos = remainingContent.indexOf(searchPrefix);
|
|
1195
|
+
return pos > 0 ? pos : -1;
|
|
1196
|
+
};
|
|
1033
1197
|
/**
|
|
1034
|
-
*
|
|
1198
|
+
* Finds matches within a window and returns the selected position based on preference.
|
|
1035
1199
|
*
|
|
1036
|
-
*
|
|
1037
|
-
*
|
|
1038
|
-
*
|
|
1039
|
-
*
|
|
1040
|
-
*
|
|
1041
|
-
* @remarks
|
|
1042
|
-
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
1043
|
-
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
1044
|
-
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
1045
|
-
*
|
|
1046
|
-
* @example
|
|
1047
|
-
* // Using tokens in a split rule
|
|
1048
|
-
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
1049
|
-
*
|
|
1050
|
-
* @example
|
|
1051
|
-
* // Using tokens with named captures
|
|
1052
|
-
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
1200
|
+
* @param windowContent - Content to search
|
|
1201
|
+
* @param regex - Regex to match
|
|
1202
|
+
* @param prefer - 'longer' for last match, 'shorter' for first match
|
|
1203
|
+
* @returns Break position after the selected match, or -1 if no matches
|
|
1053
1204
|
*/
|
|
1205
|
+
const findPatternBreakPosition = (windowContent, regex, prefer) => {
|
|
1206
|
+
let first;
|
|
1207
|
+
let last;
|
|
1208
|
+
for (const m of windowContent.matchAll(regex)) {
|
|
1209
|
+
const match = {
|
|
1210
|
+
index: m.index,
|
|
1211
|
+
length: m[0].length
|
|
1212
|
+
};
|
|
1213
|
+
if (!first) first = match;
|
|
1214
|
+
last = match;
|
|
1215
|
+
}
|
|
1216
|
+
if (!first) return -1;
|
|
1217
|
+
const selected = prefer === "longer" ? last : first;
|
|
1218
|
+
return selected.index + selected.length;
|
|
1219
|
+
};
|
|
1054
1220
|
/**
|
|
1055
|
-
*
|
|
1056
|
-
*
|
|
1057
|
-
*
|
|
1058
|
-
* This allows users to write intuitive patterns like `({{harf}}):` instead of
|
|
1059
|
-
* the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
|
|
1060
|
-
* so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
|
|
1061
|
-
*
|
|
1062
|
-
* @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
|
|
1063
|
-
* @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
|
|
1064
|
-
*
|
|
1065
|
-
* @example
|
|
1066
|
-
* escapeTemplateBrackets('({{harf}}): ')
|
|
1067
|
-
* // → '\\({{harf}}\\): '
|
|
1068
|
-
*
|
|
1069
|
-
* @example
|
|
1070
|
-
* escapeTemplateBrackets('[{{raqm}}] ')
|
|
1071
|
-
* // → '\\[{{raqm}}\\] '
|
|
1072
|
-
*
|
|
1073
|
-
* @example
|
|
1074
|
-
* escapeTemplateBrackets('{{harf}}')
|
|
1075
|
-
* // → '{{harf}}' (unchanged - no brackets outside tokens)
|
|
1221
|
+
* Handles page boundary breakpoint (empty pattern).
|
|
1222
|
+
* Returns break position or -1 if no valid position found.
|
|
1076
1223
|
*/
|
|
1077
|
-
const
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
"بخ",
|
|
1088
|
-
"عخ",
|
|
1089
|
-
"مق",
|
|
1090
|
-
"مت",
|
|
1091
|
-
"عس",
|
|
1092
|
-
"سي",
|
|
1093
|
-
"سن",
|
|
1094
|
-
"كن",
|
|
1095
|
-
"مد",
|
|
1096
|
-
"قد",
|
|
1097
|
-
"خد",
|
|
1098
|
-
"فد",
|
|
1099
|
-
"دل",
|
|
1100
|
-
"كد",
|
|
1101
|
-
"غد",
|
|
1102
|
-
"صد",
|
|
1103
|
-
"دت",
|
|
1104
|
-
"دس",
|
|
1105
|
-
"تم",
|
|
1106
|
-
"فق",
|
|
1107
|
-
"دق",
|
|
1108
|
-
"[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
|
|
1109
|
-
"(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
|
|
1110
|
-
].join("|")})`;
|
|
1111
|
-
const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
|
|
1112
|
-
const BASE_TOKENS = {
|
|
1113
|
-
bab: "باب",
|
|
1114
|
-
basmalah: ["بسم الله", "﷽"].join("|"),
|
|
1115
|
-
bullet: "[•*°]",
|
|
1116
|
-
dash: "[-–—ـ]",
|
|
1117
|
-
fasl: ["مسألة", "فصل"].join("|"),
|
|
1118
|
-
harf: "[أ-ي]",
|
|
1119
|
-
harfs: "[أ-ي](?:\\s+[أ-ي])*",
|
|
1120
|
-
kitab: "كتاب",
|
|
1121
|
-
naql: [
|
|
1122
|
-
"حدثني",
|
|
1123
|
-
"وأخبرنا",
|
|
1124
|
-
"حدثنا",
|
|
1125
|
-
"سمعت",
|
|
1126
|
-
"أنبأنا",
|
|
1127
|
-
"وحدثنا",
|
|
1128
|
-
"أخبرنا",
|
|
1129
|
-
"وحدثني",
|
|
1130
|
-
"وحدثنيه"
|
|
1131
|
-
].join("|"),
|
|
1132
|
-
raqm: "[\\u0660-\\u0669]",
|
|
1133
|
-
raqms: "[\\u0660-\\u0669]+",
|
|
1134
|
-
rumuz: RUMUZ_BLOCK,
|
|
1135
|
-
tarqim: "[.!?؟؛]"
|
|
1224
|
+
const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
|
|
1225
|
+
const nextPageIdx = windowEndIdx + 1;
|
|
1226
|
+
if (nextPageIdx <= toIdx) {
|
|
1227
|
+
const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
|
|
1228
|
+
if (nextPageData) {
|
|
1229
|
+
const pos = findNextPagePosition(remainingContent, nextPageData);
|
|
1230
|
+
if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
return Math.min(windowEndPosition, remainingContent.length);
|
|
1136
1234
|
};
|
|
1137
1235
|
/**
|
|
1138
|
-
*
|
|
1139
|
-
*
|
|
1140
|
-
* These tokens reference base tokens using `{{token}}` syntax and are
|
|
1141
|
-
* automatically expanded to their final regex patterns at module load time.
|
|
1142
|
-
*
|
|
1143
|
-
* This provides better abstraction - if base tokens change, composites
|
|
1144
|
-
* automatically update on the next build.
|
|
1236
|
+
* Tries to find a break position within the current window using breakpoint patterns.
|
|
1237
|
+
* Returns the break position or -1 if no suitable break was found.
|
|
1145
1238
|
*
|
|
1146
|
-
* @
|
|
1239
|
+
* @param remainingContent - Content remaining to be segmented
|
|
1240
|
+
* @param currentFromIdx - Current starting page index
|
|
1241
|
+
* @param toIdx - Ending page index
|
|
1242
|
+
* @param windowEndIdx - Maximum window end index
|
|
1243
|
+
* @param ctx - Breakpoint context with page data and patterns
|
|
1244
|
+
* @returns Break position in the content, or -1 if no break found
|
|
1147
1245
|
*/
|
|
1148
|
-
const
|
|
1246
|
+
const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
|
|
1247
|
+
const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
|
|
1248
|
+
for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
|
|
1249
|
+
if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
|
|
1250
|
+
if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
|
|
1251
|
+
if (skipWhenRegex?.test(remainingContent)) continue;
|
|
1252
|
+
if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
|
|
1253
|
+
const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
|
|
1254
|
+
if (breakPos > 0) return breakPos;
|
|
1255
|
+
}
|
|
1256
|
+
return -1;
|
|
1257
|
+
};
|
|
1258
|
+
|
|
1259
|
+
//#endregion
|
|
1260
|
+
//#region src/segmentation/breakpoint-processor.ts
|
|
1149
1261
|
/**
|
|
1150
|
-
*
|
|
1151
|
-
* (like `{{raqms}} {{dash}} `).
|
|
1152
|
-
*
|
|
1153
|
-
* This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
|
|
1154
|
-
* and turn it into an editable template where you can add named captures, e.g.:
|
|
1155
|
-
*
|
|
1156
|
-
* - `{{numbered}}` → `{{raqms}} {{dash}} `
|
|
1157
|
-
* - then: `{{raqms:num}} {{dash}} ` to capture the number
|
|
1262
|
+
* Breakpoint post-processing engine extracted from segmenter.ts.
|
|
1158
1263
|
*
|
|
1159
|
-
*
|
|
1160
|
-
*
|
|
1161
|
-
* - Expansion is repeated a few times to support nested composites.
|
|
1264
|
+
* This module is intentionally split into small helpers to reduce cognitive complexity
|
|
1265
|
+
* and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
|
|
1162
1266
|
*/
|
|
1163
|
-
const
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1267
|
+
const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
|
|
1268
|
+
const buildNormalizedPagesMap = (pages, normalizedContent) => {
|
|
1269
|
+
const normalizedPages = /* @__PURE__ */ new Map();
|
|
1270
|
+
for (let i = 0; i < pages.length; i++) {
|
|
1271
|
+
const content = normalizedContent[i];
|
|
1272
|
+
normalizedPages.set(pages[i].id, {
|
|
1273
|
+
content,
|
|
1274
|
+
index: i,
|
|
1275
|
+
length: content.length
|
|
1168
1276
|
});
|
|
1169
|
-
if (next === out) break;
|
|
1170
|
-
out = next;
|
|
1171
1277
|
}
|
|
1172
|
-
return
|
|
1278
|
+
return normalizedPages;
|
|
1173
1279
|
};
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
return
|
|
1184
|
-
return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
|
|
1185
|
-
});
|
|
1280
|
+
const buildCumulativeOffsets = (pageIds, normalizedPages) => {
|
|
1281
|
+
const cumulativeOffsets = [0];
|
|
1282
|
+
let totalOffset = 0;
|
|
1283
|
+
for (let i = 0; i < pageIds.length; i++) {
|
|
1284
|
+
const pageData = normalizedPages.get(pageIds[i]);
|
|
1285
|
+
totalOffset += pageData ? pageData.length : 0;
|
|
1286
|
+
if (i < pageIds.length - 1) totalOffset += 1;
|
|
1287
|
+
cumulativeOffsets.push(totalOffset);
|
|
1288
|
+
}
|
|
1289
|
+
return cumulativeOffsets;
|
|
1186
1290
|
};
|
|
1291
|
+
const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
|
|
1292
|
+
const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
|
|
1293
|
+
const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
|
|
1294
|
+
let windowEndIdx = currentFromIdx;
|
|
1295
|
+
for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
|
|
1296
|
+
else break;
|
|
1297
|
+
return windowEndIdx;
|
|
1298
|
+
};
|
|
1299
|
+
const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
|
|
1300
|
+
const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
|
|
1187
1301
|
/**
|
|
1188
|
-
*
|
|
1189
|
-
*
|
|
1190
|
-
* Tokens are used in template strings with double-brace syntax:
|
|
1191
|
-
* - `{{token}}` - Expands to the pattern (non-capturing in context)
|
|
1192
|
-
* - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
|
|
1193
|
-
* - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
|
|
1194
|
-
*
|
|
1195
|
-
* @remarks
|
|
1196
|
-
* These patterns are designed for Arabic text matching. For diacritic-insensitive
|
|
1197
|
-
* matching of Arabic patterns, use the `fuzzy: true` option in split rules,
|
|
1198
|
-
* which applies `makeDiacriticInsensitive()` to the expanded patterns.
|
|
1199
|
-
*
|
|
1200
|
-
* @example
|
|
1201
|
-
* // Using tokens in a split rule
|
|
1202
|
-
* { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
|
|
1203
|
-
*
|
|
1204
|
-
* @example
|
|
1205
|
-
* // Using tokens with named captures
|
|
1206
|
-
* { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
|
|
1302
|
+
* Computes the actual start and end page indices for a piece using
|
|
1303
|
+
* precomputed boundary positions and binary search.
|
|
1207
1304
|
*
|
|
1208
|
-
* @
|
|
1209
|
-
*
|
|
1210
|
-
*
|
|
1305
|
+
* @param pieceStartPos - Start position of the piece in the full segment content
|
|
1306
|
+
* @param pieceEndPos - End position (exclusive) of the piece
|
|
1307
|
+
* @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
|
|
1308
|
+
* @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
|
|
1309
|
+
* @param toIdx - Maximum page index
|
|
1310
|
+
* @returns Object with actualStartIdx and actualEndIdx
|
|
1211
1311
|
*/
|
|
1212
|
-
const
|
|
1213
|
-
|
|
1214
|
-
|
|
1312
|
+
const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
|
|
1313
|
+
const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
|
|
1314
|
+
const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
|
|
1315
|
+
return {
|
|
1316
|
+
actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
|
|
1317
|
+
actualStartIdx
|
|
1318
|
+
};
|
|
1215
1319
|
};
|
|
1320
|
+
const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
|
|
1321
|
+
let nextFromIdx = actualEndIdx;
|
|
1322
|
+
if (remainingContent && actualEndIdx + 1 <= toIdx) {
|
|
1323
|
+
const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
|
|
1324
|
+
if (nextPageData) {
|
|
1325
|
+
const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
|
|
1326
|
+
const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
|
|
1327
|
+
if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
|
|
1328
|
+
}
|
|
1329
|
+
}
|
|
1330
|
+
return nextFromIdx;
|
|
1331
|
+
};
|
|
1332
|
+
const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
|
|
1216
1333
|
/**
|
|
1217
|
-
*
|
|
1218
|
-
*
|
|
1219
|
-
* Matches:
|
|
1220
|
-
* - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
|
|
1221
|
-
* - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
|
|
1222
|
-
* - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
|
|
1334
|
+
* Finds the break offset within a window, trying exclusions first, then patterns.
|
|
1223
1335
|
*
|
|
1224
|
-
* @
|
|
1336
|
+
* @returns Break offset relative to remainingContent, or windowEndPosition as fallback
|
|
1225
1337
|
*/
|
|
1226
|
-
const
|
|
1338
|
+
const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
|
|
1339
|
+
if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
|
|
1340
|
+
const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
|
|
1341
|
+
if (exclusionBreak > 0) return exclusionBreak;
|
|
1342
|
+
}
|
|
1343
|
+
const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
|
|
1344
|
+
expandedBreakpoints,
|
|
1345
|
+
normalizedPages,
|
|
1346
|
+
pageIds,
|
|
1347
|
+
prefer
|
|
1348
|
+
});
|
|
1349
|
+
return patternBreak > 0 ? patternBreak : windowEndPosition;
|
|
1350
|
+
};
|
|
1227
1351
|
/**
|
|
1228
|
-
*
|
|
1229
|
-
*
|
|
1230
|
-
* Matches only `{{token}}` format where token is one or more word characters.
|
|
1231
|
-
* Used by `containsTokens()` for quick detection.
|
|
1232
|
-
*
|
|
1233
|
-
* @internal
|
|
1352
|
+
* Advances cursor position past any leading whitespace.
|
|
1234
1353
|
*/
|
|
1235
|
-
const
|
|
1354
|
+
const skipWhitespace$1 = (content, startPos) => {
|
|
1355
|
+
let pos = startPos;
|
|
1356
|
+
while (pos < content.length && /\s/.test(content[pos])) pos++;
|
|
1357
|
+
return pos;
|
|
1358
|
+
};
|
|
1236
1359
|
/**
|
|
1237
|
-
*
|
|
1238
|
-
*
|
|
1239
|
-
* Performs a quick test for `{{token}}` patterns without actually
|
|
1240
|
-
* expanding them. Useful for determining whether to apply token
|
|
1241
|
-
* expansion to a string.
|
|
1242
|
-
*
|
|
1243
|
-
* @param query - String to check for tokens
|
|
1244
|
-
* @returns `true` if the string contains at least one `{{token}}` pattern
|
|
1360
|
+
* Processes an oversized segment by iterating through the content and
|
|
1361
|
+
* breaking it into smaller pieces that fit within maxPages constraints.
|
|
1245
1362
|
*
|
|
1246
|
-
*
|
|
1247
|
-
* containsTokens('{{raqms}} {{dash}}') // → true
|
|
1248
|
-
* containsTokens('plain text') // → false
|
|
1249
|
-
* containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
|
|
1363
|
+
* Uses precomputed boundary positions for O(log n) page attribution lookups.
|
|
1250
1364
|
*/
|
|
1251
|
-
const
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1365
|
+
const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
|
|
1366
|
+
const result = [];
|
|
1367
|
+
const fullContent = segment.content;
|
|
1368
|
+
let cursorPos = 0;
|
|
1369
|
+
let currentFromIdx = fromIdx;
|
|
1370
|
+
let isFirstPiece = true;
|
|
1371
|
+
const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1372
|
+
logger?.debug?.("[breakpoints] boundaryPositions built", {
|
|
1373
|
+
boundaryPositions,
|
|
1374
|
+
fromIdx,
|
|
1375
|
+
fullContentLength: fullContent.length,
|
|
1376
|
+
toIdx
|
|
1377
|
+
});
|
|
1378
|
+
const maxIterations = 1e4;
|
|
1379
|
+
for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
|
|
1380
|
+
const remainingContent = fullContent.slice(cursorPos);
|
|
1381
|
+
if (!remainingContent.trim()) break;
|
|
1382
|
+
const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
|
|
1383
|
+
const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
|
|
1384
|
+
if (remainingSpan <= maxPages && !remainingHasExclusions) {
|
|
1385
|
+
const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
|
|
1386
|
+
if (finalSeg) result.push(finalSeg);
|
|
1387
|
+
break;
|
|
1388
|
+
}
|
|
1389
|
+
const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
|
|
1390
|
+
const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
|
|
1391
|
+
logger?.debug?.(`[breakpoints] iteration=${i}`, {
|
|
1392
|
+
currentFromIdx,
|
|
1393
|
+
cursorPos,
|
|
1394
|
+
windowEndIdx
|
|
1264
1395
|
});
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1396
|
+
const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
|
|
1397
|
+
const breakPos = cursorPos + breakOffset;
|
|
1398
|
+
const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
|
|
1399
|
+
const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
|
|
1400
|
+
logger?.trace?.("[breakpoints] piece", {
|
|
1401
|
+
actualEndIdx,
|
|
1402
|
+
actualStartIdx,
|
|
1403
|
+
pieceLength: pieceContent.length
|
|
1268
1404
|
});
|
|
1269
|
-
|
|
1405
|
+
if (pieceContent) {
|
|
1406
|
+
const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
|
|
1407
|
+
if (pieceSeg) result.push(pieceSeg);
|
|
1408
|
+
}
|
|
1409
|
+
cursorPos = skipWhitespace$1(fullContent, breakPos);
|
|
1410
|
+
currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
|
|
1411
|
+
isFirstPiece = false;
|
|
1270
1412
|
}
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
value: query.slice(lastIndex)
|
|
1274
|
-
});
|
|
1275
|
-
return segments;
|
|
1276
|
-
};
|
|
1277
|
-
const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
|
|
1278
|
-
if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
|
|
1279
|
-
return text;
|
|
1280
|
-
};
|
|
1281
|
-
const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
|
|
1282
|
-
if (!fuzzyTransform) return tokenPattern;
|
|
1283
|
-
return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
|
|
1284
|
-
};
|
|
1285
|
-
const parseTokenLiteral = (literal) => {
|
|
1286
|
-
TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
|
|
1287
|
-
const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
|
|
1288
|
-
if (!tokenMatch) return null;
|
|
1289
|
-
const [, tokenName, captureName] = tokenMatch;
|
|
1290
|
-
return {
|
|
1291
|
-
captureName,
|
|
1292
|
-
tokenName
|
|
1293
|
-
};
|
|
1294
|
-
};
|
|
1295
|
-
const createCaptureRegistry = (capturePrefix) => {
|
|
1296
|
-
const captureNames = [];
|
|
1297
|
-
const captureNameCounts = /* @__PURE__ */ new Map();
|
|
1298
|
-
const register = (baseName) => {
|
|
1299
|
-
const count = captureNameCounts.get(baseName) ?? 0;
|
|
1300
|
-
captureNameCounts.set(baseName, count + 1);
|
|
1301
|
-
const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
|
|
1302
|
-
const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
|
|
1303
|
-
captureNames.push(prefixedName);
|
|
1304
|
-
return prefixedName;
|
|
1305
|
-
};
|
|
1306
|
-
return {
|
|
1307
|
-
captureNames,
|
|
1308
|
-
register
|
|
1309
|
-
};
|
|
1413
|
+
logger?.debug?.("[breakpoints] done", { resultCount: result.length });
|
|
1414
|
+
return result;
|
|
1310
1415
|
};
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1416
|
+
/**
|
|
1417
|
+
* Applies breakpoints to oversized segments.
|
|
1418
|
+
*
|
|
1419
|
+
* Note: This is an internal engine used by `segmentPages()`.
|
|
1420
|
+
*/
|
|
1421
|
+
const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
|
|
1422
|
+
const pageIds = pages.map((p) => p.id);
|
|
1423
|
+
const pageIdToIndex = buildPageIdToIndexMap(pageIds);
|
|
1424
|
+
const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
|
|
1425
|
+
const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
|
|
1426
|
+
const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
|
|
1427
|
+
const result = [];
|
|
1428
|
+
logger?.info?.("Starting breakpoint processing", {
|
|
1429
|
+
maxPages,
|
|
1430
|
+
segmentCount: segments.length
|
|
1431
|
+
});
|
|
1432
|
+
logger?.debug?.("[breakpoints] inputSegments", {
|
|
1433
|
+
segmentCount: segments.length,
|
|
1434
|
+
segments: segments.map((s) => ({
|
|
1435
|
+
contentLength: s.content.length,
|
|
1436
|
+
from: s.from,
|
|
1437
|
+
to: s.to
|
|
1438
|
+
}))
|
|
1439
|
+
});
|
|
1440
|
+
for (const segment of segments) {
|
|
1441
|
+
const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
|
|
1442
|
+
const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
|
|
1443
|
+
const segmentSpan = (segment.to ?? segment.from) - segment.from;
|
|
1444
|
+
const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
|
|
1445
|
+
if (segmentSpan <= maxPages && !hasExclusions) {
|
|
1446
|
+
result.push(segment);
|
|
1447
|
+
continue;
|
|
1448
|
+
}
|
|
1449
|
+
const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
|
|
1450
|
+
result.push(...broken.map((s) => {
|
|
1451
|
+
const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
|
|
1452
|
+
const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
|
|
1453
|
+
if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
|
|
1454
|
+
...s,
|
|
1455
|
+
content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
|
|
1456
|
+
};
|
|
1457
|
+
return s;
|
|
1458
|
+
}));
|
|
1459
|
+
}
|
|
1460
|
+
logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
|
|
1461
|
+
return result;
|
|
1321
1462
|
};
|
|
1463
|
+
|
|
1464
|
+
//#endregion
|
|
1465
|
+
//#region src/segmentation/match-utils.ts
|
|
1322
1466
|
/**
|
|
1323
|
-
*
|
|
1467
|
+
* Utility functions for regex matching and result processing.
|
|
1324
1468
|
*
|
|
1325
|
-
*
|
|
1326
|
-
*
|
|
1327
|
-
*
|
|
1328
|
-
* - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
|
|
1469
|
+
* These functions were extracted from `segmenter.ts` to reduce complexity
|
|
1470
|
+
* and enable independent testing. They handle match filtering, capture
|
|
1471
|
+
* extraction, and occurrence-based selection.
|
|
1329
1472
|
*
|
|
1330
|
-
*
|
|
1473
|
+
* @module match-utils
|
|
1474
|
+
*/
|
|
1475
|
+
/**
|
|
1476
|
+
* Extracts named capture groups from a regex match.
|
|
1331
1477
|
*
|
|
1332
|
-
*
|
|
1333
|
-
*
|
|
1334
|
-
*
|
|
1335
|
-
* Typically `makeDiacriticInsensitive` from the fuzzy module.
|
|
1336
|
-
* @returns Object with expanded pattern, capture names, and capture flag
|
|
1478
|
+
* Only includes groups that are in the `captureNames` list and have
|
|
1479
|
+
* defined values. This filters out positional captures and ensures
|
|
1480
|
+
* only explicitly requested named captures are returned.
|
|
1337
1481
|
*
|
|
1338
|
-
* @
|
|
1339
|
-
*
|
|
1340
|
-
*
|
|
1341
|
-
* // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
|
|
1482
|
+
* @param groups - The `match.groups` object from `RegExp.exec()`
|
|
1483
|
+
* @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
|
|
1484
|
+
* @returns Object with capture name → value pairs, or `undefined` if none found
|
|
1342
1485
|
*
|
|
1343
1486
|
* @example
|
|
1344
|
-
*
|
|
1345
|
-
*
|
|
1346
|
-
* // → {
|
|
1487
|
+
* const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
|
|
1488
|
+
* extractNamedCaptures(match.groups, ['num'])
|
|
1489
|
+
* // → { num: '٦٦٩٦' }
|
|
1347
1490
|
*
|
|
1348
1491
|
* @example
|
|
1349
|
-
* //
|
|
1350
|
-
*
|
|
1351
|
-
* // →
|
|
1492
|
+
* // No matching captures
|
|
1493
|
+
* extractNamedCaptures({}, ['num'])
|
|
1494
|
+
* // → undefined
|
|
1352
1495
|
*
|
|
1353
1496
|
* @example
|
|
1354
|
-
* //
|
|
1355
|
-
*
|
|
1356
|
-
* // →
|
|
1497
|
+
* // Undefined groups
|
|
1498
|
+
* extractNamedCaptures(undefined, ['num'])
|
|
1499
|
+
* // → undefined
|
|
1357
1500
|
*/
|
|
1358
|
-
const
|
|
1359
|
-
|
|
1360
|
-
const
|
|
1361
|
-
const
|
|
1362
|
-
|
|
1363
|
-
return expandTokenLiteral(segment.value, {
|
|
1364
|
-
capturePrefix,
|
|
1365
|
-
fuzzyTransform,
|
|
1366
|
-
registerCapture: registry.register
|
|
1367
|
-
});
|
|
1368
|
-
});
|
|
1369
|
-
return {
|
|
1370
|
-
captureNames: registry.captureNames,
|
|
1371
|
-
hasCaptures: registry.captureNames.length > 0,
|
|
1372
|
-
pattern: processedParts.join("")
|
|
1373
|
-
};
|
|
1501
|
+
const extractNamedCaptures = (groups, captureNames) => {
|
|
1502
|
+
if (!groups || captureNames.length === 0) return;
|
|
1503
|
+
const namedCaptures = {};
|
|
1504
|
+
for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
|
|
1505
|
+
return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
|
|
1374
1506
|
};
|
|
1375
1507
|
/**
|
|
1376
|
-
*
|
|
1377
|
-
*
|
|
1378
|
-
* This is the simple version without capture support. It returns only the
|
|
1379
|
-
* expanded pattern string, not capture metadata.
|
|
1508
|
+
* Gets the last defined positional capture group from a match array.
|
|
1380
1509
|
*
|
|
1381
|
-
*
|
|
1510
|
+
* Used for `lineStartsAfter` patterns where the content capture (`.*`)
|
|
1511
|
+
* is always at the end of the pattern. Named captures may shift the
|
|
1512
|
+
* positional indices, so we iterate backward to find the actual content.
|
|
1382
1513
|
*
|
|
1383
|
-
* @param
|
|
1384
|
-
* @returns
|
|
1514
|
+
* @param match - RegExp exec result array
|
|
1515
|
+
* @returns The last defined capture group value, or `undefined` if none
|
|
1385
1516
|
*
|
|
1386
1517
|
* @example
|
|
1387
|
-
*
|
|
1388
|
-
*
|
|
1389
|
-
*
|
|
1390
|
-
*
|
|
1391
|
-
*
|
|
1392
|
-
* @see expandTokensWithCaptures for full capture group support
|
|
1393
|
-
*/
|
|
1394
|
-
const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
|
|
1395
|
-
/**
|
|
1396
|
-
* Converts a template string to a compiled RegExp.
|
|
1397
|
-
*
|
|
1398
|
-
* Expands all tokens and attempts to compile the result as a RegExp
|
|
1399
|
-
* with Unicode flag. Returns `null` if the resulting pattern is invalid.
|
|
1400
|
-
*
|
|
1401
|
-
* @remarks
|
|
1402
|
-
* This function dynamically compiles regular expressions from template strings.
|
|
1403
|
-
* If templates may come from untrusted sources, be aware of potential ReDoS
|
|
1404
|
-
* (Regular Expression Denial of Service) risks due to catastrophic backtracking.
|
|
1405
|
-
* Consider validating pattern complexity or applying execution timeouts when
|
|
1406
|
-
* running user-submitted patterns.
|
|
1407
|
-
*
|
|
1408
|
-
* @param template - Template string containing `{{token}}` placeholders
|
|
1409
|
-
* @returns Compiled RegExp with 'u' flag, or `null` if invalid
|
|
1518
|
+
* // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
|
|
1519
|
+
* // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
|
|
1520
|
+
* getLastPositionalCapture(match)
|
|
1521
|
+
* // → 'content'
|
|
1410
1522
|
*
|
|
1411
1523
|
* @example
|
|
1412
|
-
*
|
|
1413
|
-
*
|
|
1414
|
-
*
|
|
1524
|
+
* // No captures
|
|
1525
|
+
* getLastPositionalCapture(['full match'])
|
|
1526
|
+
* // → undefined
|
|
1415
1527
|
*/
|
|
1416
|
-
const
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
return new RegExp(expanded, "u");
|
|
1420
|
-
} catch {
|
|
1421
|
-
return null;
|
|
1422
|
-
}
|
|
1528
|
+
const getLastPositionalCapture = (match) => {
|
|
1529
|
+
if (match.length <= 1) return;
|
|
1530
|
+
for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
|
|
1423
1531
|
};
|
|
1424
1532
|
/**
|
|
1425
|
-
*
|
|
1533
|
+
* Filters matches to only include those within page ID constraints.
|
|
1426
1534
|
*
|
|
1427
|
-
*
|
|
1428
|
-
* that
|
|
1535
|
+
* Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
|
|
1536
|
+
* matches that occur on pages outside the allowed range or explicitly excluded.
|
|
1429
1537
|
*
|
|
1430
|
-
* @
|
|
1538
|
+
* @param matches - Array of match results to filter
|
|
1539
|
+
* @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
|
|
1540
|
+
* @param getId - Function that returns the page ID for a given offset
|
|
1541
|
+
* @returns Filtered array containing only matches within constraints
|
|
1431
1542
|
*
|
|
1432
1543
|
* @example
|
|
1433
|
-
*
|
|
1434
|
-
*
|
|
1544
|
+
* const matches = [
|
|
1545
|
+
* { start: 0, end: 10 }, // Page 1
|
|
1546
|
+
* { start: 100, end: 110 }, // Page 5
|
|
1547
|
+
* { start: 200, end: 210 }, // Page 10
|
|
1548
|
+
* ];
|
|
1549
|
+
* filterByConstraints(matches, { min: 3, max: 8 }, getId)
|
|
1550
|
+
* // → [{ start: 100, end: 110 }] (only page 5 match)
|
|
1435
1551
|
*/
|
|
1436
|
-
const
|
|
1552
|
+
const filterByConstraints = (matches, rule, getId) => {
|
|
1553
|
+
return matches.filter((m) => {
|
|
1554
|
+
const id = getId(m.start);
|
|
1555
|
+
if (rule.min !== void 0 && id < rule.min) return false;
|
|
1556
|
+
if (rule.max !== void 0 && id > rule.max) return false;
|
|
1557
|
+
if (isPageExcluded(id, rule.exclude)) return false;
|
|
1558
|
+
return true;
|
|
1559
|
+
});
|
|
1560
|
+
};
|
|
1437
1561
|
/**
|
|
1438
|
-
*
|
|
1562
|
+
* Checks if any rule in the list allows the given page ID.
|
|
1439
1563
|
*
|
|
1440
|
-
*
|
|
1441
|
-
* without
|
|
1564
|
+
* A rule allows an ID if it falls within the rule's `min`/`max` constraints.
|
|
1565
|
+
* Rules without constraints allow all page IDs.
|
|
1442
1566
|
*
|
|
1443
|
-
*
|
|
1444
|
-
*
|
|
1567
|
+
* This is used to determine whether to create a segment for content
|
|
1568
|
+
* that appears before any split points (the "first segment").
|
|
1445
1569
|
*
|
|
1446
|
-
* @
|
|
1447
|
-
*
|
|
1448
|
-
*
|
|
1449
|
-
* getTokenPattern('unknown') // → undefined
|
|
1450
|
-
*/
|
|
1451
|
-
const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
|
|
1452
|
-
/**
|
|
1453
|
-
* Regex to detect fuzzy-default tokens in a pattern string.
|
|
1454
|
-
* Matches {{token}} or {{token:name}} syntax.
|
|
1455
|
-
*/
|
|
1456
|
-
const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
|
|
1457
|
-
"bab",
|
|
1458
|
-
"basmalah",
|
|
1459
|
-
"fasl",
|
|
1460
|
-
"kitab",
|
|
1461
|
-
"naql"
|
|
1462
|
-
].join("|")})(?::\\w+)?\\}\\}`, "g");
|
|
1463
|
-
/**
|
|
1464
|
-
* Checks if a pattern (or array of patterns) contains tokens that should
|
|
1465
|
-
* default to fuzzy matching.
|
|
1570
|
+
* @param rules - Array of rules with optional `min` and `max` constraints
|
|
1571
|
+
* @param pageId - Page ID to check
|
|
1572
|
+
* @returns `true` if at least one rule allows the page ID
|
|
1466
1573
|
*
|
|
1467
|
-
*
|
|
1574
|
+
* @example
|
|
1575
|
+
* const rules = [
|
|
1576
|
+
* { min: 5, max: 10 }, // Allows pages 5-10
|
|
1577
|
+
* { min: 20 }, // Allows pages 20+
|
|
1578
|
+
* ];
|
|
1468
1579
|
*
|
|
1469
|
-
*
|
|
1470
|
-
*
|
|
1580
|
+
* anyRuleAllowsId(rules, 7) // → true (first rule allows)
|
|
1581
|
+
* anyRuleAllowsId(rules, 3) // → false (no rule allows)
|
|
1582
|
+
* anyRuleAllowsId(rules, 25) // → true (second rule allows)
|
|
1471
1583
|
*
|
|
1472
1584
|
* @example
|
|
1473
|
-
*
|
|
1474
|
-
*
|
|
1475
|
-
* shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
|
|
1585
|
+
* // Rules without constraints allow everything
|
|
1586
|
+
* anyRuleAllowsId([{}], 999) // → true
|
|
1476
1587
|
*/
|
|
1477
|
-
const
|
|
1478
|
-
return
|
|
1479
|
-
|
|
1480
|
-
|
|
1588
|
+
const anyRuleAllowsId = (rules, pageId) => {
|
|
1589
|
+
return rules.some((r) => {
|
|
1590
|
+
const minOk = r.min === void 0 || pageId >= r.min;
|
|
1591
|
+
const maxOk = r.max === void 0 || pageId <= r.max;
|
|
1592
|
+
return minOk && maxOk;
|
|
1481
1593
|
});
|
|
1482
1594
|
};
|
|
1483
1595
|
|
|
@@ -1847,6 +1959,117 @@ const collectFastFuzzySplitPoints = (matchContent, pageMap, fastFuzzyRules, pass
|
|
|
1847
1959
|
return splitPointsByRule;
|
|
1848
1960
|
};
|
|
1849
1961
|
|
|
1962
|
+
//#endregion
|
|
1963
|
+
//#region src/segmentation/split-point-helpers.ts
|
|
1964
|
+
/**
|
|
1965
|
+
* Helper module for collectSplitPointsFromRules to reduce complexity.
|
|
1966
|
+
* Handles combined regex matching and split point creation.
|
|
1967
|
+
*/
|
|
1968
|
+
const MAX_REGEX_ITERATIONS = 1e5;
|
|
1969
|
+
const extractNamedCapturesForRule = (groups, captureNames, prefix) => {
|
|
1970
|
+
const result = {};
|
|
1971
|
+
if (!groups) return result;
|
|
1972
|
+
for (const name of captureNames) if (groups[name] !== void 0) result[name.slice(prefix.length)] = groups[name];
|
|
1973
|
+
return result;
|
|
1974
|
+
};
|
|
1975
|
+
const buildContentOffsets = (match, ruleInfo) => {
|
|
1976
|
+
if (!ruleInfo.usesLineStartsAfter) return {};
|
|
1977
|
+
const captured = match.groups?.[`${ruleInfo.prefix}__content`];
|
|
1978
|
+
if (captured === void 0) return {};
|
|
1979
|
+
return { contentStartOffset: (match.groups?.[ruleInfo.prefix] || match[0]).length - captured.length };
|
|
1980
|
+
};
|
|
1981
|
+
const passesRuleConstraints = (rule, pageId) => (rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude);
|
|
1982
|
+
const createSplitPointFromMatch = (match, rule, ruleInfo) => {
|
|
1983
|
+
const namedCaptures = extractNamedCapturesForRule(match.groups, ruleInfo.captureNames, ruleInfo.prefix);
|
|
1984
|
+
const { contentStartOffset } = buildContentOffsets(match, ruleInfo);
|
|
1985
|
+
return {
|
|
1986
|
+
capturedContent: void 0,
|
|
1987
|
+
contentStartOffset,
|
|
1988
|
+
index: (rule.split ?? "at") === "at" ? match.index : match.index + match[0].length,
|
|
1989
|
+
meta: rule.meta,
|
|
1990
|
+
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
1991
|
+
};
|
|
1992
|
+
};
|
|
1993
|
+
const processCombinedMatches = (matchContent, combinableRules, ruleRegexes, pageMap, passesPageStartGuard, splitPointsByRule, logger) => {
|
|
1994
|
+
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1995
|
+
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1996
|
+
logger?.debug?.("[segmenter] combined regex built", {
|
|
1997
|
+
combinableRuleCount: combinableRules.length,
|
|
1998
|
+
combinedSourceLength: combinedSource.length
|
|
1999
|
+
});
|
|
2000
|
+
let m = combinedRegex.exec(matchContent);
|
|
2001
|
+
let iterations = 0;
|
|
2002
|
+
while (m !== null) {
|
|
2003
|
+
iterations++;
|
|
2004
|
+
if (iterations > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop: exceeded ${MAX_REGEX_ITERATIONS} iterations at position ${m.index}.`);
|
|
2005
|
+
if (iterations % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count", {
|
|
2006
|
+
iterations,
|
|
2007
|
+
position: m.index
|
|
2008
|
+
});
|
|
2009
|
+
const matchedIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2010
|
+
if (matchedIndex !== -1) {
|
|
2011
|
+
const { rule, index: originalIndex } = combinableRules[matchedIndex];
|
|
2012
|
+
const ruleInfo = ruleRegexes[matchedIndex];
|
|
2013
|
+
if (passesRuleConstraints(rule, pageMap.getId(m.index)) && passesPageStartGuard(rule, originalIndex, m.index)) {
|
|
2014
|
+
const sp = createSplitPointFromMatch(m, rule, ruleInfo);
|
|
2015
|
+
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2016
|
+
splitPointsByRule.get(originalIndex).push(sp);
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
2019
|
+
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2020
|
+
m = combinedRegex.exec(matchContent);
|
|
2021
|
+
}
|
|
2022
|
+
};
|
|
2023
|
+
const buildRuleRegexes = (combinableRules) => combinableRules.map(({ rule, prefix }) => {
|
|
2024
|
+
const built = buildRuleRegex(rule, prefix);
|
|
2025
|
+
return {
|
|
2026
|
+
...built,
|
|
2027
|
+
prefix,
|
|
2028
|
+
source: `(?<${prefix}>${built.regex.source})`
|
|
2029
|
+
};
|
|
2030
|
+
});
|
|
2031
|
+
const processStandaloneRule = (rule, ruleIndex, matchContent, pageMap, passesPageStartGuard, splitPointsByRule) => {
|
|
2032
|
+
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2033
|
+
const points = filterByConstraints(findMatchesInContent(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2034
|
+
const isLSA = usesLineStartsAfter && m.captured !== void 0;
|
|
2035
|
+
const markerLen = isLSA ? m.end - m.captured.length - m.start : 0;
|
|
2036
|
+
return {
|
|
2037
|
+
capturedContent: isLSA ? void 0 : m.captured,
|
|
2038
|
+
contentStartOffset: isLSA ? markerLen : void 0,
|
|
2039
|
+
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2040
|
+
meta: rule.meta,
|
|
2041
|
+
namedCaptures: m.namedCaptures
|
|
2042
|
+
};
|
|
2043
|
+
});
|
|
2044
|
+
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2045
|
+
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2046
|
+
};
|
|
2047
|
+
const findMatchesInContent = (content, regex, usesCapture, captureNames) => {
|
|
2048
|
+
const matches = [];
|
|
2049
|
+
let m = regex.exec(content);
|
|
2050
|
+
while (m !== null) {
|
|
2051
|
+
matches.push({
|
|
2052
|
+
captured: usesCapture ? getLastPositionalCapture(m) : void 0,
|
|
2053
|
+
end: m.index + m[0].length,
|
|
2054
|
+
namedCaptures: extractNamedCaptures(m.groups, captureNames),
|
|
2055
|
+
start: m.index
|
|
2056
|
+
});
|
|
2057
|
+
if (m[0].length === 0) regex.lastIndex++;
|
|
2058
|
+
m = regex.exec(content);
|
|
2059
|
+
}
|
|
2060
|
+
return matches;
|
|
2061
|
+
};
|
|
2062
|
+
const applyOccurrenceFilter = (rules, splitPointsByRule) => {
|
|
2063
|
+
const result = [];
|
|
2064
|
+
rules.forEach((rule, index) => {
|
|
2065
|
+
const points = splitPointsByRule.get(index);
|
|
2066
|
+
if (!points?.length) return;
|
|
2067
|
+
const filtered = rule.occurrence === "first" ? [points[0]] : rule.occurrence === "last" ? [points.at(-1)] : points;
|
|
2068
|
+
result.push(...filtered);
|
|
2069
|
+
});
|
|
2070
|
+
return result;
|
|
2071
|
+
};
|
|
2072
|
+
|
|
1850
2073
|
//#endregion
|
|
1851
2074
|
//#region src/segmentation/textUtils.ts
|
|
1852
2075
|
/**
|
|
@@ -1969,7 +2192,7 @@ const dedupeSplitPoints = (splitPoints) => {
|
|
|
1969
2192
|
const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) => {
|
|
1970
2193
|
if (segments.length > 0 || pages.length === 0) return segments;
|
|
1971
2194
|
const firstPage = pages[0];
|
|
1972
|
-
const lastPage = pages
|
|
2195
|
+
const lastPage = pages.at(-1);
|
|
1973
2196
|
const joinChar = pageJoiner === "newline" ? "\n" : " ";
|
|
1974
2197
|
const allContent = normalizedContent.join(joinChar).trim();
|
|
1975
2198
|
if (!allContent) return segments;
|
|
@@ -1980,116 +2203,22 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
|
|
|
1980
2203
|
if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
|
|
1981
2204
|
return [initialSeg];
|
|
1982
2205
|
};
|
|
1983
|
-
const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
|
|
2206
|
+
const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
|
|
2207
|
+
logger?.debug?.("[segmenter] collecting split points from rules", {
|
|
2208
|
+
contentLength: matchContent.length,
|
|
2209
|
+
ruleCount: rules.length
|
|
2210
|
+
});
|
|
1984
2211
|
const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
|
|
1985
2212
|
const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
return {
|
|
1991
|
-
prefix,
|
|
1992
|
-
source: `(?<${prefix}>${built.regex.source})`,
|
|
1993
|
-
...built
|
|
1994
|
-
};
|
|
1995
|
-
});
|
|
1996
|
-
const combinedSource = ruleRegexes.map((r) => r.source).join("|");
|
|
1997
|
-
const combinedRegex = new RegExp(combinedSource, "gm");
|
|
1998
|
-
combinedRegex.lastIndex = 0;
|
|
1999
|
-
let m = combinedRegex.exec(matchContent);
|
|
2000
|
-
while (m !== null) {
|
|
2001
|
-
const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
|
|
2002
|
-
if (matchedRuleIndex !== -1) {
|
|
2003
|
-
const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
|
|
2004
|
-
const ruleInfo = ruleRegexes[matchedRuleIndex];
|
|
2005
|
-
const namedCaptures = {};
|
|
2006
|
-
if (m.groups) {
|
|
2007
|
-
for (const prefixedName of ruleInfo.captureNames) if (m.groups[prefixedName] !== void 0) {
|
|
2008
|
-
const cleanName = prefixedName.slice(prefix.length);
|
|
2009
|
-
namedCaptures[cleanName] = m.groups[prefixedName];
|
|
2010
|
-
}
|
|
2011
|
-
}
|
|
2012
|
-
let capturedContent;
|
|
2013
|
-
let contentStartOffset;
|
|
2014
|
-
if (ruleInfo.usesLineStartsAfter) {
|
|
2015
|
-
capturedContent = m.groups?.[`${prefix}__content`];
|
|
2016
|
-
if (capturedContent !== void 0) contentStartOffset = (m.groups?.[prefix] || m[0]).length - capturedContent.length;
|
|
2017
|
-
}
|
|
2018
|
-
const start = m.index;
|
|
2019
|
-
const end = m.index + m[0].length;
|
|
2020
|
-
const pageId = pageMap.getId(start);
|
|
2021
|
-
if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
|
|
2022
|
-
if (!passesPageStartGuard(rule, originalIndex, start)) continue;
|
|
2023
|
-
const sp = {
|
|
2024
|
-
capturedContent: void 0,
|
|
2025
|
-
contentStartOffset,
|
|
2026
|
-
index: (rule.split ?? "at") === "at" ? start : end,
|
|
2027
|
-
meta: rule.meta,
|
|
2028
|
-
namedCaptures: Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0
|
|
2029
|
-
};
|
|
2030
|
-
if (!splitPointsByRule.has(originalIndex)) splitPointsByRule.set(originalIndex, []);
|
|
2031
|
-
splitPointsByRule.get(originalIndex).push(sp);
|
|
2032
|
-
}
|
|
2033
|
-
}
|
|
2034
|
-
if (m[0].length === 0) combinedRegex.lastIndex++;
|
|
2035
|
-
m = combinedRegex.exec(matchContent);
|
|
2036
|
-
}
|
|
2037
|
-
}
|
|
2038
|
-
const collectSplitPointsFromRule = (rule, ruleIndex) => {
|
|
2039
|
-
const { regex, usesCapture, captureNames, usesLineStartsAfter } = buildRuleRegex(rule);
|
|
2040
|
-
const points = filterByConstraints(findMatches(matchContent, regex, usesCapture, captureNames), rule, pageMap.getId).filter((m) => passesPageStartGuard(rule, ruleIndex, m.start)).map((m) => {
|
|
2041
|
-
const isLineStartsAfter = usesLineStartsAfter && m.captured !== void 0;
|
|
2042
|
-
const markerLength = isLineStartsAfter ? m.end - m.captured.length - m.start : 0;
|
|
2043
|
-
return {
|
|
2044
|
-
capturedContent: isLineStartsAfter ? void 0 : m.captured,
|
|
2045
|
-
contentStartOffset: isLineStartsAfter ? markerLength : void 0,
|
|
2046
|
-
index: (rule.split ?? "at") === "at" ? m.start : m.end,
|
|
2047
|
-
meta: rule.meta,
|
|
2048
|
-
namedCaptures: m.namedCaptures
|
|
2049
|
-
};
|
|
2050
|
-
});
|
|
2051
|
-
if (!splitPointsByRule.has(ruleIndex)) splitPointsByRule.set(ruleIndex, []);
|
|
2052
|
-
splitPointsByRule.get(ruleIndex).push(...points);
|
|
2053
|
-
};
|
|
2054
|
-
standaloneRules.forEach((rule) => {
|
|
2055
|
-
collectSplitPointsFromRule(rule, rules.indexOf(rule));
|
|
2213
|
+
logger?.debug?.("[segmenter] rules partitioned", {
|
|
2214
|
+
combinableCount: combinableRules.length,
|
|
2215
|
+
fastFuzzyCount: fastFuzzyRules.length,
|
|
2216
|
+
standaloneCount: standaloneRules.length
|
|
2056
2217
|
});
|
|
2057
|
-
const
|
|
2058
|
-
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
let filtered = points;
|
|
2062
|
-
if (rule.occurrence === "first") filtered = [points[0]];
|
|
2063
|
-
else if (rule.occurrence === "last") filtered = [points[points.length - 1]];
|
|
2064
|
-
finalSplitPoints.push(...filtered);
|
|
2065
|
-
});
|
|
2066
|
-
return finalSplitPoints;
|
|
2067
|
-
};
|
|
2068
|
-
/**
|
|
2069
|
-
* Executes a regex against content and extracts match results with capture information.
|
|
2070
|
-
*
|
|
2071
|
-
* @param content - Full content string to search
|
|
2072
|
-
* @param regex - Compiled regex with 'g' flag
|
|
2073
|
-
* @param usesCapture - Whether to extract captured content
|
|
2074
|
-
* @param captureNames - Names of expected named capture groups
|
|
2075
|
-
* @returns Array of match results with positions and captures
|
|
2076
|
-
*/
|
|
2077
|
-
const findMatches = (content, regex, usesCapture, captureNames) => {
|
|
2078
|
-
const matches = [];
|
|
2079
|
-
regex.lastIndex = 0;
|
|
2080
|
-
let m = regex.exec(content);
|
|
2081
|
-
while (m !== null) {
|
|
2082
|
-
const result = {
|
|
2083
|
-
end: m.index + m[0].length,
|
|
2084
|
-
start: m.index
|
|
2085
|
-
};
|
|
2086
|
-
result.namedCaptures = extractNamedCaptures(m.groups, captureNames);
|
|
2087
|
-
if (usesCapture) result.captured = getLastPositionalCapture(m);
|
|
2088
|
-
matches.push(result);
|
|
2089
|
-
if (m[0].length === 0) regex.lastIndex++;
|
|
2090
|
-
m = regex.exec(content);
|
|
2091
|
-
}
|
|
2092
|
-
return matches;
|
|
2218
|
+
const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
|
|
2219
|
+
if (combinableRules.length > 0) processCombinedMatches(matchContent, combinableRules, buildRuleRegexes(combinableRules), pageMap, passesPageStartGuard, splitPointsByRule, logger);
|
|
2220
|
+
for (const rule of standaloneRules) processStandaloneRule(rule, rules.indexOf(rule), matchContent, pageMap, passesPageStartGuard, splitPointsByRule);
|
|
2221
|
+
return applyOccurrenceFilter(rules, splitPointsByRule);
|
|
2093
2222
|
};
|
|
2094
2223
|
/**
|
|
2095
2224
|
* Finds page breaks within a given offset range using binary search.
|
|
@@ -2205,7 +2334,7 @@ const segmentPages = (pages, options) => {
|
|
|
2205
2334
|
pageIds: pageMap.pageIds,
|
|
2206
2335
|
totalContentLength: matchContent.length
|
|
2207
2336
|
});
|
|
2208
|
-
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
|
|
2337
|
+
const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
|
|
2209
2338
|
const unique = dedupeSplitPoints(splitPoints);
|
|
2210
2339
|
logger?.debug?.("[segmenter] split points collected", {
|
|
2211
2340
|
rawSplitPoints: splitPoints.length,
|
|
@@ -2276,7 +2405,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2276
2405
|
const result = [];
|
|
2277
2406
|
for (let i = 0; i < splitPoints.length; i++) {
|
|
2278
2407
|
const sp = splitPoints[i];
|
|
2279
|
-
const end =
|
|
2408
|
+
const end = splitPoints[i + 1]?.index ?? content.length;
|
|
2280
2409
|
const s = createSegment$1(sp.index, end, sp.meta, sp.capturedContent, sp.namedCaptures, sp.contentStartOffset);
|
|
2281
2410
|
if (s) result.push(s);
|
|
2282
2411
|
}
|
|
@@ -2300,29 +2429,7 @@ const buildSegments = (splitPoints, content, pageMap, rules) => {
|
|
|
2300
2429
|
};
|
|
2301
2430
|
|
|
2302
2431
|
//#endregion
|
|
2303
|
-
//#region src/analysis.ts
|
|
2304
|
-
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2305
|
-
const stripWhitespacePlaceholders = (pattern) => pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "");
|
|
2306
|
-
const computeSpecificity = (pattern) => {
|
|
2307
|
-
const tokenCount = countTokenMarkers(pattern);
|
|
2308
|
-
return {
|
|
2309
|
-
literalLen: stripWhitespacePlaceholders(pattern).length,
|
|
2310
|
-
tokenCount
|
|
2311
|
-
};
|
|
2312
|
-
};
|
|
2313
|
-
const DEFAULT_OPTIONS = {
|
|
2314
|
-
includeFirstWordFallback: true,
|
|
2315
|
-
lineFilter: void 0,
|
|
2316
|
-
maxExamples: 1,
|
|
2317
|
-
minCount: 3,
|
|
2318
|
-
minLineLength: 6,
|
|
2319
|
-
normalizeArabicDiacritics: true,
|
|
2320
|
-
prefixChars: 60,
|
|
2321
|
-
prefixMatchers: [/^#+/u],
|
|
2322
|
-
sortBy: "specificity",
|
|
2323
|
-
topK: 40,
|
|
2324
|
-
whitespace: "regex"
|
|
2325
|
-
};
|
|
2432
|
+
//#region src/analysis/shared.ts
|
|
2326
2433
|
const escapeSignatureLiteral = (s) => s.replace(/[.*+?^${}|\\{}]/g, "\\$&");
|
|
2327
2434
|
const TOKEN_PRIORITY_ORDER$1 = [
|
|
2328
2435
|
"basmalah",
|
|
@@ -2363,30 +2470,7 @@ const appendWs = (out, mode) => {
|
|
|
2363
2470
|
if (mode === "space") return out.endsWith(" ") ? out : `${out} `;
|
|
2364
2471
|
return out.endsWith("\\s*") ? out : `${out}\\s*`;
|
|
2365
2472
|
};
|
|
2366
|
-
const
|
|
2367
|
-
let matchedAny = false;
|
|
2368
|
-
let currentPos = pos;
|
|
2369
|
-
let currentOut = out;
|
|
2370
|
-
for (const re of prefixMatchers) {
|
|
2371
|
-
if (currentPos >= s.length) break;
|
|
2372
|
-
const m = re.exec(s.slice(currentPos));
|
|
2373
|
-
if (!m || m.index !== 0 || !m[0]) continue;
|
|
2374
|
-
currentOut += escapeSignatureLiteral(m[0]);
|
|
2375
|
-
currentPos += m[0].length;
|
|
2376
|
-
matchedAny = true;
|
|
2377
|
-
const wsAfter = /^[ \t]+/u.exec(s.slice(currentPos));
|
|
2378
|
-
if (wsAfter) {
|
|
2379
|
-
currentPos += wsAfter[0].length;
|
|
2380
|
-
currentOut = appendWs(currentOut, whitespace);
|
|
2381
|
-
}
|
|
2382
|
-
}
|
|
2383
|
-
return {
|
|
2384
|
-
matchedAny,
|
|
2385
|
-
out: currentOut,
|
|
2386
|
-
pos: currentPos
|
|
2387
|
-
};
|
|
2388
|
-
};
|
|
2389
|
-
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
2473
|
+
const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter$1) => {
|
|
2390
2474
|
let best = null;
|
|
2391
2475
|
for (const { token, re } of compiled) {
|
|
2392
2476
|
re.lastIndex = pos;
|
|
@@ -2400,132 +2484,364 @@ const findBestTokenMatchAt = (s, pos, compiled, isArabicLetter) => {
|
|
|
2400
2484
|
if (best?.token === "rumuz") {
|
|
2401
2485
|
const end = pos + best.text.length;
|
|
2402
2486
|
const next = end < s.length ? s[end] : "";
|
|
2403
|
-
if (next && isArabicLetter(next) && !/\s/u.test(next)) return null;
|
|
2487
|
+
if (next && isArabicLetter$1(next) && !/\s/u.test(next)) return null;
|
|
2404
2488
|
}
|
|
2405
2489
|
return best;
|
|
2406
2490
|
};
|
|
2407
|
-
const
|
|
2491
|
+
const isArabicLetter = (ch) => /\p{Script=Arabic}/u.test(ch) && /\p{L}/u.test(ch);
|
|
2492
|
+
const isCommonDelimiter = (ch) => /[::\-–—ـ،؛.?!؟()[\]{}]/u.test(ch);
|
|
2493
|
+
|
|
2494
|
+
//#endregion
|
|
2495
|
+
//#region src/analysis/line-starts.ts
|
|
2496
|
+
const resolveOptions$1 = (options = {}) => ({
|
|
2497
|
+
includeFirstWordFallback: options.includeFirstWordFallback ?? true,
|
|
2498
|
+
lineFilter: options.lineFilter,
|
|
2499
|
+
maxExamples: options.maxExamples ?? 1,
|
|
2500
|
+
minCount: options.minCount ?? 3,
|
|
2501
|
+
minLineLength: options.minLineLength ?? 6,
|
|
2502
|
+
normalizeArabicDiacritics: options.normalizeArabicDiacritics ?? true,
|
|
2503
|
+
prefixChars: options.prefixChars ?? 60,
|
|
2504
|
+
prefixMatchers: options.prefixMatchers ?? [/^#+/u],
|
|
2505
|
+
sortBy: options.sortBy ?? "specificity",
|
|
2506
|
+
topK: options.topK ?? 40,
|
|
2507
|
+
whitespace: options.whitespace ?? "regex"
|
|
2508
|
+
});
|
|
2509
|
+
const countTokenMarkers = (pattern) => (pattern.match(/\{\{/g) ?? []).length;
|
|
2510
|
+
const computeSpecificity = (pattern) => ({
|
|
2511
|
+
literalLen: pattern.replace(/\\s\*/g, "").replace(/[ \t]+/g, "").length,
|
|
2512
|
+
tokenCount: countTokenMarkers(pattern)
|
|
2513
|
+
});
|
|
2514
|
+
const compareBySpecificity = (a, b) => {
|
|
2515
|
+
const sa = computeSpecificity(a.pattern), sb = computeSpecificity(b.pattern);
|
|
2516
|
+
return sb.tokenCount - sa.tokenCount || sb.literalLen - sa.literalLen || b.count - a.count || a.pattern.localeCompare(b.pattern);
|
|
2517
|
+
};
|
|
2518
|
+
const compareByCount = (a, b) => b.count !== a.count ? b.count - a.count : compareBySpecificity(a, b);
|
|
2519
|
+
/** Remove trailing whitespace placeholders */
|
|
2520
|
+
const trimTrailingWs = (out, mode) => {
|
|
2521
|
+
const suffix = mode === "regex" ? "\\s*" : " ";
|
|
2522
|
+
while (out.endsWith(suffix)) out = out.slice(0, -suffix.length);
|
|
2523
|
+
return out;
|
|
2524
|
+
};
|
|
2525
|
+
/** Try to extract first word for fallback */
|
|
2526
|
+
const extractFirstWord = (s) => (s.match(/^[^\s:،؛.?!؟]+/u) ?? [])[0] ?? null;
|
|
2527
|
+
/** Consume prefix matchers at current position */
|
|
2528
|
+
const consumePrefixes = (s, pos, out, matchers, ws) => {
|
|
2529
|
+
let matched = false;
|
|
2530
|
+
for (const re of matchers) {
|
|
2531
|
+
if (pos >= s.length) break;
|
|
2532
|
+
const m = re.exec(s.slice(pos));
|
|
2533
|
+
if (!m?.index && m?.[0]) {
|
|
2534
|
+
out += escapeSignatureLiteral(m[0]);
|
|
2535
|
+
pos += m[0].length;
|
|
2536
|
+
matched = true;
|
|
2537
|
+
const wsm = /^[ \t]+/u.exec(s.slice(pos));
|
|
2538
|
+
if (wsm) {
|
|
2539
|
+
pos += wsm[0].length;
|
|
2540
|
+
out = appendWs(out, ws);
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
}
|
|
2544
|
+
return {
|
|
2545
|
+
matched,
|
|
2546
|
+
out,
|
|
2547
|
+
pos
|
|
2548
|
+
};
|
|
2549
|
+
};
|
|
2550
|
+
/** Try to match a token at current position and append to signature */
|
|
2551
|
+
const tryMatchToken = (s, pos, out, compiled) => {
|
|
2552
|
+
const best = findBestTokenMatchAt(s, pos, compiled, isArabicLetter);
|
|
2553
|
+
if (!best) return {
|
|
2554
|
+
matched: false,
|
|
2555
|
+
out,
|
|
2556
|
+
pos
|
|
2557
|
+
};
|
|
2558
|
+
return {
|
|
2559
|
+
matched: true,
|
|
2560
|
+
out: `${out}{{${best.token}}}`,
|
|
2561
|
+
pos: pos + best.text.length
|
|
2562
|
+
};
|
|
2563
|
+
};
|
|
2564
|
+
/** Try to match a delimiter at current position */
|
|
2565
|
+
const tryMatchDelimiter = (s, pos, out) => {
|
|
2566
|
+
const ch = s[pos];
|
|
2567
|
+
if (!ch || !isCommonDelimiter(ch)) return {
|
|
2568
|
+
matched: false,
|
|
2569
|
+
out,
|
|
2570
|
+
pos
|
|
2571
|
+
};
|
|
2572
|
+
return {
|
|
2573
|
+
matched: true,
|
|
2574
|
+
out: out + escapeSignatureLiteral(ch),
|
|
2575
|
+
pos: pos + 1
|
|
2576
|
+
};
|
|
2577
|
+
};
|
|
2578
|
+
/** Skip whitespace at position */
|
|
2579
|
+
const skipWhitespace = (s, pos, out, ws) => {
|
|
2580
|
+
const m = /^[ \t]+/u.exec(s.slice(pos));
|
|
2581
|
+
if (!m) return {
|
|
2582
|
+
out,
|
|
2583
|
+
pos,
|
|
2584
|
+
skipped: false
|
|
2585
|
+
};
|
|
2586
|
+
return {
|
|
2587
|
+
out: appendWs(out, ws),
|
|
2588
|
+
pos: pos + m[0].length,
|
|
2589
|
+
skipped: true
|
|
2590
|
+
};
|
|
2591
|
+
};
|
|
2592
|
+
const tokenizeLineStart = (line, tokenNames, opts) => {
|
|
2408
2593
|
const trimmed = collapseWhitespace(line);
|
|
2409
2594
|
if (!trimmed) return null;
|
|
2410
|
-
const s = (normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, prefixChars);
|
|
2411
|
-
let pos = 0;
|
|
2412
|
-
let out = "";
|
|
2413
|
-
let matchedAny = false;
|
|
2414
|
-
let matchedToken = false;
|
|
2595
|
+
const s = (opts.normalizeArabicDiacritics ? stripArabicDiacritics(trimmed) : trimmed).slice(0, opts.prefixChars);
|
|
2415
2596
|
const compiled = compileTokenRegexes(tokenNames);
|
|
2416
|
-
|
|
2417
|
-
const
|
|
2418
|
-
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
const wsMatch = /^[ \t]+/u.exec(s.slice(pos));
|
|
2427
|
-
if (wsMatch) {
|
|
2428
|
-
pos += wsMatch[0].length;
|
|
2429
|
-
out = appendWs(out, whitespace);
|
|
2597
|
+
let pos = 0, out = "", matchedAny = false, matchedToken = false, steps = 0;
|
|
2598
|
+
const prefix = consumePrefixes(s, pos, out, opts.prefixMatchers, opts.whitespace);
|
|
2599
|
+
pos = prefix.pos;
|
|
2600
|
+
out = prefix.out;
|
|
2601
|
+
matchedAny = prefix.matched;
|
|
2602
|
+
while (steps < 6 && pos < s.length) {
|
|
2603
|
+
const ws = skipWhitespace(s, pos, out, opts.whitespace);
|
|
2604
|
+
if (ws.skipped) {
|
|
2605
|
+
pos = ws.pos;
|
|
2606
|
+
out = ws.out;
|
|
2430
2607
|
continue;
|
|
2431
2608
|
}
|
|
2432
|
-
const
|
|
2433
|
-
if (
|
|
2434
|
-
|
|
2435
|
-
out
|
|
2436
|
-
matchedAny = true;
|
|
2437
|
-
|
|
2438
|
-
pos += best.text.length;
|
|
2439
|
-
tokenSteps++;
|
|
2609
|
+
const tok = tryMatchToken(s, pos, out, compiled);
|
|
2610
|
+
if (tok.matched) {
|
|
2611
|
+
pos = tok.pos;
|
|
2612
|
+
out = tok.out;
|
|
2613
|
+
matchedAny = matchedToken = true;
|
|
2614
|
+
steps++;
|
|
2440
2615
|
continue;
|
|
2441
2616
|
}
|
|
2442
2617
|
if (matchedAny) {
|
|
2443
|
-
const
|
|
2444
|
-
if (
|
|
2445
|
-
|
|
2446
|
-
|
|
2618
|
+
const delim = tryMatchDelimiter(s, pos, out);
|
|
2619
|
+
if (delim.matched) {
|
|
2620
|
+
pos = delim.pos;
|
|
2621
|
+
out = delim.out;
|
|
2447
2622
|
continue;
|
|
2448
2623
|
}
|
|
2449
2624
|
}
|
|
2450
2625
|
if (matchedAny) {
|
|
2451
|
-
if (includeFirstWordFallback && !matchedToken) {
|
|
2452
|
-
const
|
|
2453
|
-
if (
|
|
2454
|
-
|
|
2455
|
-
|
|
2626
|
+
if (opts.includeFirstWordFallback && !matchedToken) {
|
|
2627
|
+
const word$1 = extractFirstWord(s.slice(pos));
|
|
2628
|
+
if (word$1) {
|
|
2629
|
+
out += escapeSignatureLiteral(word$1);
|
|
2630
|
+
steps++;
|
|
2631
|
+
}
|
|
2456
2632
|
}
|
|
2457
2633
|
break;
|
|
2458
2634
|
}
|
|
2459
|
-
if (!includeFirstWordFallback) return null;
|
|
2460
|
-
const
|
|
2461
|
-
if (!
|
|
2462
|
-
|
|
2463
|
-
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
|
|
2468
|
-
|
|
2469
|
-
return
|
|
2635
|
+
if (!opts.includeFirstWordFallback) return null;
|
|
2636
|
+
const word = extractFirstWord(s.slice(pos));
|
|
2637
|
+
if (!word) return null;
|
|
2638
|
+
return escapeSignatureLiteral(word);
|
|
2639
|
+
}
|
|
2640
|
+
return matchedAny ? trimTrailingWs(out, opts.whitespace) : null;
|
|
2641
|
+
};
|
|
2642
|
+
const processLine = (line, pageId, tokenPriority, opts, acc) => {
|
|
2643
|
+
const trimmed = collapseWhitespace(line);
|
|
2644
|
+
if (trimmed.length < opts.minLineLength) return;
|
|
2645
|
+
if (opts.lineFilter && !opts.lineFilter(trimmed, pageId)) return;
|
|
2646
|
+
const sig = tokenizeLineStart(trimmed, tokenPriority, opts);
|
|
2647
|
+
if (!sig) return;
|
|
2648
|
+
const entry = acc.get(sig);
|
|
2649
|
+
if (!entry) acc.set(sig, {
|
|
2650
|
+
count: 1,
|
|
2651
|
+
examples: [{
|
|
2652
|
+
line: trimmed,
|
|
2653
|
+
pageId
|
|
2654
|
+
}]
|
|
2655
|
+
});
|
|
2656
|
+
else {
|
|
2657
|
+
entry.count++;
|
|
2658
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push({
|
|
2659
|
+
line: trimmed,
|
|
2660
|
+
pageId
|
|
2661
|
+
});
|
|
2662
|
+
}
|
|
2663
|
+
};
|
|
2664
|
+
const processPage = (page, tokenPriority, opts, acc) => {
|
|
2665
|
+
for (const line of normalizeLineEndings(page.content ?? "").split("\n")) processLine(line, page.id, tokenPriority, opts, acc);
|
|
2470
2666
|
};
|
|
2471
2667
|
/**
|
|
2472
2668
|
* Analyze pages and return the most common line-start patterns (top K).
|
|
2473
|
-
*
|
|
2474
|
-
* This is a pure algorithmic heuristic: it tokenizes common prefixes into a stable
|
|
2475
|
-
* template-ish string using the library tokens (e.g., `{{bab}}`, `{{raqms}}`, `{{rumuz}}`).
|
|
2476
2669
|
*/
|
|
2477
2670
|
const analyzeCommonLineStarts = (pages, options = {}) => {
|
|
2478
|
-
const
|
|
2479
|
-
...DEFAULT_OPTIONS,
|
|
2480
|
-
...options,
|
|
2481
|
-
lineFilter: options.lineFilter ?? DEFAULT_OPTIONS.lineFilter,
|
|
2482
|
-
prefixMatchers: options.prefixMatchers ?? DEFAULT_OPTIONS.prefixMatchers,
|
|
2483
|
-
whitespace: options.whitespace ?? DEFAULT_OPTIONS.whitespace
|
|
2484
|
-
};
|
|
2671
|
+
const opts = resolveOptions$1(options);
|
|
2485
2672
|
const tokenPriority = buildTokenPriority();
|
|
2486
|
-
const
|
|
2487
|
-
for (const page of pages)
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2673
|
+
const acc = /* @__PURE__ */ new Map();
|
|
2674
|
+
for (const page of pages) processPage(page, tokenPriority, opts, acc);
|
|
2675
|
+
const comparator = opts.sortBy === "count" ? compareByCount : compareBySpecificity;
|
|
2676
|
+
return [...acc.entries()].map(([pattern, v]) => ({
|
|
2677
|
+
count: v.count,
|
|
2678
|
+
examples: v.examples,
|
|
2679
|
+
pattern
|
|
2680
|
+
})).filter((p) => p.count >= opts.minCount).sort(comparator).slice(0, opts.topK);
|
|
2681
|
+
};
|
|
2682
|
+
|
|
2683
|
+
//#endregion
|
|
2684
|
+
//#region src/analysis/repeating-sequences.ts
|
|
2685
|
+
const resolveOptions = (options) => {
|
|
2686
|
+
const minElements = Math.max(1, options?.minElements ?? 1);
|
|
2687
|
+
return {
|
|
2688
|
+
contextChars: options?.contextChars ?? 50,
|
|
2689
|
+
maxElements: Math.max(minElements, options?.maxElements ?? 3),
|
|
2690
|
+
maxExamples: options?.maxExamples ?? 3,
|
|
2691
|
+
maxUniquePatterns: options?.maxUniquePatterns ?? 1e3,
|
|
2692
|
+
minCount: Math.max(1, options?.minCount ?? 3),
|
|
2693
|
+
minElements,
|
|
2694
|
+
normalizeArabicDiacritics: options?.normalizeArabicDiacritics ?? true,
|
|
2695
|
+
requireToken: options?.requireToken ?? true,
|
|
2696
|
+
topK: Math.max(1, options?.topK ?? 20),
|
|
2697
|
+
whitespace: options?.whitespace ?? "regex"
|
|
2698
|
+
};
|
|
2699
|
+
};
|
|
2700
|
+
/** Creates a cursor that tracks position in both normalized and raw text */
|
|
2701
|
+
const createRawCursor = (text, normalize) => {
|
|
2702
|
+
let rawPos = 0;
|
|
2703
|
+
return {
|
|
2704
|
+
advance(normalizedLen) {
|
|
2705
|
+
if (!normalize) {
|
|
2706
|
+
const chunk = text.slice(rawPos, rawPos + normalizedLen);
|
|
2707
|
+
rawPos += normalizedLen;
|
|
2708
|
+
return chunk;
|
|
2709
|
+
}
|
|
2710
|
+
const start = rawPos;
|
|
2711
|
+
let matchedLen = 0;
|
|
2712
|
+
while (matchedLen < normalizedLen && rawPos < text.length) {
|
|
2713
|
+
if (stripArabicDiacritics(text[rawPos]).length > 0) matchedLen++;
|
|
2714
|
+
rawPos++;
|
|
2509
2715
|
}
|
|
2716
|
+
while (rawPos < text.length && stripArabicDiacritics(text[rawPos]).length === 0) rawPos++;
|
|
2717
|
+
return text.slice(start, rawPos);
|
|
2718
|
+
},
|
|
2719
|
+
get pos() {
|
|
2720
|
+
return rawPos;
|
|
2721
|
+
}
|
|
2722
|
+
};
|
|
2723
|
+
};
|
|
2724
|
+
/** Scans text and produces a stream of tokens and literals. */
|
|
2725
|
+
const tokenizeContent = (text, normalize) => {
|
|
2726
|
+
const normalized = normalize ? stripArabicDiacritics(text) : text;
|
|
2727
|
+
const compiled = compileTokenRegexes(buildTokenPriority());
|
|
2728
|
+
const cursor = createRawCursor(text, normalize);
|
|
2729
|
+
const items = [];
|
|
2730
|
+
let pos = 0;
|
|
2731
|
+
while (pos < normalized.length) {
|
|
2732
|
+
const ws = /^\s+/u.exec(normalized.slice(pos));
|
|
2733
|
+
if (ws) {
|
|
2734
|
+
pos += ws[0].length;
|
|
2735
|
+
cursor.advance(ws[0].length);
|
|
2736
|
+
continue;
|
|
2737
|
+
}
|
|
2738
|
+
const token = findBestTokenMatchAt(normalized, pos, compiled, isArabicLetter);
|
|
2739
|
+
if (token) {
|
|
2740
|
+
const raw = cursor.advance(token.text.length);
|
|
2741
|
+
items.push({
|
|
2742
|
+
end: cursor.pos,
|
|
2743
|
+
raw,
|
|
2744
|
+
start: cursor.pos - raw.length,
|
|
2745
|
+
text: `{{${token.token}}}`,
|
|
2746
|
+
type: "token"
|
|
2747
|
+
});
|
|
2748
|
+
pos += token.text.length;
|
|
2749
|
+
continue;
|
|
2750
|
+
}
|
|
2751
|
+
if (isCommonDelimiter(normalized[pos])) {
|
|
2752
|
+
const raw = cursor.advance(1);
|
|
2753
|
+
items.push({
|
|
2754
|
+
end: cursor.pos,
|
|
2755
|
+
raw,
|
|
2756
|
+
start: cursor.pos - 1,
|
|
2757
|
+
text: escapeSignatureLiteral(normalized[pos]),
|
|
2758
|
+
type: "literal"
|
|
2759
|
+
});
|
|
2760
|
+
pos++;
|
|
2761
|
+
continue;
|
|
2762
|
+
}
|
|
2763
|
+
const word = /^[^\s::\-–—ـ،؛.?!؟()[\]{}]+/u.exec(normalized.slice(pos));
|
|
2764
|
+
if (word) {
|
|
2765
|
+
const raw = cursor.advance(word[0].length);
|
|
2766
|
+
items.push({
|
|
2767
|
+
end: cursor.pos,
|
|
2768
|
+
raw,
|
|
2769
|
+
start: cursor.pos - raw.length,
|
|
2770
|
+
text: escapeSignatureLiteral(word[0]),
|
|
2771
|
+
type: "literal"
|
|
2772
|
+
});
|
|
2773
|
+
pos += word[0].length;
|
|
2774
|
+
continue;
|
|
2510
2775
|
}
|
|
2776
|
+
cursor.advance(1);
|
|
2777
|
+
pos++;
|
|
2511
2778
|
}
|
|
2512
|
-
|
|
2513
|
-
|
|
2514
|
-
|
|
2515
|
-
|
|
2516
|
-
|
|
2517
|
-
|
|
2518
|
-
|
|
2779
|
+
return items;
|
|
2780
|
+
};
|
|
2781
|
+
/** Build pattern string from window items */
|
|
2782
|
+
const buildPattern = (window, whitespace) => window.map((i) => i.text).join(whitespace === "space" ? " " : "\\s*");
|
|
2783
|
+
/** Check if window contains at least one token */
|
|
2784
|
+
const hasTokenInWindow = (window) => window.some((i) => i.type === "token");
|
|
2785
|
+
/** Compute token count and literal length for a window */
|
|
2786
|
+
const computeWindowStats = (window) => {
|
|
2787
|
+
let tokenCount = 0, literalLen = 0;
|
|
2788
|
+
for (const item of window) if (item.type === "token") tokenCount++;
|
|
2789
|
+
else literalLen += item.text.length;
|
|
2790
|
+
return {
|
|
2791
|
+
literalLen,
|
|
2792
|
+
tokenCount
|
|
2519
2793
|
};
|
|
2520
|
-
|
|
2521
|
-
|
|
2522
|
-
|
|
2794
|
+
};
|
|
2795
|
+
/** Build example from page content and window */
|
|
2796
|
+
const buildExample = (page, window, contextChars) => {
|
|
2797
|
+
const start = window[0].start;
|
|
2798
|
+
const end = window.at(-1).end;
|
|
2799
|
+
const ctxStart = Math.max(0, start - contextChars);
|
|
2800
|
+
const ctxEnd = Math.min(page.content.length, end + contextChars);
|
|
2801
|
+
return {
|
|
2802
|
+
context: (ctxStart > 0 ? "..." : "") + page.content.slice(ctxStart, ctxEnd) + (ctxEnd < page.content.length ? "..." : ""),
|
|
2803
|
+
pageId: page.id,
|
|
2804
|
+
startIndices: window.map((w) => w.start),
|
|
2805
|
+
text: page.content.slice(start, end)
|
|
2523
2806
|
};
|
|
2524
|
-
|
|
2525
|
-
|
|
2526
|
-
|
|
2807
|
+
};
|
|
2808
|
+
/** Extract N-grams from a single page */
|
|
2809
|
+
const extractPageNgrams = (page, items, opts, stats) => {
|
|
2810
|
+
for (let i = 0; i <= items.length - opts.minElements; i++) for (let n = opts.minElements; n <= Math.min(opts.maxElements, items.length - i); n++) {
|
|
2811
|
+
const window = items.slice(i, i + n);
|
|
2812
|
+
if (opts.requireToken && !hasTokenInWindow(window)) continue;
|
|
2813
|
+
const pattern = buildPattern(window, opts.whitespace);
|
|
2814
|
+
if (!stats.has(pattern)) {
|
|
2815
|
+
if (stats.size >= opts.maxUniquePatterns) continue;
|
|
2816
|
+
stats.set(pattern, {
|
|
2817
|
+
count: 0,
|
|
2818
|
+
examples: [],
|
|
2819
|
+
...computeWindowStats(window)
|
|
2820
|
+
});
|
|
2821
|
+
}
|
|
2822
|
+
const entry = stats.get(pattern);
|
|
2823
|
+
entry.count++;
|
|
2824
|
+
if (entry.examples.length < opts.maxExamples) entry.examples.push(buildExample(page, window, opts.contextChars));
|
|
2825
|
+
}
|
|
2826
|
+
};
|
|
2827
|
+
/**
|
|
2828
|
+
* Analyze pages for commonly repeating word sequences.
|
|
2829
|
+
*
|
|
2830
|
+
* Use for continuous text without line breaks. For line-based analysis,
|
|
2831
|
+
* use `analyzeCommonLineStarts()` instead.
|
|
2832
|
+
*/
|
|
2833
|
+
const analyzeRepeatingSequences = (pages, options) => {
|
|
2834
|
+
const opts = resolveOptions(options);
|
|
2835
|
+
const stats = /* @__PURE__ */ new Map();
|
|
2836
|
+
for (const page of pages) {
|
|
2837
|
+
if (!page.content) continue;
|
|
2838
|
+
extractPageNgrams(page, tokenizeContent(page.content, opts.normalizeArabicDiacritics), opts, stats);
|
|
2839
|
+
}
|
|
2840
|
+
return [...stats.entries()].filter(([, s]) => s.count >= opts.minCount).sort((a, b) => b[1].count - a[1].count || b[1].tokenCount - a[1].tokenCount || b[1].literalLen - a[1].literalLen).slice(0, opts.topK).map(([pattern, s]) => ({
|
|
2841
|
+
count: s.count,
|
|
2842
|
+
examples: s.examples,
|
|
2527
2843
|
pattern
|
|
2528
|
-
}))
|
|
2844
|
+
}));
|
|
2529
2845
|
};
|
|
2530
2846
|
|
|
2531
2847
|
//#endregion
|
|
@@ -2697,5 +3013,524 @@ const analyzeTextForRule = (text) => {
|
|
|
2697
3013
|
};
|
|
2698
3014
|
|
|
2699
3015
|
//#endregion
|
|
2700
|
-
|
|
3016
|
+
//#region src/recovery.ts
|
|
3017
|
+
const preview = (s, max = 40) => s.length <= max ? s : `${s.slice(0, max)}…`;
|
|
3018
|
+
const normalizeForCompare = (s, mode) => {
|
|
3019
|
+
if (mode === "none") return s;
|
|
3020
|
+
let out = s;
|
|
3021
|
+
if (mode === "whitespace_and_nfkc") out = out.normalize("NFKC").replace(/(?:\u200C|\u200D|\uFEFF)/gu, "");
|
|
3022
|
+
out = out.replace(/\r\n?/gu, "\n").replace(/\s+/gu, " ").trim();
|
|
3023
|
+
return out;
|
|
3024
|
+
};
|
|
3025
|
+
const segmentRangeKey = (s) => `${s.from}|${s.to ?? s.from}`;
|
|
3026
|
+
const buildFixedOptions = (options, selectedRuleIndices) => {
|
|
3027
|
+
const fixedRules = (options.rules ?? []).map((r, idx) => {
|
|
3028
|
+
if (!selectedRuleIndices.has(idx)) return r;
|
|
3029
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter) return r;
|
|
3030
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3031
|
+
return {
|
|
3032
|
+
...rest,
|
|
3033
|
+
lineStartsWith: lineStartsAfter
|
|
3034
|
+
};
|
|
3035
|
+
});
|
|
3036
|
+
return {
|
|
3037
|
+
...options,
|
|
3038
|
+
rules: fixedRules
|
|
3039
|
+
};
|
|
3040
|
+
};
|
|
3041
|
+
const buildPageIdToIndex = (pages) => new Map(pages.map((p, i) => [p.id, i]));
|
|
3042
|
+
const buildRangeContent = (processedPages, fromIdx, toIdx, pageJoiner) => {
|
|
3043
|
+
const parts = [];
|
|
3044
|
+
for (let i = fromIdx; i <= toIdx; i++) parts.push(normalizeLineEndings(processedPages[i].content));
|
|
3045
|
+
const matchContent = parts.join("\n");
|
|
3046
|
+
if (pageJoiner === "newline") return {
|
|
3047
|
+
matchContent,
|
|
3048
|
+
outputContent: matchContent
|
|
3049
|
+
};
|
|
3050
|
+
return {
|
|
3051
|
+
matchContent,
|
|
3052
|
+
outputContent: parts.join(" ")
|
|
3053
|
+
};
|
|
3054
|
+
};
|
|
3055
|
+
const compileMistakenRulesAsStartsWith = (options, selectedRuleIndices) => {
|
|
3056
|
+
const rules = options.rules ?? [];
|
|
3057
|
+
const compiled = [];
|
|
3058
|
+
for (const idx of selectedRuleIndices) {
|
|
3059
|
+
const r = rules[idx];
|
|
3060
|
+
if (!r || !("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3061
|
+
const { lineStartsAfter, ...rest } = r;
|
|
3062
|
+
const built = buildRuleRegex({
|
|
3063
|
+
...rest,
|
|
3064
|
+
lineStartsWith: lineStartsAfter
|
|
3065
|
+
});
|
|
3066
|
+
compiled.push({
|
|
3067
|
+
ruleIndex: idx,
|
|
3068
|
+
startsWithRegex: new RegExp(built.regex.source, "mu")
|
|
3069
|
+
});
|
|
3070
|
+
}
|
|
3071
|
+
return compiled;
|
|
3072
|
+
};
|
|
3073
|
+
const findUniqueAnchorPos = (outputContent, segmentContent) => {
|
|
3074
|
+
for (const len of [
|
|
3075
|
+
80,
|
|
3076
|
+
60,
|
|
3077
|
+
40,
|
|
3078
|
+
30,
|
|
3079
|
+
20,
|
|
3080
|
+
15
|
|
3081
|
+
]) {
|
|
3082
|
+
const needle = segmentContent.slice(0, Math.min(len, segmentContent.length));
|
|
3083
|
+
if (!needle.trim()) continue;
|
|
3084
|
+
const first = outputContent.indexOf(needle);
|
|
3085
|
+
if (first === -1) continue;
|
|
3086
|
+
if (outputContent.indexOf(needle, first + 1) === -1) return first;
|
|
3087
|
+
}
|
|
3088
|
+
return null;
|
|
3089
|
+
};
|
|
3090
|
+
const findRecoveredPrefixAtLineStart = (segmentContent, matchContent, lineStart, anchorPos, compiledMistaken) => {
|
|
3091
|
+
const line = matchContent.slice(lineStart);
|
|
3092
|
+
for (const mr of compiledMistaken) {
|
|
3093
|
+
mr.startsWithRegex.lastIndex = 0;
|
|
3094
|
+
const m = mr.startsWithRegex.exec(line);
|
|
3095
|
+
if (!m || m.index !== 0) continue;
|
|
3096
|
+
const markerMatch = m[0];
|
|
3097
|
+
const markerEnd = lineStart + markerMatch.length;
|
|
3098
|
+
if (anchorPos < markerEnd) continue;
|
|
3099
|
+
const gap = matchContent.slice(markerEnd, anchorPos);
|
|
3100
|
+
const recoveredPrefix = /^\s*$/u.test(gap) ? `${markerMatch}${gap}` : markerMatch;
|
|
3101
|
+
if (segmentContent.startsWith(markerMatch) || segmentContent.startsWith(recoveredPrefix)) return { reason: "content already starts with selected marker" };
|
|
3102
|
+
return { prefix: recoveredPrefix };
|
|
3103
|
+
}
|
|
3104
|
+
return { reason: "no selected marker pattern matched at anchored line start" };
|
|
3105
|
+
};
|
|
3106
|
+
const tryBestEffortRecoverOneSegment = (segment, processedPages, pageIdToIndex, compiledMistaken, pageJoiner) => {
|
|
3107
|
+
const fromIdx = pageIdToIndex.get(segment.from);
|
|
3108
|
+
const toIdx = pageIdToIndex.get(segment.to ?? segment.from) ?? fromIdx;
|
|
3109
|
+
if (fromIdx === void 0 || toIdx === void 0 || fromIdx < 0 || toIdx < fromIdx) return {
|
|
3110
|
+
kind: "unresolved",
|
|
3111
|
+
reason: "segment page range not found in pages"
|
|
3112
|
+
};
|
|
3113
|
+
const { matchContent, outputContent } = buildRangeContent(processedPages, fromIdx, toIdx, pageJoiner);
|
|
3114
|
+
if (!segment.content) return {
|
|
3115
|
+
kind: "unresolved",
|
|
3116
|
+
reason: "empty segment content"
|
|
3117
|
+
};
|
|
3118
|
+
const anchorPos = findUniqueAnchorPos(outputContent, segment.content);
|
|
3119
|
+
if (anchorPos === null) return {
|
|
3120
|
+
kind: "unresolved",
|
|
3121
|
+
reason: "could not uniquely anchor segment content in page range"
|
|
3122
|
+
};
|
|
3123
|
+
const lineStart = matchContent.lastIndexOf("\n", Math.max(0, anchorPos - 1)) + 1;
|
|
3124
|
+
const found = findRecoveredPrefixAtLineStart(segment.content, matchContent, lineStart, anchorPos, compiledMistaken);
|
|
3125
|
+
if ("reason" in found) return found.reason.includes("already starts") ? { kind: "skipped_idempotent" } : {
|
|
3126
|
+
kind: "unresolved",
|
|
3127
|
+
reason: found.reason
|
|
3128
|
+
};
|
|
3129
|
+
return {
|
|
3130
|
+
kind: "recovered",
|
|
3131
|
+
recoveredContent: `${found.prefix}${segment.content}`,
|
|
3132
|
+
recoveredPrefix: found.prefix
|
|
3133
|
+
};
|
|
3134
|
+
};
|
|
3135
|
+
const resolveRuleIndicesSelector = (rules, indicesIn) => {
|
|
3136
|
+
const errors = [];
|
|
3137
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3138
|
+
for (const idx of indicesIn) {
|
|
3139
|
+
if (!Number.isInteger(idx) || idx < 0 || idx >= rules.length) {
|
|
3140
|
+
errors.push(`Selector index out of range: ${idx}`);
|
|
3141
|
+
continue;
|
|
3142
|
+
}
|
|
3143
|
+
const rule = rules[idx];
|
|
3144
|
+
if (!rule || !("lineStartsAfter" in rule)) {
|
|
3145
|
+
errors.push(`Selector index ${idx} is not a lineStartsAfter rule`);
|
|
3146
|
+
continue;
|
|
3147
|
+
}
|
|
3148
|
+
indices.add(idx);
|
|
3149
|
+
}
|
|
3150
|
+
return {
|
|
3151
|
+
errors,
|
|
3152
|
+
indices,
|
|
3153
|
+
warnings: []
|
|
3154
|
+
};
|
|
3155
|
+
};
|
|
3156
|
+
const resolvePredicateSelector = (rules, predicate) => {
|
|
3157
|
+
const errors = [];
|
|
3158
|
+
const warnings = [];
|
|
3159
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3160
|
+
rules.forEach((r, i) => {
|
|
3161
|
+
try {
|
|
3162
|
+
if (!predicate(r, i)) return;
|
|
3163
|
+
if ("lineStartsAfter" in r && r.lineStartsAfter?.length) {
|
|
3164
|
+
indices.add(i);
|
|
3165
|
+
return;
|
|
3166
|
+
}
|
|
3167
|
+
warnings.push(`Predicate selected rule ${i}, but it is not a lineStartsAfter rule; skipping`);
|
|
3168
|
+
} catch (e) {
|
|
3169
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
3170
|
+
errors.push(`Predicate threw at rule ${i}: ${msg}`);
|
|
3171
|
+
}
|
|
3172
|
+
});
|
|
3173
|
+
if (indices.size === 0) warnings.push("Predicate did not select any lineStartsAfter rules");
|
|
3174
|
+
return {
|
|
3175
|
+
errors,
|
|
3176
|
+
indices,
|
|
3177
|
+
warnings
|
|
3178
|
+
};
|
|
3179
|
+
};
|
|
3180
|
+
const resolvePatternsSelector = (rules, patterns, matchMode) => {
|
|
3181
|
+
const errors = [];
|
|
3182
|
+
const warnings = [];
|
|
3183
|
+
const indices = /* @__PURE__ */ new Set();
|
|
3184
|
+
const normalizePattern = (p) => normalizeForCompare(p, (matchMode ?? "exact") === "normalized" ? "whitespace_and_nfkc" : "none");
|
|
3185
|
+
const targets = patterns.map(normalizePattern);
|
|
3186
|
+
for (let pi = 0; pi < patterns.length; pi++) {
|
|
3187
|
+
const rawPattern = patterns[pi];
|
|
3188
|
+
const pat = targets[pi];
|
|
3189
|
+
const matched = [];
|
|
3190
|
+
for (let i = 0; i < rules.length; i++) {
|
|
3191
|
+
const r = rules[i];
|
|
3192
|
+
if (!("lineStartsAfter" in r) || !r.lineStartsAfter?.length) continue;
|
|
3193
|
+
if (r.lineStartsAfter.some((rp) => normalizePattern(rp) === pat)) matched.push(i);
|
|
3194
|
+
}
|
|
3195
|
+
if (matched.length === 0) {
|
|
3196
|
+
errors.push(`Pattern "${rawPattern}" did not match any lineStartsAfter rule`);
|
|
3197
|
+
continue;
|
|
3198
|
+
}
|
|
3199
|
+
if (matched.length > 1) warnings.push(`Pattern "${rawPattern}" matched multiple lineStartsAfter rules: [${matched.join(", ")}]`);
|
|
3200
|
+
matched.forEach((i) => {
|
|
3201
|
+
indices.add(i);
|
|
3202
|
+
});
|
|
3203
|
+
}
|
|
3204
|
+
return {
|
|
3205
|
+
errors,
|
|
3206
|
+
indices,
|
|
3207
|
+
warnings
|
|
3208
|
+
};
|
|
3209
|
+
};
|
|
3210
|
+
const resolveSelectorToRuleIndices = (options, selector) => {
|
|
3211
|
+
const rules = options.rules ?? [];
|
|
3212
|
+
if (selector.type === "rule_indices") return resolveRuleIndicesSelector(rules, selector.indices);
|
|
3213
|
+
if (selector.type === "predicate") return resolvePredicateSelector(rules, selector.predicate);
|
|
3214
|
+
return resolvePatternsSelector(rules, selector.patterns, selector.match);
|
|
3215
|
+
};
|
|
3216
|
+
const longestCommonSuffixLength = (a, b) => {
|
|
3217
|
+
const max = Math.min(a.length, b.length);
|
|
3218
|
+
let i = 0;
|
|
3219
|
+
while (i < max) {
|
|
3220
|
+
if (a[a.length - 1 - i] !== b[b.length - 1 - i]) break;
|
|
3221
|
+
i++;
|
|
3222
|
+
}
|
|
3223
|
+
return i;
|
|
3224
|
+
};
|
|
3225
|
+
const AMBIGUITY_SCORE_GAP = 5;
|
|
3226
|
+
const scoreCandidate = (orig, fixed, normalizeMode) => {
|
|
3227
|
+
if (fixed.content === orig.content) return {
|
|
3228
|
+
fixedIndex: -1,
|
|
3229
|
+
kind: "exact",
|
|
3230
|
+
score: 100
|
|
3231
|
+
};
|
|
3232
|
+
if (fixed.content.endsWith(orig.content)) {
|
|
3233
|
+
const markerLen = fixed.content.length - orig.content.length;
|
|
3234
|
+
return {
|
|
3235
|
+
fixedIndex: -1,
|
|
3236
|
+
kind: "exact_suffix",
|
|
3237
|
+
score: 90 + Math.min(30, markerLen)
|
|
3238
|
+
};
|
|
3239
|
+
}
|
|
3240
|
+
if (normalizeMode !== "none") {
|
|
3241
|
+
const normFixed = normalizeForCompare(fixed.content, normalizeMode);
|
|
3242
|
+
const normOrig = normalizeForCompare(orig.content, normalizeMode);
|
|
3243
|
+
if (normFixed.endsWith(normOrig) && normOrig.length > 0) {
|
|
3244
|
+
const overlap = longestCommonSuffixLength(normFixed, normOrig) / normOrig.length;
|
|
3245
|
+
return {
|
|
3246
|
+
fixedIndex: -1,
|
|
3247
|
+
kind: "normalized_suffix",
|
|
3248
|
+
score: 70 + Math.floor(overlap * 20)
|
|
3249
|
+
};
|
|
3250
|
+
}
|
|
3251
|
+
}
|
|
3252
|
+
return null;
|
|
3253
|
+
};
|
|
3254
|
+
const buildNoSelectionResult = (segments, reportBase, mode, selectorErrors) => {
|
|
3255
|
+
const warnings = [...reportBase.warnings];
|
|
3256
|
+
warnings.push("No lineStartsAfter rules selected for recovery; returning segments unchanged");
|
|
3257
|
+
const details = segments.map((s, i) => {
|
|
3258
|
+
const status = selectorErrors.length ? "unresolved_selector" : "unchanged";
|
|
3259
|
+
return {
|
|
3260
|
+
from: s.from,
|
|
3261
|
+
notes: selectorErrors.length ? ["selector did not resolve"] : void 0,
|
|
3262
|
+
originalStartPreview: preview(s.content),
|
|
3263
|
+
segmentIndex: i,
|
|
3264
|
+
status,
|
|
3265
|
+
strategy: "none",
|
|
3266
|
+
to: s.to
|
|
3267
|
+
};
|
|
3268
|
+
});
|
|
3269
|
+
return {
|
|
3270
|
+
report: {
|
|
3271
|
+
...reportBase,
|
|
3272
|
+
details,
|
|
3273
|
+
summary: {
|
|
3274
|
+
mode,
|
|
3275
|
+
recovered: 0,
|
|
3276
|
+
totalSegments: segments.length,
|
|
3277
|
+
unchanged: segments.length,
|
|
3278
|
+
unresolved: selectorErrors.length ? segments.length : 0
|
|
3279
|
+
},
|
|
3280
|
+
warnings
|
|
3281
|
+
},
|
|
3282
|
+
segments
|
|
3283
|
+
};
|
|
3284
|
+
};
|
|
3285
|
+
const runStage1IfEnabled = (pages, segments, options, selectedRuleIndices, mode) => {
|
|
3286
|
+
const recoveredAtIndex = /* @__PURE__ */ new Map();
|
|
3287
|
+
const recoveredDetailAtIndex = /* @__PURE__ */ new Map();
|
|
3288
|
+
if (mode !== "best_effort_then_rerun") return {
|
|
3289
|
+
recoveredAtIndex,
|
|
3290
|
+
recoveredDetailAtIndex
|
|
3291
|
+
};
|
|
3292
|
+
const processedPages = options.replace ? applyReplacements(pages, options.replace) : pages;
|
|
3293
|
+
const pageIdToIndex = buildPageIdToIndex(processedPages);
|
|
3294
|
+
const pageJoiner = options.pageJoiner ?? "space";
|
|
3295
|
+
const compiledMistaken = compileMistakenRulesAsStartsWith(options, selectedRuleIndices);
|
|
3296
|
+
for (let i = 0; i < segments.length; i++) {
|
|
3297
|
+
const orig = segments[i];
|
|
3298
|
+
const r = tryBestEffortRecoverOneSegment(orig, processedPages, pageIdToIndex, compiledMistaken, pageJoiner);
|
|
3299
|
+
if (r.kind !== "recovered") continue;
|
|
3300
|
+
const seg = {
|
|
3301
|
+
...orig,
|
|
3302
|
+
content: r.recoveredContent
|
|
3303
|
+
};
|
|
3304
|
+
recoveredAtIndex.set(i, seg);
|
|
3305
|
+
recoveredDetailAtIndex.set(i, {
|
|
3306
|
+
from: orig.from,
|
|
3307
|
+
originalStartPreview: preview(orig.content),
|
|
3308
|
+
recoveredPrefixPreview: preview(r.recoveredPrefix),
|
|
3309
|
+
recoveredStartPreview: preview(seg.content),
|
|
3310
|
+
segmentIndex: i,
|
|
3311
|
+
status: "recovered",
|
|
3312
|
+
strategy: "stage1",
|
|
3313
|
+
to: orig.to
|
|
3314
|
+
});
|
|
3315
|
+
}
|
|
3316
|
+
return {
|
|
3317
|
+
recoveredAtIndex,
|
|
3318
|
+
recoveredDetailAtIndex
|
|
3319
|
+
};
|
|
3320
|
+
};
|
|
3321
|
+
const buildFixedBuckets = (fixedSegments) => {
|
|
3322
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
3323
|
+
for (let i = 0; i < fixedSegments.length; i++) {
|
|
3324
|
+
const k = segmentRangeKey(fixedSegments[i]);
|
|
3325
|
+
const arr = buckets.get(k);
|
|
3326
|
+
if (!arr) buckets.set(k, [i]);
|
|
3327
|
+
else arr.push(i);
|
|
3328
|
+
}
|
|
3329
|
+
return buckets;
|
|
3330
|
+
};
|
|
3331
|
+
const findBestFixedMatch = (orig, candidates, fixedSegments, usedFixed, normalizeCompare) => {
|
|
3332
|
+
let best = null;
|
|
3333
|
+
let secondBestScore = -Infinity;
|
|
3334
|
+
for (const fixedIdx of candidates) {
|
|
3335
|
+
if (usedFixed.has(fixedIdx)) continue;
|
|
3336
|
+
const fixed = fixedSegments[fixedIdx];
|
|
3337
|
+
const scored = scoreCandidate(orig, fixed, normalizeCompare);
|
|
3338
|
+
if (!scored) continue;
|
|
3339
|
+
const candidateScore = scored.score;
|
|
3340
|
+
if (!best || candidateScore > best.score) {
|
|
3341
|
+
secondBestScore = best?.score ?? -Infinity;
|
|
3342
|
+
best = {
|
|
3343
|
+
fixedIdx,
|
|
3344
|
+
score: candidateScore
|
|
3345
|
+
};
|
|
3346
|
+
} else if (candidateScore > secondBestScore) secondBestScore = candidateScore;
|
|
3347
|
+
}
|
|
3348
|
+
if (!best) return { kind: "none" };
|
|
3349
|
+
if (best.score - secondBestScore < AMBIGUITY_SCORE_GAP && candidates.length > 1) return { kind: "ambiguous" };
|
|
3350
|
+
return {
|
|
3351
|
+
fixedIdx: best.fixedIdx,
|
|
3352
|
+
kind: "match"
|
|
3353
|
+
};
|
|
3354
|
+
};
|
|
3355
|
+
const detailUnresolved = (orig, segmentIndex, notes) => ({
|
|
3356
|
+
from: orig.from,
|
|
3357
|
+
notes,
|
|
3358
|
+
originalStartPreview: preview(orig.content),
|
|
3359
|
+
segmentIndex,
|
|
3360
|
+
status: "unresolved_alignment",
|
|
3361
|
+
strategy: "rerun",
|
|
3362
|
+
to: orig.to
|
|
3363
|
+
});
|
|
3364
|
+
const detailSkippedIdempotent = (orig, segmentIndex, notes) => ({
|
|
3365
|
+
from: orig.from,
|
|
3366
|
+
notes,
|
|
3367
|
+
originalStartPreview: preview(orig.content),
|
|
3368
|
+
segmentIndex,
|
|
3369
|
+
status: "skipped_idempotent",
|
|
3370
|
+
strategy: "rerun",
|
|
3371
|
+
to: orig.to
|
|
3372
|
+
});
|
|
3373
|
+
const detailRecoveredRerun = (orig, fixed, segmentIndex) => {
|
|
3374
|
+
let recoveredPrefixPreview;
|
|
3375
|
+
if (fixed.content.endsWith(orig.content)) recoveredPrefixPreview = preview(fixed.content.slice(0, fixed.content.length - orig.content.length));
|
|
3376
|
+
return {
|
|
3377
|
+
from: orig.from,
|
|
3378
|
+
originalStartPreview: preview(orig.content),
|
|
3379
|
+
recoveredPrefixPreview,
|
|
3380
|
+
recoveredStartPreview: preview(fixed.content),
|
|
3381
|
+
segmentIndex,
|
|
3382
|
+
status: "recovered",
|
|
3383
|
+
strategy: "rerun",
|
|
3384
|
+
to: orig.to
|
|
3385
|
+
};
|
|
3386
|
+
};
|
|
3387
|
+
const mergeWithRerun = (params) => {
|
|
3388
|
+
const { fixedBuckets, fixedSegments, normalizeCompare, originalSegments, stage1RecoveredAtIndex, recoveredDetailAtIndex } = params;
|
|
3389
|
+
const usedFixed = /* @__PURE__ */ new Set();
|
|
3390
|
+
const out = [];
|
|
3391
|
+
const details = [];
|
|
3392
|
+
let recovered = 0;
|
|
3393
|
+
let unresolved = 0;
|
|
3394
|
+
let unchanged = 0;
|
|
3395
|
+
for (let i = 0; i < originalSegments.length; i++) {
|
|
3396
|
+
const stage1Recovered = stage1RecoveredAtIndex.get(i);
|
|
3397
|
+
if (stage1Recovered) {
|
|
3398
|
+
out.push(stage1Recovered);
|
|
3399
|
+
recovered++;
|
|
3400
|
+
details.push(recoveredDetailAtIndex.get(i) ?? {
|
|
3401
|
+
from: stage1Recovered.from,
|
|
3402
|
+
originalStartPreview: preview(originalSegments[i].content),
|
|
3403
|
+
recoveredStartPreview: preview(stage1Recovered.content),
|
|
3404
|
+
segmentIndex: i,
|
|
3405
|
+
status: "recovered",
|
|
3406
|
+
strategy: "stage1",
|
|
3407
|
+
to: stage1Recovered.to
|
|
3408
|
+
});
|
|
3409
|
+
continue;
|
|
3410
|
+
}
|
|
3411
|
+
const orig = originalSegments[i];
|
|
3412
|
+
const best = findBestFixedMatch(orig, fixedBuckets.get(segmentRangeKey(orig)) ?? [], fixedSegments, usedFixed, normalizeCompare);
|
|
3413
|
+
if (best.kind === "none") {
|
|
3414
|
+
out.push(orig);
|
|
3415
|
+
unresolved++;
|
|
3416
|
+
details.push(detailUnresolved(orig, i, ["no alignment candidate in rerun output for same (from,to)"]));
|
|
3417
|
+
continue;
|
|
3418
|
+
}
|
|
3419
|
+
if (best.kind === "ambiguous") {
|
|
3420
|
+
out.push(orig);
|
|
3421
|
+
unresolved++;
|
|
3422
|
+
details.push(detailUnresolved(orig, i, ["ambiguous alignment (score gap too small)"]));
|
|
3423
|
+
continue;
|
|
3424
|
+
}
|
|
3425
|
+
usedFixed.add(best.fixedIdx);
|
|
3426
|
+
const fixed = fixedSegments[best.fixedIdx];
|
|
3427
|
+
if (fixed.content === orig.content) {
|
|
3428
|
+
out.push(orig);
|
|
3429
|
+
unchanged++;
|
|
3430
|
+
details.push(detailSkippedIdempotent(orig, i, ["content already matches rerun output"]));
|
|
3431
|
+
continue;
|
|
3432
|
+
}
|
|
3433
|
+
out.push({
|
|
3434
|
+
...orig,
|
|
3435
|
+
content: fixed.content
|
|
3436
|
+
});
|
|
3437
|
+
recovered++;
|
|
3438
|
+
details.push(detailRecoveredRerun(orig, fixed, i));
|
|
3439
|
+
}
|
|
3440
|
+
return {
|
|
3441
|
+
details,
|
|
3442
|
+
segments: out,
|
|
3443
|
+
summary: {
|
|
3444
|
+
recovered,
|
|
3445
|
+
unchanged,
|
|
3446
|
+
unresolved
|
|
3447
|
+
}
|
|
3448
|
+
};
|
|
3449
|
+
};
|
|
3450
|
+
function recoverMistakenLineStartsAfterMarkers(pages, segments, options, selector, opts) {
|
|
3451
|
+
const mode = opts?.mode ?? "rerun_only";
|
|
3452
|
+
const normalizeCompare = opts?.normalizeCompare ?? "whitespace";
|
|
3453
|
+
const resolved = resolveSelectorToRuleIndices(options, selector);
|
|
3454
|
+
const reportBase = {
|
|
3455
|
+
byRun: void 0,
|
|
3456
|
+
errors: resolved.errors,
|
|
3457
|
+
warnings: resolved.warnings
|
|
3458
|
+
};
|
|
3459
|
+
if (resolved.indices.size === 0) return buildNoSelectionResult(segments, reportBase, mode, resolved.errors);
|
|
3460
|
+
const stage1 = runStage1IfEnabled(pages, segments, options, resolved.indices, mode);
|
|
3461
|
+
const fixedSegments = segmentPages(pages, buildFixedOptions(options, resolved.indices));
|
|
3462
|
+
const merged = mergeWithRerun({
|
|
3463
|
+
fixedBuckets: buildFixedBuckets(fixedSegments),
|
|
3464
|
+
fixedSegments,
|
|
3465
|
+
normalizeCompare,
|
|
3466
|
+
originalSegments: segments,
|
|
3467
|
+
recoveredDetailAtIndex: stage1.recoveredDetailAtIndex,
|
|
3468
|
+
stage1RecoveredAtIndex: stage1.recoveredAtIndex
|
|
3469
|
+
});
|
|
3470
|
+
return {
|
|
3471
|
+
report: {
|
|
3472
|
+
...reportBase,
|
|
3473
|
+
details: merged.details,
|
|
3474
|
+
summary: {
|
|
3475
|
+
mode,
|
|
3476
|
+
recovered: merged.summary.recovered,
|
|
3477
|
+
totalSegments: segments.length,
|
|
3478
|
+
unchanged: merged.summary.unchanged,
|
|
3479
|
+
unresolved: merged.summary.unresolved
|
|
3480
|
+
}
|
|
3481
|
+
},
|
|
3482
|
+
segments: merged.segments
|
|
3483
|
+
};
|
|
3484
|
+
}
|
|
3485
|
+
function recoverMistakenMarkersForRuns(runs, opts) {
|
|
3486
|
+
const allSegments = [];
|
|
3487
|
+
const byRun = [];
|
|
3488
|
+
const details = [];
|
|
3489
|
+
const warnings = [];
|
|
3490
|
+
const errors = [];
|
|
3491
|
+
let recovered = 0;
|
|
3492
|
+
let unchanged = 0;
|
|
3493
|
+
let unresolved = 0;
|
|
3494
|
+
let offset = 0;
|
|
3495
|
+
for (let i = 0; i < runs.length; i++) {
|
|
3496
|
+
const run = runs[i];
|
|
3497
|
+
const res = recoverMistakenLineStartsAfterMarkers(run.pages, run.segments, run.options, run.selector, opts);
|
|
3498
|
+
allSegments.push(...res.segments);
|
|
3499
|
+
for (const d of res.report.details) details.push({
|
|
3500
|
+
...d,
|
|
3501
|
+
segmentIndex: d.segmentIndex + offset
|
|
3502
|
+
});
|
|
3503
|
+
offset += run.segments.length;
|
|
3504
|
+
recovered += res.report.summary.recovered;
|
|
3505
|
+
unchanged += res.report.summary.unchanged;
|
|
3506
|
+
unresolved += res.report.summary.unresolved;
|
|
3507
|
+
warnings.push(...res.report.warnings);
|
|
3508
|
+
errors.push(...res.report.errors);
|
|
3509
|
+
byRun.push({
|
|
3510
|
+
recovered: res.report.summary.recovered,
|
|
3511
|
+
runIndex: i,
|
|
3512
|
+
totalSegments: run.segments.length,
|
|
3513
|
+
unresolved: res.report.summary.unresolved
|
|
3514
|
+
});
|
|
3515
|
+
}
|
|
3516
|
+
return {
|
|
3517
|
+
report: {
|
|
3518
|
+
byRun,
|
|
3519
|
+
details,
|
|
3520
|
+
errors,
|
|
3521
|
+
summary: {
|
|
3522
|
+
mode: opts?.mode ?? "rerun_only",
|
|
3523
|
+
recovered,
|
|
3524
|
+
totalSegments: offset,
|
|
3525
|
+
unchanged,
|
|
3526
|
+
unresolved
|
|
3527
|
+
},
|
|
3528
|
+
warnings
|
|
3529
|
+
},
|
|
3530
|
+
segments: allSegments
|
|
3531
|
+
};
|
|
3532
|
+
}
|
|
3533
|
+
|
|
3534
|
+
//#endregion
|
|
3535
|
+
export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
|
|
2701
3536
|
//# sourceMappingURL=index.mjs.map
|