flappa-doormal 2.6.3 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -156,1298 +156,1442 @@ const makeDiacriticInsensitive = (text) => {
156
156
  };
157
157
 
158
158
  //#endregion
159
- //#region src/segmentation/breakpoint-utils.ts
160
- const WINDOW_PREFIX_LENGTHS = [
161
- 80,
162
- 60,
163
- 40,
164
- 30,
165
- 20,
166
- 15
167
- ];
168
- const JOINER_PREFIX_LENGTHS = [
169
- 80,
170
- 60,
171
- 40,
172
- 30,
173
- 20,
174
- 15,
175
- 12,
176
- 10,
177
- 8,
178
- 6
179
- ];
159
+ //#region src/segmentation/tokens.ts
180
160
  /**
181
- * Normalizes a breakpoint to the object form.
182
- * Strings are converted to { pattern: str } with no constraints.
161
+ * Token-based template system for Arabic text pattern matching.
183
162
  *
184
- * @param bp - Breakpoint as string or object
185
- * @returns Normalized BreakpointRule object
163
+ * This module provides a human-readable way to define regex patterns using
164
+ * `{{token}}` placeholders that expand to their regex equivalents. It supports
165
+ * named capture groups for extracting matched values into metadata.
166
+ *
167
+ * @module tokens
186
168
  *
187
169
  * @example
188
- * normalizeBreakpoint('\\n\\n')
189
- * // { pattern: '\\n\\n' }
170
+ * // Simple token expansion
171
+ * expandTokens('{{raqms}} {{dash}}')
172
+ * // → '[\\u0660-\\u0669]+ [-–—ـ]'
190
173
  *
191
- * normalizeBreakpoint({ pattern: '\\n', min: 10 })
192
- * // { pattern: '\\n', min: 10 }
174
+ * @example
175
+ * // Named capture groups
176
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
177
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
193
178
  */
194
- const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
195
179
  /**
196
- * Checks if a page ID is in an excluded list (single pages or ranges).
180
+ * Token definitions mapping human-readable token names to regex patterns.
197
181
  *
198
- * @param pageId - Page ID to check
199
- * @param excludeList - List of page IDs or [from, to] ranges to exclude
200
- * @returns True if page is excluded
182
+ * Tokens are used in template strings with double-brace syntax:
183
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
184
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
185
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
201
186
  *
202
- * @example
203
- * isPageExcluded(5, [1, 5, 10])
204
- * // true
187
+ * @remarks
188
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
189
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
190
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
205
191
  *
206
- * isPageExcluded(5, [[3, 7]])
207
- * // true
192
+ * @example
193
+ * // Using tokens in a split rule
194
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
208
195
  *
209
- * isPageExcluded(5, [[10, 20]])
210
- * // false
196
+ * @example
197
+ * // Using tokens with named captures
198
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
211
199
  */
212
- const isPageExcluded = (pageId, excludeList) => {
213
- if (!excludeList || excludeList.length === 0) return false;
214
- for (const item of excludeList) if (typeof item === "number") {
215
- if (pageId === item) return true;
216
- } else {
217
- const [from, to] = item;
218
- if (pageId >= from && pageId <= to) return true;
219
- }
220
- return false;
221
- };
222
200
  /**
223
- * Checks if a page ID is within a breakpoint's min/max range and not excluded.
201
+ * Escapes regex metacharacters (parentheses and brackets) in template patterns,
202
+ * but preserves content inside `{{...}}` token delimiters.
224
203
  *
225
- * @param pageId - Page ID to check
226
- * @param rule - Breakpoint rule with optional min/max/exclude constraints
227
- * @returns True if page is within valid range
204
+ * This allows users to write intuitive patterns like `({{harf}}):` instead of
205
+ * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
206
+ * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
207
+ *
208
+ * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
209
+ * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
228
210
  *
229
211
  * @example
230
- * isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
231
- * // → true
212
+ * escapeTemplateBrackets('({{harf}}): ')
213
+ * // → '\\({{harf}}\\): '
232
214
  *
233
- * isInBreakpointRange(5, { pattern: '\\n', min: 10 })
234
- * // → false (below min)
215
+ * @example
216
+ * escapeTemplateBrackets('[{{raqm}}] ')
217
+ * // → '\\[{{raqm}}\\] '
218
+ *
219
+ * @example
220
+ * escapeTemplateBrackets('{{harf}}')
221
+ * // → '{{harf}}' (unchanged - no brackets outside tokens)
235
222
  */
236
- const isInBreakpointRange = (pageId, rule) => {
237
- if (rule.min !== void 0 && pageId < rule.min) return false;
238
- if (rule.max !== void 0 && pageId > rule.max) return false;
239
- return !isPageExcluded(pageId, rule.exclude);
223
+ const escapeTemplateBrackets = (pattern) => {
224
+ return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
225
+ if (token) return token;
226
+ return `\\${bracket}`;
227
+ });
228
+ };
229
+ const RUMUZ_ATOM = `(?:${[
230
+ "تمييز(?![\\u064B-\\u0652\\u0670أ-ي])",
231
+ "خت",
232
+ "خغ",
233
+ "بخ",
234
+ "عخ",
235
+ "مق",
236
+ "مت",
237
+ "عس",
238
+ "سي",
239
+ "سن",
240
+ "كن",
241
+ "مد",
242
+ "قد",
243
+ "خد",
244
+ "فد",
245
+ "دل",
246
+ "كد",
247
+ "غد",
248
+ "صد",
249
+ "دت",
250
+ "دس",
251
+ "تم",
252
+ "فق",
253
+ "دق",
254
+ "[خرزيمنصسدفلتقع](?![\\u064B-\\u0652\\u0670أ-ي])",
255
+ "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
256
+ ].join("|")})`;
257
+ const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
258
+ const BASE_TOKENS = {
259
+ bab: "باب",
260
+ basmalah: ["بسم الله", "﷽"].join("|"),
261
+ bullet: "[•*°]",
262
+ dash: "[-–—ـ]",
263
+ fasl: ["مسألة", "فصل"].join("|"),
264
+ harf: "[أ-ي]",
265
+ harfs: "[أ-ي](?:\\s+[أ-ي])*",
266
+ kitab: "كتاب",
267
+ naql: [
268
+ "حدثني",
269
+ "وأخبرنا",
270
+ "حدثنا",
271
+ "سمعت",
272
+ "أنبأنا",
273
+ "وحدثنا",
274
+ "أخبرنا",
275
+ "وحدثني",
276
+ "وحدثنيه"
277
+ ].join("|"),
278
+ raqm: "[\\u0660-\\u0669]",
279
+ raqms: "[\\u0660-\\u0669]+",
280
+ rumuz: RUMUZ_BLOCK,
281
+ tarqim: "[.!?؟؛]"
240
282
  };
241
283
  /**
242
- * Builds an exclude set from a PageRange array for O(1) lookups.
284
+ * Composite token definitions using template syntax.
243
285
  *
244
- * @param excludeList - List of page IDs or [from, to] ranges
245
- * @returns Set of all excluded page IDs
286
+ * These tokens reference base tokens using `{{token}}` syntax and are
287
+ * automatically expanded to their final regex patterns at module load time.
246
288
  *
247
- * @remarks
248
- * This expands ranges into explicit page IDs for fast membership checks. For typical
249
- * book-scale inputs (thousands of pages), this is small and keeps downstream logic
250
- * simple and fast. If you expect extremely large ranges (e.g., millions of pages),
251
- * consider avoiding broad excludes or introducing a range-based membership structure.
289
+ * This provides better abstraction - if base tokens change, composites
290
+ * automatically update on the next build.
252
291
  *
253
- * @example
254
- * buildExcludeSet([1, 5, [10, 12]])
255
- * // → Set { 1, 5, 10, 11, 12 }
292
+ * @internal
256
293
  */
257
- const buildExcludeSet = (excludeList) => {
258
- const excludeSet = /* @__PURE__ */ new Set();
259
- for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
260
- else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
261
- return excludeSet;
262
- };
294
+ const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
263
295
  /**
264
- * Creates a segment with optional to and meta fields.
265
- * Returns null if content is empty after trimming.
296
+ * Expands any *composite* tokens (like `{{numbered}}`) into their underlying template form
297
+ * (like `{{raqms}} {{dash}} `).
266
298
  *
267
- * @param content - Segment content
268
- * @param fromPageId - Starting page ID
269
- * @param toPageId - Optional ending page ID (omitted if same as from)
270
- * @param meta - Optional metadata to attach
271
- * @returns Segment object or null if empty
299
+ * This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
300
+ * and turn it into an editable template where you can add named captures, e.g.:
272
301
  *
273
- * @example
274
- * createSegment('Hello world', 1, 3, { chapter: 1 })
275
- * // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
302
+ * - `{{numbered}}` → `{{raqms}} {{dash}} `
303
+ * - then: `{{raqms:num}} {{dash}} ` to capture the number
276
304
  *
277
- * createSegment(' ', 1, undefined, undefined)
278
- * // null (empty content)
305
+ * Notes:
306
+ * - This only expands the plain `{{token}}` form (not `{{token:name}}`).
307
+ * - Expansion is repeated a few times to support nested composites.
279
308
  */
280
- const createSegment = (content, fromPageId, toPageId, meta) => {
281
- const trimmed = content.trim();
282
- if (!trimmed) return null;
283
- const seg = {
284
- content: trimmed,
285
- from: fromPageId
286
- };
287
- if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
288
- if (meta) seg.meta = meta;
289
- return seg;
309
+ const expandCompositeTokensInTemplate = (template) => {
310
+ let out = template;
311
+ for (let i = 0; i < 10; i++) {
312
+ const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => {
313
+ return COMPOSITE_TOKENS[tokenName] ?? m;
314
+ });
315
+ if (next === out) break;
316
+ out = next;
317
+ }
318
+ return out;
290
319
  };
291
320
  /**
292
- * Expands breakpoint patterns and pre-computes exclude sets.
321
+ * Expands base tokens in a template string.
322
+ * Used internally to pre-expand composite tokens.
293
323
  *
294
- * @param breakpoints - Array of breakpoint patterns or rules
295
- * @param processPattern - Function to expand tokens in patterns
296
- * @returns Array of expanded breakpoints with compiled regexes
324
+ * @param template - Template string with `{{token}}` placeholders
325
+ * @returns Expanded pattern with base tokens replaced
326
+ * @internal
327
+ */
328
+ const expandBaseTokens = (template) => {
329
+ return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
330
+ return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
331
+ });
332
+ };
333
+ /**
334
+ * Token definitions mapping human-readable token names to regex patterns.
335
+ *
336
+ * Tokens are used in template strings with double-brace syntax:
337
+ * - `{{token}}` - Expands to the pattern (non-capturing in context)
338
+ * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
339
+ * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
297
340
  *
298
341
  * @remarks
299
- * This function compiles regex patterns dynamically. This can be a ReDoS vector
300
- * if patterns come from untrusted sources. In typical usage, breakpoint rules
301
- * are application configuration, not user input.
302
- */
303
- const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
304
- const rule = normalizeBreakpoint(bp);
305
- const excludeSet = buildExcludeSet(rule.exclude);
306
- const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
307
- const expandedSkip = processPattern$1(rule.skipWhen);
308
- try {
309
- return new RegExp(expandedSkip, "mu");
310
- } catch (error) {
311
- const message = error instanceof Error ? error.message : String(error);
312
- throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
313
- }
314
- })() : null;
315
- if (rule.pattern === "") return {
316
- excludeSet,
317
- regex: null,
318
- rule,
319
- skipWhenRegex
320
- };
321
- const expanded = processPattern$1(rule.pattern);
322
- try {
323
- return {
324
- excludeSet,
325
- regex: new RegExp(expanded, "gmu"),
326
- rule,
327
- skipWhenRegex
328
- };
329
- } catch (error) {
330
- const message = error instanceof Error ? error.message : String(error);
331
- throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
332
- }
333
- });
334
- /**
335
- * Applies a configured joiner at detected page boundaries within a multi-page content chunk.
342
+ * These patterns are designed for Arabic text matching. For diacritic-insensitive
343
+ * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
344
+ * which applies `makeDiacriticInsensitive()` to the expanded patterns.
336
345
  *
337
- * This is used for breakpoint-generated segments which don't have access to the original
338
- * `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
339
- * prefix after the previous boundary, then replace ONLY the single newline immediately before
340
- * that page start.
346
+ * @example
347
+ * // Using tokens in a split rule
348
+ * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
341
349
  *
342
- * This avoids converting real in-page newlines, while still normalizing page joins consistently.
343
- */
344
- const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
345
- if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
346
- let updated = content;
347
- let searchFrom = 0;
348
- for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
349
- const pageData = normalizedPages.get(pageIds[pi]);
350
- if (!pageData) continue;
351
- const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
352
- if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
353
- if (found > 0) searchFrom = found;
354
- }
355
- return updated;
356
- };
357
- /**
358
- * Finds the position of a page prefix in content, trying multiple prefix lengths.
359
- */
360
- const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
361
- for (const len of JOINER_PREFIX_LENGTHS) {
362
- const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
363
- if (!prefix) continue;
364
- const pos = content.indexOf(prefix, searchFrom);
365
- if (pos > 0) return pos;
366
- }
367
- return -1;
368
- };
369
- /**
370
- * Estimates how far into the current page `remainingContent` begins.
350
+ * @example
351
+ * // Using tokens with named captures
352
+ * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
371
353
  *
372
- * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
373
- * When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
374
- * expected boundary positions. This helper computes an approximate starting offset by matching
375
- * a short prefix of `remainingContent` inside the current page content.
354
+ * @example
355
+ * // Using the numbered convenience token
356
+ * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
376
357
  */
377
- const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
378
- const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
379
- if (!currentPageData) return 0;
380
- const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
381
- const needle = remStart.slice(0, Math.min(30, remStart.length));
382
- if (!needle) return 0;
383
- const idx = currentPageData.content.indexOf(needle);
384
- return idx > 0 ? idx : 0;
358
+ const TOKEN_PATTERNS = {
359
+ ...BASE_TOKENS,
360
+ ...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
385
361
  };
386
362
  /**
387
- * Attempts to find the start position of a target page within remainingContent,
388
- * anchored near an expected boundary position to reduce collisions.
363
+ * Regex pattern for matching tokens with optional named capture syntax.
389
364
  *
390
- * This is used to define breakpoint windows in terms of actual content being split, rather than
391
- * raw per-page offsets which can desync when structural rules strip markers.
365
+ * Matches:
366
+ * - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
367
+ * - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
368
+ * - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
369
+ *
370
+ * @internal
392
371
  */
393
- const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
394
- const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
395
- if (!targetPageData) return -1;
396
- const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
397
- const searchStart = Math.max(0, approx - 1e4);
398
- const searchEnd = Math.min(remainingContent.length, approx + 2e3);
399
- const targetTrimmed = targetPageData.content.trimStart();
400
- for (const len of WINDOW_PREFIX_LENGTHS) {
401
- const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
402
- if (!prefix) continue;
403
- let pos = remainingContent.indexOf(prefix, searchStart);
404
- while (pos !== -1 && pos <= searchEnd) {
405
- if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
406
- pos = remainingContent.indexOf(prefix, pos + 1);
407
- }
408
- const last = remainingContent.lastIndexOf(prefix, approx);
409
- if (last > 0) return last;
410
- }
411
- return -1;
412
- };
372
+ const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
413
373
  /**
414
- * Builds a boundary position map for pages within the given range.
415
- *
416
- * This function computes page boundaries once per segment and enables
417
- * O(log n) page lookups via binary search with `findPageIndexForPosition`.
418
- *
419
- * Boundaries are derived from segmentContent (post-structural-rules).
420
- * When the segment starts mid-page, an offset correction is applied to
421
- * keep boundary estimates aligned with the segment's actual content space.
374
+ * Regex pattern for simple token matching (no capture syntax).
422
375
  *
423
- * @param segmentContent - Full segment content (already processed by structural rules)
424
- * @param fromIdx - Starting page index
425
- * @param toIdx - Ending page index
426
- * @param pageIds - Array of all page IDs
427
- * @param normalizedPages - Map of page ID to normalized content
428
- * @param cumulativeOffsets - Cumulative character offsets (for estimates)
429
- * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
430
- * with a sentinel boundary at segmentContent.length as the last element
376
+ * Matches only `{{token}}` format where token is one or more word characters.
377
+ * Used by `containsTokens()` for quick detection.
431
378
  *
432
- * @example
433
- * // For a 3-page segment:
434
- * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
435
- * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
379
+ * @internal
436
380
  */
437
- const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
438
- const boundaryPositions = [0];
439
- const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
440
- for (let i = fromIdx + 1; i <= toIdx; i++) {
441
- const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
442
- const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
443
- const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
444
- if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
445
- else {
446
- const estimate = Math.max(prevBoundary + 1, expectedBoundary);
447
- boundaryPositions.push(Math.min(estimate, segmentContent.length));
448
- }
449
- }
450
- boundaryPositions.push(segmentContent.length);
451
- return boundaryPositions;
452
- };
381
+ const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
453
382
  /**
454
- * Binary search to find which page a position falls within.
455
- * Uses "largest i where boundaryPositions[i] <= position" semantics.
383
+ * Checks if a query string contains template tokens.
456
384
  *
457
- * @param position - Character position in segmentContent
458
- * @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
459
- * @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
460
- * @returns Page index in pageIds array
385
+ * Performs a quick test for `{{token}}` patterns without actually
386
+ * expanding them. Useful for determining whether to apply token
387
+ * expansion to a string.
388
+ *
389
+ * @param query - String to check for tokens
390
+ * @returns `true` if the string contains at least one `{{token}}` pattern
461
391
  *
462
392
  * @example
463
- * // With boundaries [0, 20, 40, 60] and fromIdx=0:
464
- * findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
465
- * findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
466
- * findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
393
+ * containsTokens('{{raqms}} {{dash}}') // true
394
+ * containsTokens('plain text') // → false
395
+ * containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
467
396
  */
468
- const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
469
- if (boundaryPositions.length <= 1) return fromIdx;
470
- let left = 0;
471
- let right = boundaryPositions.length - 2;
472
- while (left < right) {
473
- const mid = Math.ceil((left + right) / 2);
474
- if (boundaryPositions[mid] <= position) left = mid;
475
- else right = mid - 1;
476
- }
477
- return fromIdx + left;
397
+ const containsTokens = (query) => {
398
+ SIMPLE_TOKEN_REGEX.lastIndex = 0;
399
+ return SIMPLE_TOKEN_REGEX.test(query);
478
400
  };
479
- /**
480
- * Finds the end position of a breakpoint window inside `remainingContent`.
481
- *
482
- * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
483
- * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
484
- * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
485
- */
486
- const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
487
- if (windowEndIdx >= toIdx) return remainingContent.length;
488
- const desiredNextIdx = windowEndIdx + 1;
489
- const minNextIdx = currentFromIdx + 1;
490
- const maxNextIdx = Math.min(desiredNextIdx, toIdx);
491
- const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
492
- for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
493
- const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
494
- const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
495
- if (pos > 0) return pos;
401
+ const splitTemplateIntoSegments = (query) => {
402
+ const segments = [];
403
+ let lastIndex = 0;
404
+ TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
405
+ let match;
406
+ while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
407
+ if (match.index > lastIndex) segments.push({
408
+ type: "text",
409
+ value: query.slice(lastIndex, match.index)
410
+ });
411
+ segments.push({
412
+ type: "token",
413
+ value: match[0]
414
+ });
415
+ lastIndex = match.index + match[0].length;
496
416
  }
497
- return remainingContent.length;
417
+ if (lastIndex < query.length) segments.push({
418
+ type: "text",
419
+ value: query.slice(lastIndex)
420
+ });
421
+ return segments;
498
422
  };
499
- /**
500
- * Finds exclusion-based break position using raw cumulative offsets.
501
- *
502
- * This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
503
- * Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
504
- */
505
- const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
506
- const startingPageId = pageIds[currentFromIdx];
507
- if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
508
- for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
509
- const pageId = pageIds[pageIdx];
510
- if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
511
- }
512
- return -1;
423
+ const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
424
+ if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
425
+ return text;
513
426
  };
514
- /**
515
- * Checks if any page in a range is excluded by the given exclude set.
427
+ const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
428
+ if (!fuzzyTransform) return tokenPattern;
429
+ return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
430
+ };
431
+ const parseTokenLiteral = (literal) => {
432
+ TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
433
+ const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
434
+ if (!tokenMatch) return null;
435
+ const [, tokenName, captureName] = tokenMatch;
436
+ return {
437
+ captureName,
438
+ tokenName
439
+ };
440
+ };
441
+ const createCaptureRegistry = (capturePrefix) => {
442
+ const captureNames = [];
443
+ const captureNameCounts = /* @__PURE__ */ new Map();
444
+ const register = (baseName) => {
445
+ const count = captureNameCounts.get(baseName) ?? 0;
446
+ captureNameCounts.set(baseName, count + 1);
447
+ const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
448
+ const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
449
+ captureNames.push(prefixedName);
450
+ return prefixedName;
451
+ };
452
+ return {
453
+ captureNames,
454
+ register
455
+ };
456
+ };
457
+ const expandTokenLiteral = (literal, opts) => {
458
+ const parsed = parseTokenLiteral(literal);
459
+ if (!parsed) return literal;
460
+ const { tokenName, captureName } = parsed;
461
+ if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
462
+ let tokenPattern = TOKEN_PATTERNS[tokenName];
463
+ if (!tokenPattern) return literal;
464
+ tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
465
+ if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
466
+ return tokenPattern;
467
+ };
468
+ /**
469
+ * Expands template tokens with support for named captures.
516
470
  *
517
- * @param excludeSet - Set of excluded page IDs
518
- * @param pageIds - Array of page IDs
519
- * @param fromIdx - Start index (inclusive)
520
- * @param toIdx - End index (inclusive)
521
- * @returns True if any page in range is excluded
471
+ * This is the primary token expansion function that handles all token syntax:
472
+ * - `{{token}}` Expands to the token's pattern (no capture group)
473
+ * - `{{token:name}}` Expands to `(?<name>pattern)` (named capture)
474
+ * - `{{:name}}` Expands to `(?<name>.+)` (capture anything)
475
+ *
476
+ * Unknown tokens are left as-is in the output, allowing for partial templates.
477
+ *
478
+ * @param query - The template string containing tokens
479
+ * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
480
+ * Applied to both token patterns and plain Arabic text between tokens.
481
+ * Typically `makeDiacriticInsensitive` from the fuzzy module.
482
+ * @returns Object with expanded pattern, capture names, and capture flag
483
+ *
484
+ * @example
485
+ * // Simple token expansion
486
+ * expandTokensWithCaptures('{{raqms}} {{dash}}')
487
+ * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
488
+ *
489
+ * @example
490
+ * // Named capture
491
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
492
+ * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
493
+ *
494
+ * @example
495
+ * // Capture-only token
496
+ * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
497
+ * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
498
+ *
499
+ * @example
500
+ * // With fuzzy transform
501
+ * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
502
+ * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
522
503
  */
523
- const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
524
- if (excludeSet.size === 0) return false;
525
- for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
526
- return false;
504
+ const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
505
+ const segments = splitTemplateIntoSegments(query);
506
+ const registry = createCaptureRegistry(capturePrefix);
507
+ const processedParts = segments.map((segment) => {
508
+ if (segment.type === "text") return maybeApplyFuzzyToText(segment.value, fuzzyTransform);
509
+ return expandTokenLiteral(segment.value, {
510
+ capturePrefix,
511
+ fuzzyTransform,
512
+ registerCapture: registry.register
513
+ });
514
+ });
515
+ return {
516
+ captureNames: registry.captureNames,
517
+ hasCaptures: registry.captureNames.length > 0,
518
+ pattern: processedParts.join("")
519
+ };
527
520
  };
528
521
  /**
529
- * Finds the position of the next page content within remaining content.
530
- * Returns -1 if not found.
522
+ * Expands template tokens in a query string to their regex equivalents.
531
523
  *
532
- * @param remainingContent - Content to search in
533
- * @param nextPageData - Normalized data for the next page
534
- * @returns Position of next page content, or -1 if not found
524
+ * This is the simple version without capture support. It returns only the
525
+ * expanded pattern string, not capture metadata.
526
+ *
527
+ * Unknown tokens are left as-is, allowing for partial templates.
528
+ *
529
+ * @param query - Template string containing `{{token}}` placeholders
530
+ * @returns Expanded regex pattern string
531
+ *
532
+ * @example
533
+ * expandTokens('، {{raqms}}') // → '، [\\u0660-\\u0669]+'
534
+ * expandTokens('{{raqm}}*') // → '[\\u0660-\\u0669]*'
535
+ * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
536
+ * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
537
+ *
538
+ * @see expandTokensWithCaptures for full capture group support
535
539
  */
536
- const findNextPagePosition = (remainingContent, nextPageData) => {
537
- const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
538
- if (searchPrefix.length === 0) return -1;
539
- const pos = remainingContent.indexOf(searchPrefix);
540
- return pos > 0 ? pos : -1;
541
- };
540
+ const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
542
541
  /**
543
- * Finds matches within a window and returns the selected position based on preference.
542
+ * Converts a template string to a compiled RegExp.
544
543
  *
545
- * @param windowContent - Content to search
546
- * @param regex - Regex to match
547
- * @param prefer - 'longer' for last match, 'shorter' for first match
548
- * @returns Break position after the selected match, or -1 if no matches
544
+ * Expands all tokens and attempts to compile the result as a RegExp
545
+ * with Unicode flag. Returns `null` if the resulting pattern is invalid.
546
+ *
547
+ * @remarks
548
+ * This function dynamically compiles regular expressions from template strings.
549
+ * If templates may come from untrusted sources, be aware of potential ReDoS
550
+ * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
551
+ * Consider validating pattern complexity or applying execution timeouts when
552
+ * running user-submitted patterns.
553
+ *
554
+ * @param template - Template string containing `{{token}}` placeholders
555
+ * @returns Compiled RegExp with 'u' flag, or `null` if invalid
556
+ *
557
+ * @example
558
+ * templateToRegex('، {{raqms}}') // → /، [٠-٩]+/u
559
+ * templateToRegex('{{raqms}}+') // → /[٠-٩]++/u (might be invalid in some engines)
560
+ * templateToRegex('(((') // → null (invalid regex)
549
561
  */
550
- const findPatternBreakPosition = (windowContent, regex, prefer) => {
551
- let first;
552
- let last;
553
- for (const m of windowContent.matchAll(regex)) {
554
- const match = {
555
- index: m.index,
556
- length: m[0].length
557
- };
558
- if (!first) first = match;
559
- last = match;
562
+ const templateToRegex = (template) => {
563
+ const expanded = expandTokens(template);
564
+ try {
565
+ return new RegExp(expanded, "u");
566
+ } catch {
567
+ return null;
560
568
  }
561
- if (!first) return -1;
562
- const selected = prefer === "longer" ? last : first;
563
- return selected.index + selected.length;
564
569
  };
565
570
  /**
566
- * Handles page boundary breakpoint (empty pattern).
567
- * Returns break position or -1 if no valid position found.
571
+ * Lists all available token names defined in `TOKEN_PATTERNS`.
572
+ *
573
+ * Useful for documentation, validation, or building user interfaces
574
+ * that show available tokens.
575
+ *
576
+ * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
577
+ *
578
+ * @example
579
+ * getAvailableTokens()
580
+ * // → ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
568
581
  */
569
- const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
570
- const nextPageIdx = windowEndIdx + 1;
571
- if (nextPageIdx <= toIdx) {
572
- const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
573
- if (nextPageData) {
574
- const pos = findNextPagePosition(remainingContent, nextPageData);
575
- if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
576
- }
577
- }
578
- return Math.min(windowEndPosition, remainingContent.length);
579
- };
582
+ const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
580
583
  /**
581
- * Tries to find a break position within the current window using breakpoint patterns.
582
- * Returns the break position or -1 if no suitable break was found.
584
+ * Gets the regex pattern for a specific token name.
583
585
  *
584
- * @param remainingContent - Content remaining to be segmented
585
- * @param currentFromIdx - Current starting page index
586
- * @param toIdx - Ending page index
587
- * @param windowEndIdx - Maximum window end index
588
- * @param ctx - Breakpoint context with page data and patterns
589
- * @returns Break position in the content, or -1 if no break found
586
+ * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
587
+ * without any expansion or capture group wrapping.
588
+ *
589
+ * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
590
+ * @returns The regex pattern string, or `undefined` if token doesn't exist
591
+ *
592
+ * @example
593
+ * getTokenPattern('raqms') // → '[\\u0660-\\u0669]+'
594
+ * getTokenPattern('dash') // → '[-–—ـ]'
595
+ * getTokenPattern('unknown') // → undefined
590
596
  */
591
- const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
592
- const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
593
- for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
594
- if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
595
- if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
596
- if (skipWhenRegex?.test(remainingContent)) continue;
597
- if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
598
- const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
599
- if (breakPos > 0) return breakPos;
600
- }
601
- return -1;
597
+ const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
598
+ /**
599
+ * Regex to detect fuzzy-default tokens in a pattern string.
600
+ * Matches {{token}} or {{token:name}} syntax.
601
+ */
602
+ const FUZZY_TOKEN_REGEX = new RegExp(`\\{\\{(?:${[
603
+ "bab",
604
+ "basmalah",
605
+ "fasl",
606
+ "kitab",
607
+ "naql"
608
+ ].join("|")})(?::\\w+)?\\}\\}`, "g");
609
+ /**
610
+ * Checks if a pattern (or array of patterns) contains tokens that should
611
+ * default to fuzzy matching.
612
+ *
613
+ * Fuzzy-default tokens are: bab, basmalah, fasl, kitab, naql
614
+ *
615
+ * @param patterns - Single pattern string or array of pattern strings
616
+ * @returns `true` if any pattern contains a fuzzy-default token
617
+ *
618
+ * @example
619
+ * shouldDefaultToFuzzy('{{bab}} الإيمان') // true
620
+ * shouldDefaultToFuzzy('{{raqms}} {{dash}}') // false
621
+ * shouldDefaultToFuzzy(['{{kitab}}', '{{raqms}}']) // true
622
+ */
623
+ const shouldDefaultToFuzzy = (patterns) => {
624
+ return (Array.isArray(patterns) ? patterns : [patterns]).some((p) => {
625
+ FUZZY_TOKEN_REGEX.lastIndex = 0;
626
+ return FUZZY_TOKEN_REGEX.test(p);
627
+ });
602
628
  };
603
629
 
604
630
  //#endregion
605
- //#region src/segmentation/breakpoint-processor.ts
631
+ //#region src/segmentation/pattern-validator.ts
606
632
  /**
607
- * Breakpoint post-processing engine extracted from segmenter.ts.
633
+ * Pattern validation utilities for detecting common mistakes in rule patterns.
608
634
  *
609
- * This module is intentionally split into small helpers to reduce cognitive complexity
610
- * and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
635
+ * These utilities help catch typos and issues early, before rules are used
636
+ * for segmentation.
611
637
  */
612
- const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
613
- const buildNormalizedPagesMap = (pages, normalizedContent) => {
614
- const normalizedPages = /* @__PURE__ */ new Map();
615
- for (let i = 0; i < pages.length; i++) {
616
- const content = normalizedContent[i];
617
- normalizedPages.set(pages[i].id, {
618
- content,
619
- index: i,
620
- length: content.length
621
- });
622
- }
623
- return normalizedPages;
638
+ const KNOWN_TOKENS = new Set(getAvailableTokens());
639
+ const TOKEN_INSIDE_BRACES = /\{\{(\w+)(?::\w+)?\}\}/g;
640
+ const buildBareTokenRegex = () => {
641
+ const tokens = [...KNOWN_TOKENS].sort((a, b) => b.length - a.length);
642
+ return new RegExp(`(?<!\\{\\{)(${tokens.join("|")})(?::\\w+)?(?!\\}\\})`, "g");
624
643
  };
625
- const buildCumulativeOffsets = (pageIds, normalizedPages) => {
626
- const cumulativeOffsets = [0];
627
- let totalOffset = 0;
628
- for (let i = 0; i < pageIds.length; i++) {
629
- const pageData = normalizedPages.get(pageIds[i]);
630
- totalOffset += pageData ? pageData.length : 0;
631
- if (i < pageIds.length - 1) totalOffset += 1;
632
- cumulativeOffsets.push(totalOffset);
633
- }
634
- return cumulativeOffsets;
635
- };
636
- const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
637
- const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
638
- const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
639
- let windowEndIdx = currentFromIdx;
640
- for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
641
- else break;
642
- return windowEndIdx;
643
- };
644
- const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
645
- const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
646
644
  /**
647
- * Computes the actual start and end page indices for a piece using
648
- * precomputed boundary positions and binary search.
649
- *
650
- * @param pieceStartPos - Start position of the piece in the full segment content
651
- * @param pieceEndPos - End position (exclusive) of the piece
652
- * @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
653
- * @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
654
- * @param toIdx - Maximum page index
655
- * @returns Object with actualStartIdx and actualEndIdx
645
+ * Validates a single pattern for common issues.
656
646
  */
657
- const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
658
- const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
659
- const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
660
- return {
661
- actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
662
- actualStartIdx
647
+ const validatePattern = (pattern, seenPatterns) => {
648
+ if (seenPatterns.has(pattern)) return {
649
+ message: `Duplicate pattern: "${pattern}"`,
650
+ type: "duplicate"
663
651
  };
664
- };
665
- const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
666
- let nextFromIdx = actualEndIdx;
667
- if (remainingContent && actualEndIdx + 1 <= toIdx) {
668
- const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
669
- if (nextPageData) {
670
- const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
671
- const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
672
- if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
673
- }
652
+ seenPatterns.add(pattern);
653
+ const tokensInBraces = [...pattern.matchAll(TOKEN_INSIDE_BRACES)];
654
+ for (const match of tokensInBraces) {
655
+ const tokenName = match[1];
656
+ if (!KNOWN_TOKENS.has(tokenName)) return {
657
+ message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
658
+ suggestion: `Check spelling or use a known token`,
659
+ type: "unknown_token"
660
+ };
674
661
  }
675
- return nextFromIdx;
676
- };
677
- const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
678
- /**
679
- * Finds the break offset within a window, trying exclusions first, then patterns.
680
- *
681
- * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
682
- */
683
- const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
684
- if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
685
- const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
686
- if (exclusionBreak > 0) return exclusionBreak;
662
+ const bareTokenRegex = buildBareTokenRegex();
663
+ const bareMatches = [...pattern.matchAll(bareTokenRegex)];
664
+ for (const match of bareMatches) {
665
+ const tokenName = match[1];
666
+ const fullMatch = match[0];
667
+ const matchIndex = match.index;
668
+ const before = pattern.slice(Math.max(0, matchIndex - 2), matchIndex);
669
+ const after = pattern.slice(matchIndex + fullMatch.length, matchIndex + fullMatch.length + 2);
670
+ if (before !== "{{" && after !== "}}") return {
671
+ message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
672
+ suggestion: `{{${fullMatch}}}`,
673
+ type: "missing_braces"
674
+ };
687
675
  }
688
- const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
689
- expandedBreakpoints,
690
- normalizedPages,
691
- pageIds,
692
- prefer
693
- });
694
- return patternBreak > 0 ? patternBreak : windowEndPosition;
695
676
  };
696
677
  /**
697
- * Advances cursor position past any leading whitespace.
678
+ * Validates an array of patterns, returning parallel array of issues.
698
679
  */
699
- const skipWhitespace = (content, startPos) => {
700
- let pos = startPos;
701
- while (pos < content.length && /\s/.test(content[pos])) pos++;
702
- return pos;
680
+ const validatePatternArray = (patterns) => {
681
+ const seenPatterns = /* @__PURE__ */ new Set();
682
+ const issues = patterns.map((p) => validatePattern(p, seenPatterns));
683
+ if (issues.every((i) => i === void 0)) return;
684
+ return issues;
703
685
  };
704
686
  /**
705
- * Processes an oversized segment by iterating through the content and
706
- * breaking it into smaller pieces that fit within maxPages constraints.
687
+ * Validates split rules for common pattern issues.
707
688
  *
708
- * Uses precomputed boundary positions for O(log n) page attribution lookups.
709
- */
710
- const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
711
- const result = [];
712
- const fullContent = segment.content;
713
- let cursorPos = 0;
714
- let currentFromIdx = fromIdx;
715
- let isFirstPiece = true;
716
- const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
717
- logger?.debug?.("[breakpoints] boundaryPositions built", {
718
- boundaryPositions,
719
- fromIdx,
720
- fullContentLength: fullContent.length,
721
- toIdx
722
- });
723
- const maxIterations = 1e4;
724
- for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
725
- const remainingContent = fullContent.slice(cursorPos);
726
- if (!remainingContent.trim()) break;
727
- const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
728
- const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
729
- if (remainingSpan <= maxPages && !remainingHasExclusions) {
730
- const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
731
- if (finalSeg) result.push(finalSeg);
732
- break;
689
+ * Checks for:
690
+ * - Missing `{{}}` around known token names (e.g., `raqms:num` instead of `{{raqms:num}}`)
691
+ * - Unknown token names inside `{{}}` (e.g., `{{nonexistent}}`)
692
+ * - Duplicate patterns within the same rule
693
+ *
694
+ * @param rules - Array of split rules to validate
695
+ * @returns Array parallel to input with validation results (undefined if no issues)
696
+ *
697
+ * @example
698
+ * const issues = validateRules([
699
+ * { lineStartsAfter: ['raqms:num'] }, // Missing braces
700
+ * { lineStartsWith: ['{{unknown}}'] }, // Unknown token
701
+ * ]);
702
+ * // issues[0]?.lineStartsAfter?.[0]?.type === 'missing_braces'
703
+ * // issues[1]?.lineStartsWith?.[0]?.type === 'unknown_token'
704
+ */
705
+ const validateRules = (rules) => {
706
+ return rules.map((rule) => {
707
+ const result = {};
708
+ let hasIssues = false;
709
+ if ("lineStartsWith" in rule && rule.lineStartsWith) {
710
+ const issues = validatePatternArray(rule.lineStartsWith);
711
+ if (issues) {
712
+ result.lineStartsWith = issues;
713
+ hasIssues = true;
714
+ }
733
715
  }
734
- const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
735
- const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
736
- logger?.debug?.(`[breakpoints] iteration=${i}`, {
737
- currentFromIdx,
738
- cursorPos,
739
- windowEndIdx
740
- });
741
- const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
742
- const breakPos = cursorPos + breakOffset;
743
- const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
744
- const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
745
- logger?.trace?.("[breakpoints] piece", {
746
- actualEndIdx,
747
- actualStartIdx,
748
- pieceLength: pieceContent.length
749
- });
750
- if (pieceContent) {
751
- const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
752
- if (pieceSeg) result.push(pieceSeg);
716
+ if ("lineStartsAfter" in rule && rule.lineStartsAfter) {
717
+ const issues = validatePatternArray(rule.lineStartsAfter);
718
+ if (issues) {
719
+ result.lineStartsAfter = issues;
720
+ hasIssues = true;
721
+ }
753
722
  }
754
- cursorPos = skipWhitespace(fullContent, breakPos);
755
- currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
756
- isFirstPiece = false;
723
+ if ("lineEndsWith" in rule && rule.lineEndsWith) {
724
+ const issues = validatePatternArray(rule.lineEndsWith);
725
+ if (issues) {
726
+ result.lineEndsWith = issues;
727
+ hasIssues = true;
728
+ }
729
+ }
730
+ if ("template" in rule && rule.template) {
731
+ const seenPatterns = /* @__PURE__ */ new Set();
732
+ const issue = validatePattern(rule.template, seenPatterns);
733
+ if (issue) {
734
+ result.template = issue;
735
+ hasIssues = true;
736
+ }
737
+ }
738
+ return hasIssues ? result : void 0;
739
+ });
740
+ };
741
+
742
+ //#endregion
743
+ //#region src/segmentation/replace.ts
744
+ const DEFAULT_REPLACE_FLAGS = "gu";
745
+ const normalizeReplaceFlags = (flags) => {
746
+ if (!flags) return DEFAULT_REPLACE_FLAGS;
747
+ const allowed = new Set([
748
+ "g",
749
+ "i",
750
+ "m",
751
+ "s",
752
+ "u",
753
+ "y"
754
+ ]);
755
+ const set = /* @__PURE__ */ new Set();
756
+ for (const ch of flags) {
757
+ if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
758
+ set.add(ch);
757
759
  }
758
- logger?.debug?.("[breakpoints] done", { resultCount: result.length });
759
- return result;
760
+ set.add("g");
761
+ set.add("u");
762
+ return [
763
+ "g",
764
+ "i",
765
+ "m",
766
+ "s",
767
+ "y",
768
+ "u"
769
+ ].filter((c) => set.has(c)).join("");
770
+ };
771
+ const compileReplaceRules = (rules) => {
772
+ const compiled = [];
773
+ for (const r of rules) {
774
+ if (r.pageIds && r.pageIds.length === 0) continue;
775
+ const flags = normalizeReplaceFlags(r.flags);
776
+ const re = new RegExp(r.regex, flags);
777
+ compiled.push({
778
+ pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
779
+ re,
780
+ replacement: r.replacement
781
+ });
782
+ }
783
+ return compiled;
760
784
  };
761
785
  /**
762
- * Applies breakpoints to oversized segments.
786
+ * Applies ordered regex replacements to page content (per page).
763
787
  *
764
- * Note: This is an internal engine used by `segmentPages()`.
788
+ * - Replacement rules are applied in array order.
789
+ * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
790
+ * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
791
+ *
792
+ * This function is intentionally **pure**:
793
+ * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
765
794
  */
766
- const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
767
- const pageIds = pages.map((p) => p.id);
768
- const pageIdToIndex = buildPageIdToIndexMap(pageIds);
769
- const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
770
- const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
771
- const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
772
- const result = [];
773
- logger?.info?.("Starting breakpoint processing", {
774
- maxPages,
775
- segmentCount: segments.length
776
- });
777
- logger?.debug?.("[breakpoints] inputSegments", {
778
- segmentCount: segments.length,
779
- segments: segments.map((s) => ({
780
- contentLength: s.content.length,
781
- from: s.from,
782
- to: s.to
783
- }))
784
- });
785
- for (const segment of segments) {
786
- const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
787
- const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
788
- const segmentSpan = (segment.to ?? segment.from) - segment.from;
789
- const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
790
- if (segmentSpan <= maxPages && !hasExclusions) {
791
- result.push(segment);
792
- continue;
795
+ const applyReplacements = (pages, rules) => {
796
+ if (!rules || rules.length === 0 || pages.length === 0) return pages;
797
+ const compiled = compileReplaceRules(rules);
798
+ if (compiled.length === 0) return pages;
799
+ return pages.map((p) => {
800
+ let content = p.content;
801
+ for (const rule of compiled) {
802
+ if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
803
+ content = content.replace(rule.re, rule.replacement);
793
804
  }
794
- const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
795
- result.push(...broken.map((s) => {
796
- const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
797
- const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
798
- if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
799
- ...s,
800
- content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
801
- };
802
- return s;
803
- }));
804
- }
805
- logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
806
- return result;
805
+ if (content === p.content) return p;
806
+ return {
807
+ ...p,
808
+ content
809
+ };
810
+ });
807
811
  };
808
812
 
809
813
  //#endregion
810
- //#region src/segmentation/match-utils.ts
811
- /**
812
- * Utility functions for regex matching and result processing.
813
- *
814
- * These functions were extracted from `segmenter.ts` to reduce complexity
815
- * and enable independent testing. They handle match filtering, capture
816
- * extraction, and occurrence-based selection.
817
- *
818
- * @module match-utils
819
- */
820
- /**
821
- * Extracts named capture groups from a regex match.
822
- *
823
- * Only includes groups that are in the `captureNames` list and have
824
- * defined values. This filters out positional captures and ensures
825
- * only explicitly requested named captures are returned.
826
- *
827
- * @param groups - The `match.groups` object from `RegExp.exec()`
828
- * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
829
- * @returns Object with capture name → value pairs, or `undefined` if none found
814
+ //#region src/segmentation/breakpoint-utils.ts
815
+ const WINDOW_PREFIX_LENGTHS = [
816
+ 80,
817
+ 60,
818
+ 40,
819
+ 30,
820
+ 20,
821
+ 15
822
+ ];
823
+ const JOINER_PREFIX_LENGTHS = [
824
+ 80,
825
+ 60,
826
+ 40,
827
+ 30,
828
+ 20,
829
+ 15,
830
+ 12,
831
+ 10,
832
+ 8,
833
+ 6
834
+ ];
835
+ /**
836
+ * Normalizes a breakpoint to the object form.
837
+ * Strings are converted to { pattern: str } with no constraints.
830
838
  *
831
- * @example
832
- * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
833
- * extractNamedCaptures(match.groups, ['num'])
834
- * // → { num: '٦٦٩٦' }
839
+ * @param bp - Breakpoint as string or object
840
+ * @returns Normalized BreakpointRule object
835
841
  *
836
842
  * @example
837
- * // No matching captures
838
- * extractNamedCaptures({}, ['num'])
839
- * // → undefined
843
+ * normalizeBreakpoint('\\n\\n')
844
+ * // → { pattern: '\\n\\n' }
840
845
  *
841
- * @example
842
- * // Undefined groups
843
- * extractNamedCaptures(undefined, ['num'])
844
- * // → undefined
846
+ * normalizeBreakpoint({ pattern: '\\n', min: 10 })
847
+ * // { pattern: '\\n', min: 10 }
845
848
  */
846
- const extractNamedCaptures = (groups, captureNames) => {
847
- if (!groups || captureNames.length === 0) return;
848
- const namedCaptures = {};
849
- for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
850
- return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
851
- };
849
+ const normalizeBreakpoint = (bp) => typeof bp === "string" ? { pattern: bp } : bp;
852
850
  /**
853
- * Gets the last defined positional capture group from a match array.
854
- *
855
- * Used for `lineStartsAfter` patterns where the content capture (`.*`)
856
- * is always at the end of the pattern. Named captures may shift the
857
- * positional indices, so we iterate backward to find the actual content.
851
+ * Checks if a page ID is in an excluded list (single pages or ranges).
858
852
  *
859
- * @param match - RegExp exec result array
860
- * @returns The last defined capture group value, or `undefined` if none
853
+ * @param pageId - Page ID to check
854
+ * @param excludeList - List of page IDs or [from, to] ranges to exclude
855
+ * @returns True if page is excluded
861
856
  *
862
857
  * @example
863
- * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
864
- * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
865
- * getLastPositionalCapture(match)
866
- * // → 'content'
858
+ * isPageExcluded(5, [1, 5, 10])
859
+ * // true
867
860
  *
868
- * @example
869
- * // No captures
870
- * getLastPositionalCapture(['full match'])
871
- * // undefined
861
+ * isPageExcluded(5, [[3, 7]])
862
+ * // true
863
+ *
864
+ * isPageExcluded(5, [[10, 20]])
865
+ * // → false
872
866
  */
873
- const getLastPositionalCapture = (match) => {
874
- if (match.length <= 1) return;
875
- for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
867
+ const isPageExcluded = (pageId, excludeList) => {
868
+ if (!excludeList || excludeList.length === 0) return false;
869
+ for (const item of excludeList) if (typeof item === "number") {
870
+ if (pageId === item) return true;
871
+ } else {
872
+ const [from, to] = item;
873
+ if (pageId >= from && pageId <= to) return true;
874
+ }
875
+ return false;
876
876
  };
877
877
  /**
878
- * Filters matches to only include those within page ID constraints.
879
- *
880
- * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
881
- * matches that occur on pages outside the allowed range or explicitly excluded.
878
+ * Checks if a page ID is within a breakpoint's min/max range and not excluded.
882
879
  *
883
- * @param matches - Array of match results to filter
884
- * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
885
- * @param getId - Function that returns the page ID for a given offset
886
- * @returns Filtered array containing only matches within constraints
880
+ * @param pageId - Page ID to check
881
+ * @param rule - Breakpoint rule with optional min/max/exclude constraints
882
+ * @returns True if page is within valid range
887
883
  *
888
884
  * @example
889
- * const matches = [
890
- * { start: 0, end: 10 }, // Page 1
891
- * { start: 100, end: 110 }, // Page 5
892
- * { start: 200, end: 210 }, // Page 10
893
- * ];
894
- * filterByConstraints(matches, { min: 3, max: 8 }, getId)
895
- * // → [{ start: 100, end: 110 }] (only page 5 match)
885
+ * isInBreakpointRange(50, { pattern: '\\n', min: 10, max: 100 })
886
+ * // true
887
+ *
888
+ * isInBreakpointRange(5, { pattern: '\\n', min: 10 })
889
+ * // → false (below min)
896
890
  */
897
- const filterByConstraints = (matches, rule, getId) => {
898
- return matches.filter((m) => {
899
- const id = getId(m.start);
900
- if (rule.min !== void 0 && id < rule.min) return false;
901
- if (rule.max !== void 0 && id > rule.max) return false;
902
- if (isPageExcluded(id, rule.exclude)) return false;
903
- return true;
904
- });
891
+ const isInBreakpointRange = (pageId, rule) => {
892
+ if (rule.min !== void 0 && pageId < rule.min) return false;
893
+ if (rule.max !== void 0 && pageId > rule.max) return false;
894
+ return !isPageExcluded(pageId, rule.exclude);
905
895
  };
906
896
  /**
907
- * Checks if any rule in the list allows the given page ID.
908
- *
909
- * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
910
- * Rules without constraints allow all page IDs.
897
+ * Builds an exclude set from a PageRange array for O(1) lookups.
911
898
  *
912
- * This is used to determine whether to create a segment for content
913
- * that appears before any split points (the "first segment").
899
+ * @param excludeList - List of page IDs or [from, to] ranges
900
+ * @returns Set of all excluded page IDs
914
901
  *
915
- * @param rules - Array of rules with optional `min` and `max` constraints
916
- * @param pageId - Page ID to check
917
- * @returns `true` if at least one rule allows the page ID
902
+ * @remarks
903
+ * This expands ranges into explicit page IDs for fast membership checks. For typical
904
+ * book-scale inputs (thousands of pages), this is small and keeps downstream logic
905
+ * simple and fast. If you expect extremely large ranges (e.g., millions of pages),
906
+ * consider avoiding broad excludes or introducing a range-based membership structure.
918
907
  *
919
908
  * @example
920
- * const rules = [
921
- * { min: 5, max: 10 }, // Allows pages 5-10
922
- * { min: 20 }, // Allows pages 20+
923
- * ];
909
+ * buildExcludeSet([1, 5, [10, 12]])
910
+ * // → Set { 1, 5, 10, 11, 12 }
911
+ */
912
+ const buildExcludeSet = (excludeList) => {
913
+ const excludeSet = /* @__PURE__ */ new Set();
914
+ for (const item of excludeList || []) if (typeof item === "number") excludeSet.add(item);
915
+ else for (let i = item[0]; i <= item[1]; i++) excludeSet.add(i);
916
+ return excludeSet;
917
+ };
918
+ /**
919
+ * Creates a segment with optional to and meta fields.
920
+ * Returns null if content is empty after trimming.
924
921
  *
925
- * anyRuleAllowsId(rules, 7) // true (first rule allows)
926
- * anyRuleAllowsId(rules, 3) // false (no rule allows)
927
- * anyRuleAllowsId(rules, 25) // true (second rule allows)
922
+ * @param content - Segment content
923
+ * @param fromPageId - Starting page ID
924
+ * @param toPageId - Optional ending page ID (omitted if same as from)
925
+ * @param meta - Optional metadata to attach
926
+ * @returns Segment object or null if empty
928
927
  *
929
928
  * @example
930
- * // Rules without constraints allow everything
931
- * anyRuleAllowsId([{}], 999) // true
929
+ * createSegment('Hello world', 1, 3, { chapter: 1 })
930
+ * // → { content: 'Hello world', from: 1, to: 3, meta: { chapter: 1 } }
931
+ *
932
+ * createSegment(' ', 1, undefined, undefined)
933
+ * // → null (empty content)
932
934
  */
933
- const anyRuleAllowsId = (rules, pageId) => {
934
- return rules.some((r) => {
935
- const minOk = r.min === void 0 || pageId >= r.min;
936
- const maxOk = r.max === void 0 || pageId <= r.max;
937
- return minOk && maxOk;
938
- });
939
- };
940
-
941
- //#endregion
942
- //#region src/segmentation/replace.ts
943
- const DEFAULT_REPLACE_FLAGS = "gu";
944
- const normalizeReplaceFlags = (flags) => {
945
- if (!flags) return DEFAULT_REPLACE_FLAGS;
946
- const allowed = new Set([
947
- "g",
948
- "i",
949
- "m",
950
- "s",
951
- "u",
952
- "y"
953
- ]);
954
- const set = /* @__PURE__ */ new Set();
955
- for (const ch of flags) {
956
- if (!allowed.has(ch)) throw new Error(`Invalid replace regex flag: "${ch}" (allowed: gimsyu)`);
957
- set.add(ch);
958
- }
959
- set.add("g");
960
- set.add("u");
961
- return [
962
- "g",
963
- "i",
964
- "m",
965
- "s",
966
- "y",
967
- "u"
968
- ].filter((c) => set.has(c)).join("");
969
- };
970
- const compileReplaceRules = (rules) => {
971
- const compiled = [];
972
- for (const r of rules) {
973
- if (r.pageIds && r.pageIds.length === 0) continue;
974
- const flags = normalizeReplaceFlags(r.flags);
975
- const re = new RegExp(r.regex, flags);
976
- compiled.push({
977
- pageIdSet: r.pageIds ? new Set(r.pageIds) : void 0,
978
- re,
979
- replacement: r.replacement
980
- });
981
- }
982
- return compiled;
935
+ const createSegment = (content, fromPageId, toPageId, meta) => {
936
+ const trimmed = content.trim();
937
+ if (!trimmed) return null;
938
+ const seg = {
939
+ content: trimmed,
940
+ from: fromPageId
941
+ };
942
+ if (toPageId !== void 0 && toPageId !== fromPageId) seg.to = toPageId;
943
+ if (meta) seg.meta = meta;
944
+ return seg;
983
945
  };
984
946
  /**
985
- * Applies ordered regex replacements to page content (per page).
947
+ * Expands breakpoint patterns and pre-computes exclude sets.
986
948
  *
987
- * - Replacement rules are applied in array order.
988
- * - Each rule is applied globally (flag `g` enforced) with unicode mode (flag `u` enforced).
989
- * - `pageIds` can scope a rule to specific pages. `pageIds: []` skips the rule entirely.
949
+ * @param breakpoints - Array of breakpoint patterns or rules
950
+ * @param processPattern - Function to expand tokens in patterns
951
+ * @returns Array of expanded breakpoints with compiled regexes
990
952
  *
991
- * This function is intentionally **pure**:
992
- * it returns a new pages array only when changes are needed, otherwise it returns the original pages.
953
+ * @remarks
954
+ * This function compiles regex patterns dynamically. This can be a ReDoS vector
955
+ * if patterns come from untrusted sources. In typical usage, breakpoint rules
956
+ * are application configuration, not user input.
993
957
  */
994
- const applyReplacements = (pages, rules) => {
995
- if (!rules || rules.length === 0 || pages.length === 0) return pages;
996
- const compiled = compileReplaceRules(rules);
997
- if (compiled.length === 0) return pages;
998
- return pages.map((p) => {
999
- let content = p.content;
1000
- for (const rule of compiled) {
1001
- if (rule.pageIdSet && !rule.pageIdSet.has(p.id)) continue;
1002
- content = content.replace(rule.re, rule.replacement);
1003
- }
1004
- if (content === p.content) return p;
958
+ const expandBreakpoints = (breakpoints, processPattern$1) => breakpoints.map((bp) => {
959
+ const rule = normalizeBreakpoint(bp);
960
+ const excludeSet = buildExcludeSet(rule.exclude);
961
+ const skipWhenRegex = rule.skipWhen !== void 0 ? (() => {
962
+ const expandedSkip = processPattern$1(rule.skipWhen);
963
+ try {
964
+ return new RegExp(expandedSkip, "mu");
965
+ } catch (error) {
966
+ const message = error instanceof Error ? error.message : String(error);
967
+ throw new Error(`Invalid breakpoint skipWhen regex: ${rule.skipWhen}\n Cause: ${message}`);
968
+ }
969
+ })() : null;
970
+ if (rule.pattern === "") return {
971
+ excludeSet,
972
+ regex: null,
973
+ rule,
974
+ skipWhenRegex
975
+ };
976
+ const expanded = processPattern$1(rule.pattern);
977
+ try {
1005
978
  return {
1006
- ...p,
1007
- content
979
+ excludeSet,
980
+ regex: new RegExp(expanded, "gmu"),
981
+ rule,
982
+ skipWhenRegex
1008
983
  };
1009
- });
1010
- };
1011
-
1012
- //#endregion
1013
- //#region src/segmentation/tokens.ts
984
+ } catch (error) {
985
+ const message = error instanceof Error ? error.message : String(error);
986
+ throw new Error(`Invalid breakpoint regex: ${rule.pattern}\n Cause: ${message}`);
987
+ }
988
+ });
1014
989
  /**
1015
- * Token-based template system for Arabic text pattern matching.
990
+ * Applies a configured joiner at detected page boundaries within a multi-page content chunk.
1016
991
  *
1017
- * This module provides a human-readable way to define regex patterns using
1018
- * `{{token}}` placeholders that expand to their regex equivalents. It supports
1019
- * named capture groups for extracting matched values into metadata.
992
+ * This is used for breakpoint-generated segments which don't have access to the original
993
+ * `pageMap.pageBreaks` offsets. We detect page starts sequentially by searching for each page's
994
+ * prefix after the previous boundary, then replace ONLY the single newline immediately before
995
+ * that page start.
1020
996
  *
1021
- * @module tokens
997
+ * This avoids converting real in-page newlines, while still normalizing page joins consistently.
998
+ */
999
+ const applyPageJoinerBetweenPages = (content, fromIdx, toIdx, pageIds, normalizedPages, joiner) => {
1000
+ if (joiner === "newline" || fromIdx >= toIdx || !content.includes("\n")) return content;
1001
+ let updated = content;
1002
+ let searchFrom = 0;
1003
+ for (let pi = fromIdx + 1; pi <= toIdx; pi++) {
1004
+ const pageData = normalizedPages.get(pageIds[pi]);
1005
+ if (!pageData) continue;
1006
+ const found = findPrefixPositionInContent(updated, pageData.content.trimStart(), searchFrom);
1007
+ if (found > 0 && updated[found - 1] === "\n") updated = `${updated.slice(0, found - 1)} ${updated.slice(found)}`;
1008
+ if (found > 0) searchFrom = found;
1009
+ }
1010
+ return updated;
1011
+ };
1012
+ /**
1013
+ * Finds the position of a page prefix in content, trying multiple prefix lengths.
1014
+ */
1015
+ const findPrefixPositionInContent = (content, trimmedPageContent, searchFrom) => {
1016
+ for (const len of JOINER_PREFIX_LENGTHS) {
1017
+ const prefix = trimmedPageContent.slice(0, Math.min(len, trimmedPageContent.length)).trim();
1018
+ if (!prefix) continue;
1019
+ const pos = content.indexOf(prefix, searchFrom);
1020
+ if (pos > 0) return pos;
1021
+ }
1022
+ return -1;
1023
+ };
1024
+ /**
1025
+ * Estimates how far into the current page `remainingContent` begins.
1022
1026
  *
1023
- * @example
1024
- * // Simple token expansion
1025
- * expandTokens('{{raqms}} {{dash}}')
1026
- * // '[\\u0660-\\u0669]+ [-–—ـ]'
1027
+ * During breakpoint processing, `remainingContent` can begin mid-page after a previous split.
1028
+ * When that happens, raw cumulative page offsets (computed from full page starts) can overestimate
1029
+ * expected boundary positions. This helper computes an approximate starting offset by matching
1030
+ * a short prefix of `remainingContent` inside the current page content.
1031
+ */
1032
+ const estimateStartOffsetInCurrentPage = (remainingContent, currentFromIdx, pageIds, normalizedPages) => {
1033
+ const currentPageData = normalizedPages.get(pageIds[currentFromIdx]);
1034
+ if (!currentPageData) return 0;
1035
+ const remStart = remainingContent.trimStart().slice(0, Math.min(60, remainingContent.length));
1036
+ const needle = remStart.slice(0, Math.min(30, remStart.length));
1037
+ if (!needle) return 0;
1038
+ const idx = currentPageData.content.indexOf(needle);
1039
+ return idx > 0 ? idx : 0;
1040
+ };
1041
+ /**
1042
+ * Attempts to find the start position of a target page within remainingContent,
1043
+ * anchored near an expected boundary position to reduce collisions.
1027
1044
  *
1028
- * @example
1029
- * // Named capture groups
1030
- * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
1031
- * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
1045
+ * This is used to define breakpoint windows in terms of actual content being split, rather than
1046
+ * raw per-page offsets which can desync when structural rules strip markers.
1032
1047
  */
1048
+ const findPageStartNearExpectedBoundary = (remainingContent, _currentFromIdx, targetPageIdx, expectedBoundary, pageIds, normalizedPages) => {
1049
+ const targetPageData = normalizedPages.get(pageIds[targetPageIdx]);
1050
+ if (!targetPageData) return -1;
1051
+ const approx = Math.min(Math.max(0, expectedBoundary), remainingContent.length);
1052
+ const searchStart = Math.max(0, approx - 1e4);
1053
+ const searchEnd = Math.min(remainingContent.length, approx + 2e3);
1054
+ const targetTrimmed = targetPageData.content.trimStart();
1055
+ for (const len of WINDOW_PREFIX_LENGTHS) {
1056
+ const prefix = targetTrimmed.slice(0, Math.min(len, targetTrimmed.length)).trim();
1057
+ if (!prefix) continue;
1058
+ let pos = remainingContent.indexOf(prefix, searchStart);
1059
+ while (pos !== -1 && pos <= searchEnd) {
1060
+ if (pos > 0 && /\s/.test(remainingContent[pos - 1] ?? "")) return pos;
1061
+ pos = remainingContent.indexOf(prefix, pos + 1);
1062
+ }
1063
+ const last = remainingContent.lastIndexOf(prefix, approx);
1064
+ if (last > 0) return last;
1065
+ }
1066
+ return -1;
1067
+ };
1033
1068
  /**
1034
- * Token definitions mapping human-readable token names to regex patterns.
1069
+ * Builds a boundary position map for pages within the given range.
1035
1070
  *
1036
- * Tokens are used in template strings with double-brace syntax:
1037
- * - `{{token}}` - Expands to the pattern (non-capturing in context)
1038
- * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
1039
- * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
1071
+ * This function computes page boundaries once per segment and enables
1072
+ * O(log n) page lookups via binary search with `findPageIndexForPosition`.
1040
1073
  *
1041
- * @remarks
1042
- * These patterns are designed for Arabic text matching. For diacritic-insensitive
1043
- * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
1044
- * which applies `makeDiacriticInsensitive()` to the expanded patterns.
1074
+ * Boundaries are derived from segmentContent (post-structural-rules).
1075
+ * When the segment starts mid-page, an offset correction is applied to
1076
+ * keep boundary estimates aligned with the segment's actual content space.
1045
1077
  *
1046
- * @example
1047
- * // Using tokens in a split rule
1048
- * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
1078
+ * @param segmentContent - Full segment content (already processed by structural rules)
1079
+ * @param fromIdx - Starting page index
1080
+ * @param toIdx - Ending page index
1081
+ * @param pageIds - Array of all page IDs
1082
+ * @param normalizedPages - Map of page ID to normalized content
1083
+ * @param cumulativeOffsets - Cumulative character offsets (for estimates)
1084
+ * @returns Array where boundaryPositions[i] = start position of page (fromIdx + i),
1085
+ * with a sentinel boundary at segmentContent.length as the last element
1049
1086
  *
1050
1087
  * @example
1051
- * // Using tokens with named captures
1052
- * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
1088
+ * // For a 3-page segment:
1089
+ * buildBoundaryPositions(content, 0, 2, pageIds, normalizedPages, offsets)
1090
+ * // → [0, 23, 45, 67] where 67 is content.length (sentinel)
1053
1091
  */
1092
+ const buildBoundaryPositions = (segmentContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1093
+ const boundaryPositions = [0];
1094
+ const startOffsetInFromPage = estimateStartOffsetInCurrentPage(segmentContent, fromIdx, pageIds, normalizedPages);
1095
+ for (let i = fromIdx + 1; i <= toIdx; i++) {
1096
+ const expectedBoundary = cumulativeOffsets[i] !== void 0 && cumulativeOffsets[fromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[i] - cumulativeOffsets[fromIdx] - startOffsetInFromPage) : segmentContent.length;
1097
+ const pos = findPageStartNearExpectedBoundary(segmentContent, fromIdx, i, expectedBoundary, pageIds, normalizedPages);
1098
+ const prevBoundary = boundaryPositions[boundaryPositions.length - 1];
1099
+ if (pos > 0 && pos > prevBoundary && Math.abs(pos - expectedBoundary) < 2e3) boundaryPositions.push(pos);
1100
+ else {
1101
+ const estimate = Math.max(prevBoundary + 1, expectedBoundary);
1102
+ boundaryPositions.push(Math.min(estimate, segmentContent.length));
1103
+ }
1104
+ }
1105
+ boundaryPositions.push(segmentContent.length);
1106
+ return boundaryPositions;
1107
+ };
1054
1108
  /**
1055
- * Escapes regex metacharacters (parentheses and brackets) in template patterns,
1056
- * but preserves content inside `{{...}}` token delimiters.
1057
- *
1058
- * This allows users to write intuitive patterns like `({{harf}}):` instead of
1059
- * the verbose `\\({{harf}}\\):`. The escaping is applied BEFORE token expansion,
1060
- * so tokens like `{{harf}}` which expand to `[أ-ي]` work correctly.
1109
+ * Binary search to find which page a position falls within.
1110
+ * Uses "largest i where boundaryPositions[i] <= position" semantics.
1061
1111
  *
1062
- * @param pattern - Template pattern that may contain `()[]` and `{{tokens}}`
1063
- * @returns Pattern with `()[]` escaped outside of `{{...}}` delimiters
1112
+ * @param position - Character position in segmentContent
1113
+ * @param boundaryPositions - Precomputed boundary positions (from buildBoundaryPositions)
1114
+ * @param fromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[fromIdx])
1115
+ * @returns Page index in pageIds array
1064
1116
  *
1065
1117
  * @example
1066
- * escapeTemplateBrackets('({{harf}}): ')
1067
- * // → '\\({{harf}}\\): '
1118
+ * // With boundaries [0, 20, 40, 60] and fromIdx=0:
1119
+ * findPageIndexForPosition(15, boundaries, 0) // → 0 (first page)
1120
+ * findPageIndexForPosition(25, boundaries, 0) // → 1 (second page)
1121
+ * findPageIndexForPosition(40, boundaries, 0) // → 2 (exactly on boundary = that page)
1122
+ */
1123
+ const findPageIndexForPosition = (position, boundaryPositions, fromIdx) => {
1124
+ if (boundaryPositions.length <= 1) return fromIdx;
1125
+ let left = 0;
1126
+ let right = boundaryPositions.length - 2;
1127
+ while (left < right) {
1128
+ const mid = Math.ceil((left + right) / 2);
1129
+ if (boundaryPositions[mid] <= position) left = mid;
1130
+ else right = mid - 1;
1131
+ }
1132
+ return fromIdx + left;
1133
+ };
1134
+ /**
1135
+ * Finds the end position of a breakpoint window inside `remainingContent`.
1068
1136
  *
1069
- * @example
1070
- * escapeTemplateBrackets('[{{raqm}}] ')
1071
- * // '\\[{{raqm}}\\] '
1137
+ * The window end is defined as the start of the page AFTER `windowEndIdx` (i.e. `windowEndIdx + 1`),
1138
+ * found within the actual `remainingContent` string being split. This avoids relying on raw page offsets
1139
+ * that can diverge when structural rules strip markers (e.g. `lineStartsAfter`).
1140
+ */
1141
+ const findBreakpointWindowEndPosition = (remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets) => {
1142
+ if (windowEndIdx >= toIdx) return remainingContent.length;
1143
+ const desiredNextIdx = windowEndIdx + 1;
1144
+ const minNextIdx = currentFromIdx + 1;
1145
+ const maxNextIdx = Math.min(desiredNextIdx, toIdx);
1146
+ const startOffsetInCurrentPage = estimateStartOffsetInCurrentPage(remainingContent, currentFromIdx, pageIds, normalizedPages);
1147
+ for (let nextIdx = maxNextIdx; nextIdx >= minNextIdx; nextIdx--) {
1148
+ const expectedBoundary = cumulativeOffsets[nextIdx] !== void 0 && cumulativeOffsets[currentFromIdx] !== void 0 ? Math.max(0, cumulativeOffsets[nextIdx] - cumulativeOffsets[currentFromIdx] - startOffsetInCurrentPage) : remainingContent.length;
1149
+ const pos = findPageStartNearExpectedBoundary(remainingContent, currentFromIdx, nextIdx, expectedBoundary, pageIds, normalizedPages);
1150
+ if (pos > 0) return pos;
1151
+ }
1152
+ return remainingContent.length;
1153
+ };
1154
+ /**
1155
+ * Finds exclusion-based break position using raw cumulative offsets.
1072
1156
  *
1073
- * @example
1074
- * escapeTemplateBrackets('{{harf}}')
1075
- * // → '{{harf}}' (unchanged - no brackets outside tokens)
1157
+ * This is used to ensure pages excluded by breakpoints are never merged into the same output segment.
1158
+ * Returns a break position relative to the start of `remainingContent` (i.e. the currentFromIdx start).
1076
1159
  */
1077
- const escapeTemplateBrackets = (pattern) => {
1078
- return pattern.replace(/(\{\{[^}]*\}\})|([()[\]])/g, (_match, token, bracket) => {
1079
- if (token) return token;
1080
- return `\\${bracket}`;
1081
- });
1082
- };
1083
- const RUMUZ_ATOM = `(?:${[
1084
- "خت",
1085
- "خغ",
1086
- "بخ",
1087
- "عخ",
1088
- "مق",
1089
- "مت",
1090
- "عس",
1091
- "سي",
1092
- "سن",
1093
- "كن",
1094
- "مد",
1095
- "قد",
1096
- "خد",
1097
- "فد",
1098
- "دل",
1099
- "كد",
1100
- "غد",
1101
- "صد",
1102
- "دت",
1103
- "دس",
1104
- "تم",
1105
- "فق",
1106
- "دق",
1107
- "[خرزيمنصسدفلتقع]",
1108
- "(?<![\\u0660-\\u0669])٤(?![\\u0660-\\u0669])"
1109
- ].join("|")})`;
1110
- const RUMUZ_BLOCK = `${RUMUZ_ATOM}(?:\\s+${RUMUZ_ATOM})*`;
1111
- const BASE_TOKENS = {
1112
- bab: "باب",
1113
- basmalah: ["بسم الله", "﷽"].join("|"),
1114
- bullet: "[•*°]",
1115
- dash: "[-–—ـ]",
1116
- fasl: ["مسألة", "فصل"].join("|"),
1117
- harf: "[أ-ي]",
1118
- harfs: "[أ-ي](?:\\s+[أ-ي])*",
1119
- kitab: "كتاب",
1120
- naql: [
1121
- "حدثني",
1122
- "وأخبرنا",
1123
- "حدثنا",
1124
- "سمعت",
1125
- "أنبأنا",
1126
- "وحدثنا",
1127
- "أخبرنا",
1128
- "وحدثني",
1129
- "وحدثنيه"
1130
- ].join("|"),
1131
- raqm: "[\\u0660-\\u0669]",
1132
- raqms: "[\\u0660-\\u0669]+",
1133
- rumuz: RUMUZ_BLOCK,
1134
- tarqim: "[.!?؟؛]"
1160
+ const findExclusionBreakPosition = (currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets) => {
1161
+ const startingPageId = pageIds[currentFromIdx];
1162
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(startingPageId)) && currentFromIdx < toIdx) return cumulativeOffsets[currentFromIdx + 1] - cumulativeOffsets[currentFromIdx];
1163
+ for (let pageIdx = currentFromIdx + 1; pageIdx <= windowEndIdx; pageIdx++) {
1164
+ const pageId = pageIds[pageIdx];
1165
+ if (expandedBreakpoints.some((bp) => bp.excludeSet.has(pageId))) return cumulativeOffsets[pageIdx] - cumulativeOffsets[currentFromIdx];
1166
+ }
1167
+ return -1;
1135
1168
  };
1136
1169
  /**
1137
- * Composite token definitions using template syntax.
1138
- *
1139
- * These tokens reference base tokens using `{{token}}` syntax and are
1140
- * automatically expanded to their final regex patterns at module load time.
1141
- *
1142
- * This provides better abstraction - if base tokens change, composites
1143
- * automatically update on the next build.
1170
+ * Checks if any page in a range is excluded by the given exclude set.
1144
1171
  *
1145
- * @internal
1172
+ * @param excludeSet - Set of excluded page IDs
1173
+ * @param pageIds - Array of page IDs
1174
+ * @param fromIdx - Start index (inclusive)
1175
+ * @param toIdx - End index (inclusive)
1176
+ * @returns True if any page in range is excluded
1146
1177
  */
1147
- const COMPOSITE_TOKENS = { numbered: "{{raqms}} {{dash}} " };
1178
+ const hasExcludedPageInRange = (excludeSet, pageIds, fromIdx, toIdx) => {
1179
+ if (excludeSet.size === 0) return false;
1180
+ for (let pageIdx = fromIdx; pageIdx <= toIdx; pageIdx++) if (excludeSet.has(pageIds[pageIdx])) return true;
1181
+ return false;
1182
+ };
1148
1183
  /**
1149
- * Expands any *composite* tokens (like `{{numbered}}`) into their underlying template form
1150
- * (like `{{raqms}} {{dash}} `).
1151
- *
1152
- * This is useful when you want to take a signature produced by `analyzeCommonLineStarts()`
1153
- * and turn it into an editable template where you can add named captures, e.g.:
1184
+ * Finds the position of the next page content within remaining content.
1185
+ * Returns -1 if not found.
1154
1186
  *
1155
- * - `{{numbered}}` `{{raqms}} {{dash}} `
1156
- * - then: `{{raqms:num}} {{dash}} ` to capture the number
1187
+ * @param remainingContent - Content to search in
1188
+ * @param nextPageData - Normalized data for the next page
1189
+ * @returns Position of next page content, or -1 if not found
1190
+ */
1191
+ const findNextPagePosition = (remainingContent, nextPageData) => {
1192
+ const searchPrefix = nextPageData.content.trim().slice(0, Math.min(30, nextPageData.length));
1193
+ if (searchPrefix.length === 0) return -1;
1194
+ const pos = remainingContent.indexOf(searchPrefix);
1195
+ return pos > 0 ? pos : -1;
1196
+ };
1197
+ /**
1198
+ * Finds matches within a window and returns the selected position based on preference.
1157
1199
  *
1158
- * Notes:
1159
- * - This only expands the plain `{{token}}` form (not `{{token:name}}`).
1160
- * - Expansion is repeated a few times to support nested composites.
1200
+ * @param windowContent - Content to search
1201
+ * @param regex - Regex to match
1202
+ * @param prefer - 'longer' for last match, 'shorter' for first match
1203
+ * @returns Break position after the selected match, or -1 if no matches
1161
1204
  */
1162
- const expandCompositeTokensInTemplate = (template) => {
1163
- let out = template;
1164
- for (let i = 0; i < 10; i++) {
1165
- const next = out.replace(/\{\{(\w+)\}\}/g, (m, tokenName) => {
1166
- return COMPOSITE_TOKENS[tokenName] ?? m;
1167
- });
1168
- if (next === out) break;
1169
- out = next;
1205
+ const findPatternBreakPosition = (windowContent, regex, prefer) => {
1206
+ let first;
1207
+ let last;
1208
+ for (const m of windowContent.matchAll(regex)) {
1209
+ const match = {
1210
+ index: m.index,
1211
+ length: m[0].length
1212
+ };
1213
+ if (!first) first = match;
1214
+ last = match;
1170
1215
  }
1171
- return out;
1216
+ if (!first) return -1;
1217
+ const selected = prefer === "longer" ? last : first;
1218
+ return selected.index + selected.length;
1172
1219
  };
1173
1220
  /**
1174
- * Expands base tokens in a template string.
1175
- * Used internally to pre-expand composite tokens.
1176
- *
1177
- * @param template - Template string with `{{token}}` placeholders
1178
- * @returns Expanded pattern with base tokens replaced
1179
- * @internal
1221
+ * Handles page boundary breakpoint (empty pattern).
1222
+ * Returns break position or -1 if no valid position found.
1180
1223
  */
1181
- const expandBaseTokens = (template) => {
1182
- return template.replace(/\{\{(\w+)\}\}/g, (_, tokenName) => {
1183
- return BASE_TOKENS[tokenName] ?? `{{${tokenName}}}`;
1184
- });
1224
+ const handlePageBoundaryBreak = (remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages) => {
1225
+ const nextPageIdx = windowEndIdx + 1;
1226
+ if (nextPageIdx <= toIdx) {
1227
+ const nextPageData = normalizedPages.get(pageIds[nextPageIdx]);
1228
+ if (nextPageData) {
1229
+ const pos = findNextPagePosition(remainingContent, nextPageData);
1230
+ if (pos > 0) return Math.min(pos, windowEndPosition, remainingContent.length);
1231
+ }
1232
+ }
1233
+ return Math.min(windowEndPosition, remainingContent.length);
1185
1234
  };
1186
1235
  /**
1187
- * Token definitions mapping human-readable token names to regex patterns.
1188
- *
1189
- * Tokens are used in template strings with double-brace syntax:
1190
- * - `{{token}}` - Expands to the pattern (non-capturing in context)
1191
- * - `{{token:name}}` - Expands to a named capture group `(?<name>pattern)`
1192
- * - `{{:name}}` - Captures any content with the given name `(?<name>.+)`
1193
- *
1194
- * @remarks
1195
- * These patterns are designed for Arabic text matching. For diacritic-insensitive
1196
- * matching of Arabic patterns, use the `fuzzy: true` option in split rules,
1197
- * which applies `makeDiacriticInsensitive()` to the expanded patterns.
1198
- *
1199
- * @example
1200
- * // Using tokens in a split rule
1201
- * { lineStartsWith: ['{{kitab}}', '{{bab}}'], split: 'at', fuzzy: true }
1202
- *
1203
- * @example
1204
- * // Using tokens with named captures
1205
- * { lineStartsAfter: ['{{raqms:hadithNum}} {{dash}} '], split: 'at' }
1236
+ * Tries to find a break position within the current window using breakpoint patterns.
1237
+ * Returns the break position or -1 if no suitable break was found.
1206
1238
  *
1207
- * @example
1208
- * // Using the numbered convenience token
1209
- * { lineStartsAfter: ['{{numbered}}'], split: 'at' }
1239
+ * @param remainingContent - Content remaining to be segmented
1240
+ * @param currentFromIdx - Current starting page index
1241
+ * @param toIdx - Ending page index
1242
+ * @param windowEndIdx - Maximum window end index
1243
+ * @param ctx - Breakpoint context with page data and patterns
1244
+ * @returns Break position in the content, or -1 if no break found
1210
1245
  */
1211
- const TOKEN_PATTERNS = {
1212
- ...BASE_TOKENS,
1213
- ...Object.fromEntries(Object.entries(COMPOSITE_TOKENS).map(([k, v]) => [k, expandBaseTokens(v)]))
1246
+ const findBreakPosition = (remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, ctx) => {
1247
+ const { pageIds, normalizedPages, expandedBreakpoints, prefer } = ctx;
1248
+ for (const { rule, regex, excludeSet, skipWhenRegex } of expandedBreakpoints) {
1249
+ if (!isInBreakpointRange(pageIds[currentFromIdx], rule)) continue;
1250
+ if (hasExcludedPageInRange(excludeSet, pageIds, currentFromIdx, windowEndIdx)) continue;
1251
+ if (skipWhenRegex?.test(remainingContent)) continue;
1252
+ if (regex === null) return handlePageBoundaryBreak(remainingContent, windowEndIdx, windowEndPosition, toIdx, pageIds, normalizedPages);
1253
+ const breakPos = findPatternBreakPosition(remainingContent.slice(0, Math.min(windowEndPosition, remainingContent.length)), regex, prefer);
1254
+ if (breakPos > 0) return breakPos;
1255
+ }
1256
+ return -1;
1214
1257
  };
1258
+
1259
+ //#endregion
1260
+ //#region src/segmentation/breakpoint-processor.ts
1215
1261
  /**
1216
- * Regex pattern for matching tokens with optional named capture syntax.
1217
- *
1218
- * Matches:
1219
- * - `{{token}}` - Simple token (group 1 = token name, group 2 = empty)
1220
- * - `{{token:name}}` - Token with capture (group 1 = token, group 2 = name)
1221
- * - `{{:name}}` - Capture-only (group 1 = empty, group 2 = name)
1262
+ * Breakpoint post-processing engine extracted from segmenter.ts.
1222
1263
  *
1223
- * @internal
1264
+ * This module is intentionally split into small helpers to reduce cognitive complexity
1265
+ * and allow unit testing of tricky edge cases (window sizing, next-page advancement, etc.).
1224
1266
  */
1225
- const TOKEN_WITH_CAPTURE_REGEX = /\{\{(\w*):?(\w*)\}\}/g;
1267
+ const buildPageIdToIndexMap = (pageIds) => new Map(pageIds.map((id, i) => [id, i]));
1268
+ const buildNormalizedPagesMap = (pages, normalizedContent) => {
1269
+ const normalizedPages = /* @__PURE__ */ new Map();
1270
+ for (let i = 0; i < pages.length; i++) {
1271
+ const content = normalizedContent[i];
1272
+ normalizedPages.set(pages[i].id, {
1273
+ content,
1274
+ index: i,
1275
+ length: content.length
1276
+ });
1277
+ }
1278
+ return normalizedPages;
1279
+ };
1280
+ const buildCumulativeOffsets = (pageIds, normalizedPages) => {
1281
+ const cumulativeOffsets = [0];
1282
+ let totalOffset = 0;
1283
+ for (let i = 0; i < pageIds.length; i++) {
1284
+ const pageData = normalizedPages.get(pageIds[i]);
1285
+ totalOffset += pageData ? pageData.length : 0;
1286
+ if (i < pageIds.length - 1) totalOffset += 1;
1287
+ cumulativeOffsets.push(totalOffset);
1288
+ }
1289
+ return cumulativeOffsets;
1290
+ };
1291
+ const hasAnyExclusionsInRange = (expandedBreakpoints, pageIds, fromIdx, toIdx) => expandedBreakpoints.some((bp) => hasExcludedPageInRange(bp.excludeSet, pageIds, fromIdx, toIdx));
1292
+ const computeWindowEndIdx = (currentFromIdx, toIdx, pageIds, maxPages) => {
1293
+ const maxWindowPageId = pageIds[currentFromIdx] + maxPages;
1294
+ let windowEndIdx = currentFromIdx;
1295
+ for (let i = currentFromIdx; i <= toIdx; i++) if (pageIds[i] <= maxWindowPageId) windowEndIdx = i;
1296
+ else break;
1297
+ return windowEndIdx;
1298
+ };
1299
+ const computeRemainingSpan = (currentFromIdx, toIdx, pageIds) => pageIds[toIdx] - pageIds[currentFromIdx];
1300
+ const createFinalSegment = (remainingContent, currentFromIdx, toIdx, pageIds, meta, includeMeta) => createSegment(remainingContent, pageIds[currentFromIdx], currentFromIdx !== toIdx ? pageIds[toIdx] : void 0, includeMeta ? meta : void 0);
1226
1301
  /**
1227
- * Regex pattern for simple token matching (no capture syntax).
1228
- *
1229
- * Matches only `{{token}}` format where token is one or more word characters.
1230
- * Used by `containsTokens()` for quick detection.
1302
+ * Computes the actual start and end page indices for a piece using
1303
+ * precomputed boundary positions and binary search.
1231
1304
  *
1232
- * @internal
1305
+ * @param pieceStartPos - Start position of the piece in the full segment content
1306
+ * @param pieceEndPos - End position (exclusive) of the piece
1307
+ * @param boundaryPositions - Precomputed boundary positions from buildBoundaryPositions
1308
+ * @param baseFromIdx - Base page index (boundaryPositions[0] corresponds to pageIds[baseFromIdx])
1309
+ * @param toIdx - Maximum page index
1310
+ * @returns Object with actualStartIdx and actualEndIdx
1233
1311
  */
1234
- const SIMPLE_TOKEN_REGEX = /\{\{(\w+)\}\}/g;
1312
+ const computePiecePages = (pieceStartPos, pieceEndPos, boundaryPositions, baseFromIdx, toIdx) => {
1313
+ const actualStartIdx = findPageIndexForPosition(pieceStartPos, boundaryPositions, baseFromIdx);
1314
+ const endPos = Math.max(pieceStartPos, pieceEndPos - 1);
1315
+ return {
1316
+ actualEndIdx: Math.min(findPageIndexForPosition(endPos, boundaryPositions, baseFromIdx), toIdx),
1317
+ actualStartIdx
1318
+ };
1319
+ };
1320
+ const computeNextFromIdx = (remainingContent, actualEndIdx, toIdx, pageIds, normalizedPages) => {
1321
+ let nextFromIdx = actualEndIdx;
1322
+ if (remainingContent && actualEndIdx + 1 <= toIdx) {
1323
+ const nextPageData = normalizedPages.get(pageIds[actualEndIdx + 1]);
1324
+ if (nextPageData) {
1325
+ const nextPrefix = nextPageData.content.slice(0, Math.min(30, nextPageData.length));
1326
+ const remainingPrefix = remainingContent.trimStart().slice(0, Math.min(30, remainingContent.length));
1327
+ if (nextPrefix && (remainingContent.startsWith(nextPrefix) || nextPageData.content.startsWith(remainingPrefix))) nextFromIdx = actualEndIdx + 1;
1328
+ }
1329
+ }
1330
+ return nextFromIdx;
1331
+ };
1332
+ const createPieceSegment = (pieceContent, actualStartIdx, actualEndIdx, pageIds, meta, includeMeta) => createSegment(pieceContent, pageIds[actualStartIdx], actualEndIdx > actualStartIdx ? pageIds[actualEndIdx] : void 0, includeMeta ? meta : void 0);
1235
1333
  /**
1236
- * Checks if a query string contains template tokens.
1237
- *
1238
- * Performs a quick test for `{{token}}` patterns without actually
1239
- * expanding them. Useful for determining whether to apply token
1240
- * expansion to a string.
1241
- *
1242
- * @param query - String to check for tokens
1243
- * @returns `true` if the string contains at least one `{{token}}` pattern
1334
+ * Finds the break offset within a window, trying exclusions first, then patterns.
1244
1335
  *
1245
- * @example
1246
- * containsTokens('{{raqms}} {{dash}}') // → true
1247
- * containsTokens('plain text') // → false
1248
- * containsTokens('[٠-٩]+ - ') // → false (raw regex, no tokens)
1336
+ * @returns Break offset relative to remainingContent, or windowEndPosition as fallback
1249
1337
  */
1250
- const containsTokens = (query) => {
1251
- SIMPLE_TOKEN_REGEX.lastIndex = 0;
1252
- return SIMPLE_TOKEN_REGEX.test(query);
1253
- };
1254
- const splitTemplateIntoSegments = (query) => {
1255
- const segments = [];
1256
- let lastIndex = 0;
1257
- TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
1258
- let match;
1259
- while ((match = TOKEN_WITH_CAPTURE_REGEX.exec(query)) !== null) {
1260
- if (match.index > lastIndex) segments.push({
1261
- type: "text",
1262
- value: query.slice(lastIndex, match.index)
1263
- });
1264
- segments.push({
1265
- type: "token",
1266
- value: match[0]
1267
- });
1268
- lastIndex = match.index + match[0].length;
1338
+ const findBreakOffsetForWindow = (remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer) => {
1339
+ if (hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, windowEndIdx)) {
1340
+ const exclusionBreak = findExclusionBreakPosition(currentFromIdx, windowEndIdx, toIdx, pageIds, expandedBreakpoints, cumulativeOffsets);
1341
+ if (exclusionBreak > 0) return exclusionBreak;
1269
1342
  }
1270
- if (lastIndex < query.length) segments.push({
1271
- type: "text",
1272
- value: query.slice(lastIndex)
1343
+ const patternBreak = findBreakPosition(remainingContent, currentFromIdx, toIdx, windowEndIdx, windowEndPosition, {
1344
+ expandedBreakpoints,
1345
+ normalizedPages,
1346
+ pageIds,
1347
+ prefer
1273
1348
  });
1274
- return segments;
1275
- };
1276
- const maybeApplyFuzzyToText = (text, fuzzyTransform) => {
1277
- if (fuzzyTransform && /[\u0600-\u06FF]/u.test(text)) return fuzzyTransform(text);
1278
- return text;
1279
- };
1280
- const maybeApplyFuzzyToTokenPattern = (tokenPattern, fuzzyTransform) => {
1281
- if (!fuzzyTransform) return tokenPattern;
1282
- return tokenPattern.split("|").map((part) => /[\u0600-\u06FF]/u.test(part) ? fuzzyTransform(part) : part).join("|");
1349
+ return patternBreak > 0 ? patternBreak : windowEndPosition;
1283
1350
  };
1284
- const parseTokenLiteral = (literal) => {
1285
- TOKEN_WITH_CAPTURE_REGEX.lastIndex = 0;
1286
- const tokenMatch = TOKEN_WITH_CAPTURE_REGEX.exec(literal);
1287
- if (!tokenMatch) return null;
1288
- const [, tokenName, captureName] = tokenMatch;
1289
- return {
1290
- captureName,
1291
- tokenName
1292
- };
1351
+ /**
1352
+ * Advances cursor position past any leading whitespace.
1353
+ */
1354
+ const skipWhitespace = (content, startPos) => {
1355
+ let pos = startPos;
1356
+ while (pos < content.length && /\s/.test(content[pos])) pos++;
1357
+ return pos;
1293
1358
  };
1294
- const createCaptureRegistry = (capturePrefix) => {
1295
- const captureNames = [];
1296
- const captureNameCounts = /* @__PURE__ */ new Map();
1297
- const register = (baseName) => {
1298
- const count = captureNameCounts.get(baseName) ?? 0;
1299
- captureNameCounts.set(baseName, count + 1);
1300
- const uniqueName = count === 0 ? baseName : `${baseName}_${count + 1}`;
1301
- const prefixedName = capturePrefix ? `${capturePrefix}${uniqueName}` : uniqueName;
1302
- captureNames.push(prefixedName);
1303
- return prefixedName;
1304
- };
1305
- return {
1306
- captureNames,
1307
- register
1308
- };
1359
+ /**
1360
+ * Processes an oversized segment by iterating through the content and
1361
+ * breaking it into smaller pieces that fit within maxPages constraints.
1362
+ *
1363
+ * Uses precomputed boundary positions for O(log n) page attribution lookups.
1364
+ */
1365
+ const processOversizedSegment = (segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger) => {
1366
+ const result = [];
1367
+ const fullContent = segment.content;
1368
+ let cursorPos = 0;
1369
+ let currentFromIdx = fromIdx;
1370
+ let isFirstPiece = true;
1371
+ const boundaryPositions = buildBoundaryPositions(fullContent, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1372
+ logger?.debug?.("[breakpoints] boundaryPositions built", {
1373
+ boundaryPositions,
1374
+ fromIdx,
1375
+ fullContentLength: fullContent.length,
1376
+ toIdx
1377
+ });
1378
+ const maxIterations = 1e4;
1379
+ for (let i = 0; i < maxIterations && cursorPos < fullContent.length && currentFromIdx <= toIdx; i++) {
1380
+ const remainingContent = fullContent.slice(cursorPos);
1381
+ if (!remainingContent.trim()) break;
1382
+ const remainingSpan = computeRemainingSpan(currentFromIdx, toIdx, pageIds);
1383
+ const remainingHasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, currentFromIdx, toIdx);
1384
+ if (remainingSpan <= maxPages && !remainingHasExclusions) {
1385
+ const finalSeg = createFinalSegment(remainingContent, currentFromIdx, toIdx, pageIds, segment.meta, isFirstPiece);
1386
+ if (finalSeg) result.push(finalSeg);
1387
+ break;
1388
+ }
1389
+ const windowEndIdx = computeWindowEndIdx(currentFromIdx, toIdx, pageIds, maxPages);
1390
+ const windowEndPosition = findBreakpointWindowEndPosition(remainingContent, currentFromIdx, windowEndIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets);
1391
+ logger?.debug?.(`[breakpoints] iteration=${i}`, {
1392
+ currentFromIdx,
1393
+ cursorPos,
1394
+ windowEndIdx
1395
+ });
1396
+ const breakOffset = findBreakOffsetForWindow(remainingContent, currentFromIdx, windowEndIdx, toIdx, windowEndPosition, pageIds, expandedBreakpoints, cumulativeOffsets, normalizedPages, prefer);
1397
+ const breakPos = cursorPos + breakOffset;
1398
+ const pieceContent = fullContent.slice(cursorPos, breakPos).trim();
1399
+ const { actualEndIdx, actualStartIdx } = computePiecePages(cursorPos, breakPos, boundaryPositions, fromIdx, toIdx);
1400
+ logger?.trace?.("[breakpoints] piece", {
1401
+ actualEndIdx,
1402
+ actualStartIdx,
1403
+ pieceLength: pieceContent.length
1404
+ });
1405
+ if (pieceContent) {
1406
+ const pieceSeg = createPieceSegment(pieceContent, actualStartIdx, actualEndIdx, pageIds, segment.meta, isFirstPiece);
1407
+ if (pieceSeg) result.push(pieceSeg);
1408
+ }
1409
+ cursorPos = skipWhitespace(fullContent, breakPos);
1410
+ currentFromIdx = computeNextFromIdx(fullContent.slice(cursorPos), actualEndIdx, toIdx, pageIds, normalizedPages);
1411
+ isFirstPiece = false;
1412
+ }
1413
+ logger?.debug?.("[breakpoints] done", { resultCount: result.length });
1414
+ return result;
1309
1415
  };
1310
- const expandTokenLiteral = (literal, opts) => {
1311
- const parsed = parseTokenLiteral(literal);
1312
- if (!parsed) return literal;
1313
- const { tokenName, captureName } = parsed;
1314
- if (!tokenName && captureName) return `(?<${opts.registerCapture(captureName)}>.+)`;
1315
- let tokenPattern = TOKEN_PATTERNS[tokenName];
1316
- if (!tokenPattern) return literal;
1317
- tokenPattern = maybeApplyFuzzyToTokenPattern(tokenPattern, opts.fuzzyTransform);
1318
- if (captureName) return `(?<${opts.registerCapture(captureName)}>${tokenPattern})`;
1319
- return tokenPattern;
1416
+ /**
1417
+ * Applies breakpoints to oversized segments.
1418
+ *
1419
+ * Note: This is an internal engine used by `segmentPages()`.
1420
+ */
1421
+ const applyBreakpoints = (segments, pages, normalizedContent, maxPages, breakpoints, prefer, patternProcessor, logger, pageJoiner = "space") => {
1422
+ const pageIds = pages.map((p) => p.id);
1423
+ const pageIdToIndex = buildPageIdToIndexMap(pageIds);
1424
+ const normalizedPages = buildNormalizedPagesMap(pages, normalizedContent);
1425
+ const cumulativeOffsets = buildCumulativeOffsets(pageIds, normalizedPages);
1426
+ const expandedBreakpoints = expandBreakpoints(breakpoints, patternProcessor);
1427
+ const result = [];
1428
+ logger?.info?.("Starting breakpoint processing", {
1429
+ maxPages,
1430
+ segmentCount: segments.length
1431
+ });
1432
+ logger?.debug?.("[breakpoints] inputSegments", {
1433
+ segmentCount: segments.length,
1434
+ segments: segments.map((s) => ({
1435
+ contentLength: s.content.length,
1436
+ from: s.from,
1437
+ to: s.to
1438
+ }))
1439
+ });
1440
+ for (const segment of segments) {
1441
+ const fromIdx = pageIdToIndex.get(segment.from) ?? -1;
1442
+ const toIdx = segment.to !== void 0 ? pageIdToIndex.get(segment.to) ?? fromIdx : fromIdx;
1443
+ const segmentSpan = (segment.to ?? segment.from) - segment.from;
1444
+ const hasExclusions = hasAnyExclusionsInRange(expandedBreakpoints, pageIds, fromIdx, toIdx);
1445
+ if (segmentSpan <= maxPages && !hasExclusions) {
1446
+ result.push(segment);
1447
+ continue;
1448
+ }
1449
+ const broken = processOversizedSegment(segment, fromIdx, toIdx, pageIds, normalizedPages, cumulativeOffsets, expandedBreakpoints, maxPages, prefer, logger);
1450
+ result.push(...broken.map((s) => {
1451
+ const segFromIdx = pageIdToIndex.get(s.from) ?? -1;
1452
+ const segToIdx = s.to !== void 0 ? pageIdToIndex.get(s.to) ?? segFromIdx : segFromIdx;
1453
+ if (segFromIdx >= 0 && segToIdx > segFromIdx) return {
1454
+ ...s,
1455
+ content: applyPageJoinerBetweenPages(s.content, segFromIdx, segToIdx, pageIds, normalizedPages, pageJoiner)
1456
+ };
1457
+ return s;
1458
+ }));
1459
+ }
1460
+ logger?.info?.("Breakpoint processing completed", { resultCount: result.length });
1461
+ return result;
1320
1462
  };
1463
+
1464
+ //#endregion
1465
+ //#region src/segmentation/match-utils.ts
1321
1466
  /**
1322
- * Expands template tokens with support for named captures.
1467
+ * Utility functions for regex matching and result processing.
1323
1468
  *
1324
- * This is the primary token expansion function that handles all token syntax:
1325
- * - `{{token}}` Expands to the token's pattern (no capture group)
1326
- * - `{{token:name}}` Expands to `(?<name>pattern)` (named capture)
1327
- * - `{{:name}}` → Expands to `(?<name>.+)` (capture anything)
1469
+ * These functions were extracted from `segmenter.ts` to reduce complexity
1470
+ * and enable independent testing. They handle match filtering, capture
1471
+ * extraction, and occurrence-based selection.
1328
1472
  *
1329
- * Unknown tokens are left as-is in the output, allowing for partial templates.
1473
+ * @module match-utils
1474
+ */
1475
+ /**
1476
+ * Extracts named capture groups from a regex match.
1330
1477
  *
1331
- * @param query - The template string containing tokens
1332
- * @param fuzzyTransform - Optional function to transform Arabic text for fuzzy matching.
1333
- * Applied to both token patterns and plain Arabic text between tokens.
1334
- * Typically `makeDiacriticInsensitive` from the fuzzy module.
1335
- * @returns Object with expanded pattern, capture names, and capture flag
1478
+ * Only includes groups that are in the `captureNames` list and have
1479
+ * defined values. This filters out positional captures and ensures
1480
+ * only explicitly requested named captures are returned.
1336
1481
  *
1337
- * @example
1338
- * // Simple token expansion
1339
- * expandTokensWithCaptures('{{raqms}} {{dash}}')
1340
- * // → { pattern: '[\\u0660-\\u0669]+ [-–—ـ]', captureNames: [], hasCaptures: false }
1482
+ * @param groups - The `match.groups` object from `RegExp.exec()`
1483
+ * @param captureNames - List of capture names to extract (from `{{token:name}}` syntax)
1484
+ * @returns Object with capture name → value pairs, or `undefined` if none found
1341
1485
  *
1342
1486
  * @example
1343
- * // Named capture
1344
- * expandTokensWithCaptures('{{raqms:num}} {{dash}}')
1345
- * // → { pattern: '(?<num>[\\u0660-\\u0669]+) [-–—ـ]', captureNames: ['num'], hasCaptures: true }
1487
+ * const match = /(?<num>[٠-٩]+) -/.exec('٦٦٩٦ - text');
1488
+ * extractNamedCaptures(match.groups, ['num'])
1489
+ * // → { num: '٦٦٩٦' }
1346
1490
  *
1347
1491
  * @example
1348
- * // Capture-only token
1349
- * expandTokensWithCaptures('{{raqms:num}} {{dash}} {{:content}}')
1350
- * // → { pattern: '(?<num>[٠-٩]+) [-–—ـ] (?<content>.+)', captureNames: ['num', 'content'], hasCaptures: true }
1492
+ * // No matching captures
1493
+ * extractNamedCaptures({}, ['num'])
1494
+ * // → undefined
1351
1495
  *
1352
1496
  * @example
1353
- * // With fuzzy transform
1354
- * expandTokensWithCaptures('{{bab}}', makeDiacriticInsensitive)
1355
- * // → { pattern: 'بَ?ا?بٌ?', captureNames: [], hasCaptures: false }
1497
+ * // Undefined groups
1498
+ * extractNamedCaptures(undefined, ['num'])
1499
+ * // → undefined
1356
1500
  */
1357
- const expandTokensWithCaptures = (query, fuzzyTransform, capturePrefix) => {
1358
- const segments = splitTemplateIntoSegments(query);
1359
- const registry = createCaptureRegistry(capturePrefix);
1360
- const processedParts = segments.map((segment) => {
1361
- if (segment.type === "text") return maybeApplyFuzzyToText(segment.value, fuzzyTransform);
1362
- return expandTokenLiteral(segment.value, {
1363
- capturePrefix,
1364
- fuzzyTransform,
1365
- registerCapture: registry.register
1366
- });
1367
- });
1368
- return {
1369
- captureNames: registry.captureNames,
1370
- hasCaptures: registry.captureNames.length > 0,
1371
- pattern: processedParts.join("")
1372
- };
1501
+ const extractNamedCaptures = (groups, captureNames) => {
1502
+ if (!groups || captureNames.length === 0) return;
1503
+ const namedCaptures = {};
1504
+ for (const name of captureNames) if (groups[name] !== void 0) namedCaptures[name] = groups[name];
1505
+ return Object.keys(namedCaptures).length > 0 ? namedCaptures : void 0;
1373
1506
  };
1374
1507
  /**
1375
- * Expands template tokens in a query string to their regex equivalents.
1376
- *
1377
- * This is the simple version without capture support. It returns only the
1378
- * expanded pattern string, not capture metadata.
1508
+ * Gets the last defined positional capture group from a match array.
1379
1509
  *
1380
- * Unknown tokens are left as-is, allowing for partial templates.
1510
+ * Used for `lineStartsAfter` patterns where the content capture (`.*`)
1511
+ * is always at the end of the pattern. Named captures may shift the
1512
+ * positional indices, so we iterate backward to find the actual content.
1381
1513
  *
1382
- * @param query - Template string containing `{{token}}` placeholders
1383
- * @returns Expanded regex pattern string
1514
+ * @param match - RegExp exec result array
1515
+ * @returns The last defined capture group value, or `undefined` if none
1384
1516
  *
1385
1517
  * @example
1386
- * expandTokens('، {{raqms}}') // [\\u0660-\\u0669]+'
1387
- * expandTokens('{{raqm}}*') // '[\\u0660-\\u0669]*'
1388
- * expandTokens('{{dash}}{{raqm}}') // → '[-–—ـ][\\u0660-\\u0669]'
1389
- * expandTokens('{{unknown}}') // → '{{unknown}}' (left as-is)
1518
+ * // Pattern: ^(?:(?<num>[٠-٩]+) - )(.*)
1519
+ * // Match array: ['٦٦٩٦ - content', '٦٦٩٦', 'content']
1520
+ * getLastPositionalCapture(match)
1521
+ * // → 'content'
1390
1522
  *
1391
- * @see expandTokensWithCaptures for full capture group support
1523
+ * @example
1524
+ * // No captures
1525
+ * getLastPositionalCapture(['full match'])
1526
+ * // → undefined
1392
1527
  */
1393
- const expandTokens = (query) => expandTokensWithCaptures(query).pattern;
1528
+ const getLastPositionalCapture = (match) => {
1529
+ if (match.length <= 1) return;
1530
+ for (let i = match.length - 1; i >= 1; i--) if (match[i] !== void 0) return match[i];
1531
+ };
1394
1532
  /**
1395
- * Converts a template string to a compiled RegExp.
1396
- *
1397
- * Expands all tokens and attempts to compile the result as a RegExp
1398
- * with Unicode flag. Returns `null` if the resulting pattern is invalid.
1533
+ * Filters matches to only include those within page ID constraints.
1399
1534
  *
1400
- * @remarks
1401
- * This function dynamically compiles regular expressions from template strings.
1402
- * If templates may come from untrusted sources, be aware of potential ReDoS
1403
- * (Regular Expression Denial of Service) risks due to catastrophic backtracking.
1404
- * Consider validating pattern complexity or applying execution timeouts when
1405
- * running user-submitted patterns.
1535
+ * Applies the `min`, `max`, and `exclude` constraints from a rule to filter out
1536
+ * matches that occur on pages outside the allowed range or explicitly excluded.
1406
1537
  *
1407
- * @param template - Template string containing `{{token}}` placeholders
1408
- * @returns Compiled RegExp with 'u' flag, or `null` if invalid
1538
+ * @param matches - Array of match results to filter
1539
+ * @param rule - Rule containing `min`, `max`, and/or `exclude` page constraints
1540
+ * @param getId - Function that returns the page ID for a given offset
1541
+ * @returns Filtered array containing only matches within constraints
1409
1542
  *
1410
1543
  * @example
1411
- * templateToRegex('، {{raqms}}') // [٠-٩]+/u
1412
- * templateToRegex('{{raqms}}+') // /[٠-٩]++/u (might be invalid in some engines)
1413
- * templateToRegex('(((') // null (invalid regex)
1544
+ * const matches = [
1545
+ * { start: 0, end: 10 }, // Page 1
1546
+ * { start: 100, end: 110 }, // Page 5
1547
+ * { start: 200, end: 210 }, // Page 10
1548
+ * ];
1549
+ * filterByConstraints(matches, { min: 3, max: 8 }, getId)
1550
+ * // → [{ start: 100, end: 110 }] (only page 5 match)
1414
1551
  */
1415
- const templateToRegex = (template) => {
1416
- const expanded = expandTokens(template);
1417
- try {
1418
- return new RegExp(expanded, "u");
1419
- } catch {
1420
- return null;
1421
- }
1552
+ const filterByConstraints = (matches, rule, getId) => {
1553
+ return matches.filter((m) => {
1554
+ const id = getId(m.start);
1555
+ if (rule.min !== void 0 && id < rule.min) return false;
1556
+ if (rule.max !== void 0 && id > rule.max) return false;
1557
+ if (isPageExcluded(id, rule.exclude)) return false;
1558
+ return true;
1559
+ });
1422
1560
  };
1423
1561
  /**
1424
- * Lists all available token names defined in `TOKEN_PATTERNS`.
1562
+ * Checks if any rule in the list allows the given page ID.
1425
1563
  *
1426
- * Useful for documentation, validation, or building user interfaces
1427
- * that show available tokens.
1564
+ * A rule allows an ID if it falls within the rule's `min`/`max` constraints.
1565
+ * Rules without constraints allow all page IDs.
1428
1566
  *
1429
- * @returns Array of token names (e.g., `['bab', 'basmala', 'bullet', ...]`)
1567
+ * This is used to determine whether to create a segment for content
1568
+ * that appears before any split points (the "first segment").
1430
1569
  *
1431
- * @example
1432
- * getAvailableTokens()
1433
- * // ['bab', 'basmala', 'bullet', 'dash', 'harf', 'kitab', 'naql', 'raqm', 'raqms']
1434
- */
1435
- const getAvailableTokens = () => Object.keys(TOKEN_PATTERNS);
1436
- /**
1437
- * Gets the regex pattern for a specific token name.
1570
+ * @param rules - Array of rules with optional `min` and `max` constraints
1571
+ * @param pageId - Page ID to check
1572
+ * @returns `true` if at least one rule allows the page ID
1438
1573
  *
1439
- * Returns the raw pattern string as defined in `TOKEN_PATTERNS`,
1440
- * without any expansion or capture group wrapping.
1574
+ * @example
1575
+ * const rules = [
1576
+ * { min: 5, max: 10 }, // Allows pages 5-10
1577
+ * { min: 20 }, // Allows pages 20+
1578
+ * ];
1441
1579
  *
1442
- * @param tokenName - The token name to look up (e.g., 'raqms', 'dash')
1443
- * @returns The regex pattern string, or `undefined` if token doesn't exist
1580
+ * anyRuleAllowsId(rules, 7) // true (first rule allows)
1581
+ * anyRuleAllowsId(rules, 3) // false (no rule allows)
1582
+ * anyRuleAllowsId(rules, 25) // → true (second rule allows)
1444
1583
  *
1445
1584
  * @example
1446
- * getTokenPattern('raqms') // '[\\u0660-\\u0669]+'
1447
- * getTokenPattern('dash') // → '[-–—ـ]'
1448
- * getTokenPattern('unknown') // → undefined
1585
+ * // Rules without constraints allow everything
1586
+ * anyRuleAllowsId([{}], 999) // → true
1449
1587
  */
1450
- const getTokenPattern = (tokenName) => TOKEN_PATTERNS[tokenName];
1588
+ const anyRuleAllowsId = (rules, pageId) => {
1589
+ return rules.some((r) => {
1590
+ const minOk = r.min === void 0 || pageId >= r.min;
1591
+ const maxOk = r.max === void 0 || pageId <= r.max;
1592
+ return minOk && maxOk;
1593
+ });
1594
+ };
1451
1595
 
1452
1596
  //#endregion
1453
1597
  //#region src/segmentation/rule-regex.ts
@@ -1548,7 +1692,12 @@ const determineUsesCapture = (regexSource, _captureNames) => hasCapturingGroup(r
1548
1692
  */
1549
1693
  const buildRuleRegex = (rule, capturePrefix) => {
1550
1694
  const s = { ...rule };
1551
- const fuzzy = rule.fuzzy ?? false;
1695
+ const allPatterns = [
1696
+ ...s.lineStartsWith ?? [],
1697
+ ...s.lineStartsAfter ?? [],
1698
+ ...s.lineEndsWith ?? []
1699
+ ];
1700
+ const fuzzy = rule.fuzzy ?? shouldDefaultToFuzzy(allPatterns);
1552
1701
  let allCaptureNames = [];
1553
1702
  if (s.lineStartsAfter?.length) {
1554
1703
  const { regex, captureNames } = buildLineStartsAfterRegexSource(s.lineStartsAfter, fuzzy, capturePrefix);
@@ -1836,6 +1985,7 @@ const normalizeLineEndings = (content) => {
1836
1985
  *
1837
1986
  * @module segmenter
1838
1987
  */
1988
+ const MAX_REGEX_ITERATIONS = 1e5;
1839
1989
  /**
1840
1990
  * Builds a concatenated content string and page mapping from input pages.
1841
1991
  *
@@ -1943,9 +2093,18 @@ const ensureFallbackSegment = (segments, pages, normalizedContent, pageJoiner) =
1943
2093
  if (lastPage.id !== firstPage.id) initialSeg.to = lastPage.id;
1944
2094
  return [initialSeg];
1945
2095
  };
1946
- const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
2096
+ const collectSplitPointsFromRules = (rules, matchContent, pageMap, logger) => {
2097
+ logger?.debug?.("[segmenter] collecting split points from rules", {
2098
+ contentLength: matchContent.length,
2099
+ ruleCount: rules.length
2100
+ });
1947
2101
  const passesPageStartGuard = createPageStartGuardChecker(matchContent, pageMap);
1948
2102
  const { combinableRules, fastFuzzyRules, standaloneRules } = partitionRulesForMatching(rules);
2103
+ logger?.debug?.("[segmenter] rules partitioned", {
2104
+ combinableCount: combinableRules.length,
2105
+ fastFuzzyCount: fastFuzzyRules.length,
2106
+ standaloneCount: standaloneRules.length
2107
+ });
1949
2108
  const splitPointsByRule = collectFastFuzzySplitPoints(matchContent, pageMap, fastFuzzyRules, passesPageStartGuard);
1950
2109
  if (combinableRules.length > 0) {
1951
2110
  const ruleRegexes = combinableRules.map(({ rule, prefix }) => {
@@ -1958,9 +2117,22 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1958
2117
  });
1959
2118
  const combinedSource = ruleRegexes.map((r) => r.source).join("|");
1960
2119
  const combinedRegex = new RegExp(combinedSource, "gm");
2120
+ logger?.debug?.("[segmenter] combined regex built", {
2121
+ combinableRuleCount: combinableRules.length,
2122
+ combinedSourceLength: combinedSource.length
2123
+ });
1961
2124
  combinedRegex.lastIndex = 0;
1962
2125
  let m = combinedRegex.exec(matchContent);
2126
+ let iterationCount = 0;
1963
2127
  while (m !== null) {
2128
+ iterationCount++;
2129
+ if (iterationCount > MAX_REGEX_ITERATIONS) throw new Error(`[segmenter] Possible infinite loop detected: regex matching exceeded ${MAX_REGEX_ITERATIONS} iterations. Last match at position ${m.index} (length ${m[0].length}). Check for patterns that may match empty strings or cause catastrophic backtracking.`);
2130
+ if (iterationCount % 1e4 === 0) logger?.warn?.("[segmenter] high iteration count in regex loop", {
2131
+ iterationCount,
2132
+ lastIndex: combinedRegex.lastIndex,
2133
+ matchLength: m[0].length,
2134
+ matchPosition: m.index
2135
+ });
1964
2136
  const matchedRuleIndex = combinableRules.findIndex(({ prefix }) => m?.groups?.[prefix] !== void 0);
1965
2137
  if (matchedRuleIndex !== -1) {
1966
2138
  const { rule, prefix, index: originalIndex } = combinableRules[matchedRuleIndex];
@@ -1981,8 +2153,7 @@ const collectSplitPointsFromRules = (rules, matchContent, pageMap) => {
1981
2153
  const start = m.index;
1982
2154
  const end = m.index + m[0].length;
1983
2155
  const pageId = pageMap.getId(start);
1984
- if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude)) {
1985
- if (!passesPageStartGuard(rule, originalIndex, start)) continue;
2156
+ if ((rule.min === void 0 || pageId >= rule.min) && (rule.max === void 0 || pageId <= rule.max) && !isPageExcluded(pageId, rule.exclude) && passesPageStartGuard(rule, originalIndex, start)) {
1986
2157
  const sp = {
1987
2158
  capturedContent: void 0,
1988
2159
  contentStartOffset,
@@ -2168,7 +2339,7 @@ const segmentPages = (pages, options) => {
2168
2339
  pageIds: pageMap.pageIds,
2169
2340
  totalContentLength: matchContent.length
2170
2341
  });
2171
- const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap);
2342
+ const splitPoints = collectSplitPointsFromRules(rules, matchContent, pageMap, logger);
2172
2343
  const unique = dedupeSplitPoints(splitPoints);
2173
2344
  logger?.debug?.("[segmenter] split points collected", {
2174
2345
  rawSplitPoints: splitPoints.length,
@@ -2660,5 +2831,5 @@ const analyzeTextForRule = (text) => {
2660
2831
  };
2661
2832
 
2662
2833
  //#endregion
2663
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex };
2834
+ export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
2664
2835
  //# sourceMappingURL=index.mjs.map